Update README.md
Browse files
README.md
CHANGED
@@ -31,18 +31,25 @@ pip install sentencepiece
|
|
31 |
|
32 |
```python
|
33 |
import sentencepiece as spm
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# Load the tokenizer model
|
36 |
sp = spm.SentencePieceProcessor()
|
37 |
-
sp.load(
|
38 |
|
39 |
# Sample text for tokenization
|
40 |
text = "ଦୀପାବଳି ଏକ ଭାରତୀୟ ପର୍ବ ।"
|
41 |
|
42 |
-
# Tokenize the text
|
43 |
tokens = sp.encode_as_pieces(text)
|
|
|
|
|
44 |
token_ids = sp.encode_as_ids(text)
|
45 |
|
|
|
46 |
print("Tokens:", tokens)
|
47 |
print("Token IDs:", token_ids)
|
48 |
```
|
|
|
31 |
|
32 |
```python
|
33 |
import sentencepiece as spm
|
34 |
+
from huggingface_hub import hf_hub_download
|
35 |
+
|
36 |
+
# Download the model file from Hugging Face
|
37 |
+
model_path = hf_hub_download(repo_id="shantipriya/OdiaTokenizer", filename="odia_tokenizers_test.model")
|
38 |
|
39 |
# Load the tokenizer model
|
40 |
sp = spm.SentencePieceProcessor()
|
41 |
+
sp.load(model_path)
|
42 |
|
43 |
# Sample text for tokenization
|
44 |
text = "ଦୀପାବଳି ଏକ ଭାରତୀୟ ପର୍ବ ।"
|
45 |
|
46 |
+
# Tokenize the text into pieces (subwords or tokens)
|
47 |
tokens = sp.encode_as_pieces(text)
|
48 |
+
|
49 |
+
# Tokenize the text into token IDs (integer representations of the tokens)
|
50 |
token_ids = sp.encode_as_ids(text)
|
51 |
|
52 |
+
# Print the tokenized output
|
53 |
print("Tokens:", tokens)
|
54 |
print("Token IDs:", token_ids)
|
55 |
```
|