nioushasadjadi
commited on
Commit
·
92d46e2
1
Parent(s):
eb1e311
Adding automap to the tokenizer config
Browse files- tokenizer.py +8 -1
- tokenizer_config.json +6 -0
tokenizer.py
CHANGED
|
@@ -93,6 +93,12 @@ class KmerTokenizer(PreTrainedTokenizer):
|
|
| 93 |
"1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
|
| 94 |
"special": True}
|
| 95 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
"clean_up_tokenization_spaces": True,
|
| 97 |
"mask_token": "[MASK]",
|
| 98 |
"model_max_length": 1e12, # Set a high number, or adjust as needed
|
|
@@ -119,7 +125,8 @@ class KmerTokenizer(PreTrainedTokenizer):
|
|
| 119 |
# stride = vocab_content["model"]["stride"]
|
| 120 |
|
| 121 |
# Load k and stride from tokenizer_config.json
|
| 122 |
-
tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
|
|
|
|
| 123 |
if os.path.exists(tokenizer_config_file):
|
| 124 |
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
| 125 |
tokenizer_config = json.load(f)
|
|
|
|
| 93 |
"1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
|
| 94 |
"special": True}
|
| 95 |
},
|
| 96 |
+
"auto_map": {
|
| 97 |
+
"AutoTokenizer": [
|
| 98 |
+
"tokenizer.KmerTokenizer",
|
| 99 |
+
None
|
| 100 |
+
]
|
| 101 |
+
},
|
| 102 |
"clean_up_tokenization_spaces": True,
|
| 103 |
"mask_token": "[MASK]",
|
| 104 |
"model_max_length": 1e12, # Set a high number, or adjust as needed
|
|
|
|
| 125 |
# stride = vocab_content["model"]["stride"]
|
| 126 |
|
| 127 |
# Load k and stride from tokenizer_config.json
|
| 128 |
+
# tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
|
| 129 |
+
tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
|
| 130 |
if os.path.exists(tokenizer_config_file):
|
| 131 |
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
| 132 |
tokenizer_config = json.load(f)
|
tokenizer_config.json
CHANGED
|
@@ -17,6 +17,12 @@
|
|
| 17 |
"special": true
|
| 18 |
}
|
| 19 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"clean_up_tokenization_spaces": true,
|
| 21 |
"mask_token": "[MASK]",
|
| 22 |
"model_max_length": 1000000000000.0,
|
|
|
|
| 17 |
"special": true
|
| 18 |
}
|
| 19 |
},
|
| 20 |
+
"auto_map": {
|
| 21 |
+
"AutoTokenizer": [
|
| 22 |
+
"tokenizer.KmerTokenizer",
|
| 23 |
+
null
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
"clean_up_tokenization_spaces": true,
|
| 27 |
"mask_token": "[MASK]",
|
| 28 |
"model_max_length": 1000000000000.0,
|