bioscan-ml
/

BarcodeBERT

Feature Extraction

token-classification

text-embeddings-inference

Model card Files Files and versions

nioushasadjadi commited on Nov 16, 2024

Commit

92d46e2

·

1 Parent(s): eb1e311

Adding automap to the tokenizer config

Files changed (2) hide show

tokenizer.py +8 -1
tokenizer_config.json +6 -0

tokenizer.py CHANGED Viewed

@@ -93,6 +93,12 @@ class KmerTokenizer(PreTrainedTokenizer):
                 "1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
                       "special": True}
             },
             "clean_up_tokenization_spaces": True,
             "mask_token": "[MASK]",
             "model_max_length": 1e12,  # Set a high number, or adjust as needed
@@ -119,7 +125,8 @@ class KmerTokenizer(PreTrainedTokenizer):
             # stride = vocab_content["model"]["stride"]
         # Load k and stride from tokenizer_config.json
-        tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
         if os.path.exists(tokenizer_config_file):
             with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                 tokenizer_config = json.load(f)

                 "1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
                       "special": True}
             },
+            "auto_map": {
+                "AutoTokenizer": [
+                    "tokenizer.KmerTokenizer",
+                    None
+                ]
+            },
             "clean_up_tokenization_spaces": True,
             "mask_token": "[MASK]",
             "model_max_length": 1e12,  # Set a high number, or adjust as needed
             # stride = vocab_content["model"]["stride"]
         # Load k and stride from tokenizer_config.json
+        # tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
+        tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
         if os.path.exists(tokenizer_config_file):
             with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                 tokenizer_config = json.load(f)

tokenizer_config.json CHANGED Viewed

@@ -17,6 +17,12 @@
       "special": true
     }
   },
   "clean_up_tokenization_spaces": true,
   "mask_token": "[MASK]",
   "model_max_length": 1000000000000.0,

       "special": true
     }
   },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.KmerTokenizer",
+      null
+    ]
+  },
   "clean_up_tokenization_spaces": true,
   "mask_token": "[MASK]",
   "model_max_length": 1000000000000.0,