Spaces:
Paused
Paused
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import logging | |
| import re | |
| import typing as tp | |
| from pathlib import Path | |
| from botok.tokenizers import sentencetokenizer as bod_sent_tok | |
| # Indicp NLP | |
| from indicnlp import common as indic_common | |
| from indicnlp import loader as indic_loader | |
| from indicnlp.normalize.indic_normalize import IndicNormalizerFactory | |
| from indicnlp.tokenize import sentence_tokenize as indic_sent_tok | |
| from khmernltk import sentence_tokenize as khm_sent_tok | |
| # pythainlp for Thai | |
| # Seahorse for Indonesian, Thai, Vietnamese | |
| # botok for tibetan | |
| # Spacy for | |
| # various tool-kits | |
| from laonlp.tokenize import sent_tokenize as lao_sent_tok | |
| # --- sentence splitters | |
| # Moses-style | |
| from sentence_splitter import SentenceSplitter | |
| INDIC_NLP_RESOURCES = None # apparently not needed for splitting and normalization | |
| logger = logging.getLogger("sentence_split") | |
| split_lang_code_map = { | |
| "ace_Arab" : "ace_Arab", | |
| "ace_Latn" : "ace_Latn", | |
| "acm_Arab" : "acm", | |
| "acq_Arab" : "acq", | |
| "aeb_Arab" : "aeb", | |
| "afr_Latn" : "afr", | |
| "ajp_Arab" : "ajp", | |
| "aka_Latn" : "aka", | |
| "amh_Ethi" : "amh", | |
| "apc_Arab" : "apc", | |
| "arb_Arab" : "ara", | |
| "arb_Arab" : "ara_Arab", | |
| "arb_Latn" : "ara_Latn", | |
| "ars_Arab" : "ars", | |
| "ary_Arab" : "ary", | |
| "arz_Arab" : "arz", | |
| "asm_Beng" : "asm", | |
| "ast_Latn" : "ast", | |
| "awa_Deva" : "awa", | |
| "ayr_Latn" : "ayr", | |
| "azb_Arab" : "azb", | |
| "azj_Latn" : "azj", | |
| "bak_Cyrl" : "bak", | |
| "bam_Latn" : "bam", | |
| "ban_Latn" : "ban", | |
| "bel_Cyrl" : "bel", | |
| "bem_Latn" : "bem", | |
| "ben_Beng" : "ben", | |
| "bho_Deva" : "bho", | |
| "bjn_Arab" : "bjn_Arab", | |
| "bjn_Latn" : "bjn_Latn", | |
| "bod_Tibt" : "bod", | |
| "bos_Latn" : "bos", | |
| "bug_Latn" : "bug", | |
| "bul_Cyrl" : "bul", | |
| "cat_Latn" : "cat", | |
| "ceb_Latn" : "ceb", | |
| "ces_Latn" : "ces", | |
| "cjk_Latn" : "cjk", | |
| "ckb_Arab" : "ckb", | |
| "crh_Latn" : "crh_Latn", | |
| "cym_Latn" : "cym", | |
| "dan_Latn" : "dan", | |
| "deu_Latn" : "deu", | |
| "dik_Latn" : "dik", | |
| "diq_Latn" : "diq", | |
| "dyu_Latn" : "dyu", | |
| "dzo_Tibt" : "dzo", | |
| "ell_Grek" : "ell", | |
| "eng_Latn" : "eng", | |
| "epo_Latn" : "epo", | |
| "est_Latn" : "est", | |
| "eus_Latn" : "eus", | |
| "ewe_Latn" : "ewe", | |
| "fao_Latn" : "fao", | |
| "pes_Arab" : "fas", | |
| "fij_Latn" : "fij", | |
| "fin_Latn" : "fin", | |
| "fon_Latn" : "fon", | |
| "fra_Latn" : "fra", | |
| "fur_Latn" : "fur", | |
| "fuv_Latn" : "fuv", | |
| "gla_Latn" : "gla", | |
| "gle_Latn" : "gle", | |
| "glg_Latn" : "glg", | |
| "grn_Latn" : "grn", | |
| "guj_Gujr" : "guj", | |
| "hat_Latn" : "hat", | |
| "hau_Latn" : "hau", | |
| "heb_Hebr" : "heb", | |
| "hin_Deva" : "hin", | |
| "hne_Deva" : "hne", | |
| "hrv_Latn" : "hrv", | |
| "hun_Latn" : "hun", | |
| "hye_Armn" : "hye", | |
| "ibo_Latn" : "ibo", | |
| "ilo_Latn" : "ilo", | |
| "ind_Latn" : "ind", | |
| "isl_Latn" : "isl", | |
| "ita_Latn" : "ita", | |
| "jav_Latn" : "jav", | |
| "jpn_Jpan" : "jpn", | |
| "kab_Latn" : "kab", | |
| "kac_Latn" : "kac", | |
| "kam_Latn" : "kam", | |
| "kan_Knda" : "kan", | |
| "kas_Arab" : "kas_Arab", | |
| "kas_Deva" : "kas_Deva", | |
| "kat_Geor" : "kat", | |
| "knc_Arab" : "kau_Arab", | |
| "knc_Latn" : "kau_Latn", | |
| "kaz_Cyrl" : "kaz", | |
| "kbp_Latn" : "kbp", | |
| "kea_Latn" : "kea", | |
| "khm_Khmr" : "khm", | |
| "kik_Latn" : "kik", | |
| "kin_Latn" : "kin", | |
| "kir_Cyrl" : "kir", | |
| "kmb_Latn" : "kmb", | |
| "kon_Latn" : "kon", | |
| "kor_Hang" : "kor", | |
| "kmr_Latn" : "kur", | |
| "lao_Laoo" : "lao", | |
| "lvs_Latn" : "lav", | |
| "lij_Latn" : "lij", | |
| "lim_Latn" : "lim", | |
| "lin_Latn" : "lin", | |
| "lit_Latn" : "lit", | |
| "lmo_Latn" : "lmo", | |
| "ltg_Latn" : "ltg", | |
| "ltz_Latn" : "ltz", | |
| "lua_Latn" : "lua", | |
| "lug_Latn" : "lug", | |
| "luo_Latn" : "luo", | |
| "lus_Latn" : "lus", | |
| "mag_Deva" : "mag", | |
| "mai_Deva" : "mai", | |
| "mal_Mlym" : "mal", | |
| "mar_Deva" : "mar", | |
| "min_Arab" : "min_Arab", | |
| "min_Latn" : "min_Latn", | |
| "mkd_Cyrl" : "mkd", | |
| "plt_Latn" : "mlg", | |
| "mlt_Latn" : "mlt", | |
| "khk_Cyrl" : "mon", | |
| "mos_Latn" : "mos", | |
| "mri_Latn" : "mri", | |
| "zsm_Latn" : "msa", | |
| "mya_Mymr" : "mya", | |
| "nld_Latn" : "nld", | |
| "nno_Latn" : "nno", | |
| "nob_Latn" : "nob", | |
| "npi_Deva" : "npi", | |
| "nso_Latn" : "nso", | |
| "nus_Latn" : "nus", | |
| "nya_Latn" : "nya", | |
| "oci_Latn" : "oci", | |
| "gaz_Latn" : "orm", | |
| "ory_Orya" : "ory", | |
| "pag_Latn" : "pag", | |
| "pan_Guru" : "pan", | |
| "pap_Latn" : "pap", | |
| "pol_Latn" : "pol", | |
| "por_Latn" : "por", | |
| "prs_Arab" : "prs", | |
| "pbt_Arab" : "pus", | |
| "quy_Latn" : "que", | |
| "ron_Latn" : "ron", | |
| "run_Latn" : "run", | |
| "rus_Cyrl" : "rus", | |
| "sag_Latn" : "sag", | |
| "san_Deva" : "san", | |
| "sat_Olck" : "sat", | |
| "scn_Latn" : "scn", | |
| "shn_Mymr" : "shn", | |
| "sin_Sinh" : "sin", | |
| "slk_Latn" : "slk", | |
| "slv_Latn" : "slv", | |
| "smo_Latn" : "smo", | |
| "sna_Latn" : "sna", | |
| "snd_Arab" : "snd", | |
| "som_Latn" : "som", | |
| "sot_Latn" : "sot", | |
| "spa_Latn" : "spa", | |
| "als_Latn" : "sqi", | |
| "srd_Latn" : "srd", | |
| "srp_Cyrl" : "srp_Cyrl", | |
| "ssw_Latn" : "ssw", | |
| "sun_Latn" : "sun", | |
| "swe_Latn" : "swe", | |
| "swh_Latn" : "swh", | |
| "szl_Latn" : "szl", | |
| "tam_Taml" : "tam", | |
| "tat_Cyrl" : "tat_Cyrl", | |
| "tel_Telu" : "tel", | |
| "tgk_Cyrl" : "tgk", | |
| "tgl_Latn" : "tgl", | |
| "tha_Thai" : "tha", | |
| "tir_Ethi" : "tir", | |
| "taq_Latn" : "tmh_Latn", | |
| "taq_Tfng" : "tmh_Tfng", | |
| "ton_Latn" : "ton", | |
| "tpi_Latn" : "tpi", | |
| "tsn_Latn" : "tsn", | |
| "tso_Latn" : "tso", | |
| "tuk_Latn" : "tuk", | |
| "tum_Latn" : "tum", | |
| "tur_Latn" : "tur", | |
| "twi_Latn" : "twi", | |
| "tzm_Tfng" : "tzm", | |
| "uig_Arab" : "uig", | |
| "ukr_Cyrl" : "ukr", | |
| "umb_Latn" : "umb", | |
| "urd_Arab" : "urd", | |
| "uzn_Latn" : "uzb", | |
| "vec_Latn" : "vec", | |
| "vie_Latn" : "vie", | |
| "war_Latn" : "war", | |
| "wol_Latn" : "wol", | |
| "xho_Latn" : "xho", | |
| "ydd_Hebr" : "yid", | |
| "yor_Latn" : "yor", | |
| "yue_Hant" : "yue", | |
| "zho_Hans" : "zho_Hans", | |
| "zho_Hant" : "zho_Hant", | |
| "zul_Latn" : "zul" | |
| } | |
| # ---------------------------------- | |
| # Supported tokenization algorithms | |
| # List of supported languages and mapping ISO3 - > ISO2 | |
| LANGS_MOSES = { | |
| "cat": "ca", | |
| "ces": "cs", | |
| "dan": "da", | |
| "nld": "nl", | |
| "eng": "en", | |
| "fin": "fi", | |
| "fra": "fr", | |
| "deu": "de", | |
| "ell": "el", | |
| "hun": "hu", | |
| "isl": "is", | |
| "ita": "it", | |
| "lav": "lv", | |
| "lit": "lt", | |
| "nob": "no", | |
| "pol": "pl", | |
| "por": "pt", | |
| "ron": "ro", | |
| "rus": "ru", | |
| "slk": "sk", | |
| "slv": "sl", | |
| "spa": "es", | |
| "swe": "sv", | |
| "tur": "tr", | |
| } | |
| LANGS_LAONLP = {"lao": "lao"} | |
| LANGS_KHMER = {"khm": "khm"} | |
| LANGS_BODNLP = { | |
| "bod": "bod", | |
| "dzo": "dzo", | |
| } # languages with tibetan script | |
| # ---------------------------------------------- | |
| LANGS_INDIC = { | |
| "asm": "as", | |
| "awa": "hi", | |
| "ben": "bn", | |
| "bho": "hi", | |
| "brx": "bD", | |
| "gom": "xx", | |
| "guj": "gu", | |
| "hin": "hi", | |
| "hne": "hi", | |
| "kan": "kn", | |
| "kas": "hi", | |
| "kas_Deva": "hi", | |
| "kok": "kK", | |
| "mni": "bn", # our meitei is in bengali script, so swapped it to bengali here | |
| "mag": "hi", | |
| "mai": "hi", | |
| "mal": "ml", | |
| "mar": "mr", | |
| "npi": "ne", | |
| "ory": "or", | |
| "pan": "pa", | |
| "san": "sa", | |
| "snd": "sd", | |
| "tam": "ta", | |
| "tel": "te", | |
| "urd": "ur", | |
| } | |
| # ---------------------------------------------- | |
| LANGS_GEEZ = {"amh": "amh", "tir": "tir"} | |
| def split_geez(line: str) -> tp.Iterable[str]: | |
| """Split Amharic text into sentences.""" | |
| line = line.replace("፡፡", "።") | |
| # remove "•" if there's already EOS marker before | |
| line = ( | |
| line.replace("። •", "።") | |
| .replace("? •", "?") | |
| .replace("! •", "!") | |
| .replace(". •", ".") | |
| ) | |
| for sent in re.findall(r"[^።•!?\!\?\.]+[።•!?।৷\?\!\.]?", line, flags=re.U): | |
| yield sent | |
| # ---------------------------------------------- | |
| LANGS_OLCHIKI = {"san": "san"} | |
| def split_olchiki(line: str) -> tp.Iterable[str]: | |
| """Split Santali text into sentences.""" | |
| for sent in re.findall(r"[^᱾|᱿!?\!\?]+[᱾|᱿!?\?\!]?", line, flags=re.U): | |
| yield sent | |
| # test sentence: ᱱᱤᱭᱟᱹ ᱣᱤᱠᱤᱯᱤᱰᱤᱭᱟ ᱫᱚ ᱥᱟᱱᱛᱟᱲᱤ ᱛᱮ ᱚᱞ ᱟᱠᱟᱱᱟ᱾ ᱚᱨᱦᱚᱸ ᱮᱴᱟᱜ ᱯᱟᱹᱨᱥᱤᱛᱮ ᱦᱚᱸ ᱟᱭᱢᱟ ᱣᱤᱠᱤᱯᱤᱰᱤᱭᱟ ᱢᱮᱱᱟᱜᱼᱟ ᱾ ᱱᱚᱸᱰᱮ ᱠᱤᱪᱷᱩ ᱛᱟᱹᱞᱠᱟᱹ ᱮᱢ ᱦᱩᱭᱱᱟ ᱾ | |
| # splits three times | |
| # ---------------------------------------------- | |
| LANGS_BURMESE = {"mya": "mya", "shn": "shn"} | |
| def split_burmese(line: str) -> tp.Iterable[str]: | |
| """Split Amharic text into sentences.""" | |
| # remove "•" if there's already EOS marker before | |
| line = line.replace("။”", "APOS။") | |
| for sent in re.findall(r"[^။!?\!\?\.]+[။!?।৷\?\!\.]?", line, flags=re.U): | |
| yield sent.replace("APOS။", "။”") | |
| # ---------------------------------- | |
| def get_split_algo(lang: str, split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]: | |
| if lang in split_lang_code_map: | |
| lang = split_lang_code_map[lang] | |
| # get default algorithm if requested | |
| if split_algo == "default": | |
| # use best algorithm in function of language | |
| if lang in LANGS_MOSES: | |
| split_algo = "moses" | |
| elif lang in LANGS_INDIC: | |
| split_algo = "indic" | |
| elif lang in LANGS_GEEZ: | |
| split_algo = "geez" | |
| elif lang in LANGS_KHMER: | |
| split_algo = "khmer" | |
| elif lang in LANGS_BURMESE: | |
| split_algo = "burmese" | |
| else: | |
| # use Moses by default (which likely will fall-back to English) | |
| split_algo = "moses" | |
| logger.info(f" - default algorithm for {lang} is {split_algo}") | |
| if split_algo == "none" or lang == "TODO": | |
| logger.info(" - no sentence splitting") | |
| return lambda line: [line] | |
| elif split_algo == "moses": | |
| if lang in LANGS_MOSES: | |
| lang = LANGS_MOSES[lang] | |
| logger.info(f" - Moses sentence splitter: using rules for '{lang}'") | |
| else: | |
| lang = "en" | |
| logger.info( | |
| f" - Moses sentence splitter for {lang}: falling back to {lang} rules" | |
| ) | |
| splitter = SentenceSplitter(language=lang) | |
| # non_breaking_prefix_file=non_breaking_prefix_file | |
| return splitter.split | |
| elif split_algo == "indic": | |
| # initialize toolkit (apparently not needed for sentence segmentation) | |
| if INDIC_NLP_RESOURCES: | |
| logger.info(" - Initialize Indic NLP toolkit") | |
| indic_common.set_resources_path(INDIC_NLP_RESOURCES) | |
| indic_loader.load() | |
| if lang in LANGS_INDIC: | |
| lang = LANGS_INDIC[lang] | |
| logger.info(f" - Indic sentence splitter: using rules for '{lang}'") | |
| else: | |
| lang = "hi" | |
| logger.info( | |
| f" - Indic sentence splitter for {lang}: falling back to {lang} rules" | |
| ) | |
| # setup normalizer | |
| factory = IndicNormalizerFactory() | |
| indic_normalizer = factory.get_normalizer(lang) | |
| def split_indic(line: str) -> tp.Iterable[str]: | |
| """Split Indian text into sentences using Indic NLP tool.""" | |
| line = indic_normalizer.normalize(line) | |
| for sent in indic_sent_tok.sentence_split(line, lang=lang): | |
| yield sent | |
| return split_indic | |
| elif split_algo == "laonlp": | |
| logger.info(f" - LaoNLP sentence splitter applied to '{lang}'") | |
| return lao_sent_tok | |
| elif split_algo == "khmer": | |
| logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'") | |
| return khm_sent_tok | |
| elif split_algo == "bodnlp": | |
| logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'") | |
| return bod_sent_tok | |
| elif split_algo == "geez": | |
| logger.info(f" - Ge'ez rule-based sentence splitter applied to '{lang}'") | |
| return split_geez | |
| elif split_algo == "burmese": | |
| logger.info(f" - Burmese rule-based sentence splitter applied to '{lang}'") | |
| return split_burmese | |
| else: | |
| logger.error(f"Unknown splitting algorithm {split_algo}") | |
| return None | |