SPACY_LANGUAGE_MAPPER = { "ca": "ca_core_news_sm", "da": "da_core_news_sm", "de": "de_core_news_sm", "el": "el_core_news_sm", "en": "en_core_web_sm", "es": "es_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm", "ja": "ja_core_news_sm", "lt": "lt_core_news_sm", "mk": "mk_core_news_sm", "nb": "nb_core_news_sm", "nl": "nl_core_news_sm", "pl": "pl_core_news_sm", "pt": "pt_core_news_sm", "ro": "ro_core_news_sm", "ru": "ru_core_news_sm", "xx": "xx_sent_ud_sm", "zh": "zh_core_web_sm", "ca_core_news_sm": "ca_core_news_sm", "ca_core_news_md": "ca_core_news_md", "ca_core_news_lg": "ca_core_news_lg", "ca_core_news_trf": "ca_core_news_trf", "da_core_news_sm": "da_core_news_sm", "da_core_news_md": "da_core_news_md", "da_core_news_lg": "da_core_news_lg", "da_core_news_trf": "da_core_news_trf", "de_core_news_sm": "de_core_news_sm", "de_core_news_md": "de_core_news_md", "de_core_news_lg": "de_core_news_lg", "de_dep_news_trf": "de_dep_news_trf", "el_core_news_sm": "el_core_news_sm", "el_core_news_md": "el_core_news_md", "el_core_news_lg": "el_core_news_lg", "en_core_web_sm": "en_core_web_sm", "en_core_web_md": "en_core_web_md", "en_core_web_lg": "en_core_web_lg", "en_core_web_trf": "en_core_web_trf", "es_core_news_sm": "es_core_news_sm", "es_core_news_md": "es_core_news_md", "es_core_news_lg": "es_core_news_lg", "es_dep_news_trf": "es_dep_news_trf", "fr_core_news_sm": "fr_core_news_sm", "fr_core_news_md": "fr_core_news_md", "fr_core_news_lg": "fr_core_news_lg", "fr_dep_news_trf": "fr_dep_news_trf", "it_core_news_sm": "it_core_news_sm", "it_core_news_md": "it_core_news_md", "it_core_news_lg": "it_core_news_lg", "ja_core_news_sm": "ja_core_news_sm", "ja_core_news_md": "ja_core_news_md", "ja_core_news_lg": "ja_core_news_lg", "ja_dep_news_trf": "ja_dep_news_trf", "lt_core_news_sm": "lt_core_news_sm", "lt_core_news_md": "lt_core_news_md", "lt_core_news_lg": "lt_core_news_lg", "mk_core_news_sm": "mk_core_news_sm", "mk_core_news_md": "mk_core_news_md", "mk_core_news_lg": "mk_core_news_lg", "nb_core_news_sm": "nb_core_news_sm", "nb_core_news_md": "nb_core_news_md", "nb_core_news_lg": "nb_core_news_lg", "nl_core_news_sm": "nl_core_news_sm", "nl_core_news_md": "nl_core_news_md", "nl_core_news_lg": "nl_core_news_lg", "pl_core_news_sm": "pl_core_news_sm", "pl_core_news_md": "pl_core_news_md", "pl_core_news_lg": "pl_core_news_lg", "pt_core_news_sm": "pt_core_news_sm", "pt_core_news_md": "pt_core_news_md", "pt_core_news_lg": "pt_core_news_lg", "ro_core_news_sm": "ro_core_news_sm", "ro_core_news_md": "ro_core_news_md", "ro_core_news_lg": "ro_core_news_lg", "ru_core_news_sm": "ru_core_news_sm", "ru_core_news_md": "ru_core_news_md", "ru_core_news_lg": "ru_core_news_lg", "xx_ent_wiki_sm": "xx_ent_wiki_sm", "xx_sent_ud_sm": "xx_sent_ud_sm", "zh_core_web_sm": "zh_core_web_sm", "zh_core_web_md": "zh_core_web_md", "zh_core_web_lg": "zh_core_web_lg", "zh_core_web_trf": "zh_core_web_trf", } from relik.inference.data.tokenizers.regex_tokenizer import RegexTokenizer from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer from relik.inference.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer