from typing import List from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE from surya.model.recognition.tokenizer import _tokenize as lang_tokenize from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE from marker.settings import settings def langs_to_ids(langs: List[str]): unique_langs = list(set(langs)) _, lang_tokens = lang_tokenize("", unique_langs) return lang_tokens def replace_langs_with_codes(langs): if settings.OCR_ENGINE == "surya": for i, lang in enumerate(langs): if lang.title() in LANGUAGE_TO_CODE: langs[i] = LANGUAGE_TO_CODE[lang.title()] else: for i, lang in enumerate(langs): if lang in LANGUAGE_TO_CODE: langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang] return langs def validate_langs(langs): if settings.OCR_ENGINE == "surya": for lang in langs: if lang not in CODE_TO_LANGUAGE: raise ValueError(f"Invalid language code {lang} for Surya OCR") else: for lang in langs: if lang not in TESSERACT_CODE_TO_LANGUAGE: raise ValueError(f"Invalid language code {lang} for Tesseract")