"""Detect language via polyglot and fastlid.""" # pylint: disable= from typing import Any, Callable, List, Optional from polyglot.text import Detector import polyglot.detect.base from polyglot.detect.base import UnknownLanguage from fastlid import fastlid from logzero import logger polyglot.detect.base.logger.setLevel("ERROR") def with_func_attrs(**attrs: Any) -> Callable: """Define func_attrs.""" def with_attrs(fct: Callable) -> Callable: for key, val in attrs.items(): setattr(fct, key, val) return fct return with_attrs # @with_func_attrs(set_languages=None) # def detect(text: str) -> str: def detect(text: str, set_languages: Optional[List[str]] = None) -> str: """Detect language via polyglot and fastlid. check first with fastlid, if conf < 0.3, check with polyglot.text.Detector Alternative in detec_alt.py """ # if not text.strip(): return "en" fastlid.set_languages = set_languages lang, conf = fastlid(text) detect.lang_conf = lang, conf if conf >= 0.3 or lang in ["zh"]: return lang try: langs = [(elm.code[:2], elm.confidence) for elm in Detector(text).languages] detect.lang_conf = langs # lang, conf = _[0] except UnknownLanguage: if set_languages is None: def_lang = "en" else: # def_lang = set_languages[-1] def_lang = set_languages[0] logger.warning(" UnknownLanguage exception: probably snippet too short, setting to %s", def_lang) langs = [(def_lang, 0)] except Exception as exc: logger.error(exc) langs = [("en", 0)] del conf # return first enrty's lang if set_languages is None: def_lang = langs[0][0] else: def_lang = "en" # pick the first in Detector(text).languages # just to silence pyright # set_languages_: List[str] = [""] if set_languages is None else set_languages for elm in langs: if elm[0] in set_languages: # type: ignore def_lang = elm[0] break # set_languages is set if not isinstance(set_languages, (list, tuple)): logger.warning("set_languages (%s) ought to be a list/tuple") return def_lang