File size: 2,288 Bytes
7dce6dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7fd4e54
 
1ca37ad
7fd4e54
 
 
7dce6dc
7fd4e54
 
 
 
 
 
7dce6dc
7fd4e54
 
 
7dce6dc
 
 
 
 
 
 
7fd4e54
7dce6dc
 
7fd4e54
7dce6dc
 
 
7fd4e54
7dce6dc
7fd4e54
 
 
 
 
 
 
 
 
 
 
 
 
7dce6dc
 
 
 
 
7fd4e54
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Detect language via polyglot and fastlid."""
# pylint: disable=

from typing import Any, Callable, List, Optional

from polyglot.text import Detector
import polyglot.detect.base
from polyglot.detect.base import UnknownLanguage
from fastlid import fastlid

from logzero import logger

polyglot.detect.base.logger.setLevel("ERROR")


def with_func_attrs(**attrs: Any) -> Callable:
    """Define func_attrs."""

    def with_attrs(fct: Callable) -> Callable:
        for key, val in attrs.items():
            setattr(fct, key, val)
        return fct

    return with_attrs


# @with_func_attrs(set_languages=None)
# def detect(text: str) -> str:
def detect(text: str, set_languages: Optional[List[str]] = None) -> str:
    """Detect language via polyglot and fastlid.

    check first with fastlid, if conf < 0.3, check with polyglot.text.Detector

    Alternative in detec_alt.py
    """
    # if not text.strip(): return "en"
    fastlid.set_languages = set_languages
    lang, conf = fastlid(text)
    detect.lang_conf = lang, conf
    if conf >= 0.3 or lang in ["zh"]:
        return lang

    try:
        langs = [(elm.code[:2], elm.confidence) for elm in Detector(text).languages]
        detect.lang_conf = langs
        # lang, conf = _[0]
    except UnknownLanguage:
        if set_languages is None:
            def_lang = "en"
        else:
            # def_lang = set_languages[-1]
            def_lang = set_languages[0]
        logger.warning(" UnknownLanguage exception: probably snippet too short, setting to %s", def_lang)
        langs = [(def_lang, 0)]
    except Exception as exc:
        logger.error(exc)
        langs = [("en", 0)]

    del conf

    # return first enrty's lang
    if set_languages is None:
        def_lang = langs[0][0]
    else:
        def_lang = "en"

        # pick the first in Detector(text).languages

        # just to silence pyright
        # set_languages_: List[str] = [""] if set_languages is None else set_languages

        for elm in langs:
            if elm[0] in set_languages:  # type: ignore
                def_lang = elm[0]
                break

    # set_languages is set
    if not isinstance(set_languages, (list, tuple)):
        logger.warning("set_languages (%s) ought to be a list/tuple")

    return def_lang