File size: 3,509 Bytes
44d8b8c
cbd7b15
fe1893c
cbd7b15
 
 
fe1893c
8bfc488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe1893c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07568d4
 
fe1893c
 
 
 
 
 
 
 
 
 
 
 
 
 
cbd7b15
 
528f383
 
cbd7b15
8bfc488
 
07568d4
8bfc488
98f3f94
 
 
9d37ac1
 
98f3f94
 
cbd7b15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from typing import Optional, List, Set, Union, Tuple
from huggingface_hub import hf_hub_download
import gradio as gr
import fasttext

model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin"))
model_labels = set(label[-3:] for label in model.get_labels())
language_dict = {
    'dan': 'Danish',
    'eng': 'English',
    'fao': 'Faroese',
    'fin': 'Finnish',
    'isl': 'Icelandic',
    'nno': 'Norwegian Nynorsk',
    'nob': 'Norwegian Bokmål',
    'sma': 'Southern Sami',
    'sme': 'Northern Sami',
    'smj': 'Lule Sami',
    'smn': 'Inari Sami',
    'sms': 'Skolt Sami',
    'swe': 'Swedish',
    'und': 'Undetermined',
}

def detect_lang(
    text: str,
    langs: Optional[Union[List, Set]]=None,
    threshold: float=-1.0,
    return_proba: bool=False
) -> Union[str, Tuple[str, float]]:
    """
    This function takes in a text string and optional arguments for a list or
    set of languages to detect, a threshold for minimum probability of language
    detection, and a boolean for returning the probability of detected language.
    It uses a pre-defined model to predict the language of the text and returns
    the detected ISO-639-3 language code as a string. If the return_proba
    argument is set to True, it will also return a tuple with the language code
    and the probability of detection. If no language is detected, it will
    return "und" as the language code.

    Args:
    - text (str): The text to detect the language of.
    - langs (List or Set, optional): The list or set of languages to detect in 
        the text. Defaults to all languages in the model's labels.
    - threshold (float, optional): The minimum probability for a language to be
        considered detected. Defaults to `-1.0`.
    - return_proba (bool, optional): Whether to return the language code and
        probability of detection as a tuple. Defaults to `False`.

    Returns:
    str or Tuple[str, float]: The detected language code as a string, or a
        tuple with the language code and probability of detection if
        return_proba is set to True.
    """
    if len(text.split()) < 4:
        return [("und", 1.0)] if return_proba else "und" 
    if langs:
        langs = set(langs)
    else:
        langs = model_labels
    raw_prediction = model.predict(text, threshold=threshold, k=-1)
    predictions = [
        (label[-3:], min(probability, 1.0))
        for label, probability in zip(*raw_prediction)
        if label[-3:] in langs
    ]
    if not predictions:
        return [("und", 1.0)] if return_proba else "und"
    else:
        return predictions if return_proba else predictions[0][0]


def identify(text, threshold):
    return {language_dict[lang]: proba for lang, proba in detect_lang(text.replace("\n", " "), threshold=threshold / 100.0, return_proba=True)}

iface = gr.Interface(
    title="NB Nordic Language Identification",
    description="""This demo uses the [NB-Nordic-LID](https://huggingface.co/NbAiLab/nb-nordic-lid) model to classify a given text into one of the 12 Nordic languages supported. <b>At least 3 or 4 words are needed to identify the language.</b>""",
    fn=identify,
    inputs=[gr.Textbox(label="Text to identify language for"), gr.Slider(0, 100, value=80, step=1, label="Probability threshold (%)")], 
    outputs="label",
    examples=[
        ["Jeg heter Svein Arne", 80],
        ["Dán lágan li biejadusá dárogiela, rijkalasj unneplågogielaj ja dáro siejvvemgiela birra", 80],
    ]
)
iface.launch()