from typing import Optional, List, Set, Union, Tuple from huggingface_hub import hf_hub_download import gradio as gr import fasttext model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin")) model_labels = set(label[-3:] for label in model.get_labels()) language_dict = { 'dan': 'Danish', 'eng': 'English', 'fao': 'Faroese', 'fin': 'Finnish', 'isl': 'Icelandic', 'nno': 'Norwegian Nynorsk', 'nob': 'Norwegian Bokmål', 'sma': 'Southern Sami', 'sme': 'Northern Sami', 'smj': 'Lule Sami', 'smn': 'Inari Sami', 'sms': 'Skolt Sami', 'swe': 'Swedish', 'und': 'Undetermined', } def detect_lang( text: str, langs: Optional[Union[List, Set]]=None, threshold: float=-1.0, return_proba: bool=False ) -> Union[str, Tuple[str, float]]: """ This function takes in a text string and optional arguments for a list or set of languages to detect, a threshold for minimum probability of language detection, and a boolean for returning the probability of detected language. It uses a pre-defined model to predict the language of the text and returns the detected ISO-639-3 language code as a string. If the return_proba argument is set to True, it will also return a tuple with the language code and the probability of detection. If no language is detected, it will return "und" as the language code. Args: - text (str): The text to detect the language of. - langs (List or Set, optional): The list or set of languages to detect in the text. Defaults to all languages in the model's labels. - threshold (float, optional): The minimum probability for a language to be considered detected. Defaults to `-1.0`. - return_proba (bool, optional): Whether to return the language code and probability of detection as a tuple. Defaults to `False`. Returns: str or Tuple[str, float]: The detected language code as a string, or a tuple with the language code and probability of detection if return_proba is set to True. """ if len(text.split()) < 4: return [("und", 1.0)] if return_proba else "und" if langs: langs = set(langs) else: langs = model_labels raw_prediction = model.predict(text, threshold=threshold, k=-1) predictions = [ (label[-3:], min(probability, 1.0)) for label, probability in zip(*raw_prediction) if label[-3:] in langs ] if not predictions: return [("und", 1.0)] if return_proba else "und" else: return predictions if return_proba else predictions[0][0] def identify(text, threshold): return {language_dict[lang]: proba for lang, proba in detect_lang(text.replace("\n", " "), threshold=threshold / 100.0, return_proba=True)} iface = gr.Interface( title="NB Nordic Language Identification", description="""This demo uses the [NB-Nordic-LID](https://huggingface.co/NbAiLab/nb-nordic-lid) model to classify a given text into one of the 12 Nordic languages supported. At least 3 or 4 words are needed to identify the language.""", fn=identify, inputs=[gr.Textbox(label="Text to identify language for"), gr.Slider(0, 100, value=80, step=1, label="Probability threshold (%)")], outputs="label", examples=[ ["Jeg heter Svein Arne", 80], ["Dán lágan li biejadusá dárogiela, rijkalasj unneplågogielaj ja dáro siejvvemgiela birra", 80], ] ) iface.launch()