language-identification

Sleeping

File size: 10,883 Bytes

# coding=utf-8
# Copyright 2023 The GlotLID Authors.
# Lint as: python3


# This space is built based on AMR-KELEG/ALDi space.
# GlotLID Space

import string
import constants
import pandas as pd
import streamlit as st
from huggingface_hub import hf_hub_download
from GlotScript import get_script_predictor
import matplotlib.pyplot as plt
import fasttext
import altair as alt
from altair import X, Y, Scale
import base64
import json
import os
import re

@st.cache_resource
def load_sp():
    sp = get_script_predictor()
    return sp


sp = load_sp()

def get_script(text):
    """Get the writing systems of given text.

    Args:
        text: The text to be preprocessed.

    Returns:
        The main script and list of all scripts.
    """
    res = sp(text)
    main_script = res[0] if res[0] else 'Zyyy'
    all_scripts_dict = res[2]['details']
    if all_scripts_dict:
        all_scripts = list(all_scripts_dict.keys())
    else:
        all_scripts = 'Zyyy'

    for ws in all_scripts:
        if ws in ['Kana', 'Hrkt', 'Hani', 'Hira']:
            all_scripts.append('Jpan')

    all_scripts = list(set(all_scripts))
    return main_script, all_scripts


def preprocess_text(text):
    """Apply preprocessing to the given text.
    Args:
        text: Thetext to be preprocessed.
    Returns:
        The preprocessed text.
    """

    # remove \n
    text = text.replace('\n', ' ')

    # get rid of characters that are ubiquitous
    replace_by = " " 
    replacement_map = {
        ord(c): replace_by
        for c in ':•#{|}' + string.digits
    }
    text = text.translate(replacement_map)

    # make multiple space one space
    text = re.sub(r'\s+', ' ', text)

    # strip the text
    text = text.strip()

    return text


@st.cache_data
def language_names(json_path):
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    return data

label2name = language_names("assets/language_names.json")

def get_name(label):
    """Get the name of language from label"""
    iso_3 = label.split('_')[0]
    name = label2name[iso_3]
    return name


@st.cache_data
def render_svg(svg):
    """Renders the given svg string."""
    b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
    html = rf'<p align="center"> <img src="data:image/svg+xml;base64,{b64}", width="40%"/> </p>'
    c = st.container()
    c.write(html, unsafe_allow_html=True)


@st.cache_data
def render_metadata():
    """Renders the metadata."""
    html = r"""<p align="center">
        <a href="https://huggingface.co/cis-lmu/glotlid"><img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-8A2BE2"></a>
        <a href="https://github.com/cisnlp/GlotLID"><img alt="GitHub" src="https://img.shields.io/badge/%F0%9F%93%A6%20GitHub-orange"></a>
        <a href="https://github.com/cisnlp/GlotLID/blob/main/LICENSE"><img alt="GitHub license" src="https://img.shields.io/github/license/cisnlp/GlotLID?logoColor=blue"></a>
        <a href="https://github.com/cisnlp/GlotLID"><img alt="GitHub stars" src="https://img.shields.io/github/stars/cisnlp/GlotLID"></a>
        <a href="https://arxiv.org/abs/2310.16248"><img alt="arXiv" src="https://img.shields.io/badge/arXiv-2310.16248-b31b1b.svg"></a>
        </p>"""
    c = st.container()
    c.write(html, unsafe_allow_html=True)

@st.cache_data
def citation():
    """Renders the metadata."""
    _CITATION  = """
    @inproceedings{
      kargaran2023glotlid,
      title={GlotLID: Language Identification for Low-Resource Languages},
      author={Kargaran, Amir Hossein and Imani, Ayyoob and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich},
      booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
      year={2023},
      url={https://openreview.net/forum?id=dl4e3EBz5j}
    }"""
    st.code(_CITATION, language="python", line_numbers=False)


@st.cache_data
def convert_df(df):
    # IMPORTANT: Cache the conversion to prevent computation on every rerun
    return df.to_csv(index=None).encode("utf-8")


@st.cache_resource
def load_model(model_name, file_name):
    model_path = hf_hub_download(repo_id=model_name, filename=file_name)
    model = fasttext.load_model(model_path)
    return model


model_1 = load_model(constants.MODEL_NAME, "model_v1.bin")
model_2 = load_model(constants.MODEL_NAME, "model_v2.bin")
model_3 = load_model(constants.MODEL_NAME, "model_v3.bin")
openlid = load_model('laurievb/OpenLID', "model.bin")
nllb = load_model('facebook/fasttext-language-identification', "model.bin")


# @st.cache_resource
def plot(label, prob):

    ORANGE_COLOR = "#FF8000"
    BLACK_COLOR = "#31333F"
    fig, ax = plt.subplots(figsize=(8, 1))
    fig.patch.set_facecolor("none")
    ax.set_facecolor("none")

    ax.spines["left"].set_color(BLACK_COLOR)
    ax.spines["bottom"].set_color(BLACK_COLOR)
    ax.tick_params(axis="x", colors=BLACK_COLOR)

    ax.spines[["right", "top"]].set_visible(False)

    ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
    ax.set_xlim(0, 1)
    ax.set_ylim(-1, 1)
    ax.set_title(f"Label: {label}, Language: {get_name(label)}", color=BLACK_COLOR)
    ax.get_yaxis().set_visible(False)
    ax.set_xlabel("Confidence", color=BLACK_COLOR)
    st.pyplot(fig)

def compute(sentences, version = 'v3'):
    """Computes the language probablities and labels for the given sentences.

    Args:
        sentences: A list of sentences.

    Returns:
        A list of language probablities and labels for the given sentences.
    """
    progress_text = "Computing Language..."

    if version == 'nllb-218':
        model_choice = nllb
    elif version == 'openlid-201':
        model_choice = openlid
    elif version == 'v3':
        model_choice = model_3   
    elif version == 'v2':
        model_choice = model_2
    else:
        model_choice = model_1
        
    my_bar = st.progress(0, text=progress_text)

    probs = []
    labels = []

    sentences = [preprocess_text(sent) for sent in sentences]
    
    for index, sent in enumerate(sentences):

        output = model_choice.predict(sent)
        
        output_label  = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
        output_prob = max(min(output[1][0], 1), 0) 
        output_label_language = output_label.split('_')[0]

        # script control
        if version in ['v3', 'v2', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
            main_script, all_scripts = get_script(sent)
            output_label_script = output_label.split('_')[1]

            if output_label_script not in all_scripts:
                output_label_script = main_script
                output_label = f"und_{output_label_script}"
                output_prob = 0

    
        labels = labels + [output_label]
        probs = probs + [output_prob]

        my_bar.progress(
            min((index) / len(sentences), 1),
            text=progress_text,
        )
    my_bar.empty()
    return probs, labels

#  st.markdown("[![Duplicate Space](https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14)](https://huggingface.co/spaces/cis-lmu/glotlid-space?duplicate=true)")

#  render_svg(open("assets/glotlid_logo.svg").read())

render_metadata()

st.markdown("**GlotLID** is an open-source language identification model with support for more than **2000 languages (V3)**.")



tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])

with tab1:
    
    # choice = st.radio(
    #     "Set granularity level",
    #     ["default", "merge", "individual"],
    #     captions=["enable both macrolanguage and its varieties (default)", "merge macrolanguage and its varieties into one label", "remove macrolanguages - only shows individual langauges"],
    # )

    version = st.radio(
        "Choose model",
        ["nllb-218", "openlid-201", "v1", "v2", "v3"],
        captions=["NLLB", "OpenLID", "GlotLID version 1", "GlotLID version 2", "GlotLID version 3 (More languages, better quality data)"],
        index = 4,
        key = 'version_tab1',
        horizontal = True
    )
    
    sent = st.text_input(
        "Sentence:", placeholder="Enter a sentence.", on_change=None
    )

    # TODO: Check if this is needed!

    clicked = st.button("Submit")

    if sent:
        
        probs, labels = compute([sent], version=version)
        prob = probs[0]
        label = labels[0]

        
        # Check if the file exists
        if not os.path.exists('logs.txt'):
            with open('logs.txt', 'w') as file:
                pass

        print(f"{sent}, {label}: {prob}")
        with open("logs.txt", "a") as f:
            f.write(f"{sent}, {label}: {prob}\n")
        
        # plot
        plot(label, prob)
        

with tab2:

    version = st.radio(
        "Choose model",
        ["nllb-218", "openlid-201", "v1", "v2", "v3"],
        captions=["NLLB", "OpenLID", "GlotLID version 1", "GlotLID version 2 (more data and languages), GlotLID version 3 (More languages, better quality data)"],
        index = 4,
        key = 'version_tab2',
        horizontal = True
    )

    file = st.file_uploader("Upload a file", type=["txt"])
    if file is not None:
        df = pd.read_csv(file, sep="¦\t¦", header=None, engine='python')
        df.columns = ["Sentence"]
        df.reset_index(drop=True, inplace=True)

        # TODO: Run the model
        df['Prob'], df["Label"] = compute(df["Sentence"].tolist(), version= version)
        df['Language'] = df["Label"].apply(get_name)

        # A horizontal rule
        st.markdown("""---""")

        chart = (
            alt.Chart(df.reset_index())
            .mark_area(color="darkorange", opacity=0.5)
            .encode(
                x=X(field="index", title="Sentence Index"),
                y=Y("Prob", scale=Scale(domain=[0, 1])),
            )
        )
        st.altair_chart(chart.interactive(), use_container_width=True)

        col1, col2 = st.columns([4, 1])

        with col1:
            # Display the output
            st.table(
                df,
            )

        with col2:
            # Add a download button
            csv = convert_df(df)
            st.download_button(
                label=":file_folder: Download predictions as CSV",
                data=csv,
                file_name="GlotLID.csv",
                mime="text/csv",
            )



# citation()