MaskLID / app.py
kargaranamir's picture
add app.
2c9efe4 verified
raw
history blame contribute delete
No virus
3.97 kB
# Author: Amir Hossein Kargaran
# Date: August, 2023
# Description: This code applies MaskLID (code-switch language identification) with GlotLID, a fastText-based language identification tool.
# MIT License
import gradio as gr
from masklid import MaskLID
from huggingface_hub import hf_hub_download
from fasttext.FastText import _FastText
def render_metadata():
"""Renders the metadata."""
html_content = """
<p align="center">
<a href="https://github.com/cisnlp/MaskLID"><img alt="GitHub stars" src="https://img.shields.io/github/stars/cisnlp/MaskLID"></a>
This is the demo for <a href="https://arxiv.org/abs/2406.06263">MaskLID</a> paper (ACL 2024). You can see the whole code in our GitHub. Please also note that if you increase the number of languages, you also need larger alpha and beta values.
MaskLID does not add much overhead to language identification. You first fix the languages your model is limited to and then run the MaskLID code. However, in this demo, we load the model each time (that takes couple of seconds) you hit submit to ensure the results are not cached and to make it possible to change the set of languages each time. We may later change the demo code to resolve this.
</p>
"""
return html_content
def get_model_path():
# Download GlotLID FastText language identification model from Hugging Face Hub
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model_v3.bin")
return model_path
def get_masklid():
# load masklid model
masklid_model = MaskLID(get_model_path())
# get all the labels
labels = masklid_model.model.get_labels()
labels = [l for l in labels if not l.startswith('__label__und') and not l.startswith('__label__zxx')]
return masklid_model, labels
def predict_codeswitch(text, top_labels=200, beta=20, alpha=3, max_lambda=3, min_length=10, min_prob=0.90, max_retry=3, alpha_step_increase=3, beta_step_increase=5):
# constraints
beta = top_labels if beta > top_labels else beta
alpha = beta if alpha > beta else alpha
# override the masklid label set
masklid_model, labels = get_masklid()
masklid_model.language_indices = masklid_model._compute_language_indices(labels[:top_labels])
masklid_model.labels = [masklid_model.model.get_labels()[i] for i in masklid_model.language_indices]
ans = masklid_model.predict_codeswitch(text, beta=beta, alpha=alpha, max_lambda=max_lambda, min_length=min_length, min_prob=min_prob, max_retry=max_retry, alpha_step_increase=alpha_step_increase, beta_step_increase=beta_step_increase)
return ans
inputs = gr.Textbox(lines=2, label="Enter the text", value="bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop")
parameters = {
"top_labels": gr.Slider(minimum=2, maximum=len(get_masklid()[1]), step=1, value=200, label="Limit LID to X Top Languages"),
"beta": gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Beta"),
"alpha": gr.Slider(minimum=1, maximum=30, value=3, step=1, label="Alpha"),
"max_lambda": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Iteration"),
"min_length": gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Min Length"),
"min_prob": gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.01, label="Min Probability"),
"max_retry": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Retry In total"),
"alpha_step_increase": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Alpha Step Increase"),
"beta_step_increase": gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Beta Step Increase")
}
output = gr.JSON(label="Output")
gr.Interface(
fn=predict_codeswitch,
inputs=[inputs, *parameters.values()],
outputs=output,
title="MaskLID (Code-Switch Language Identification)",
description = render_metadata(),
cache_examples=False
).launch()