nllb / app.py
davanstrien's picture
davanstrien HF staff
Remove low quality languages from target languages
49900d0 verified
raw
history blame
3.1 kB
import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from flores import code_mapping
import platform
import torch
import nltk
nltk.download("punkt")
REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"}
device = "cpu" if platform.system() == "Darwin" else "cuda"
MODEL_NAME = "facebook/nllb-200-3.3B"
code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
flores_codes = list(code_mapping.keys())
target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES]
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
return model
model = load_model()
def load_tokenizer(src_lang, tgt_lang):
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang]
)
return tokenizer
@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
tokenizer = load_tokenizer(src_lang, tgt_lang)
paragraphs = text.split("\n")
translated_paragraphs = []
for paragraph in paragraphs:
sentences = nltk.sent_tokenize(paragraph)
translated_sentences = []
for sentence in sentences:
input_tokens = (
tokenizer(sentence, return_tensors="pt")
.input_ids[0]
.cpu()
.numpy()
.tolist()
)
translated_chunk = model.generate(
input_ids=torch.tensor([input_tokens]).to(device),
forced_bos_token_id=tokenizer.lang_code_to_id[code_mapping[tgt_lang]],
max_length=len(input_tokens) + 50,
num_return_sequences=1,
)
translated_chunk = tokenizer.decode(
translated_chunk[0], skip_special_tokens=True
)
translated_sentences.append(translated_chunk)
translated_paragraph = " ".join(translated_sentences)
translated_paragraphs.append(translated_paragraph)
return "\n".join(translated_paragraphs)
description = """
UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages.
This is made possible through an open approach to AI innovation using Meta’s open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces.
"""
with gr.Blocks() as demo:
gr.Markdown("# UNESCO Language Translator, powered by Meta and Hugging Face")
gr.Markdown(description)
with gr.Row():
src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
target_lang = gr.Dropdown(label="Target Language", choices=target_languages)
with gr.Row():
input_text = gr.Textbox(label="Input Text", lines=6)
with gr.Row():
btn = gr.Button("Translate text")
with gr.Row():
output = gr.Textbox(label="Output Text", lines=6)
btn.click(
translate,
inputs=[input_text, src_lang, target_lang],
outputs=output,
)
demo.launch()