Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,169 Bytes
38742d7 71ae380 38742d7 71ae380 5e1003d 38742d7 6bcde50 38742d7 3f23d73 38742d7 3f23d73 38742d7 5e1003d 3f23d73 e72a9c0 15ccfd9 38742d7 d3ad9e6 7dc20b3 d3ad9e6 7dc20b3 1473813 38742d7 0badc10 38742d7 d3ad9e6 38742d7 bd312ad 0badc10 38742d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from flores import code_mapping
import platform
import torch
import nltk
nltk.download("punkt")
device = "cpu" if platform.system() == "Darwin" else "cuda"
MODEL_NAME = "facebook/nllb-200-3.3B"
code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
flores_codes = list(code_mapping.keys())
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
return model
model = load_model()
def load_tokenizer(src_lang, tgt_lang):
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang]
)
return tokenizer
@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
tokenizer = load_tokenizer(src_lang, tgt_lang)
paragraphs = text.split("\n")
translated_paragraphs = []
for paragraph in paragraphs:
sentences = nltk.sent_tokenize(paragraph)
translated_sentences = []
for sentence in sentences:
input_tokens = (
tokenizer(sentence, return_tensors="pt")
.input_ids[0]
.cpu()
.numpy()
.tolist()
)
translated_chunk = model.generate(
input_ids=torch.tensor([input_tokens]).to(device),
forced_bos_token_id=tokenizer.lang_code_to_id[code_mapping[tgt_lang]],
max_length=len(input_tokens) + 50,
num_return_sequences=1,
)
translated_chunk = tokenizer.decode(
translated_chunk[0], skip_special_tokens=True
)
translated_sentences.append(translated_chunk)
translated_paragraph = " ".join(translated_sentences)
translated_paragraphs.append(translated_paragraph)
return "\n".join(translated_paragraphs)
description = """
UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages.
This is made possible through an open approach to AI innovation using Meta’s open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces.
"""
instructions = """
1. Select the source and target language from the dropdown menus.
2. Enter the text you would like to translate.
3. Click the 'Translate text' button.
"""
with gr.Blocks() as demo:
gr.Markdown("# UNESCO Language Translator, powered by Meta and Hugging Face")
gr.Markdown(description)
gr.Markdown("## Instructions")
gr.Markdown(instructions)
with gr.Row():
src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
target_lang = gr.Dropdown(label="Target Language", choices=flores_codes)
with gr.Row():
input_text = gr.Textbox(label="Input Text", lines=6)
with gr.Row():
btn = gr.Button("Translate text")
with gr.Row():
output = gr.Textbox(label="Output Text", lines=6)
btn.click(
translate,
inputs=[input_text, src_lang, target_lang],
outputs=output,
)
demo.launch()
|