import spaces import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from flores import code_mapping import platform import torch import nltk nltk.download("punkt_tab") REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"} device = "cpu" if platform.system() == "Darwin" else "cuda" MODEL_NAME = "facebook/nllb-200-3.3B" code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1])) flores_codes = list(code_mapping.keys()) target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES] def load_model(): model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device) print(f"Model loaded in {device}") return model model = load_model() def load_tokenizer(src_lang, tgt_lang): tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang] ) return tokenizer @spaces.GPU def translate(text: str, src_lang: str, tgt_lang: str): tokenizer = load_tokenizer(src_lang, tgt_lang) paragraphs = text.split("\n") translated_paragraphs = [] for paragraph in paragraphs: sentences = nltk.sent_tokenize(paragraph) translated_sentences = [] for sentence in sentences: input_tokens = ( tokenizer(sentence, return_tensors="pt") .input_ids[0] .cpu() .numpy() .tolist() ) translated_chunk = model.generate( input_ids=torch.tensor([input_tokens]).to(device), forced_bos_token_id=tokenizer.convert_tokens_to_ids(code_mapping[tgt_lang]), max_length=len(input_tokens) + 50, num_return_sequences=1, ) translated_chunk = tokenizer.decode( translated_chunk[0], skip_special_tokens=True ) translated_sentences.append(translated_chunk) translated_paragraph = " ".join(translated_sentences) translated_paragraphs.append(translated_paragraph) return "\n".join(translated_paragraphs) description = """ UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages. This is made possible through an open approach to AI innovation using Meta’s open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces. """ with gr.Blocks() as demo: gr.Markdown("# UNESCO Language Translator, powered by Meta and Hugging Face") gr.Markdown(description) with gr.Row(): src_lang = gr.Dropdown(label="Source Language", choices=flores_codes) target_lang = gr.Dropdown(label="Target Language", choices=target_languages) with gr.Row(): input_text = gr.Textbox(label="Input Text", lines=6) with gr.Row(): btn = gr.Button("Translate text") with gr.Row(): output = gr.Textbox(label="Output Text", lines=6) btn.click( translate, inputs=[input_text, src_lang, target_lang], outputs=output, ) demo.launch()