M2M-Clone-Test

Runtime error

App Files Files Community

SumYin

nobrowning commited on Jan 13, 2023

Commit

534e8dd

0 Parent(s):

Duplicate from nobrowning/M2M

Browse files

Co-authored-by: Wenshu Geng <nobrowning@users.noreply.huggingface.co>

Files changed (5) hide show

.gitattributes +27 -0
README.md +13 -0
app.py +198 -0
languages.py +47 -0
requirements.txt +4 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: M2M
+emoji: 💻
+colorFrom: green
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.9.0
+app_file: app.py
+pinned: false
+duplicated_from: nobrowning/M2M
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import streamlit as st
+import os
+import io
+from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from languages import LANGUANGE_MAP
+import time
+import json
+from typing import List
+import torch
+import random
+import logging
+if torch.cuda.is_available():
+    device = torch.device("cuda:0")
+else:
+    device = torch.device("cpu")
+    logging.warning("GPU not found, using CPU, translation will be very slow.")
+st.cache(suppress_st_warning=True, allow_output_mutation=True)
+st.set_page_config(page_title="M2M100 Translator")
+lang_id = {
+    "Afrikaans": "af",
+    "Amharic": "am",
+    "Arabic": "ar",
+    "Asturian": "ast",
+    "Azerbaijani": "az",
+    "Bashkir": "ba",
+    "Belarusian": "be",
+    "Bulgarian": "bg",
+    "Bengali": "bn",
+    "Breton": "br",
+    "Bosnian": "bs",
+    "Catalan": "ca",
+    "Cebuano": "ceb",
+    "Czech": "cs",
+    "Welsh": "cy",
+    "Danish": "da",
+    "German": "de",
+    "Greeek": "el",
+    "English": "en",
+    "Spanish": "es",
+    "Estonian": "et",
+    "Persian": "fa",
+    "Fulah": "ff",
+    "Finnish": "fi",
+    "French": "fr",
+    "Western Frisian": "fy",
+    "Irish": "ga",
+    "Gaelic": "gd",
+    "Galician": "gl",
+    "Gujarati": "gu",
+    "Hausa": "ha",
+    "Hebrew": "he",
+    "Hindi": "hi",
+    "Croatian": "hr",
+    "Haitian": "ht",
+    "Hungarian": "hu",
+    "Armenian": "hy",
+    "Indonesian": "id",
+    "Igbo": "ig",
+    "Iloko": "ilo",
+    "Icelandic": "is",
+    "Italian": "it",
+    "Japanese": "ja",
+    "Javanese": "jv",
+    "Georgian": "ka",
+    "Kazakh": "kk",
+    "Central Khmer": "km",
+    "Kannada": "kn",
+    "Korean": "ko",
+    "Luxembourgish": "lb",
+    "Ganda": "lg",
+    "Lingala": "ln",
+    "Lao": "lo",
+    "Lithuanian": "lt",
+    "Latvian": "lv",
+    "Malagasy": "mg",
+    "Macedonian": "mk",
+    "Malayalam": "ml",
+    "Mongolian": "mn",
+    "Marathi": "mr",
+    "Malay": "ms",
+    "Burmese": "my",
+    "Nepali": "ne",
+    "Dutch": "nl",
+    "Norwegian": "no",
+    "Northern Sotho": "ns",
+    "Occitan": "oc",
+    "Oriya": "or",
+    "Panjabi": "pa",
+    "Polish": "pl",
+    "Pushto": "ps",
+    "Portuguese": "pt",
+    "Romanian": "ro",
+    "Russian": "ru",
+    "Sindhi": "sd",
+    "Sinhala": "si",
+    "Slovak": "sk",
+    "Slovenian": "sl",
+    "Somali": "so",
+    "Albanian": "sq",
+    "Serbian": "sr",
+    "Swati": "ss",
+    "Sundanese": "su",
+    "Swedish": "sv",
+    "Swahili": "sw",
+    "Tamil": "ta",
+    "Thai": "th",
+    "Tagalog": "tl",
+    "Tswana": "tn",
+    "Turkish": "tr",
+    "Ukrainian": "uk",
+    "Urdu": "ur",
+    "Uzbek": "uz",
+    "Vietnamese": "vi",
+    "Wolof": "wo",
+    "Xhosa": "xh",
+    "Yiddish": "yi",
+    "Yoruba": "yo",
+    "Chinese": "zh",
+    "Zulu": "zu",
+}
+@st.cache(suppress_st_warning=True, allow_output_mutation=True)
+def load_model(
+    pretrained_model: str = "facebook/m2m100_1.2B",
+    cache_dir: str = "models/",
+):
+    tokenizer = M2M100Tokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir)
+    model = M2M100ForConditionalGeneration.from_pretrained(
+        pretrained_model, cache_dir=cache_dir
+    ).to(device)
+    model.eval()
+    return tokenizer, model
+@st.cache(suppress_st_warning=True, allow_output_mutation=True)
+def load_detection_model(
+    pretrained_model: str = "ivanlau/language-detection-fine-tuned-on-xlm-roberta-base",
+    cache_dir: str = "models/",
+):
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir)
+    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, cache_dir=cache_dir).to(device)
+    model.eval()
+    return tokenizer, model
+st.title("M2M100 Translator")
+st.write("M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many multilingual translation. It was introduced in this paper https://arxiv.org/abs/2010.11125 and first released in https://github.com/pytorch/fairseq/tree/master/examples/m2m_100 repository. The model that can directly translate between the 9,900 directions of 100 languages.\n")
+st.write(" This demo uses the facebook/m2m100_1.2B model. For local inference see https://github.com/ikergarcia1996/Easy-Translate")
+user_input: str = st.text_area(
+    "Input text",
+    height=200,
+    max_chars=5120,
+)
+target_lang = st.selectbox(label="Target language", options=list(lang_id.keys()))
+if st.button("Run"):
+    time_start = time.time()
+    tokenizer, model = load_model()
+    de_tokenizer, de_model = load_detection_model()
+    with torch.no_grad():
+        tokenized_sentence = de_tokenizer(user_input, return_tensors='pt')
+        output = de_model(**tokenized_sentence)
+        de_predictions = torch.nn.functional.softmax(output.logits, dim=-1)
+        _, preds = torch.max(de_predictions, dim=-1)
+        lang_type = LANGUANGE_MAP[preds.item()]
+        if lang_type not in lang_id:
+            time_end = time.time()
+            st.success('Unsupported Language')
+            st.write(f"Computation time: {round((time_end-time_start),3)} segs")
+        else:
+            src_lang = lang_id[lang_type]
+            trg_lang = lang_id[target_lang]
+            tokenizer.src_lang = src_lang
+            encoded_input = tokenizer(user_input, return_tensors="pt").to(device)
+            generated_tokens = model.generate(
+                **encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang)
+            )
+            translated_text = tokenizer.batch_decode(
+                generated_tokens, skip_special_tokens=True
+            )[0]
+            time_end = time.time()
+            st.success(translated_text)
+            st.write(f"Computation time: {round((time_end-time_start),3)} segs")

languages.py ADDED Viewed

	@@ -0,0 +1,47 @@

+LANGUANGE_MAP = {
+    0: 'Arabic',
+    1: 'Basque',
+    2: 'Breton',
+    3: 'Catalan',
+    4: 'Chinese',
+    5: 'Chinese',
+    6: 'Chinese',
+    7: 'Chuvash',
+    8: 'Czech',
+    9: 'Dhivehi',
+    10: 'Dutch',
+    11: 'English',
+    12: 'Esperanto',
+    13: 'Estonian',
+    14: 'French',
+    15: 'Frisian',
+    16: 'Georgian',
+    17: 'German',
+    18: 'Greek',
+    19: 'Hakha_Chin',
+    20: 'Indonesian',
+    21: 'Interlingua',
+    22: 'Italian',
+    23: 'Japanese',
+    24: 'Kabyle',
+    25: 'Kinyarwanda',
+    26: 'Kyrgyz',
+    27: 'Latvian',
+    28: 'Maltese',
+    29: 'Mongolian',
+    30: 'Persian',
+    31: 'Polish',
+    32: 'Portuguese',
+    33: 'Romanian',
+    34: 'Romansh_Sursilvan',
+    35: 'Russian',
+    36: 'Sakha',
+    37: 'Slovenian',
+    38: 'Spanish',
+    39: 'Swedish',
+    40: 'Tamil',
+    41: 'Tatar',
+    42: 'Turkish',
+    43: 'Ukranian',
+    44: 'Welsh'
+ }

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+torch
+transformers
+transformers[sentencepiece]