Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import streamlit as st
|
|
| 2 |
import polars as pl
|
| 3 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, logging, AutoModelForCausalLM
|
| 4 |
import torch
|
| 5 |
-
import os
|
| 6 |
import httpx
|
| 7 |
|
| 8 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
@@ -19,11 +19,25 @@ langs.extend(list(all_langs.keys())) # Language options as list, add favourite l
|
|
| 19 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 20 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
| 21 |
|
| 22 |
-
models = ["Helsinki-NLP", "QUICKMT", "Argos", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 24 |
-
|
| 25 |
-
"HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
|
| 26 |
-
"
|
|
|
|
|
|
|
| 27 |
allmodels = ["Helsinki-NLP",
|
| 28 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
|
| 29 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
|
|
@@ -49,11 +63,25 @@ class Translators:
|
|
| 49 |
response = httpx.get(url)
|
| 50 |
return response.json()[0][0][0]
|
| 51 |
|
| 52 |
-
def hplt(self):
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
@staticmethod
|
| 59 |
def quickmttranslate(model_path, input_text):
|
|
@@ -96,7 +124,7 @@ class Translators:
|
|
| 96 |
# Direct translation model
|
| 97 |
if f"{self.sl}-{self.tl}" in quickmt_models:
|
| 98 |
model_path = Translators.quickmtdownload(model_name)
|
| 99 |
-
|
| 100 |
message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.'
|
| 101 |
# Pivot language English
|
| 102 |
elif self.sl in available_languages and self.tl in available_languages:
|
|
@@ -105,12 +133,12 @@ class Translators:
|
|
| 105 |
entranslation = Translators.quickmttranslate(model_path, self.input_text)
|
| 106 |
model_name = f"quickmt-en-{self.tl}"
|
| 107 |
model_path = Translators.quickmtdownload(model_name)
|
| 108 |
-
|
| 109 |
message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with pivot language English.'
|
| 110 |
else:
|
| 111 |
-
|
| 112 |
message = f"Available models: {', '.join(quickmt_models)}"
|
| 113 |
-
return
|
| 114 |
|
| 115 |
@staticmethod
|
| 116 |
def download_argos_model(from_code, to_code):
|
|
@@ -172,6 +200,15 @@ class Translators:
|
|
| 172 |
# output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 173 |
output_text = tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True) # Decode only the new tokens
|
| 174 |
return output_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
def HelsinkiNLP_mulroa(self):
|
| 177 |
try:
|
|
@@ -501,16 +538,22 @@ def translate_text(model_name: str, s_language: str, t_language: str, input_text
|
|
| 501 |
return translated_text, message_text
|
| 502 |
message_text = f'Translated from {s_language} to {t_language} with {model_name}'
|
| 503 |
translated_text = None
|
| 504 |
-
try:
|
| 505 |
-
if
|
|
|
|
|
|
|
|
|
|
| 506 |
translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP_mulroa()
|
| 507 |
|
| 508 |
elif model_name == "Helsinki-NLP":
|
| 509 |
translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP()
|
| 510 |
|
| 511 |
-
elif
|
| 512 |
-
|
| 513 |
-
|
|
|
|
|
|
|
|
|
|
| 514 |
elif model_name == 'Argos':
|
| 515 |
translated_text = Translators(model_name, sl, tl, input_text).argos()
|
| 516 |
|
|
|
|
| 2 |
import polars as pl
|
| 3 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, logging, AutoModelForCausalLM
|
| 4 |
import torch
|
| 5 |
+
import os
|
| 6 |
import httpx
|
| 7 |
|
| 8 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
| 19 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 20 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
| 21 |
|
| 22 |
+
models = ["Helsinki-NLP", "QUICKMT", "Argos", "Lego-MT/Lego-MT", "HPLT", "HPLT-OPUS", "Google",
|
| 23 |
+
"Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
|
| 24 |
+
"Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
|
| 25 |
+
"Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa",
|
| 26 |
+
"Helsinki-NLP/opus-mt-tc-bible-big-roa-en",
|
| 27 |
+
"facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B",
|
| 28 |
+
"facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt",
|
| 29 |
+
"facebook/m2m100_418M", "facebook/m2m100_1.2B",
|
| 30 |
+
"bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
|
| 31 |
+
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 32 |
+
"t5-small", "t5-base", "t5-large",
|
| 33 |
+
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
| 34 |
+
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 35 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 36 |
+
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
|
| 37 |
+
"HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
|
| 38 |
+
"tencent/Hunyuan-MT-7B",
|
| 39 |
+
"openGPT-X/Teuken-7B-instruct-commercial-v0.4", "openGPT-X/Teuken-7B-instruct-v0.6",
|
| 40 |
+
]
|
| 41 |
allmodels = ["Helsinki-NLP",
|
| 42 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
|
| 43 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
|
|
|
|
| 63 |
response = httpx.get(url)
|
| 64 |
return response.json()[0][0][0]
|
| 65 |
|
| 66 |
+
def hplt(self, opus = False):
|
| 67 |
+
# langs = ['ar', 'bs', 'ca', 'en', 'et', 'eu', 'fi', 'ga', 'gl', 'hi', 'hr', 'is', 'mt', 'nn', 'sq', 'sw', 'zh_hant']
|
| 68 |
+
hplt_models = ['ar-en', 'bs-en', 'ca-en', 'en-ar', 'en-bs', 'en-ca', 'en-et', 'en-eu', 'en-fi',
|
| 69 |
+
'en-ga', 'en-gl', 'en-hi', 'en-hr', 'en-is', 'en-mt', 'en-nn', 'en-sq', 'en-sw',
|
| 70 |
+
'en-zh_hant', 'et-en', 'eu-en', 'fi-en', 'ga-en', 'gl-en', 'hi-en', 'hr-en',
|
| 71 |
+
'is-en', 'mt-en', 'nn-en', 'sq-en', 'sw-en', 'zh_hant-en']
|
| 72 |
+
if opus:
|
| 73 |
+
hplt_model = f'HPLT/translate-{self.sl}-{self.tl}-v1.0-hplt_opus' # HPLT/translate-en-hr-v1.0-hplt_opus
|
| 74 |
+
else:
|
| 75 |
+
hplt_model = f'HPLT/translate-{self.sl}-{self.tl}-v1.0-hplt' # HPLT/translate-en-hr-v1.0-hplt
|
| 76 |
+
if f'{self.sl}-{self.tl}' in hplt_models:
|
| 77 |
+
pipe = pipeline("translation", model=hplt_model, device=self.device)
|
| 78 |
+
translation = pipe(self.input_text)
|
| 79 |
+
translated_text = translation[0]['translation_text']
|
| 80 |
+
message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {hplt_model}.'
|
| 81 |
+
else:
|
| 82 |
+
translated_text = f'HPLT model from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} not available!'
|
| 83 |
+
message = f"Available models: {', '.join(hplt_models)}"
|
| 84 |
+
return translated_text, message
|
| 85 |
|
| 86 |
@staticmethod
|
| 87 |
def quickmttranslate(model_path, input_text):
|
|
|
|
| 124 |
# Direct translation model
|
| 125 |
if f"{self.sl}-{self.tl}" in quickmt_models:
|
| 126 |
model_path = Translators.quickmtdownload(model_name)
|
| 127 |
+
translated_text = Translators.quickmttranslate(model_path, self.input_text)
|
| 128 |
message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.'
|
| 129 |
# Pivot language English
|
| 130 |
elif self.sl in available_languages and self.tl in available_languages:
|
|
|
|
| 133 |
entranslation = Translators.quickmttranslate(model_path, self.input_text)
|
| 134 |
model_name = f"quickmt-en-{self.tl}"
|
| 135 |
model_path = Translators.quickmtdownload(model_name)
|
| 136 |
+
translated_text = Translators.quickmttranslate(model_path, entranslation)
|
| 137 |
message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with pivot language English.'
|
| 138 |
else:
|
| 139 |
+
translated_text = f'Model {model_name} from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} not available!'
|
| 140 |
message = f"Available models: {', '.join(quickmt_models)}"
|
| 141 |
+
return translated_text, message
|
| 142 |
|
| 143 |
@staticmethod
|
| 144 |
def download_argos_model(from_code, to_code):
|
|
|
|
| 200 |
# output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 201 |
output_text = tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True) # Decode only the new tokens
|
| 202 |
return output_text
|
| 203 |
+
|
| 204 |
+
def simplepipe(self):
|
| 205 |
+
try:
|
| 206 |
+
pipe = pipeline("translation", model=self.model_name, device=self.device)
|
| 207 |
+
translation = pipe(self.input_text)
|
| 208 |
+
message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.'
|
| 209 |
+
return translation[0]['translation_text'], message
|
| 210 |
+
except Exception as error:
|
| 211 |
+
return f"Error translating with model: {self.model_name}! Try other available language combination or model.", error
|
| 212 |
|
| 213 |
def HelsinkiNLP_mulroa(self):
|
| 214 |
try:
|
|
|
|
| 538 |
return translated_text, message_text
|
| 539 |
message_text = f'Translated from {s_language} to {t_language} with {model_name}'
|
| 540 |
translated_text = None
|
| 541 |
+
try:
|
| 542 |
+
if in model_name == "Helsinki-NLP/opus-mt-tc-bible-big-roa-en":
|
| 543 |
+
translated_text, message_text = Translators(model_name, sl, tl, input_text).simplepipe()
|
| 544 |
+
|
| 545 |
+
elif "-mul" in model_name.lower() or "mul-" in model_name.lower() or "-roa" in model_name.lower():
|
| 546 |
translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP_mulroa()
|
| 547 |
|
| 548 |
elif model_name == "Helsinki-NLP":
|
| 549 |
translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP()
|
| 550 |
|
| 551 |
+
elif "HPLT" in model_name:
|
| 552 |
+
if model_name == "HPLT-OPUS":
|
| 553 |
+
translated_text, message = Translators(model_name, sl, tl, input_text).hplt(opus = True)
|
| 554 |
+
else:
|
| 555 |
+
translated_text, message = Translators(model_name, sl, tl, input_text).hplt()
|
| 556 |
+
|
| 557 |
elif model_name == 'Argos':
|
| 558 |
translated_text = Translators(model_name, sl, tl, input_text).argos()
|
| 559 |
|