|
|
|
import re |
|
import sqlite3 |
|
from flask import g |
|
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer |
|
|
|
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B") |
|
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B") |
|
|
|
|
|
def translate_text(input_text, source_language, target_language): |
|
|
|
|
|
tokenizer.src_lang = source_language |
|
|
|
|
|
if not input_text.strip(): |
|
raise ValueError("Empty input!") |
|
|
|
|
|
if not validate_input(input_text): |
|
raise ValueError("Incorrect format!") |
|
|
|
|
|
encoded_text = tokenizer(input_text, return_tensors="pt") |
|
|
|
|
|
generated_tokens = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id(target_language), max_new_tokens=512) |
|
|
|
|
|
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] |
|
|
|
return translated_text |
|
|
|
|
|
|
|
|
|
def switch(lang): |
|
if lang == "en": |
|
return "English" |
|
elif lang == "zh": |
|
return "Chinese" |
|
elif lang == "ms": |
|
return "Malay" |
|
elif lang == "ta": |
|
return "Tamil" |
|
elif lang == "th": |
|
return "Thai" |
|
|
|
|
|
|
|
def validate_input(input_text): |
|
|
|
|
|
pattern_en = r'As a (?P<role>[^,.]+), I want to (?P<goal>[^,.]+)(,|.)+so that (?P<benefit>.+)' |
|
|
|
|
|
pattern_zh = r'作为(?P<role>[^,.]+),我想要(?P<goal>[^,.]+)(,|。)+以便(?P<benefit>.+)' |
|
|
|
|
|
pattern_ms = r'Sebagai(?P<role>[^,.]+), saya mahu(?P<goal>[^,.]+)(,|.)+supaya(?P<benefit>.+)' |
|
|
|
|
|
pattern_ta = r'என(?P<role>[^,.]+) எனக்கு வேண்டும்(?P<goal>[^,.]+)(,|.)+அதனால்(?P<benefit>.+) பயன்படுத்தி வைக்கும்' |
|
|
|
|
|
pattern_th = r'ในฐานะ(?P<role>[^,.]+) ฉันต้องการ(?P<goal>[^,.]+)(,|.)+เพื่อที่ฉัน(?P<benefit>.+)' |
|
|
|
|
|
match_en = re.search(pattern_en, input_text, flags=re.DOTALL) |
|
match_zh = re.search(pattern_zh, input_text, flags=re.DOTALL) |
|
match_ms = re.search(pattern_ms, input_text, flags=re.DOTALL) |
|
match_ta = re.search(pattern_ta, input_text, flags=re.DOTALL) |
|
match_th = re.search(pattern_th, input_text, flags=re.DOTALL) |
|
|
|
|
|
return bool(match_en or match_zh or match_ms or match_ta or match_th) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getTranslatedContents(): |
|
db = getattr(g, '_database', None) |
|
if db is None: |
|
db = g._database = sqlite3.connect('Refineverse.db') |
|
cursor = db.cursor() |
|
cursor.execute("SELECT input_text, translated_text FROM Translation") |
|
rows = cursor.fetchall() |
|
return rows |
|
|
|
|
|
|
|
|
|
def insertTranslationRow(input_text, translated_text): |
|
with sqlite3.connect('Refineverse.db') as conn: |
|
cursor = conn.cursor() |
|
cursor.execute("INSERT INTO Translation (input_text, translated_text) VALUES (?, ?)", (input_text, translated_text)) |
|
conn.commit() |
|
|