# Contains only NLP translation codes! import re import sqlite3 from flask import g from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B") # Setting the model to use tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B") # Setting the tokenizer to use # Main function of the translation feature. Performs translation! def translate_text(input_text, source_language, target_language): # Grabs the source language to be used in the tokenizer tokenizer.src_lang = source_language # Check if the input is empty if not input_text.strip(): raise ValueError("Empty input!") # Validate that the input is in the correct format if not validate_input(input_text): raise ValueError("Incorrect format!") # Creates encoded text encoded_text = tokenizer(input_text, return_tensors="pt") # Generates new tokens using encoded text from source language generated_tokens = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id(target_language), max_new_tokens=512) # Decode and display text translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] return translated_text # Helper function for displaying appropriate language names in flash messages. # Note: Python does not have a built-in switch function, so this is just a rough implementation of the logic. def switch(lang): if lang == "en": return "English" elif lang == "zh": return "Chinese" elif lang == "ms": return "Malay" elif lang == "ta": return "Tamil" elif lang == "th": return "Thai" # User Input Format Validation Function for all 4 languages def validate_input(input_text): # Pattern for English language pattern_en = r'As a (?P[^,.]+), I want to (?P[^,.]+)(,|.)+so that (?P.+)' # Pattern for Chinese language pattern_zh = r'作为(?P[^,.]+),我想要(?P[^,.]+)(,|。)+以便(?P.+)' # Pattern for Malay language pattern_ms = r'Sebagai(?P[^,.]+), saya mahu(?P[^,.]+)(,|.)+supaya(?P.+)' # Pattern for Tamil language pattern_ta = r'என(?P[^,.]+) எனக்கு வேண்டும்(?P[^,.]+)(,|.)+அதனால்(?P.+) பயன்படுத்தி வைக்கும்' # Pattern for Thai language pattern_th = r'ในฐานะ(?P[^,.]+) ฉันต้องการ(?P[^,.]+)(,|.)+เพื่อที่ฉัน(?P.+)' # Try each pattern to see if there is a match match_en = re.search(pattern_en, input_text, flags=re.DOTALL) match_zh = re.search(pattern_zh, input_text, flags=re.DOTALL) match_ms = re.search(pattern_ms, input_text, flags=re.DOTALL) match_ta = re.search(pattern_ta, input_text, flags=re.DOTALL) match_th = re.search(pattern_th, input_text, flags=re.DOTALL) # Return True if at least one pattern matches, otherwise False return bool(match_en or match_zh or match_ms or match_ta or match_th) #For english language only #def validate_input(input_text): #pattern = r'As a (?P[^,.]+), I want to (?P[^,.]+)(,|.)+so that (?P.+)' #match = re.search(pattern, input_text, flags=re.DOTALL) #return bool(match) # Function to grab all contents in the "Translation" table (except for unique ids) # If adding any additional attributes to the table, this has to be updated accordingly def getTranslatedContents(): db = getattr(g, '_database', None) # Gets the _database attribute from the 'g' object. If it does not exist, returns 'None' if db is None: db = g._database = sqlite3.connect('Refineverse.db') # If db is None, create a new connection for db and g._database. cursor = db.cursor() # Creates a cursor object to handle data cursor.execute("SELECT input_text, translated_text FROM Translation") # The cursor executes the query rows = cursor.fetchall() # Stores the results of fetchall() into a variable return rows # Function to insert a new row into the "Translation" table # Using "with" for the connection here seems important, as otherwise it results in an exception def insertTranslationRow(input_text, translated_text): with sqlite3.connect('Refineverse.db') as conn: # 'With' will automatically take care of closing and opening the connection cursor = conn.cursor() cursor.execute("INSERT INTO Translation (input_text, translated_text) VALUES (?, ?)", (input_text, translated_text)) conn.commit()