File size: 4,728 Bytes
147da27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Contains only NLP translation codes!
import re
import sqlite3
from flask import g
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B") # Setting the model to use
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B") # Setting the tokenizer to use

# Main function of the translation feature. Performs translation!
def translate_text(input_text, source_language, target_language):
    
    # Grabs the source language to be used in the tokenizer
    tokenizer.src_lang = source_language
    
    # Check if the input is empty
    if not input_text.strip():
        raise ValueError("Empty input!")
    
    # Validate that the input is in the correct format
    if not validate_input(input_text):
        raise ValueError("Incorrect format!")
    
    # Creates encoded text
    encoded_text = tokenizer(input_text, return_tensors="pt")
    
    # Generates new tokens using encoded text from source language
    generated_tokens = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id(target_language), max_new_tokens=512)
    
    # Decode and display text
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    
    return translated_text


# Helper function for displaying appropriate language names in flash messages.
# Note: Python does not have a built-in switch function, so this is just a rough implementation of the logic. 
def switch(lang):
    if lang == "en":
        return "English"
    elif lang == "zh":
        return "Chinese"
    elif lang == "ms":
        return "Malay"
    elif lang == "ta":
        return "Tamil"
    elif lang == "th":
        return "Thai"


# User Input Format Validation Function for all 4 languages
def validate_input(input_text):
    
    # Pattern for English language
    pattern_en = r'As a (?P<role>[^,.]+), I want to (?P<goal>[^,.]+)(,|.)+so that (?P<benefit>.+)'
    
    # Pattern for Chinese language
    pattern_zh = r'作为(?P<role>[^,.]+),我想要(?P<goal>[^,.]+)(,|。)+以便(?P<benefit>.+)'

    # Pattern for Malay language
    pattern_ms = r'Sebagai(?P<role>[^,.]+), saya mahu(?P<goal>[^,.]+)(,|.)+supaya(?P<benefit>.+)'

    # Pattern for Tamil language
    pattern_ta = r'என(?P<role>[^,.]+) எனக்கு வேண்டும்(?P<goal>[^,.]+)(,|.)+அதனால்(?P<benefit>.+) பயன்படுத்தி வைக்கும்'

    # Pattern for Thai language
    pattern_th = r'ในฐานะ(?P<role>[^,.]+) ฉันต้องการ(?P<goal>[^,.]+)(,|.)+เพื่อที่ฉัน(?P<benefit>.+)'

    # Try each pattern to see if there is a match
    match_en = re.search(pattern_en, input_text, flags=re.DOTALL)
    match_zh = re.search(pattern_zh, input_text, flags=re.DOTALL)
    match_ms = re.search(pattern_ms, input_text, flags=re.DOTALL)
    match_ta = re.search(pattern_ta, input_text, flags=re.DOTALL)
    match_th = re.search(pattern_th, input_text, flags=re.DOTALL)

    # Return True if at least one pattern matches, otherwise False
    return bool(match_en or match_zh or match_ms or match_ta or match_th)

#For english language only
#def validate_input(input_text):
    #pattern = r'As a (?P<role>[^,.]+), I want to (?P<goal>[^,.]+)(,|.)+so that (?P<benefit>.+)'
    #match = re.search(pattern, input_text, flags=re.DOTALL)
    #return bool(match)


# Function to grab all contents in the "Translation" table (except for unique ids)
# If adding any additional attributes to the table, this has to be updated accordingly
def getTranslatedContents():
    db = getattr(g, '_database', None) # Gets the _database attribute from the 'g' object. If it does not exist, returns 'None'
    if db is None:
        db = g._database = sqlite3.connect('Refineverse.db') # If db is None, create a new connection for db and g._database.
        cursor = db.cursor() # Creates a cursor object to handle data
        cursor.execute("SELECT input_text, translated_text FROM Translation") # The cursor executes the query
        rows = cursor.fetchall() # Stores the results of fetchall() into a variable
    return rows


# Function to insert a new row into the "Translation" table
# Using "with" for the connection here seems important, as otherwise it results in an exception
def insertTranslationRow(input_text, translated_text):
    with sqlite3.connect('Refineverse.db') as conn: # 'With' will automatically take care of closing and opening the connection
        cursor = conn.cursor()
        cursor.execute("INSERT INTO Translation (input_text, translated_text) VALUES (?, ?)", (input_text, translated_text))
        conn.commit()