File size: 3,231 Bytes
9173e6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import re

# GPU check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

# Set languages
src_lang, tgt_lang = "en_XX", "ta_IN"

# Define a set of technical terms (expand this list as needed)
technical_terms = {
    "machine translation", "natural language processing", "nlp", "transformer architecture",
    "machine learning", "deep learning", "artificial intelligence", "ai", "neural network",
    "algorithms", "data science", "big data", "cloud computing", "internet of things", "iot",
    "blockchain", "cybersecurity", "virtual reality", "vr", "augmented reality", "ar",
    "robotics", "automation", "quantum computing", "5g", "edge computing", "devops",
    "microservices", "api", "serverless", "container", "docker", "kubernetes", "ml",
    "computer vision", "natural language understanding", "nlu", "speech recognition",
    "sentiment analysis", "chatbot", "reinforcement learning", "supervised learning",
    "unsupervised learning", "convolutional neural network", "cnn", "recurrent neural network", "rnn",
    "long short-term memory", "lstm", "generative adversarial network", "gan",
    "transfer learning", "federated learning", "explainable ai", "xai"
}

def preprocess_text(text):
    # Wrap technical terms with special tokens
    for term in sorted(technical_terms, key=len, reverse=True):
        pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
        text = pattern.sub(lambda m: f"<keep>{m.group()}</keep>", text)
    return text

def postprocess_text(text):
    # Replace special tokens with original terms
    return re.sub(r'<keep>(.*?)</keep>', r'**\1**', text)

def translate(text, src_lang=src_lang, tgt_lang=tgt_lang):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Tokenize the preprocessed text
    inputs = tokenizer(preprocessed_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate translation
    translated = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=512,
        num_beams=5,
        length_penalty=1.0,
        early_stopping=True
    )
    
    # Decode the generated tokens
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    
    # Postprocess the translated text
    return postprocess_text(translated_text)

# Gradio interface
def gradio_translate(text):
    return translate(text)

iface = gr.Interface(
    fn=gradio_translate,
    inputs=gr.Textbox(lines=5, label="English Text"),
    outputs=gr.Textbox(lines=5, label="Tamil Translation"),
    title="English to Tamil Translation with Technical Terms Preserved",
    description="This app translates English text to Tamil while preserving technical terms."
)

iface.launch()