File size: 2,143 Bytes
f038f7c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
import torch
from nltk.tokenize import sent_tokenize, LineTokenizer
import math
import nltk
nltk.download('punkt_tab')
# Load the translation model and tokenizer from Hugging Face
model_name = "opus-mt-id-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Define the translation function with adaptive input handling
def translate_id_en(text):
# Tokenize the input into lines and sentences
lt = LineTokenizer()
batch_size = 8
paragraphs = lt.tokenize(text)
translated_paragraphs = []
for paragraph in paragraphs:
sentences = sent_tokenize(paragraph)
batches = math.ceil(len(sentences) / batch_size)
translated = []
# Process sentences in batches
for i in range(batches):
sent_batch = sentences[i * batch_size:(i + 1) * batch_size]
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True)
# Generate translation
with torch.no_grad():
translated_batch = model.generate(**model_inputs)
# Decode the generated tokens into text
translated += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch]
translated_paragraphs.append(" ".join(translated))
# Combine all paragraphs into the final translated text
translated_text = "\n\n".join(translated_paragraphs)
return translated_text
# Define the Gradio interface
iface = gr.Interface(
fn=translate_id_en, # Function to translate text
inputs=gr.Textbox(lines=12, placeholder="Enter Indonesian text...", label="Input (Indonesian)"), # Input box
outputs=gr.Textbox(lines=12, label="Output (English)"), # Output box
title="Indonesian to English Translator", # Title of the app
description="Translate Indonesian text to English using the opus-mt-id-en model."
)
# Launch the Gradio interface locally
iface.launch()
|