English-Faroese / app.py
barbaroo's picture
Update app.py
b8bccd5 verified
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import re
import spaces # Import spaces for ZeroGPU compatibility
# Load the models and tokenizers for each translation direction
# Faroese to English
model_faero_eng = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_600M_fo_en")
tokenizer_faero_eng = AutoTokenizer.from_pretrained("barbaroo/nllb_200_600M_fo_en", src_lang="fao_Latn")
# English to Faroese
model_eng_faero = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_1.3B_en_fo")
tokenizer_eng_faero = AutoTokenizer.from_pretrained("barbaroo/nllb_200_1.3B_en_fo", src_lang="eng_Latn")
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_faero_eng.to(device)
model_eng_faero.to(device)
# Function to split text into sentences based on simple punctuation
def chunk_text_simple(text, max_length, tokenizer):
# Split by punctuation (period, question mark, or exclamation mark)
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
chunks = []
current_chunk = ""
for sentence in sentences:
# Combine sentences until adding more would exceed max_length
if len(tokenizer.encode(current_chunk + " " + sentence)) <= max_length:
current_chunk += " " + sentence
else:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
@spaces.GPU # Apply ZeroGPU decorator to ensure the function uses GPU when available
def translate_long_text(text, direction, max_length=256): # Reduce max_length to leave room for output
# Select the appropriate model and tokenizer
if direction == "Faroese to English":
model = model_faero_eng
tokenizer = tokenizer_faero_eng
else:
model = model_eng_faero
tokenizer = tokenizer_eng_faero
# Chunk the text based on max token limit
chunks = chunk_text_simple(text, max_length, tokenizer)
translated_chunks = []
for chunk in chunks:
# Encode and translate each chunk
inputs = tokenizer(chunk, return_tensors="pt", max_length=max_length, truncation=True).to(device)
outputs = model.generate(inputs.input_ids, num_beams=4, max_length=max_length, early_stopping=True)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
translated_chunks.append(translated_text)
# Combine translated chunks
return " ".join(translated_chunks)
# Gradio interface with scrollable output box
iface = gr.Interface(
fn=translate_long_text,
inputs=[
gr.Textbox(label="Input Text"),
gr.Radio(["Faroese to English", "English to Faroese"], label="Translation Direction")
],
outputs=gr.Textbox(label="Translated Text", lines=20), # Scrollable output box
title="Faroese-English Translator",
description="Translate between Faroese and English with support for longer texts."
)
# Launch Gradio Space
iface.launch()