Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
import torch | |
import re | |
import spaces # Import spaces for ZeroGPU compatibility | |
# Load the models and tokenizers for each translation direction | |
# Faroese to English | |
model_faero_eng = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_600M_fo_en") | |
tokenizer_faero_eng = AutoTokenizer.from_pretrained("barbaroo/nllb_200_600M_fo_en", src_lang="fao_Latn") | |
# English to Faroese | |
model_eng_faero = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_1.3B_en_fo") | |
tokenizer_eng_faero = AutoTokenizer.from_pretrained("barbaroo/nllb_200_1.3B_en_fo", src_lang="eng_Latn") | |
# Check if GPU is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model_faero_eng.to(device) | |
model_eng_faero.to(device) | |
# Function to split text into sentences based on simple punctuation | |
def chunk_text_simple(text, max_length, tokenizer): | |
# Split by punctuation (period, question mark, or exclamation mark) | |
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
# Combine sentences until adding more would exceed max_length | |
if len(tokenizer.encode(current_chunk + " " + sentence)) <= max_length: | |
current_chunk += " " + sentence | |
else: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |
# Apply ZeroGPU decorator to ensure the function uses GPU when available | |
def translate_long_text(text, direction, max_length=256): # Reduce max_length to leave room for output | |
# Select the appropriate model and tokenizer | |
if direction == "Faroese to English": | |
model = model_faero_eng | |
tokenizer = tokenizer_faero_eng | |
else: | |
model = model_eng_faero | |
tokenizer = tokenizer_eng_faero | |
# Chunk the text based on max token limit | |
chunks = chunk_text_simple(text, max_length, tokenizer) | |
translated_chunks = [] | |
for chunk in chunks: | |
# Encode and translate each chunk | |
inputs = tokenizer(chunk, return_tensors="pt", max_length=max_length, truncation=True).to(device) | |
outputs = model.generate(inputs.input_ids, num_beams=4, max_length=max_length, early_stopping=True) | |
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
translated_chunks.append(translated_text) | |
# Combine translated chunks | |
return " ".join(translated_chunks) | |
# Gradio interface with scrollable output box | |
iface = gr.Interface( | |
fn=translate_long_text, | |
inputs=[ | |
gr.Textbox(label="Input Text"), | |
gr.Radio(["Faroese to English", "English to Faroese"], label="Translation Direction") | |
], | |
outputs=gr.Textbox(label="Translated Text", lines=20), # Scrollable output box | |
title="Faroese-English Translator", | |
description="Translate between Faroese and English with support for longer texts." | |
) | |
# Launch Gradio Space | |
iface.launch() | |