Spaces:

potsawee
/

mt5-translate-summ

Paused

File size: 2,901 Bytes

ada39d6
 
6acc418
ada39d6
 
 
 
 
 
 
 
 
 
 
6acc418
ada39d6
 
 
 
 
 
6acc418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ada39d6
6acc418
 
 
 
 
 
 
ada39d6
 
 
 
6acc418
ada39d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bfc23d
ada39d6

import gradio as gr
import random
import spacy
import torch
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = MT5Tokenizer.from_pretrained("potsawee/mt5-english-thai-large-translation")
translator = MT5ForConditionalGeneration.from_pretrained("potsawee/mt5-english-thai-large-translation")
summarizer = MT5ForConditionalGeneration.from_pretrained("potsawee/mt5-english-thai-large-summarization")
translator.eval()
summarizer.eval()
translator.to(device)
summarizer.to(device)
nlp = spacy.load("en_core_web_sm")

def generate_output(
    task,
    text,
):
    if task == 'Translation':
        sentences = [sent.text.strip() for sent in nlp(text).sents] # List[spacy.tokens.span.Span]
        gen_texts = []
        for sentence in sentences:
            inputs = tokenizer(
                [sentence],
                padding="longest",
                max_length=1024,
                truncation=True,
                return_tensors="pt",
            ).to(device)
            outputs = translator.generate(
                **inputs,
                max_new_tokens=256,
            )
            gen_text_ = tokenizer.decode(outputs[0], skip_special_tokens=True)
            gen_texts.append(gen_text_)
        return " ".join(gen_texts)

    elif task == 'Summarization':
        inputs = tokenizer(
            [text],
            padding="longest",
            max_length=1024,
            truncation=True,
            return_tensors="pt",
        ).to(device)
        outputs = summarizer.generate(
            **inputs,
            max_new_tokens=256,
        )
        gen_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    else:
        raise ValueError("task undefined!")
    return gen_text

TASKS = ["Translation", "Summarization"]

demo = gr.Interface(
    fn=generate_output,
    inputs=[
        gr.components.Radio(label="Task", choices=TASKS, value="Translation"),
        gr.components.Textbox(label="Text (in English)", lines=10),
    ],
    outputs=gr.Textbox(label="Text (in Thai)", lines=4),
    # examples=[["Building a translation demo with Gradio is so easy!", "eng_Latn", "spa_Latn"]],
    cache_examples=False,
    title="English🇬🇧 to Thai🇹🇭 | Translation or Summarization",
    description="Provide some text (in English) & select one of the tasks (Translation or Summarization). Note that currently the model only supports text up to 1024 tokens. The base architecture is mt5-large with the embeddings filtered to only English and Thai tokens and fine-tuned to XSum (Eng2Thai) Dataset (https://huggingface.co/datasets/potsawee/xsum_eng2thai). This is only after training for 1 epoch of xsum (the quality is not production-ready), just a quick proof-of-concept about fine-tuning on translated texts.",
    allow_flagging='never'

)

demo.launch()