Spaces:

potsawee
/

mt5-translate-summ

Paused

App Files Files Community

mt5-translate-summ / app.py

potsawee

use sentence split for translation

6acc418 10 months ago

raw history blame contribute delete

No virus

2.9 kB

	import gradio as gr
	import random
	import spacy
	import torch
	from transformers import MT5Tokenizer, MT5ForConditionalGeneration
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	tokenizer = MT5Tokenizer.from_pretrained("potsawee/mt5-english-thai-large-translation")
	translator = MT5ForConditionalGeneration.from_pretrained("potsawee/mt5-english-thai-large-translation")
	summarizer = MT5ForConditionalGeneration.from_pretrained("potsawee/mt5-english-thai-large-summarization")
	translator.eval()
	summarizer.eval()
	translator.to(device)
	summarizer.to(device)
	nlp = spacy.load("en_core_web_sm")

	def generate_output(
	task,
	text,
	):
	if task == 'Translation':
	sentences = [sent.text.strip() for sent in nlp(text).sents] # List[spacy.tokens.span.Span]
	gen_texts = []
	for sentence in sentences:
	inputs = tokenizer(
	[sentence],
	padding="longest",
	max_length=1024,
	truncation=True,
	return_tensors="pt",
	).to(device)
	outputs = translator.generate(
	**inputs,
	max_new_tokens=256,
	)
	gen_text_ = tokenizer.decode(outputs[0], skip_special_tokens=True)
	gen_texts.append(gen_text_)
	return " ".join(gen_texts)

	elif task == 'Summarization':
	inputs = tokenizer(
	[text],
	padding="longest",
	max_length=1024,
	truncation=True,
	return_tensors="pt",
	).to(device)
	outputs = summarizer.generate(
	**inputs,
	max_new_tokens=256,
	)
	gen_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	else:
	raise ValueError("task undefined!")
	return gen_text

	TASKS = ["Translation", "Summarization"]

	demo = gr.Interface(
	fn=generate_output,
	inputs=[
	gr.components.Radio(label="Task", choices=TASKS, value="Translation"),
	gr.components.Textbox(label="Text (in English)", lines=10),
	],
	outputs=gr.Textbox(label="Text (in Thai)", lines=4),
	# examples=[["Building a translation demo with Gradio is so easy!", "eng_Latn", "spa_Latn"]],
	cache_examples=False,
	title="English🇬🇧 to Thai🇹🇭 \| Translation or Summarization",
	description="Provide some text (in English) & select one of the tasks (Translation or Summarization). Note that currently the model only supports text up to 1024 tokens. The base architecture is mt5-large with the embeddings filtered to only English and Thai tokens and fine-tuned to XSum (Eng2Thai) Dataset (https://huggingface.co/datasets/potsawee/xsum_eng2thai). This is only after training for 1 epoch of xsum (the quality is not production-ready), just a quick proof-of-concept about fine-tuning on translated texts.",
	allow_flagging='never'

	)

	demo.launch()