paraphrase_de / app.py
milyiyo's picture
Update app.py
25ced57
raw
history blame
5.74 kB
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
tokenizer = AutoTokenizer.from_pretrained("milyiyo/paraphraser-german-mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("milyiyo/paraphraser-german-mt5-small")
def generate_v1(inputs, count):
"""Generate text using a Beam Search strategy with repetition penalty."""
model_outputs = model.generate(inputs["input_ids"],
early_stopping=True,
length_penalty=1.0,
max_length=1024,
no_repeat_ngram_size=2,
num_beams=10,
repetition_penalty=3.5,
num_return_sequences=count
)
sentences = []
for output in model_outputs:
sentences.append(tokenizer.decode(output, skip_special_tokens=True))
return sentences
def generate_v2(inputs, count):
"""Generate text using a Beam Search strategy."""
model_outputs = model.generate(inputs["input_ids"],
early_stopping=True,
length_penalty=2.0,
max_length=1024,
no_repeat_ngram_size=2,
num_beams=5,
temperature=1.5,
num_return_sequences=count
)
sentences = []
for output in model_outputs:
sentences.append(tokenizer.decode(output, skip_special_tokens=True))
return sentences
def generate_v3(inputs, count):
"""Generate text using a Diverse Beam Search strategy."""
model_outputs = model.generate(inputs["input_ids"],
num_beams=5,
max_length=1024,
temperature=1.5,
num_beam_groups=5,
diversity_penalty=2.0,
no_repeat_ngram_size=2,
early_stopping=True,
length_penalty=2.0,
num_return_sequences=count)
sentences = []
for output in model_outputs:
sentences.append(tokenizer.decode(output, skip_special_tokens=True))
return sentences
def generate_v4(encoding, count):
"""Generate text using a Diverse Beam Search strategy."""
print(encoding)
input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
print(input_ids)
print(attention_masks)
outputs = model.generate(input_ids=input_ids,
attention_mask=attention_masks,
max_length=512,
do_sample=True,
top_k=120,
top_p=0.95,
early_stopping=True,
num_return_sequences=count)
res = []
for output in outputs:
line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
res.append(line)
return res
def paraphrase(sentence: str, count: str):
p_count = int(count)
if p_count <= 0 or len(sentence.strip()) == 0:
return {'result': []}
sentence_input = sentence
text = f"paraphrase: {sentence_input} </s>"
# encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt")
encoding = tokenizer(text, return_tensors="pt")
# input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
# outputs = model.generate(
# input_ids=input_ids, attention_mask=attention_masks,
# max_length=512, # 256,
# do_sample=True,
# top_k=120,
# top_p=0.95,
# early_stopping=True,
# num_return_sequences=p_count
# )
# res = []
# for output in outputs:
# line = tokenizer.decode(
# output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
# res.append(line)
# print(res)
#
input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
outputs = model.generate(input_ids=input_ids,
attention_mask=attention_masks,
max_length=512,
do_sample=True,
top_k=120,
top_p=0.95,
early_stopping=True,
num_return_sequences=p_count)
result_v4 = []
for output in outputs:
line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
result_v4.append(line)
#
return {
'result': {
# 'generate_v1':generate_v1(encoding, count),
# 'generate_v2':generate_v2(encoding, count),
# 'generate_v3':generate_v3(encoding, count),
'generate_v4':result_v4
}
}
def paraphrase_dummy(sentence: str, count: str):
return {'result': []}
iface = gr.Interface(fn=paraphrase,
inputs=[
gr.inputs.Textbox(lines=2, placeholder=None, label='Sentence'),
gr.inputs.Number(default=3, label='Paraphrases count'),
],
outputs=[gr.outputs.JSON(label=None)])
iface.launch()