Spaces:
Runtime error
Runtime error
File size: 5,790 Bytes
5a34301 87938f5 5a34301 546da88 9c094ba 546da88 0353678 9c094ba 437058b 9c094ba 546da88 9c094ba 546da88 9c094ba 0353678 9c094ba 437058b 9c094ba 546da88 9c094ba 546da88 9c094ba 437058b 9c094ba 990ed62 9c094ba 990ed62 546da88 990ed62 546da88 5a34301 9c094ba 5a34301 9c094ba d8be3b1 7668825 6a6499e 7668825 6a6499e ce1c219 bf1499f ce1c219 9c094ba bf1499f 9c094ba 5a34301 8ca86e2 5a34301 87938f5 5a34301 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
tokenizer = AutoTokenizer.from_pretrained("milyiyo/paraphraser-german-mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("milyiyo/paraphraser-german-mt5-small")
def generate_v1(inputs, count):
"""Generate text using a Beam Search strategy with repetition penalty."""
model_outputs = model.generate(inputs["input_ids"],
early_stopping=True,
length_penalty=1.0,
max_length=1024,
no_repeat_ngram_size=2,
num_beams=10,
repetition_penalty=3.5,
num_return_sequences=count
)
sentences = []
for output in model_outputs:
sentences.append(tokenizer.decode(output, skip_special_tokens=True))
return sentences
def generate_v2(inputs, count):
"""Generate text using a Beam Search strategy."""
model_outputs = model.generate(inputs["input_ids"],
early_stopping=True,
length_penalty=2.0,
max_length=1024,
no_repeat_ngram_size=2,
num_beams=5,
temperature=1.5,
num_return_sequences=count
)
sentences = []
for output in model_outputs:
sentences.append(tokenizer.decode(output, skip_special_tokens=True))
return sentences
def generate_v3(inputs, count):
"""Generate text using a Diverse Beam Search strategy."""
model_outputs = model.generate(inputs["input_ids"],
num_beams=5,
max_length=1024,
temperature=1.5,
num_beam_groups=5,
diversity_penalty=2.0,
no_repeat_ngram_size=2,
early_stopping=True,
length_penalty=2.0,
num_return_sequences=count)
sentences = []
for output in model_outputs:
sentences.append(tokenizer.decode(output, skip_special_tokens=True))
return sentences
def generate_v4(encoding, count):
"""Generate text using a Diverse Beam Search strategy."""
print(encoding)
input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
print(input_ids)
print(attention_masks)
outputs = model.generate(input_ids=input_ids,
attention_mask=attention_masks,
max_length=512,
do_sample=True,
top_k=120,
top_p=0.95,
early_stopping=True,
num_return_sequences=count)
res = []
for output in outputs:
line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
res.append(line)
return res
def paraphrase(sentence: str, count: str):
p_count = int(count)
if p_count <= 0 or len(sentence.strip()) == 0:
return {'result': []}
sentence_input = sentence
text = f"paraphrase: {sentence_input} </s>"
# encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt")
encoding = tokenizer(text, return_tensors="pt")
# input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
# outputs = model.generate(
# input_ids=input_ids, attention_mask=attention_masks,
# max_length=512, # 256,
# do_sample=True,
# top_k=120,
# top_p=0.95,
# early_stopping=True,
# num_return_sequences=p_count
# )
# res = []
# for output in outputs:
# line = tokenizer.decode(
# output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
# res.append(line)
# print(res)
#
# input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
# outputs = model.generate(input_ids=input_ids,
# attention_mask=attention_masks,
# max_length=512,
# do_sample=True,
# top_k=120,
# top_p=0.95,
# early_stopping=True,
# num_return_sequences=p_count)
# result_v4 = []
# for output in outputs:
# line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
# result_v4.append(line)
#
return {
'result': {
'generate_v1':generate_v1(encoding, p_count),
'generate_v2':generate_v2(encoding, p_count),
'generate_v3':generate_v3(encoding, p_count),
'generate_v4':generate_v4(encoding, p_count)
}
}
def paraphrase_dummy(sentence: str, count: str):
return {'result': []}
iface = gr.Interface(fn=paraphrase,
inputs=[
gr.inputs.Textbox(lines=2, placeholder=None, label='Sentence'),
gr.inputs.Number(default=3, label='Paraphrases count'),
],
outputs=[gr.outputs.JSON(label=None)])
iface.launch() |