File size: 5,081 Bytes
5a34301
 
 
 
edac3fb
 
 
 
 
 
 
eba4bed
edac3fb
 
 
 
 
 
 
 
 
 
 
eba4bed
 
5a34301
 
546da88
9c094ba
546da88
0353678
9c094ba
 
 
 
437058b
 
9c094ba
 
 
 
 
 
 
546da88
9c094ba
546da88
9c094ba
0353678
9c094ba
 
 
 
437058b
9c094ba
 
 
 
 
 
 
546da88
9c094ba
546da88
9c094ba
 
 
 
 
 
 
437058b
 
9c094ba
 
 
 
 
 
990ed62
9c094ba
990ed62
 
 
 
546da88
 
990ed62
546da88
 
 
 
 
5a34301
 
9c094ba
5a34301
9c094ba
 
 
 
 
 
 
 
 
d8be3b1
9c094ba
 
bf1499f
 
 
 
9c094ba
 
ee1eb9b
 
8ca86e2
5a34301
 
 
 
 
87938f5
 
5a34301
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# versions = {
#     'v1': {
#         'name': "milyiyo/paraphraser-german-mt5-small",
#         'tokenizer': None,
#         'model': None
#     },
#     'v2': {
#         'name':"milyiyo/paraphraser-german-mt5-small-v2",
#         'tokenizer': None,
#         'model': None
#     },
# }

# versions['v1']['tokenizer'] = AutoTokenizer.from_pretrained(versions['v1']['name'])
# versions['v1']['model'] = AutoModelForSeq2SeqLM.from_pretrained(versions['v1']['name'])

# versions['v2']['tokenizer'] = AutoTokenizer.from_pretrained(versions['v2']['name'])
# versions['v2']['model'] = AutoModelForSeq2SeqLM.from_pretrained(versions['v2']['name'])

tokenizer = AutoTokenizer.from_pretrained("milyiyo/paraphraser-german-mt5-small-v2")
model = AutoModelForSeq2SeqLM.from_pretrained("milyiyo/paraphraser-german-mt5-small-v2")


def generate_v1(inputs, count):
    """Generate text using a Beam Search strategy with repetition penalty."""
    model_outputs = model.generate(inputs["input_ids"], 
                                           early_stopping=True,
                                           length_penalty=1.0, 
                                           max_length=1024, 
                                           no_repeat_ngram_size=2, 
                                           num_beams=10, 
                                           repetition_penalty=3.5,
                                           num_return_sequences=count
                                           )
    sentences = []
    for output in model_outputs:
        sentences.append(tokenizer.decode(output, skip_special_tokens=True))
    return sentences


def generate_v2(inputs, count):
    """Generate text using a Beam Search strategy."""
    model_outputs = model.generate(inputs["input_ids"],
                                           early_stopping=True,
                                           length_penalty=2.0,
                                           max_length=1024,
                                           no_repeat_ngram_size=2,
                                           num_beams=5,
                                           temperature=1.5,
                                           num_return_sequences=count
                                          )
    sentences = []
    for output in model_outputs:
        sentences.append(tokenizer.decode(output, skip_special_tokens=True))
    return sentences


def generate_v3(inputs, count):
    """Generate text using a Diverse Beam Search strategy."""
    model_outputs = model.generate(inputs["input_ids"],
                                num_beams=5,
                                max_length=1024,
                                temperature=1.5,
                                num_beam_groups=5,
                                diversity_penalty=2.0,
                                no_repeat_ngram_size=2,
                                early_stopping=True,
                                length_penalty=2.0,
                                num_return_sequences=count)
    sentences = []
    for output in model_outputs:
        sentences.append(tokenizer.decode(output, skip_special_tokens=True))
    return sentences


def generate_v4(encoding, count):
    """Generate text using a Diverse Beam Search strategy."""
    print(encoding)
    input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
    print(input_ids)
    print(attention_masks)
    outputs = model.generate(input_ids=input_ids, 
                             attention_mask=attention_masks,
                             max_length=512,
                             do_sample=True,
                             top_k=120,
                             top_p=0.95,
                             early_stopping=True,
                             num_return_sequences=count)
    res = []
    for output in outputs:
        line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        res.append(line)
    return res


def paraphrase(sentence: str, count: str):
    p_count = int(count)
    if p_count <= 0 or len(sentence.strip()) == 0:
        return {'result': []}
    sentence_input = sentence
    text = f"paraphrase: {sentence_input} </s>"
    encoding = tokenizer(text, return_tensors="pt")
    return {
        'result': {
            'generate_v1':generate_v1(encoding, p_count),
            'generate_v2':generate_v2(encoding, p_count),
            'generate_v3':generate_v3(encoding, p_count),
            'generate_v4':generate_v4(encoding, p_count)
        }
    }


def paraphrase_dummy(sentence: str, count: str):
    return {'result': []}


iface = gr.Interface(fn=paraphrase,
                     inputs=[
                         gr.inputs.Textbox(lines=2, placeholder=None, label='Sentence'),
                         gr.inputs.Number(default=3, label='Paraphrases count'),
                     ],
                     outputs=[gr.outputs.JSON(label=None)])
iface.launch()