File size: 20,169 Bytes
8f59407
 
f9b8716
8f59407
 
ee7626a
8f59407
ba2a3ae
 
 
 
 
 
 
 
 
7d6f3d4
3255b79
 
 
 
 
 
 
 
 
 
 
 
ba2a3ae
3255b79
 
 
ba2a3ae
 
 
 
9d25f83
ba2a3ae
 
 
 
 
 
 
 
8f59407
 
 
 
6c19101
8f59407
 
38c66c7
123980d
38dd92a
ba01e7c
09834f8
ddb7329
38c66c7
0325d36
ddb7329
453f168
ca9e330
c629a2b
ddb7329
38c66c7
ca9e330
09834f8
c629a2b
f9b8716
0325d36
f9b8716
ca9e330
6c19101
 
f9b8716
 
ca68a3c
09834f8
c629a2b
ca68a3c
09834f8
c629a2b
ca68a3c
f9b8716
 
1469e49
09834f8
c629a2b
1469e49
f9b8716
c629a2b
1469e49
f9b8716
 
 
496bd7a
f9b8716
0325d36
496bd7a
f9b8716
0325d36
 
496bd7a
f9b8716
 
c93dad0
6805c60
 
 
 
8f59407
cce4feb
 
 
 
 
 
 
 
123980d
 
 
8f59407
1469e49
8f59407
a26ce70
b42a3ee
 
95a7d48
 
 
0598d8e
a26ce70
26bcd5f
 
aab3062
a26ce70
 
123980d
5daac23
7d6f3d4
f08b866
ba2a3ae
8c5ac11
5daac23
ba2a3ae
0325d36
 
 
 
 
 
 
 
 
 
5c594a5
 
a26ce70
123980d
 
 
 
 
 
 
 
 
272ce33
978d584
123980d
0325d36
38c66c7
 
0325d36
 
 
 
38c66c7
 
 
0325d36
 
38c66c7
 
0325d36
09834f8
 
 
 
ecffc39
0325d36
ca68a3c
 
 
 
 
 
 
 
 
 
 
ecffc39
 
0325d36
ca68a3c
 
09834f8
 
 
 
 
ecffc39
0325d36
1469e49
 
 
 
 
 
 
 
 
 
 
ecffc39
 
c629a2b
 
1469e49
 
09834f8
 
 
 
ecffc39
0325d36
496bd7a
 
 
 
 
 
e3def2b
 
0325d36
c629a2b
496bd7a
 
f9b8716
09834f8
f9b8716
 
ecffc39
0325d36
6805c60
 
 
 
 
 
 
 
 
 
 
ecffc39
 
c629a2b
 
6805c60
 
3255b79
cce4feb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5daac23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95a7d48
5daac23
 
 
 
 
 
 
95a7d48
5daac23
 
95a7d48
5daac23
3255b79
 
8f59407
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import gradio as gr
import spaces
from transformers import pipeline
import torch

DESCRIPTION="""

### a Turkish encoder-decoder language model 

Welcome to our Huggingface space, where you can explore the capabilities of TURNA. 
			
**Key Features of TURNA:**

- **Powerful Architecture:** TURNA contains 1.1B parameters, and was pre-trained with an encoder-decoder architecture following the UL2 framework on 43B tokens from various domains.
- **Diverse Training Data:** Our model is trained on a varied dataset of 43 billion tokens, covering a wide array of domains.
- **Broad Applications:** TURNA is fine-tuned for a variety of generation and understanding tasks, including:
    - Summarization
    - Paraphrasing
    - News title generation
    - Sentiment classification
    - Text categorization
    - Named entity recognition
    - Part-of-speech tagging
    - Semantic textual similarity
    - Natural language inference

**Note:** First inference might take time as the models are downloaded on-the-go.

*TURNA can generate toxic content or provide erroneous information. Double-check before usage.*

"""

CITATION = """
Refer to our [paper](https://arxiv.org/abs/2401.14373) for more details.

### Citation
```bibtex
@misc{uludogan2024turna,
	title={TURNA: A Turkish Encoder-Decoder Language Model for Enhanced Understanding and Generation}, 
	author={Gökçe Uludoğan and Zeynep Yirmibeşoğlu Balal and Furkan Akkurt and Melikşah Türker and Onur Güngör and Susan Üsküdarlı},
	year={2024},
	eprint={2401.14373},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}
```
"""


sentiment_example = [["Bu üründen çok memnun kaldım."]]
long_text = [["Eyfel Kulesi (Fransızca: La tour Eiffel [la tuʀ ɛˈfɛl]), Paris'teki demir kule. Kule, aynı zamanda tüm dünyada Fransa'nın sembolü halini almıştır. İsmini, inşa ettiren Fransız inşaat mühendisi Gustave Eiffel'den alır.[1] En büyük turizm cazibelerinden biri olan Eyfel Kulesi, yılda 6 milyon turist çeker. 2002 yılında toplam ziyaretçi sayısı 200 milyona ulaşmıştır."], ["Kalp krizi geçirenlerin yaklaşık üçte birinin kısa bir süre önce grip atlattığı düşünülüyor. Peki grip virüsü ne yapıyor da kalp krizine yol açıyor? Karpuz şöyle açıkladı: Grip virüsü kanın yapışkanlığını veya pıhtılaşmasını artırıyor."]]
ner_example = [["Benim adım Turna."]]
t2t_example = [["Paraphrase: Bu üründen çok memnun kaldım."]]
nli_example = [["Bunu çok beğendim.", "Bunu çok sevdim."]]
text_category_example = [[" anadolu_efes e 18 lik star ! beko_basketbol_ligi nde iddialı bir kadroyla sezona giren anadolu_efes transfer harekatına devam ediyor"]]



@spaces.GPU
def nli(first_input, second_input, model_choice="turna_nli_nli_tr"):
    
    if model_choice=="turna_nli_nli_tr":
        input = f"hipotez: {first_input} önerme: {second_input}"
        nli_model = pipeline(model="boun-tabi-LMG/turna_nli_nli_tr", device=0) 
        return nli_model(input)[0]["generated_text"]
    else:
        input = f"ilk cümle: {first_input} ikinci cümle: {second_input}"
        stsb_model = pipeline(model="boun-tabi-LMG/turna_semantic_similarity_stsb_tr", device=0)

        return stsb_model(input)[0]["generated_text"]


@spaces.GPU
def sentiment_analysis(input, model_choice="turna_classification_17bintweet_sentiment"):
    sentiment_model = pipeline(model=f"boun-tabi-LMG/{model_choice}", device=0) 
    return sentiment_model(input, max_new_tokens = 4)[0]["generated_text"]

@spaces.GPU
def pos(input, model_choice, max_new_tokens, length_penalty, no_repeat_ngram_size): 
    if model_choice=="turna_pos_imst":
        pos_imst = pipeline(model="boun-tabi-LMG/turna_pos_imst", device=0) 
        return pos_imst(input, max_new_tokens = max_new_tokens, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)[0]["generated_text"]
    else:
        pos_boun = pipeline(model="boun-tabi-LMG/turna_pos_boun", device=0)
        return pos_boun(input, max_new_tokens = max_new_tokens, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)[0]["generated_text"]

@spaces.GPU
def ner(input, model_choice, max_new_tokens, length_penalty, no_repeat_ngram_size):
    if model_choice=="turna_ner_wikiann":
        ner_wikiann = pipeline(model="boun-tabi-LMG/turna_ner_wikiann", device=0) 
        return ner_wikiann(input, max_new_tokens = max_new_tokens, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)[0]["generated_text"]
    else:
        ner_model = pipeline(model="boun-tabi-LMG/turna_ner_milliyet", device=0) 
        return ner_model(input, max_new_tokens = max_new_tokens, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)[0]["generated_text"]


@spaces.GPU
def paraphrase(input, model_choice, max_new_tokens):
    if model_choice=="turna_paraphrasing_tatoeba":
        paraphrasing = pipeline(model="boun-tabi-LMG/turna_paraphrasing_tatoeba", device=0) 
        return paraphrasing(input, max_new_tokens = max_new_tokens)[0]["generated_text"]
    else:
        paraphrasing_sub = pipeline(model="boun-tabi-LMG/turna_paraphrasing_opensubtitles", device=0) 

        return paraphrasing_sub(input, max_new_tokens = max_new_tokens)[0]["generated_text"]
        
@spaces.GPU    
def summarize(input, model_choice, max_new_tokens, length_penalty, no_repeat_ngram_size):
    model_mapping = {"turna_summarization_tr_news": "boun-tabi-LMG/turna_summarization_tr_news",
                     "turna_summarization_mlsum": "boun-tabi-LMG/turna_summarization_mlsum"}
    summarization_model = pipeline(model=model_mapping[model_choice], device=0) 
    return summarization_model(input, max_new_tokens = max_new_tokens, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)[0]["generated_text"]

@spaces.GPU    
def generate_title(input, model_choice, max_new_tokens, length_penalty, no_repeat_ngram_size):
    model_mapping = {"turna_title_generation_tr_news": "boun-tabi-LMG/turna_title_generation_tr_news",
                     "turna_title_generation_mlsum": "boun-tabi-LMG/turna_title_generation_mlsum"}
    summarization_model = pipeline(model=model_mapping[model_choice], device=0) 
    return summarization_model(input, max_new_tokens = max_new_tokens, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)[0]["generated_text"]


@spaces.GPU
def categorize(input):
    ttc = pipeline(model="boun-tabi-LMG/turna_classification_ttc4900", device=0) 

    return ttc(input, max_new_tokens = 8)[0]["generated_text"]

@spaces.GPU
def turna(input, max_new_tokens, length_penalty,
                                    top_k, top_p, temp, num_beams,
                                    do_sample, no_repeat_ngram_size, repetition_penalty, turna_model_version):
                                        
    turna = pipeline(model=f"boun-tabi-LMG/{turna_model_version}", device=0) 
    input = f"[S2S] {input}<EOS>"

    return turna(input, max_new_tokens = max_new_tokens, length_penalty=length_penalty,
                                    top_k=top_k, top_p=top_p, temperature=temp, num_beams=num_beams,
                                    do_sample = do_sample, no_repeat_ngram_size=no_repeat_ngram_size, repetition_penalty=repetition_penalty)[0]["generated_text"]


with gr.Blocks(theme="abidlabs/Lime") as demo:

    gr.Markdown("# TURNA")
    gr.Image("images/turna-logo.png", width=100, show_label=False, show_download_button=False, show_share_button=False)

    with gr.Tab("TURNA"):
        gr.Markdown(DESCRIPTION)
    
    with gr.Tab("Sentiment Analysis"):
        gr.Markdown("TURNA fine-tuned on sentiment analysis. Enter text to analyse sentiment and pick the model (tweets or product reviews).")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    sentiment_choice = gr.Radio(choices = ["turna_classification_17bintweet_sentiment", "turna_classification_tr_product_reviews"], label ="Model", value="turna_classification_17bintweet_sentiment")
                    sentiment_input = gr.Textbox(label="Sentiment Analysis Input")
                
                    sentiment_submit = gr.Button()
                sentiment_output = gr.Textbox(label="Sentiment Analysis Output")
                sentiment_submit.click(sentiment_analysis, inputs=[sentiment_input, sentiment_choice], outputs=sentiment_output)
            sentiment_examples = gr.Examples(examples = sentiment_example, inputs = [sentiment_input, sentiment_choice], outputs=sentiment_output, fn=sentiment_analysis)
        
    with gr.Tab("Text Categorization"):
        gr.Markdown("TURNA fine-tuned on text categorization. Enter text to categorize text or try the example.")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    text_input = gr.Textbox(label="Text Categorization Input")
                
                    text_submit = gr.Button()
                text_output = gr.Textbox(label="Text Categorization Output")
                text_submit.click(categorize, inputs=[text_input], outputs=text_output)
            text_examples = gr.Examples(examples = text_category_example,inputs=[text_input], outputs=text_output, fn=categorize)
        
    
    with gr.Tab("NLI & STS"):
        gr.Markdown("TURNA fine-tuned on natural language inference or semantic textual similarity. Enter text to infer entailment or measure semantic similarity. ")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    nli_choice = gr.Radio(choices = ["turna_nli_nli_tr", "turna_semantic_similarity_stsb_tr"], label ="Model", value="turna_nli_nli_tr")
                    nli_first_input = gr.Textbox(label="First Sentence")
                    nli_second_input = gr.Textbox(label="Second Sentence")

                    nli_submit = gr.Button()
                nli_output = gr.Textbox(label="NLI Output")
                nli_submit.click(nli, inputs=[nli_first_input, nli_second_input, nli_choice], outputs=nli_output)
            nli_examples = gr.Examples(examples = nli_example, inputs = [nli_first_input, nli_second_input, nli_choice], outputs=nli_output, fn=nli)
    
    with gr.Tab("POS"):
        gr.Markdown("TURNA fine-tuned on part-of-speech-tagging. Enter text to parse parts of speech and pick the model.")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    pos_choice = gr.Radio(choices = ["turna_pos_imst", "turna_pos_boun"], label ="Model", value="turna_pos_imst")
                    with gr.Accordion("Advanced Generation Parameters"):
                        max_new_tokens = gr.Slider(label = "Maximum length",
                        		minimum = 0,
                        		maximum = 64,
                        		value = 64)
                        length_penalty = gr.Slider(label = "Length penalty",
                                    minimum = -10,
                        		    maximum = 10,
                            		value=2.0)
                        no_repeat_ngram_size =gr.Slider(label="No Repeat N-Gram Size", minimum=0,value=3,)
                with gr.Column():    
                    pos_input = gr.Textbox(label="POS Input")
                    pos_submit = gr.Button()
                pos_output = gr.Textbox(label="POS Output")
                pos_submit.click(pos, inputs=[pos_input, pos_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=pos_output)
            pos_examples = gr.Examples(examples = ner_example, inputs = [pos_input, pos_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=pos_output, fn=pos)
    
    with gr.Tab("NER"):
        gr.Markdown("TURNA fine-tuned on named entity recognition. Enter text to parse named entities and pick the model.")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    ner_choice = gr.Radio(choices = ["turna_ner_wikiann", "turna_ner_milliyet"], label ="Model", value="turna_ner_wikiann")
                    with gr.Accordion("Advanced Generation Parameters"):
                        max_new_tokens = gr.Slider(label = "Maximum length",
                        		minimum = 0,
                        		maximum = 64,
                        		value = 64)
                        length_penalty = gr.Slider(label = "Length penalty",
                                    minimum = -10,
                        		    maximum = 10,
                            		value=2.0)
                        no_repeat_ngram_size =gr.Slider(label="No Repeat N-Gram Size", minimum=0,value=3,)
                with gr.Column():    
                    ner_input = gr.Textbox(label="NER Input")
                    ner_submit = gr.Button()
                ner_output = gr.Textbox(label="NER Output")
                
                ner_submit.click(ner, inputs=[ner_input, ner_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=ner_output)
            ner_examples = gr.Examples(examples = ner_example, inputs = [ner_input, ner_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=ner_output, fn=ner)
    with gr.Tab("Paraphrase"):
        gr.Markdown("TURNA fine-tuned on paraphrasing. Enter text to paraphrase and pick the model.")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    paraphrasing_choice = gr.Radio(choices = ["turna_paraphrasing_tatoeba", "turna_paraphrasing_opensubtitles"], label ="Model", value="turna_paraphrasing_tatoeba")
                    with gr.Accordion("Advanced Generation Parameters"):
                        max_new_tokens = gr.Slider(label = "Maximum length",
                        		minimum = 0,
                        		maximum = 20,
                        		value = 20)
                with gr.Column():
                    paraphrasing_input = gr.Textbox(label = "Paraphrasing Input")
                    paraphrasing_submit = gr.Button()
                paraphrasing_output = gr.Text(label="Paraphrasing Output")
                
            paraphrasing_submit.click(paraphrase, inputs=[paraphrasing_input, paraphrasing_choice, max_new_tokens], outputs=paraphrasing_output)
            paraphrase_examples = gr.Examples(examples = long_text, inputs = [paraphrasing_input, paraphrasing_choice, max_new_tokens], outputs=paraphrasing_output,  fn=paraphrase)
    with gr.Tab("Summarization"):
        gr.Markdown("TURNA fine-tuned on summarization. Enter text to summarize and pick the model.")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    sum_choice = gr.Radio(choices = ["turna_summarization_mlsum", "turna_summarization_tr_news"], label ="Model", value="turna_summarization_mlsum")
                    with gr.Accordion("Advanced Generation Parameters"):
                        max_new_tokens = gr.Slider(label = "Maximum length",
                        		minimum = 0,
                        		maximum = 512,
                        		value = 128)
                        length_penalty = gr.Slider(label = "Length penalty",
                                    minimum = -10,
                        		    maximum = 10,
                            		value=2.0)
                        no_repeat_ngram_size =gr.Slider(label="No Repeat N-Gram Size", minimum=0,value=3,)
                with gr.Column():
                    sum_input = gr.Textbox(label = "Summarization Input")
                    sum_submit = gr.Button()
                sum_output = gr.Textbox(label = "Summarization Output")
                
                sum_submit.click(summarize, inputs=[sum_input, sum_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=sum_output)
            sum_examples = gr.Examples(examples = long_text, inputs = [sum_input, sum_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=sum_output,  fn=summarize)
    
    with gr.Tab("Title Generation"):
        gr.Markdown("TURNA fine-tuned on news title generation. Enter news text to generate a title.")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    title_choice = gr.Radio(choices = ["turna_title_generation_tr_news", "turna_title_generation_mlsum"], label ="Model", value="turna_title_generation_tr_news")
                    with gr.Accordion("Advanced Generation Parameters"):
                        max_new_tokens = gr.Slider(label = "Maximum length",
                        		minimum = 0,
                        		maximum = 64,
                        		value = 64)
                        length_penalty = gr.Slider(label = "Length penalty",
                                    minimum = -10,
                        		    maximum = 10,
                            		value=2.0)
                        no_repeat_ngram_size =gr.Slider(label="No Repeat N-Gram Size", minimum=0,value=3,)
                with gr.Column():
                    title_input = gr.Textbox(label = "News Title Generation Input")
                    title_submit = gr.Button()
                title_output = gr.Textbox(label = "News Title Generation Output")
                
                title_submit.click(generate_title, inputs=[title_input, title_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=title_output)
            title_examples = gr.Examples(examples = long_text, inputs = [title_input, title_choice, max_new_tokens, length_penalty, no_repeat_ngram_size], outputs=title_output,  fn=generate_title)

    with gr.Tab("Text Generation"):
        gr.Markdown("Pre-trained TURNA. Enter text to start generating.")
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    with gr.Accordion("Advanced Generation Parameters"):
                        max_new_tokens = gr.Slider(label = "Maximum length",
                        		minimum = 0,
                        		maximum = 512,
                        		value = 128)
                        length_penalty = gr.Slider(label = "Length penalty",
                            		value=1.0)
                        top_k = gr.Slider(label = "Top-k", value=10)
                        top_p = gr.Slider(label = "Top-p", value=0.95)
                        temp = gr.Slider(label = "Temperature", value=1.0, minimum=0.1, maximum=100.0)
                        no_repeat_ngram_size =gr.Slider(label="No Repeat N-Gram Size", minimum=0,value=3,)
                        repetition_penalty = gr.Slider(label = "Repetition Penalty", minimum=0.0, value=3.1, step=0.1)
                        num_beams = gr.Slider(label = "Number of beams", minimum=1,
                            		maximum=10, value=3)
                        do_sample = gr.Radio(choices = [True, False], value = True, label = "Sampling")
                        turna_model_version = gr.Radio(choices = ["TURNA", "TURNA-2850K", "TURNA-4350K"], value = "TURNA", label = "Choose TURNA model version")
                with gr.Column():
                    text_gen_input = gr.Textbox(label="Text Generation Input")
                
                    text_gen_submit = gr.Button()
                text_gen_output = gr.Textbox(label="Text Generation Output")
            text_gen_submit.click(turna, inputs=[text_gen_input, max_new_tokens, length_penalty,
                                    top_k, top_p, temp, num_beams,
                                    do_sample, no_repeat_ngram_size, repetition_penalty, turna_model_version], outputs=text_gen_output)
            text_gen_example = [["Bir varmış, bir yokmuş, evvel zaman içinde, kalbur saman içinde, uzak diyarların birinde bir turna"]]
            text_gen_examples = gr.Examples(examples = text_gen_example, inputs = [text_gen_input, max_new_tokens, length_penalty,
                                    top_k, top_p, temp, num_beams, do_sample, no_repeat_ngram_size, repetition_penalty, turna_model_version], outputs=text_gen_output, fn=turna)

    gr.Markdown(CITATION)

demo.launch()