File size: 5,010 Bytes
6004edc
 
 
 
 
 
 
c2fc6a6
 
0170397
 
c2fc6a6
6004edc
 
c2fc6a6
 
 
6004edc
 
 
c2fc6a6
 
 
 
 
 
 
6004edc
 
 
 
 
 
 
 
 
 
c2fc6a6
 
 
 
 
 
 
6004edc
c2fc6a6
 
 
6004edc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fc6a6
 
6004edc
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import datetime
import gradio as gr
from huggingface_hub import hf_hub_download
from langdetect import detect, DetectorFactory, detect_langs
import fasttext
from transformers import pipeline

models = {'en': 'Narsil/deberta-large-mnli-zero-cls', # English
          'ru': 'DeepPavlov/xlm-roberta-large-en-ru-mnli', # Russian
          #'uz': 'coppercitylabs/uzbek-news-category-classifier'
          'uz': 'amberoad/bert-multilingual-passage-reranking-msmarco'
          } #Uzbek


hypothesis_templates = {'en': 'This example is {}.', # English
                        'ru': 'Этот пример {}.', # Russian
                        'uz': 'Бу мисол {}.'} # Uzbek
                       
                        
classifiers = {'en': pipeline("zero-shot-classification", hypothesis_template=hypothesis_templates['en'],
                              model=models['en']),
              
               'ru': pipeline("zero-shot-classification", hypothesis_template=hypothesis_templates['ru'],
                              model=models['ru']),
                'uz': pipeline("zero-shot-classification", hypothesis_template=hypothesis_templates['uz'],
                              model=models['uz'])
              }

fasttext_model = fasttext.load_model(hf_hub_download("julien-c/fasttext-language-id", "lid.176.bin"))

def prep_examples():
    example_text1 = "Coronavirus disease (COVID-19) is an infectious disease caused by the SARS-CoV-2 virus. Most \
    people who fall sick with COVID-19 will experience mild to moderate symptoms and recover without special treatment. \
    However, some will become seriously ill and require medical attention."
    example_labels1 = "business,health related,politics,climate change"

    
    example_text2 = "Том был невероятно рад встрече со своим другом, ученным из Китая, который занимается искусственным интелектом."
    example_labels2 = "наука,политика"

    example_text3 = "Алишер Навоий ўзбек классик шоири, буюк ижодкор ва ватанпарвар инсон бўлган."
    example_labels3 = "шеърият,спорт, санъат"

    
    examples = [
        [example_text1, example_labels1],
        [example_text2, example_labels2],
        [example_text3, example_labels3]
        ]

    return examples

def detect_lang(sequence, labels):
    DetectorFactory.seed = 0
    seq_lang = 'en'

    try:
        #seq_lang = detect(sequence)
        #lbl_lang = detect(labels)
        seq_lang = fasttext_model.predict(sequence, k=1)[0][0].split("__label__")[1]
        lbl_lang = fasttext_model.predict(labels, k=1)[0][0].split("__label__")[1]
    except:
        print("Language detection failed!",
              "Date:{}, Sequence:{}, Labels:{}".format(
                  str(datetime.datetime.now()),
                  labels))

    if seq_lang != lbl_lang:
        print("Different languages detected for sequence and labels!",
              "Date:{}, Sequence:{}, Labels:{}, Sequence Language:{}, Label Language:{}".format(
                  str(datetime.datetime.now()),
                  sequence,
                  labels,
                  seq_lang,
                  lbl_lang))

    if seq_lang in models:
        print("Sequence Language detected.",
              "Date:{}, Sequence:{}, Sequence Language:{}".format(
                  str(datetime.datetime.now()),
                  sequence,
                  seq_lang))
    else:
        print("Language not supported. Defaulting to English!",
              "Date:{}, Sequence:{}, Sequence Language:{}".format(
                  str(datetime.datetime.now()),
                  sequence,
                  seq_lang))
        seq_lang = 'en'

    return seq_lang

def sequence_to_classify(sequence, labels):
    classifier = classifiers[detect_lang(sequence, labels)]

    label_clean = str(labels).split(",")
    response = classifier(sequence, label_clean, multi_label=True)

    predicted_labels = response['labels']
    predicted_scores = response['scores']
    clean_output = {idx: float(predicted_scores.pop(0)) for idx in predicted_labels}
    print("Date:{}, Sequence:{}, Labels: {}".format(
        str(datetime.datetime.now()),
        sequence,
        predicted_labels))

    return clean_output

iface = gr.Interface(
    title="En-Ru-Uz Multi-label Zero-shot Classification",
    description="Supported languages are: English, Russian and Uzbek",
    fn=sequence_to_classify,
    inputs=[gr.inputs.Textbox(lines=10,
        label="Please enter the text you would like to classify...",
        placeholder="Text here..."),
        gr.inputs.Textbox(lines=2,
        label="Please enter the candidate labels (separated by comma)...",
        placeholder="Labels here separated by comma...")],
    outputs=gr.outputs.Label(num_top_classes=5),
    #interpretation="default",
    examples=prep_examples())

iface.launch()