kaushikbar commited on
Commit
71775e2
1 Parent(s): 36c639f

Multiple language support added.

Browse files
Files changed (2) hide show
  1. app.py +74 -17
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,21 +1,78 @@
1
- import gradio as gr
2
  import datetime
 
 
3
  from transformers import pipeline
4
- classifier = pipeline("zero-shot-classification", model="NbAiLab/nb-bert-base-mnli")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  def sequence_to_classify(sequence, labels):
8
- hypothesis_template = 'Dette eksempelet er {}.'
9
  label_clean = str(labels).split(",")
10
- response = classifier(sequence, label_clean, hypothesis_template=hypothesis_template, multi_class=True)
 
 
 
 
 
 
11
  predicted_labels = response['labels']
12
  predicted_scores = response['scores']
13
  clean_output = {idx: float(predicted_scores.pop(0)) for idx in predicted_labels}
14
- print("Date:{} , Sequece:{}, Labels: {}".format(
15
  str(datetime.datetime.now()),
16
  sequence,
17
- predicted_labels)
18
- )
19
  return clean_output
20
 
21
  example_text1="Folkehelseinstituttets mest optimistiske anslag er at alle voksne er ferdigvaksinert innen midten av september."
@@ -24,19 +81,19 @@ example_text2="Kutt smør i terninger, og la det temperere seg litt mens deigen
24
  example_labels2="helse,sport,religion, mat"
25
 
26
  iface = gr.Interface(
27
- title = "Zero-shot Classification of Norwegian Text",
28
- description = "Demo of zero-shot classification using NB-Bert base model (Norwegian).",
29
  fn=sequence_to_classify,
30
- inputs=[gr.inputs.Textbox(lines=2,
31
- label="Write a norwegian text you would like to classify...",
32
  placeholder="Text here..."),
33
- gr.inputs.Textbox(lines=10,
34
- label="Possible candidate labels",
35
- placeholder="labels here...")],
36
- outputs=gr.outputs.Label(num_top_classes=3),
37
  capture_session=True,
38
- interpretation="default"
39
- ,examples=[
40
  [example_text1, example_labels1],
41
  [example_text2, example_labels2]
42
  ])
 
 
1
  import datetime
2
+ import gradio as gr
3
+ from langdetect import detect, DetectorFactory, detect_langs
4
  from transformers import pipeline
5
+
6
+ models = {'en': 'Narsil/deberta-large-mnli-zero-cls', # English
7
+ 'de': 'Sahajtomar/German_Zeroshot', # German
8
+ 'es': 'Recognai/zeroshot_selectra_medium', # Spanish
9
+ 'it': 'joeddav/xlm-roberta-large-xnli', # Italian
10
+ 'ru': 'DeepPavlov/xlm-roberta-large-en-ru-mnli', # Russian
11
+ 'no': 'NbAiLab/nb-bert-base-mnli'} # Norsk
12
+
13
+ hypothesis_templates = {'en': 'This example is {}.', # English
14
+ 'de': 'Dieses beispiel ist {}.', # German
15
+ 'es': 'Este ejemplo es {}.', # Spanish
16
+ 'it': 'Questo esempio è {}.', # Italian
17
+ 'ru': 'Этот пример {}.', # Russian
18
+ 'no': 'Dette eksempelet er {}.'} # Norsk
19
+
20
+ def detect_lang(sequence, labels):
21
+ DetectorFactory.seed = 0
22
+ seq_lang = 'en'
23
+
24
+ try:
25
+ seq_lang = detect(sequence)
26
+ lbl_lang = detect(labels)
27
+ except:
28
+ print("Language detection failed!",
29
+ "Date:{}, Sequence:{}, Labels:{}".format(
30
+ str(datetime.datetime.now()),
31
+ labels))
32
+
33
+ if seq_lang != lbl_lang:
34
+ print("Different languages detected for sequence and labels!",
35
+ "Date:{}, Sequence:{}, Labels:{}, Sequence Language:{}, Label Language:{}".format(
36
+ str(datetime.datetime.now()),
37
+ sequence,
38
+ labels,
39
+ seq_lang,
40
+ lbl_lang))
41
+
42
+ if seq_lang in models:
43
+ print("Sequence Language detected:",
44
+ "Date:{}, Sequence:{}, Sequence Language:{}".format(
45
+ str(datetime.datetime.now()),
46
+ sequence,
47
+ labels))
48
+ else:
49
+ print("Language not supported. Defaulting to English!",
50
+ "Date:{}, Sequence:{}, Sequence Language:{}".format(
51
+ str(datetime.datetime.now()),
52
+ sequence,
53
+ seq_lang))
54
+ seq_lang = 'en'
55
+
56
+ return seq_lang
57
 
58
 
59
  def sequence_to_classify(sequence, labels):
 
60
  label_clean = str(labels).split(",")
61
+
62
+ lang = detect_lang(sequence, labels)
63
+ classifier = pipeline("zero-shot-classification",
64
+ #hypothesis_template=hypothesis_templates[lang],
65
+ model=models[lang])
66
+ response = classifier(sequence, label_clean, multi_class=True)
67
+
68
  predicted_labels = response['labels']
69
  predicted_scores = response['scores']
70
  clean_output = {idx: float(predicted_scores.pop(0)) for idx in predicted_labels}
71
+ print("Date:{}, Sequence:{}, Labels: {}".format(
72
  str(datetime.datetime.now()),
73
  sequence,
74
+ predicted_labels))
75
+
76
  return clean_output
77
 
78
  example_text1="Folkehelseinstituttets mest optimistiske anslag er at alle voksne er ferdigvaksinert innen midten av september."
 
81
  example_labels2="helse,sport,religion, mat"
82
 
83
  iface = gr.Interface(
84
+ title="Multilingual Multi-label Zero-shot Classification",
85
+ description="Currently supported languages are English, German, Spanish, Italian, Russian, Norsk.",
86
  fn=sequence_to_classify,
87
+ inputs=[gr.inputs.Textbox(lines=20,
88
+ label="Please enter the text you would like to classify...",
89
  placeholder="Text here..."),
90
+ gr.inputs.Textbox(lines=5,
91
+ label="Possible candidate labels (separated by comma)...",
92
+ placeholder="laLels here...")],
93
+ outputs=gr.outputs.Label(num_top_classes=5),
94
  capture_session=True,
95
+ #interpretation="default",
96
+ examples=[
97
  [example_text1, example_labels1],
98
  [example_text2, example_labels2]
99
  ])
requirements.txt CHANGED
@@ -2,3 +2,4 @@ transformers
2
  sentence-transformers
3
  torch
4
  langdetect
 
 
2
  sentence-transformers
3
  torch
4
  langdetect
5
+