Joe Davison commited on
Commit
e690399
1 Parent(s): a922691

update w/ pipelines and xnli model

Browse files
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from transformers import BartForSequenceClassification, BartTokenizer
3
  import torch
4
  import numpy as np
5
  import contextlib
@@ -7,6 +7,8 @@ import plotly.express as px
7
  import pandas as pd
8
  from PIL import Image
9
  import datetime
 
 
10
 
11
  with open("hit_log.txt", mode='a') as file:
12
  file.write(str(datetime.datetime.now()) + '\n')
@@ -14,92 +16,57 @@ with open("hit_log.txt", mode='a') as file:
14
  MODEL_DESC = {
15
  'Bart MNLI': """Bart with a classification head trained on MNLI.\n\nSequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""",
16
  'Bart MNLI + Yahoo Answers': """Bart with a classification head trained on MNLI and then further fine-tuned on Yahoo Answers topic classification.\n\nSequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""",
 
 
 
 
 
 
 
17
  }
18
 
19
  ZSL_DESC = """Recently, the NLP science community has begun to pay increasing attention to zero-shot and few-shot applications, such as in the [paper from OpenAI](https://arxiv.org/abs/2005.14165) introducing GPT-3. This demo shows how 🤗 Transformers can be used for zero-shot topic classification, the task of predicting a topic that the model has not been trained on."""
20
 
21
  CODE_DESC = """```python
22
- # pose sequence as a NLI premise and label as a hypothesis
23
- from transformers import BartForSequenceClassification, BartTokenizer
24
- nli_model = BartForSequenceClassification.from_pretrained('bart-large-mnli')
25
- tokenizer = BartTokenizer.from_pretrained('bart-large-mnli')
26
-
27
- premise = sequence
28
- hypothesis = f'This text is about {label}.'
29
-
30
- # run through model pre-trained on MNLI
31
- x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
32
- max_length=tokenizer.max_len,
33
- truncation_strategy='only_first')
34
- logits = nli_model(x.to(device))[0]
35
-
36
- # we throw away "neutral" (dim 1) and take the probability of
37
- # "entailment" (2) as the probability of the label being true
38
- entail_contradiction_logits = logits[:,[0,2]]
39
- probs = entail_contradiction_logits.softmax(1)
40
- prob_label_is_true = probs[:,1]
41
  ```"""
42
 
43
  model_ids = {
44
- 'Bart MNLI': 'bart-large-mnli',
45
- 'Bart MNLI + Yahoo Answers': './bart_mnli_topics'
 
46
  }
47
 
48
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
49
 
50
  @st.cache(allow_output_mutation=True)
51
  def load_models():
52
- return {id: BartForSequenceClassification.from_pretrained(id).to(device) for id in model_ids.values()}
53
 
54
  models = load_models()
55
 
56
 
57
- @st.cache(allow_output_mutation=True)
58
  def load_tokenizer(tok_id):
59
- return BartTokenizer.from_pretrained(tok_id)
60
 
61
  @st.cache(allow_output_mutation=True, show_spinner=False)
62
- def classify_candidate(nli_model_id, sequence, label, do_print_code):
63
- nli_model = models[nli_model_id]
64
- tokenizer = load_tokenizer('bart-large')
65
-
66
- # pose sequence as a NLI premise and label as a hypothesis
67
- premise = sequence
68
- hypothesis = f'This text is about {label}.'
69
-
70
- # run through model pre-trained on MNLI
71
- x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
72
- max_length=tokenizer.max_len,
73
- truncation_strategy='only_first')
74
- with torch.no_grad():
75
- logits = nli_model(x.to(device))[0]
76
-
77
- # we throw away "neutral" (dim 1) and take the probability of
78
- # "entailment" (2) as the probability of the label being true
79
- entail_contradiction_logits = logits[:,[0,2]]
80
- probs = entail_contradiction_logits.softmax(1)
81
- prob_label_is_true = probs[:,1]
82
-
83
- return prob_label_is_true.cpu()
84
-
85
- def get_most_likely(nli_model_id, sequence, labels, do_print_code):
86
- predictions = []
87
- for label in labels:
88
- predictions.append(classify_candidate(nli_model_id, sequence, label, do_print_code))
89
- do_print_code = False #only print code once per run
90
- predictions = torch.cat(predictions)
91
-
92
- most_likely = predictions.argsort().numpy()
93
- top_topics = np.array(labels)[most_likely]
94
- scores = predictions[most_likely].detach().numpy()
95
- return top_topics, scores
96
-
97
- @st.cache(allow_output_mutation=True)
98
- def get_sentence_model(model_id):
99
- return SentenceTransformer(model_id)
100
-
101
- def load_examples():
102
- df = pd.read_json('texts.json')
103
  names = df.name.values.tolist()
104
  mapping = {df['name'].iloc[i]: (df['text'].iloc[i], df['labels'].iloc[i]) for i in range(len(names))}
105
  names.append('Custom')
@@ -107,6 +74,8 @@ def load_examples():
107
  return names, mapping
108
 
109
  def plot_result(top_topics, scores):
 
 
110
  scores *= 100
111
  fig = px.bar(x=scores, y=top_topics, orientation='h',
112
  labels={'x': 'Confidence', 'y': 'Label'},
@@ -125,8 +94,6 @@ def main():
125
  with open("style.css") as f:
126
  st.markdown('<style>{}</style>'.format(f.read()), unsafe_allow_html=True)
127
 
128
- ex_names, ex_map = load_examples()
129
-
130
  logo = Image.open('huggingface_logo.png')
131
  st.sidebar.image(logo, width=120)
132
  st.sidebar.markdown(ZSL_DESC)
@@ -136,11 +103,17 @@ def main():
136
  st.sidebar.markdown(MODEL_DESC[model_desc])
137
  st.sidebar.markdown('Originally proposed by [Yin et al. (2019)](https://arxiv.org/abs/1909.00161). Read more in our [blog post](https://joeddav.github.io/blog/2020/05/29/ZSL.html).')
138
 
 
 
 
139
  st.title('Zero Shot Topic Classification')
140
  example = st.selectbox('Choose an example', ex_names)
141
  height = min((len(ex_map[example][0].split()) + 1) * 2, 200)
142
  sequence = st.text_area('Text', ex_map[example][0], key='sequence', height=height)
143
- labels = st.text_input('Possible topics (comma-separated)', ex_map[example][1], max_chars=1000)
 
 
 
144
 
145
  labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
146
  if len(labels) == 0 or len(sequence) == 0:
@@ -148,14 +121,15 @@ def main():
148
  return
149
 
150
  if do_print_code:
151
- st.markdown(CODE_DESC)
152
-
153
- model_id = model_ids[model_desc]
154
 
155
  with st.spinner('Classifying...'):
156
- top_topics, scores = get_most_likely(model_id, sequence, labels, do_print_code)
 
 
157
 
158
- plot_result(top_topics[-10:], scores[-10:])
 
159
 
160
 
161
 
 
1
  import streamlit as st
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
  import torch
4
  import numpy as np
5
  import contextlib
 
7
  import pandas as pd
8
  from PIL import Image
9
  import datetime
10
+ import os
11
+ import psutil
12
 
13
  with open("hit_log.txt", mode='a') as file:
14
  file.write(str(datetime.datetime.now()) + '\n')
 
16
  MODEL_DESC = {
17
  'Bart MNLI': """Bart with a classification head trained on MNLI.\n\nSequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""",
18
  'Bart MNLI + Yahoo Answers': """Bart with a classification head trained on MNLI and then further fine-tuned on Yahoo Answers topic classification.\n\nSequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""",
19
+ 'XLM Roberta XNLI (cross-lingual)': """XLM Roberta, a cross-lingual model, with a classification head trained on XNLI. Supported languages include: _English, French, Spanish, German, Greek, Bulgarian, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, Hindi, Swahili, and Urdu_.
20
+
21
+ Note that this model seems to be less reliable than the English-only models when classifying longer sequences.
22
+
23
+ Examples were automatically translated and may contain grammatical mistakes.
24
+
25
+ Sequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""",
26
  }
27
 
28
  ZSL_DESC = """Recently, the NLP science community has begun to pay increasing attention to zero-shot and few-shot applications, such as in the [paper from OpenAI](https://arxiv.org/abs/2005.14165) introducing GPT-3. This demo shows how 🤗 Transformers can be used for zero-shot topic classification, the task of predicting a topic that the model has not been trained on."""
29
 
30
  CODE_DESC = """```python
31
+ from transformers import pipeline
32
+ classifier = pipeline('zero-shot-classification',
33
+ model='{}')
34
+ hypothesis_template = 'This text is about {{}}.' # the template used in this demo
35
+
36
+ classifier(sequence, labels,
37
+ hypothesis_template=hypothesis_template,
38
+ multi_class=multi_class)
39
+ # {{'sequence' ..., 'labels': ..., 'scores': ...}}
 
 
 
 
 
 
 
 
 
 
40
  ```"""
41
 
42
  model_ids = {
43
+ 'Bart MNLI': 'facebook/bart-large-mnli',
44
+ 'Bart MNLI + Yahoo Answers': 'joeddav/bart-large-mnli-yahoo-answers',
45
+ 'XLM Roberta XNLI (cross-lingual)': 'joeddav/xlm-roberta-large-xnli'
46
  }
47
 
48
+ device = 0 if torch.cuda.is_available() else -1
49
 
50
  @st.cache(allow_output_mutation=True)
51
  def load_models():
52
+ return {id: AutoModelForSequenceClassification.from_pretrained(id) for id in model_ids.values()}
53
 
54
  models = load_models()
55
 
56
 
57
+ @st.cache(allow_output_mutation=True, show_spinner=False)
58
  def load_tokenizer(tok_id):
59
+ return AutoTokenizer.from_pretrained(tok_id)
60
 
61
  @st.cache(allow_output_mutation=True, show_spinner=False)
62
+ def get_most_likely(nli_model_id, sequence, labels, hypothesis_template, multi_class, do_print_code):
63
+ classifier = pipeline('zero-shot-classification', model=models[nli_model_id], tokenizer=load_tokenizer(nli_model_id), device=device)
64
+ outputs = classifier(sequence, labels, hypothesis_template, multi_class)
65
+ return outputs['labels'], outputs['scores']
66
+
67
+ def load_examples(model_id):
68
+ model_id_stripped = model_id.split('/')[-1]
69
+ df = pd.read_json(f'texts-{model_id_stripped}.json')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  names = df.name.values.tolist()
71
  mapping = {df['name'].iloc[i]: (df['text'].iloc[i], df['labels'].iloc[i]) for i in range(len(names))}
72
  names.append('Custom')
 
74
  return names, mapping
75
 
76
  def plot_result(top_topics, scores):
77
+ top_topics = np.array(top_topics)
78
+ scores = np.array(scores)
79
  scores *= 100
80
  fig = px.bar(x=scores, y=top_topics, orientation='h',
81
  labels={'x': 'Confidence', 'y': 'Label'},
 
94
  with open("style.css") as f:
95
  st.markdown('<style>{}</style>'.format(f.read()), unsafe_allow_html=True)
96
 
 
 
97
  logo = Image.open('huggingface_logo.png')
98
  st.sidebar.image(logo, width=120)
99
  st.sidebar.markdown(ZSL_DESC)
 
103
  st.sidebar.markdown(MODEL_DESC[model_desc])
104
  st.sidebar.markdown('Originally proposed by [Yin et al. (2019)](https://arxiv.org/abs/1909.00161). Read more in our [blog post](https://joeddav.github.io/blog/2020/05/29/ZSL.html).')
105
 
106
+ model_id = model_ids[model_desc]
107
+ ex_names, ex_map = load_examples(model_id)
108
+
109
  st.title('Zero Shot Topic Classification')
110
  example = st.selectbox('Choose an example', ex_names)
111
  height = min((len(ex_map[example][0].split()) + 1) * 2, 200)
112
  sequence = st.text_area('Text', ex_map[example][0], key='sequence', height=height)
113
+ labels = st.text_input('Possible topics (separated by `,`)', ex_map[example][1], max_chars=1000)
114
+ multi_class = st.checkbox('Allow multiple correct topics', value=True)
115
+
116
+ hypothesis_template = "This text is about {}."
117
 
118
  labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
119
  if len(labels) == 0 or len(sequence) == 0:
 
121
  return
122
 
123
  if do_print_code:
124
+ st.markdown(CODE_DESC.format(model_id))
 
 
125
 
126
  with st.spinner('Classifying...'):
127
+ top_topics, scores = get_most_likely(model_id, sequence, labels, hypothesis_template, multi_class, do_print_code)
128
+
129
+ plot_result(top_topics[::-1][-10:], scores[::-1][-10:])
130
 
131
+ if "socat" not in [p.name() for p in psutil.process_iter()]:
132
+ os.system('socat tcp-listen:8000,reuseaddr,fork tcp:localhost:8001 &')
133
 
134
 
135
 
texts.json → texts-bart-large-mnli-yahoo-answers.json RENAMED
File without changes
texts-bart-large-mnli.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": {
3
+ "0":"\"Jupyter's Biggest Moons Started as Tiny Grains of Hail\"",
4
+ "1":"Who are you voting for in 2020?",
5
+ "2":"Attention is all you need",
6
+ "3":"IMDB Avengers Review",
7
+ "4":"Bose QuietComfort"
8
+ }, "text": {
9
+ "0":"Jupiter\u2019s Biggest Moons Started as Tiny Grains of Hail\n\nA new model offers an explanation for how the Galilean satellites formed around the solar system\u2019s largest world.\n\nKonstantin Batygin did not set out to solve one of the solar system\u2019s most puzzling mysteries when he went for a run up a hill in Nice, France. Dr. Batygin, a Caltech researcher, best known for his contributions to the search for the solar system\u2019s missing \u201cPlanet Nine,\u201d spotted a beer bottle. At a steep, 20 degree grade, he wondered why it wasn\u2019t rolling down the hill.\n\nHe realized there was a breeze at his back holding the bottle in place. Then he had a thought that would only pop into the mind of a theoretical astrophysicist: \u201cOh! This is how Europa formed.\u201d\n\nEuropa is one of Jupiter\u2019s four large Galilean moons. And in a paper published Monday in the Astrophysical Journal, Dr. Batygin and a co-author, Alessandro Morbidelli, a planetary scientist at the C\u00f4te d\u2019Azur Observatory in France, present a theory explaining how some moons form around gas giants like Jupiter and Saturn, suggesting that millimeter-sized grains of hail produced during the solar system\u2019s formation became trapped around these massive worlds, taking shape one at a time into the potentially habitable moons we know today.",
10
+ "1": "Who are you voting for in 2020?",
11
+ "2": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.",
12
+ "3": "Are you a fan of epic adventure movies? Then this is your dream come true! Truly this is the ultimate superhero mash-up and it's executed perfectly. Props to the filmmakers for taking the time to design it to be more than just a superhero film packed with action scenes and adding depth to each character.",
13
+ "4": "What happens when you clear away the noisy distractions of the world? Concentration goes to the next level. You get deeper into your music, your work, or whatever you want to focus on. That’s the power of Bose QuietComfort 35 wireless headphones II. Put them on and get closer to what you’re most passionate about. And that’s just the beginning. QuietComfort 35 wireless headphones II are now enabled with Bose AR — an innovative, audio-only take on augmented reality. Embedded inside your headphones is a multi-directional motion sensor. One that Bose AR can utilize to provide contextual audio based on where you are. Unlock Bose AR via a firmware update through the Bose Connect app. They’re Alexa-enabled, too, so you can enjoy entertainment, get information, and manage your day — all without looking at your phone. Adjust your level of noise cancelling between three settings using the Action button or the Bose Connect app. Volume-optimized EQ gives you balanced audio performance at any volume, and a noise-rejecting dual-microphone system provides clearer calls, even in noisy environments. And with easy Bluetooth pairing, 20 hours of battery life, and a durable, comfortable fit — you can keep the music or the quiet going all day long. Included: QuietComfort 35 II, carrying case, charging cable, audio cable for enjoying music without battery power."
14
+ }, "labels": {
15
+ "0":"space & cosmos, scientific discovery, microbiology, robots, archeology",
16
+ "1":"foreign policy, Europe, elections, business, 2020, outdoor recreation, politics",
17
+ "2":"machine learning, statistics, translation, vision",
18
+ "3":"films, action, superheroes, books",
19
+ "4":"electronics, headphones, health & wellness, furniture, software, pet supplies"
20
+ }
21
+ }
texts-xlm-roberta-large-xnli.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": {
3
+ "0":"Who are you voting for in 2020? (Russian / French)",
4
+ "1":"Who are you voting for in 2020? (Arabic)",
5
+ "2":"Who are you voting for in 2020? (Turkish)",
6
+ "3":"IMDB Avengers Review (German / Spanish)",
7
+ "4":"Bose QuietComfort (Chinese)"
8
+ }, "text": {
9
+ "0": "За кого вы голосуете в 2020 году?",
10
+ "1": "لمن تصوت في 2020؟",
11
+ "2": "2020'de kime oy vereceksiniz?",
12
+ "3": "Bist du ein Fan von epischen Abenteuerfilmen? Dann wird dein Traum wahr! Dies ist wirklich das ultimative Superhelden-Mashup und es ist perfekt ausgeführt. Wir danken den Filmemachern, dass sie sich die Zeit genommen haben, es so zu gestalten, dass es mehr als nur ein Superheldenfilm ist, der voller Actionszenen ist und jedem Charakter Tiefe verleiht.",
13
+ "4": "当您清除世界上嘈杂的干扰时会发生什么?集中精力进入下一个层次。您可以更深入地了解音乐,作品或想要关注的任何事物。这就是Bose QuietComfort 35无线耳机II的强大功能。穿上它们,靠近您最热衷的事物。这仅仅是开始。现在,Bose AR启用了QuietComfort 35无线耳机II-一种创新的纯音频增强现实。耳机内部嵌入了一个多向运动传感器。 Bose AR可以利用它来根据您所在的位置提供上下文音频。通过Bose Connect应用程序通过固件更新解锁Bose AR。它们也支持Alexa,因此您无需看手机就可以享受娱乐,获取信息并管理一天。使用操作按钮或Bose Connect应用程序在三个设置之间调整消除噪音的水平。音量优化的均衡器可在任何音量下为您提供平衡的音频性能,即使在嘈杂的环境中,降噪双麦克风系统也可提供更清晰的通话。借助轻松的蓝牙配对,20小时的电池寿命以及持久,舒适的佩戴-您可以整天保持音乐或安静的氛围。包括:QuietComfort 35 II,手提箱,充电线,音频线,无需电池即可欣赏音乐。"
14
+ }, "labels": {
15
+ "0":"politique étrangère, Europe, élections, affaires, 2020, loisirs de plein air, politique",
16
+ "1":"السياسة الخارجية, أوروبا, الانتخابات, الأعمال التجارية, 2020, الترفيه في الهواء الطلق, السياسة",
17
+ "2":"dış politika, Avrupa, seçimler, iş, 2020, açık hava rekreasyonu, siyaset",
18
+ "3":"películas, acción, superhéroes, libros",
19
+ "4":"电子产品, 耳机, 健康与保健, 家具, 软件, 宠物用品"
20
+ }
21
+ }