Berbex commited on
Commit
fe33dd6
β€’
1 Parent(s): 2de0a92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -6
app.py CHANGED
@@ -1,5 +1,7 @@
1
- #!pip install -q transformers datasets torch gradio console_logging numpy
 
2
 
 
3
  import torch
4
  from datasets import load_dataset
5
  from console_logging.console import Console
@@ -9,8 +11,6 @@ from transformers import TrainingArguments, Trainer
9
  from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
10
  from transformers import EvalPrediction
11
  import torch
12
- import gradio as gr
13
-
14
  console = Console()
15
 
16
  dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
@@ -112,7 +112,8 @@ def compute_metrics(p: EvalPrediction):
112
  labels=p.label_ids)
113
  return result
114
 
115
- """trainer = Trainer(
 
116
  model,
117
  args,
118
  train_dataset=encoded_dataset["train"],
@@ -120,9 +121,126 @@ def compute_metrics(p: EvalPrediction):
120
  tokenizer=tokenizer,
121
  compute_metrics=compute_metrics
122
  )
 
 
 
 
123
  """
124
 
125
- # REMOVE THIS IN COLAB #############
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  text_ = "Bitcoin to the moon"
128
  model = torch.load("./model.pt", map_location=torch.device('cpu'))
@@ -168,7 +286,7 @@ with demo:
168
  """)
169
  inp = [gr.Textbox(label='Text or tweet text', placeholder="Insert text")]
170
  out = gr.Textbox(label='Output')
171
- text_button = gr.Button("Flip")
172
  text_button.click(predict, inputs=inp, outputs=out)
173
 
174
 
 
1
+ """ CODE TO TRY IN COLAB
2
+ !pip install -q transformers datasets torch gradio console_logging numpy
3
 
4
+ import gradio as gr
5
  import torch
6
  from datasets import load_dataset
7
  from console_logging.console import Console
 
11
  from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
12
  from transformers import EvalPrediction
13
  import torch
 
 
14
  console = Console()
15
 
16
  dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
 
112
  labels=p.label_ids)
113
  return result
114
 
115
+
116
+ trainer = Trainer(
117
  model,
118
  args,
119
  train_dataset=encoded_dataset["train"],
 
121
  tokenizer=tokenizer,
122
  compute_metrics=compute_metrics
123
  )
124
+
125
+ trainer.train()
126
+
127
+ trainer.evaluate()
128
  """
129
 
130
+ # Version to gradio and HuggingFace, doesn't works like the colab version, this version use the exported model, possible without the fine tuning
131
+
132
+ import torch
133
+ from datasets import load_dataset
134
+ from console_logging.console import Console
135
+ import numpy as np
136
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
137
+ from transformers import TrainingArguments, Trainer
138
+ from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
139
+ from transformers import EvalPrediction
140
+ import torch
141
+ import gradio as gr
142
+
143
+ console = Console()
144
+
145
+ dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
146
+
147
+
148
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
149
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
150
+
151
+ #labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
152
+
153
+ labels = ["Bearish", "Bullish", "Neutral"]
154
+
155
+ def preprocess_data(examples):
156
+ # take a batch of texts
157
+ text = examples["text"]
158
+ # encode them
159
+ encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
160
+ # add labels
161
+ #labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
162
+ labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
163
+ for i in range (len(examples['label'])):
164
+ labels_batch["Bearish"].append(False)
165
+ labels_batch["Bullish"].append(False)
166
+ labels_batch["Neutral"].append(False)
167
+
168
+ if examples['label'][i] == 0:
169
+ labels_batch["Bearish"][i] = True
170
+
171
+ elif examples['label'][i] == 1:
172
+ labels_batch["Bullish"][i] = True
173
+
174
+ else:
175
+ labels_batch["Neutral"][i] = True
176
+
177
+ # create numpy array of shape (batch_size, num_labels)
178
+ labels_matrix = np.zeros((len(text), len(labels)))
179
+ # fill numpy array
180
+ for idx, label in enumerate(labels):
181
+ labels_matrix[:, idx] = labels_batch[label]
182
+
183
+ encoding["labels"] = labels_matrix.tolist()
184
+
185
+ return encoding
186
+
187
+ encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
188
+
189
+ encoded_dataset.set_format("torch")
190
+
191
+ id2label = {idx:label for idx, label in enumerate(labels)}
192
+ label2id = {label:idx for idx, label in enumerate(labels)}
193
+
194
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
195
+ problem_type="multi_label_classification",
196
+ num_labels=len(labels),
197
+ id2label=id2label,
198
+ label2id=label2id)
199
+
200
+ batch_size = 8
201
+ metric_name = "f1"
202
+
203
+ args = TrainingArguments(
204
+ f"bert-finetuned-sem_eval-english",
205
+ evaluation_strategy = "epoch",
206
+ save_strategy = "epoch",
207
+ learning_rate=2e-5,
208
+ per_device_train_batch_size=batch_size,
209
+ per_device_eval_batch_size=batch_size,
210
+ num_train_epochs=5,
211
+ weight_decay=0.01,
212
+ load_best_model_at_end=True,
213
+ metric_for_best_model=metric_name,
214
+ #push_to_hub=True,
215
+ )
216
+
217
+ # source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
218
+ def multi_label_metrics(predictions, labels, threshold=0.5):
219
+ # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
220
+ sigmoid = torch.nn.Sigmoid()
221
+ probs = sigmoid(torch.Tensor(predictions))
222
+ # next, use threshold to turn them into integer predictions
223
+ y_pred = np.zeros(probs.shape)
224
+ y_pred[np.where(probs >= threshold)] = 1
225
+ # finally, compute metrics
226
+ y_true = labels
227
+ f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
228
+ roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
229
+ accuracy = accuracy_score(y_true, y_pred)
230
+ # return as dictionary
231
+ metrics = {'f1': f1_micro_average,
232
+ 'roc_auc': roc_auc,
233
+ 'accuracy': accuracy}
234
+ return metrics
235
+
236
+ def compute_metrics(p: EvalPrediction):
237
+ preds = p.predictions[0] if isinstance(p.predictions,
238
+ tuple) else p.predictions
239
+ result = multi_label_metrics(
240
+ predictions=preds,
241
+ labels=p.label_ids)
242
+ return result
243
+
244
 
245
  text_ = "Bitcoin to the moon"
246
  model = torch.load("./model.pt", map_location=torch.device('cpu'))
 
286
  """)
287
  inp = [gr.Textbox(label='Text or tweet text', placeholder="Insert text")]
288
  out = gr.Textbox(label='Output')
289
+ text_button = gr.Button("Get the text sentiment")
290
  text_button.click(predict, inputs=inp, outputs=out)
291
 
292