Berbex commited on
Commit
71d11a0
β€’
1 Parent(s): 5a64574

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -30
app.py CHANGED
@@ -3,16 +3,22 @@ import torch
3
  from datasets import load_dataset
4
  from console_logging.console import Console
5
  import numpy as np
 
 
 
 
 
6
  console = Console()
7
 
8
  dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
9
 
10
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
11
 
12
  model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
13
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
14
 
15
- labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
 
 
16
 
17
  def preprocess_data(examples):
18
  # take a batch of texts
@@ -20,7 +26,22 @@ def preprocess_data(examples):
20
  # encode them
21
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
22
  # add labels
23
- labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # create numpy array of shape (batch_size, num_labels)
25
  labels_matrix = np.zeros((len(text), len(labels)))
26
  # fill numpy array
@@ -38,7 +59,8 @@ encoded_dataset.set_format("torch")
38
  id2label = {idx:label for idx, label in enumerate(labels)}
39
  label2id = {label:idx for idx, label in enumerate(labels)}
40
 
41
- model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
 
42
  num_labels=len(labels),
43
  id2label=id2label,
44
  label2id=label2id)
@@ -46,8 +68,6 @@ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
46
  batch_size = 8
47
  metric_name = "f1"
48
 
49
- from transformers import TrainingArguments, Trainer
50
-
51
  args = TrainingArguments(
52
  f"bert-finetuned-sem_eval-english",
53
  evaluation_strategy = "epoch",
@@ -62,11 +82,6 @@ args = TrainingArguments(
62
  #push_to_hub=True,
63
  )
64
 
65
-
66
- from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
67
- from transformers import EvalPrediction
68
- import torch
69
-
70
  # source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
71
  def multi_label_metrics(predictions, labels, threshold=0.5):
72
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
@@ -106,22 +121,4 @@ trainer = Trainer(
106
 
107
  trainer.train()
108
 
109
- trainer.evaluate()
110
-
111
- """
112
-
113
- categories = ('Car in good condition','Damaged Car')
114
-
115
- def is_car(x) : return x[0].isupper()
116
-
117
- def image_classifier(img):
118
- pred,index,probs = learn.predict(img)
119
- return dict(zip(categories, map(float,probs)))
120
-
121
- # image = gr.inputs.Image(shape=(192,192))
122
- image = gr.components.Image(shape=(192,192))
123
- label = gr.components.Label()
124
- examples = ['./car.jpg','./crash.jpg','./carf.jpg']
125
-
126
- intf = gr.Interface(fn= image_classifier,inputs=image,outputs=label,examples=examples)
127
- intf.launch()"""
 
3
  from datasets import load_dataset
4
  from console_logging.console import Console
5
  import numpy as np
6
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
7
+ from transformers import TrainingArguments, Trainer
8
+ from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
9
+ from transformers import EvalPrediction
10
+ import torch
11
  console = Console()
12
 
13
  dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
14
 
 
15
 
16
  model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
17
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
18
 
19
+ #labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
20
+
21
+ labels = ["Bearish", "Bullish", "Neutral"]
22
 
23
  def preprocess_data(examples):
24
  # take a batch of texts
 
26
  # encode them
27
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
28
  # add labels
29
+ #labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
30
+ labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
31
+ for i in range (len(examples['label'])):
32
+ labels_batch["Bearish"].append(False)
33
+ labels_batch["Bullish"].append(False)
34
+ labels_batch["Neutral"].append(False)
35
+
36
+ if examples['label'][i] == 0:
37
+ labels_batch["Bearish"][i] = True
38
+
39
+ elif examples['label'][i] == 1:
40
+ labels_batch["Bullish"][i] = True
41
+
42
+ else:
43
+ labels_batch["Neutral"][i] = True
44
+
45
  # create numpy array of shape (batch_size, num_labels)
46
  labels_matrix = np.zeros((len(text), len(labels)))
47
  # fill numpy array
 
59
  id2label = {idx:label for idx, label in enumerate(labels)}
60
  label2id = {label:idx for idx, label in enumerate(labels)}
61
 
62
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
63
+ problem_type="multi_label_classification",
64
  num_labels=len(labels),
65
  id2label=id2label,
66
  label2id=label2id)
 
68
  batch_size = 8
69
  metric_name = "f1"
70
 
 
 
71
  args = TrainingArguments(
72
  f"bert-finetuned-sem_eval-english",
73
  evaluation_strategy = "epoch",
 
82
  #push_to_hub=True,
83
  )
84
 
 
 
 
 
 
85
  # source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
86
  def multi_label_metrics(predictions, labels, threshold=0.5):
87
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
 
121
 
122
  trainer.train()
123
 
124
+ trainer.evaluate()