lucabadiali commited on
Commit
85c00b7
·
1 Parent(s): 5396065

Added env config file

Browse files
.gitignore CHANGED
@@ -5,4 +5,4 @@ data/__pycache__
5
  data/dataset
6
  app/__pycache__
7
  *.pyc
8
- .env
 
5
  data/dataset
6
  app/__pycache__
7
  *.pyc
8
+ .env
env_config.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ export MODEL_SOURCE="hf"
2
+ export EVAL_SAMPLE_SIZE=100
3
+ export EVAL_PERIOD_MIN=30
4
+ export EVAL_BATCH_SIZE=64
5
+ export TRAIN_FRACTION_SIZE=0.2
6
+ export EVAL_FRACTION_SIZE=0.4
prometheus.yml CHANGED
@@ -1,5 +1,5 @@
1
  global:
2
- scrape_interval: 30s # più gentile verso HF
3
 
4
  scrape_configs:
5
  - job_name: "fastapi_hf"
@@ -7,9 +7,9 @@ scrape_configs:
7
  metrics_path: /metrics
8
  static_configs:
9
  - targets:
10
- #- "host.docker.internal:8000"
11
- - "lucabadiali-ml-ops-project.hf.space:443"
12
- # Se la Space è privata, aggiungi:
13
  # authorization:
14
  # type: Bearer
15
  # credentials: "hf_XXXXXXXXXXXX" # token read-only
 
1
  global:
2
+ scrape_interval: 60s # più gentile verso HF
3
 
4
  scrape_configs:
5
  - job_name: "fastapi_hf"
 
7
  metrics_path: /metrics
8
  static_configs:
9
  - targets:
10
+ - "host.docker.internal:8000"
11
+ #- "lucabadiali-ml-ops-project.hf.space:443"
12
+ # Se l#a Space è privata, aggiungi:
13
  # authorization:
14
  # type: Bearer
15
  # credentials: "hf_XXXXXXXXXXXX" # token read-only
src/app/__pycache__/config.cpython-311.pyc CHANGED
Binary files a/src/app/__pycache__/config.cpython-311.pyc and b/src/app/__pycache__/config.cpython-311.pyc differ
 
src/app/app.py CHANGED
@@ -1,84 +1,77 @@
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
 
 
 
 
 
 
 
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
- from .utils import preprocess, load_model_and_tokenizer
4
  from scipy.special import softmax
5
  import numpy as np
6
- from pydantic import BaseModel
7
  import urllib.request
8
  import csv
9
- import requests
10
- from typing import Union, List
11
- import torch
12
- from .config import MODEL_SOURCE, ModelSource
13
- from prometheus_fastapi_instrumentator import Instrumentator
14
 
15
  ##################
 
 
16
  from prometheus_client import Counter, Gauge
17
  from apscheduler.schedulers.background import BackgroundScheduler
18
- from datetime import datetime
19
- import os
20
- import random
21
- import pandas as pd
22
  #################
23
 
24
 
25
- #############
26
- from .config import EVAL_BATCH_SIZE, N_SAMPLES, DATASET_PATH, EVAL_PERIOD_MIN
27
- from .utils import load_dataset
28
- ###########
29
 
 
 
30
  app = FastAPI()
31
  Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
32
 
33
- ###################
34
- # ---------- Metrics (custom) ----------
35
- # Production predictions distribution (unlabeled)
36
- # PRED_COUNTER = Counter(
37
- # "sentiment_requests_total",
38
- # "Total predictions served by label",
39
- # ["label"]
40
- # )
41
-
42
-
43
-
44
- # EVAL_SAMPLE_SIZE = Gauge(
45
- # "model_evaluation_sample_size",
46
- # "Number of samples used in the latest periodic evaluation"
47
- # )
48
- # EVAL_COUNTER_DIST = Counter(
49
- # "sentiment_test_distribution_total",
50
- # "Cumulative predicted label counts on evaluation samples",
51
- # ["label"]
52
- # )
53
- # EVAL_RUNS = Counter(
54
- # "model_evaluations_total",
55
- # "Total number of evaluation runs completed"
56
- # )
57
- ##################
58
-
59
-
60
 
61
 
62
 
63
 
 
 
64
  class SentimentQuery(BaseModel):
65
  input_texts: Union[str, List[str]]
66
 
 
 
 
 
 
 
 
 
67
  mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
68
  with urllib.request.urlopen(mapping_link) as f:
69
  html = f.read().decode('utf-8').split("\n")
70
  csvreader = csv.reader(html, delimiter='\t')
71
  labels = [row[1] for row in csvreader if len(row) > 1]
 
72
 
73
- tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
74
- model.eval()
75
 
76
  @app.get("/")
77
  def read_root():
78
  return {"status": "ok", "message": "Sentiment API is running"}
79
 
80
  @app.post("/predict")
81
- async def analyze_text(query:SentimentQuery):
 
 
 
 
82
 
83
  if isinstance(query.input_texts, str):
84
  input_texts = [query.input_texts]
@@ -102,8 +95,6 @@ async def analyze_text(query:SentimentQuery):
102
  for i,text in enumerate(input_texts):
103
 
104
  predicted = labels[pred_labels[i]]
105
- #PRED_COUNTER.labels(label=predicted).inc()
106
-
107
 
108
  response_body.append(
109
  {
@@ -124,16 +115,25 @@ async def analyze_text(query:SentimentQuery):
124
 
125
 
126
 
127
- def evaluate_accuracy():
 
 
 
 
 
 
 
 
 
128
  dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
129
- N_BATCHES = len(dataset["text"])//EVAL_BATCH_SIZE
130
 
131
  accuracy = 0
132
  for i in range(N_BATCHES+1):
133
  if i == N_BATCHES :
134
- samples, labels = dataset["text"][i*EVAL_BATCH_SIZE:], dataset["label"][i*EVAL_BATCH_SIZE:]
135
  else:
136
- samples, labels = dataset["text"][i*EVAL_BATCH_SIZE:(i+1)*EVAL_BATCH_SIZE], dataset["label"][i*EVAL_BATCH_SIZE:(i+1)*EVAL_BATCH_SIZE]
137
 
138
  model.eval()
139
  encoded_batch = tokenizer(
@@ -150,37 +150,88 @@ def evaluate_accuracy():
150
  scores = softmax(logits, axis=-1)
151
  pred_labels = scores.argmax(axis=-1)
152
  accuracy += sum(pred_labels==labels)
 
153
  accuracy/=N_SAMPLES
154
  return accuracy
155
 
156
 
157
- # Evaluation metrics (labeled test set)
158
- EVAL_ACCURACY = Gauge(
159
- "model_evaluation_accuracy",
160
- "Accuracy on latest periodic evaluation of labeled test subset"
 
161
  )
162
 
163
- from apscheduler.schedulers.background import BackgroundScheduler
164
- from datetime import datetime, timedelta
165
- import threading
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  _model_lock = threading.Lock()
168
 
169
- def _run_eval_and_set_gauge():
170
- # If you expect concurrent requests to /predict, the lock prevents GPU/torch contention
 
 
 
171
  with _model_lock:
172
- acc = evaluate_accuracy()
173
  EVAL_ACCURACY.set(acc)
174
 
 
 
 
 
 
175
 
176
- scheduler = BackgroundScheduler(daemon=True)
177
 
178
  @app.on_event("startup")
179
  def _start_scheduler():
 
180
  # run once soon after startup
181
- scheduler.add_job(_run_eval_and_set_gauge, next_run_time=datetime.now() + timedelta(seconds=2))
182
  # then every EVAL_PERIOD_MIN minutes
183
- scheduler.add_job(_run_eval_and_set_gauge, "interval", minutes=EVAL_PERIOD_MIN)
 
 
 
 
 
 
184
  scheduler.start()
185
 
186
  @app.on_event("shutdown")
 
1
+ ###### IMPORTS
2
+
3
+ ########
4
+ # Imports for app and model creation and
5
  from fastapi import FastAPI, HTTPException
6
+ from pydantic import BaseModel
7
+ import requests
8
+ from typing import Union, List
9
+
10
+ ##########
11
+ # Imports for model creation/usage
12
+ import torch
13
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
14
  from scipy.special import softmax
15
  import numpy as np
 
16
  import urllib.request
17
  import csv
18
+
19
+ # #################
20
+ # LOCAL IMPORTS
21
+ from .config import MODEL_SOURCE, ModelSource, EVAL_BATCH_SIZE, EVAL_SAMPLE_SIZE, DATASET_PATH, EVAL_PERIOD_MIN
22
+ from .utils import preprocess, load_model_and_tokenizer, load_dataset
23
 
24
  ##################
25
+ # Imports for app monitoring
26
+ from prometheus_fastapi_instrumentator import Instrumentator
27
  from prometheus_client import Counter, Gauge
28
  from apscheduler.schedulers.background import BackgroundScheduler
29
+ from datetime import datetime, timedelta
30
+ import threading
 
 
31
  #################
32
 
33
 
 
 
 
 
34
 
35
+ #################
36
+ # App creation and metrics exposition
37
  app = FastAPI()
38
  Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
 
43
 
44
+ #################
45
+ # class for transferring post request data
46
  class SentimentQuery(BaseModel):
47
  input_texts: Union[str, List[str]]
48
 
49
+
50
+ #################
51
+ # Retrieve model either locally or via download
52
+ tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
53
+ model.eval()
54
+
55
+ ##############
56
+ # retrieve label to int mapping from model repo
57
  mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
58
  with urllib.request.urlopen(mapping_link) as f:
59
  html = f.read().decode('utf-8').split("\n")
60
  csvreader = csv.reader(html, delimiter='\t')
61
  labels = [row[1] for row in csvreader if len(row) > 1]
62
+ #############
63
 
 
 
64
 
65
  @app.get("/")
66
  def read_root():
67
  return {"status": "ok", "message": "Sentiment API is running"}
68
 
69
  @app.post("/predict")
70
+ async def analyze_text(query:SentimentQuery)->dict:
71
+ """
72
+ Elaborates an input query containing one or more text messages and returns a response
73
+ containing the prediction and the sentiment score for each message
74
+ """
75
 
76
  if isinstance(query.input_texts, str):
77
  input_texts = [query.input_texts]
 
95
  for i,text in enumerate(input_texts):
96
 
97
  predicted = labels[pred_labels[i]]
 
 
98
 
99
  response_body.append(
100
  {
 
115
 
116
 
117
 
118
+ # Evaluation metrics on labeled test set
119
+ EVAL_ACCURACY = Gauge(
120
+ "model_evaluation_accuracy",
121
+ "Accuracy on latest periodic evaluation of labeled test subset"
122
+ )
123
+
124
+ def evaluate_accuracy(N_SAMPLES:int, BATCH_SIZE:int)->float:
125
+ """
126
+ Evaluates and returns the model accuracy on a random subset of the test dataset
127
+ """
128
  dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
129
+ N_BATCHES = len(dataset["text"])//BATCH_SIZE
130
 
131
  accuracy = 0
132
  for i in range(N_BATCHES+1):
133
  if i == N_BATCHES :
134
+ samples, labels = dataset["text"][i*BATCH_SIZE:], dataset["label"][i*BATCH_SIZE:]
135
  else:
136
+ samples, labels = dataset["text"][i*BATCH_SIZE:(i+1)*BATCH_SIZE], dataset["label"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
137
 
138
  model.eval()
139
  encoded_batch = tokenizer(
 
150
  scores = softmax(logits, axis=-1)
151
  pred_labels = scores.argmax(axis=-1)
152
  accuracy += sum(pred_labels==labels)
153
+
154
  accuracy/=N_SAMPLES
155
  return accuracy
156
 
157
 
158
+ # Sentiment Distribution over unlabelled set
159
+ SENTIMENT_BATCH_FRACTION = Gauge(
160
+ "sentiment_batch_fraction",
161
+ "Fraction of predictions in the latest monitored batch, by label (0..1).",
162
+ ["label"]
163
  )
164
 
165
+ def evaluate_sentiment_distribution(N_SAMPLES:int, BATCH_SIZE:int)->np.ndarray:
166
+ """
167
+ Evaluates and returns the sentiment distribution over a random subset of the test dataset
168
+ """
169
+ dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
170
+ N_BATCHES = len(dataset["text"])//BATCH_SIZE
171
+
172
+ model.eval()
173
+
174
+ counts = np.array([0.,0.,0.])
175
+ for i in range(N_BATCHES+1):
176
+ if i == N_BATCHES :
177
+ samples = dataset["text"][i*BATCH_SIZE:]
178
+ else:
179
+ samples = dataset["text"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
180
+
181
+ encoded_batch = tokenizer(
182
+ [preprocess(t) for t in samples],
183
+ padding=True, # pad to same length
184
+ truncation=True, # truncate long texts
185
+ return_tensors="pt",
186
+ )
187
 
188
+ with torch.no_grad():
189
+ output = model(**encoded_batch)
190
+
191
+ logits = output[0].detach().cpu().numpy()
192
+ scores = softmax(logits, axis=-1)
193
+ pred_labels = scores.argmax(axis=-1)
194
+ counts += np.unique(pred_labels, return_counts=True)[1]
195
+
196
+ fractions=counts/N_SAMPLES
197
+ return fractions
198
+
199
+
200
+ ##################
201
+ # scheduler creation for managing the metric creation jobs
202
+ scheduler = BackgroundScheduler(daemon=True)
203
+ # threading lock to possibly handle concurrent request
204
  _model_lock = threading.Lock()
205
 
206
+
207
+ ############
208
+ # jobs to be launched periodically
209
+
210
+ def _run_eval_and_send_data():
211
  with _model_lock:
212
+ acc = evaluate_accuracy(EVAL_SAMPLE_SIZE, EVAL_BATCH_SIZE)
213
  EVAL_ACCURACY.set(acc)
214
 
215
+ def _run_sentiment_distr_and_send_data():
216
+ with _model_lock:
217
+ fractions = evaluate_sentiment_distribution(EVAL_SAMPLE_SIZE, EVAL_BATCH_SIZE)
218
+ for i, label in enumerate(labels):
219
+ SENTIMENT_BATCH_FRACTION.labels(label=label).set(fractions[i])
220
 
 
221
 
222
  @app.on_event("startup")
223
  def _start_scheduler():
224
+
225
  # run once soon after startup
226
+ scheduler.add_job(_run_eval_and_send_data, next_run_time=datetime.now() + timedelta(seconds=2))
227
  # then every EVAL_PERIOD_MIN minutes
228
+ scheduler.add_job(_run_eval_and_send_data, "interval", minutes=EVAL_PERIOD_MIN)
229
+
230
+ # run once soon after startup
231
+ scheduler.add_job(_run_sentiment_distr_and_send_data, next_run_time=datetime.now() + timedelta(seconds=2))
232
+ # then every EVAL_PERIOD_MIN minutes
233
+ scheduler.add_job(_run_sentiment_distr_and_send_data, "interval", minutes=EVAL_PERIOD_MIN)
234
+
235
  scheduler.start()
236
 
237
  @app.on_event("shutdown")
src/app/config.py CHANGED
@@ -2,7 +2,6 @@ import os
2
  from enum import Enum
3
  from pathlib import Path
4
 
5
-
6
  class ModelSource(str, Enum):
7
  HF = "hf"
8
  LOCAL = "local"
@@ -10,24 +9,9 @@ class ModelSource(str, Enum):
10
  MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
11
  HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
12
  DATASET_PATH = Path("data/dataset")
 
 
 
 
 
13
 
14
-
15
- EVAL_SAMPLE_SIZE = int(os.getenv("EVAL_SAMPLE_SIZE", "80"))
16
- EVAL_INTERVAL_HOURS = float(os.getenv("EVAL_INTERVAL_HOURS", "1"))
17
- RANDOM_SEED = int(os.getenv("RANDOM_SEED", "42"))
18
-
19
- EVAL_BATCH_SIZE = 64
20
- N_SAMPLES = 500
21
- EVAL_PERIOD_MIN = 1
22
-
23
-
24
- # def load_model_and_tokenizer(MODEL_SOURCE):
25
- # if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
26
- # tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
27
- # model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
28
- # else: # use a locally fine tuned model
29
- # local_model_path = Path("models/saved_model")
30
- # assert local_model_path.exists(), """No local model was found. Run 'python3 src/train_model.py' first"""
31
- # tokenizer = AutoTokenizer.from_pretrained("models/saved_tokenizer")
32
- # model = AutoModelForSequenceClassification.from_pretrained("models/saved_model")
33
- # return tokenizer, model
 
2
  from enum import Enum
3
  from pathlib import Path
4
 
 
5
  class ModelSource(str, Enum):
6
  HF = "hf"
7
  LOCAL = "local"
 
9
  MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
10
  HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
11
  DATASET_PATH = Path("data/dataset")
12
+ EVAL_SAMPLE_SIZE = int(os.getenv("EVAL_SAMPLE_SIZE", "100"))
13
+ EVAL_PERIOD_MIN = float(os.getenv("EVAL_PERIOD_MIN", "30"))
14
+ EVAL_BATCH_SIZE = int(os.getenv("EVAL_BATCH_SIZE", "64"))
15
+ TRAIN_FRACTION_SIZE = float(os.getenv("TRAIN_FRACTION_SIZE", "0.2"))
16
+ EVAL_FRACTION_SIZE = float(os.getenv("EVAL_FRACTION_SIZE", "0.4"))
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/app/utils.py CHANGED
@@ -5,7 +5,10 @@ from datasets import load_dataset as hf_load_dataset
5
  from datasets import load_from_disk
6
 
7
 
8
- def preprocess(text):
 
 
 
9
  new_text = []
10
  for t in text.split(" "):
11
  t = '@user' if t.startswith('@') and len(t) > 1 else t
@@ -14,8 +17,11 @@ def preprocess(text):
14
  return " ".join(new_text)
15
 
16
 
17
-
18
- def load_model_and_tokenizer(MODEL_SOURCE):
 
 
 
19
  if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
20
  tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
21
  model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
@@ -27,7 +33,11 @@ def load_model_and_tokenizer(MODEL_SOURCE):
27
  return tokenizer, model
28
 
29
 
30
- def load_dataset(dataset_path):
 
 
 
 
31
  if dataset_path.exists():
32
  dataset = load_from_disk(dataset_path)
33
  else:
 
5
  from datasets import load_from_disk
6
 
7
 
8
+ def preprocess(text:str)->str:
9
+ """
10
+ Returns an input text ready to be tokenized by removing special characters
11
+ """
12
  new_text = []
13
  for t in text.split(" "):
14
  t = '@user' if t.startswith('@') and len(t) > 1 else t
 
17
  return " ".join(new_text)
18
 
19
 
20
+ def load_model_and_tokenizer(MODEL_SOURCE:str)->(AutoTokenizer,AutoModelForSequenceClassification):
21
+ """
22
+ Loads a tokenizer and sentiment analysis model. These can be either loaded from local
23
+ or downloaded from Hugging Face API
24
+ """
25
  if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
26
  tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
27
  model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
 
33
  return tokenizer, model
34
 
35
 
36
+ def load_dataset(dataset_path:str):
37
+ """
38
+ Loads the tweet_eval dataset for sentiment analysis task. The dataset
39
+ can be either loaded from local and downloaded through Hugging Face API
40
+ """
41
  if dataset_path.exists():
42
  dataset = load_from_disk(dataset_path)
43
  else:
src/nb.ipynb DELETED
@@ -1,208 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 48,
6
- "id": "7aaceacb",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "from pathlib import Path\n",
11
- "from app.config import DATASET_PATH, MODEL_SOURCE\n",
12
- "from app.utils import load_dataset, load_model_and_tokenizer, preprocess\n",
13
- "from scipy.special import softmax\n"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": 49,
19
- "id": "7defab3e",
20
- "metadata": {},
21
- "outputs": [
22
- {
23
- "name": "stderr",
24
- "output_type": "stream",
25
- "text": [
26
- "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
27
- "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
28
- "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
29
- ]
30
- }
31
- ],
32
- "source": [
33
- "tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)\n"
34
- ]
35
- },
36
- {
37
- "cell_type": "code",
38
- "execution_count": 24,
39
- "id": "0a1dcfdd",
40
- "metadata": {},
41
- "outputs": [],
42
- "source": [
43
- "dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n"
44
- ]
45
- },
46
- {
47
- "cell_type": "code",
48
- "execution_count": 33,
49
- "id": "501e6728",
50
- "metadata": {},
51
- "outputs": [],
52
- "source": [
53
- "import torch"
54
- ]
55
- },
56
- {
57
- "cell_type": "code",
58
- "execution_count": 47,
59
- "id": "82b25de1",
60
- "metadata": {},
61
- "outputs": [
62
- {
63
- "data": {
64
- "text/plain": [
65
- "2"
66
- ]
67
- },
68
- "execution_count": 47,
69
- "metadata": {},
70
- "output_type": "execute_result"
71
- }
72
- ],
73
- "source": [
74
- "N_BEVAL_BATCH_SIZE = 64\n",
75
- "N_SAMPLES = 500\n",
76
- "N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
77
- "N_BATCHES"
78
- ]
79
- },
80
- {
81
- "cell_type": "code",
82
- "execution_count": 54,
83
- "id": "7dd5371b",
84
- "metadata": {},
85
- "outputs": [
86
- {
87
- "name": "stdout",
88
- "output_type": "stream",
89
- "text": [
90
- "0 64\n",
91
- "64 128\n",
92
- "128 192\n",
93
- "192 256\n",
94
- "256 320\n",
95
- "320 384\n",
96
- "384 448\n",
97
- "448 500\n"
98
- ]
99
- },
100
- {
101
- "data": {
102
- "text/plain": [
103
- "np.float64(0.71)"
104
- ]
105
- },
106
- "execution_count": 54,
107
- "metadata": {},
108
- "output_type": "execute_result"
109
- }
110
- ],
111
- "source": [
112
- "EVAL_BATCH_SIZE = 64\n",
113
- "N_SAMPLES = 500\n",
114
- "def evaluate_accuracy():\n",
115
- "\n",
116
- " dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n",
117
- " N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
118
- "\n",
119
- " accuracy = 0\n",
120
- " for i in range(N_BATCHES+1):\n",
121
- "\n",
122
- " start = i*EVAL_BATCH_SIZE\n",
123
- " end = min(N_SAMPLES, (i+1)*EVAL_BATCH_SIZE)\n",
124
- " print(start, end)\n",
125
- " samples, labels = dataset[\"text\"][start:end], dataset[\"label\"][start:end]\n",
126
- " \n",
127
- " model.eval()\n",
128
- " encoded_batch = tokenizer(\n",
129
- " [preprocess(t) for t in samples],\n",
130
- " padding=True, # pad to same length\n",
131
- " truncation=True, # truncate long texts\n",
132
- " return_tensors=\"pt\",\n",
133
- " )\n",
134
- "\n",
135
- " with torch.no_grad():\n",
136
- " output = model(**encoded_batch)\n",
137
- " \n",
138
- " logits = output[0].detach().cpu().numpy()\n",
139
- " scores = softmax(logits, axis=-1)\n",
140
- " pred_labels = scores.argmax(axis=-1)\n",
141
- " accuracy += sum(pred_labels==labels)\n",
142
- " accuracy/=N_SAMPLES\n",
143
- " return accuracy\n",
144
- "evaluate_accuracy()"
145
- ]
146
- },
147
- {
148
- "cell_type": "code",
149
- "execution_count": 1,
150
- "id": "dbd3bb8c",
151
- "metadata": {},
152
- "outputs": [],
153
- "source": [
154
- "def _load_test_data():\n",
155
- " \"\"\"\n",
156
- " Expects CSV with columns: text,label\n",
157
- " label values must be one of labels (negative, neutral, positive) or their indices (0,1,2).\n",
158
- " \"\"\"\n",
159
- " df = pd.read_csv(TEST_DATA_PATH)\n",
160
- " # normalize label column to strings matching our 'labels' list\n",
161
- " if np.issubdtype(df[\"label\"].dtype, np.number):\n",
162
- " df[\"label\"] = df[\"label\"].astype(int).map(lambda i: labels[i])\n",
163
- " else:\n",
164
- " df[\"label\"] = df[\"label\"].str.lower().str.strip()\n",
165
- " # keep only supported labels\n",
166
- " df = df[df[\"label\"].isin(labels)].dropna(subset=[\"text\", \"label\"])\n",
167
- " return df"
168
- ]
169
- },
170
- {
171
- "cell_type": "code",
172
- "execution_count": null,
173
- "id": "ec0b086e",
174
- "metadata": {},
175
- "outputs": [],
176
- "source": []
177
- },
178
- {
179
- "cell_type": "code",
180
- "execution_count": null,
181
- "id": "800c8018",
182
- "metadata": {},
183
- "outputs": [],
184
- "source": []
185
- }
186
- ],
187
- "metadata": {
188
- "kernelspec": {
189
- "display_name": "ProjectEnv",
190
- "language": "python",
191
- "name": "python3"
192
- },
193
- "language_info": {
194
- "codemirror_mode": {
195
- "name": "ipython",
196
- "version": 3
197
- },
198
- "file_extension": ".py",
199
- "mimetype": "text/x-python",
200
- "name": "python",
201
- "nbconvert_exporter": "python",
202
- "pygments_lexer": "ipython3",
203
- "version": "3.11.10"
204
- }
205
- },
206
- "nbformat": 4,
207
- "nbformat_minor": 5
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/train_model.py CHANGED
@@ -9,7 +9,13 @@ from transformers import (
9
  DataCollatorWithPadding
10
  )
11
  from pathlib import Path
12
- from app.config import DATASET_PATH
 
 
 
 
 
 
13
 
14
 
15
  # --- Device detection ---
@@ -77,18 +83,11 @@ model.config.use_cache = False
77
  dataset = load_dataset(DATASET_PATH)
78
 
79
 
80
- # ---- COPY-PASTE FROM HERE ----
81
- import os
82
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
83
-
84
- from datasets import DatasetDict
85
- from transformers import AutoTokenizer, DataCollatorWithPadding
86
-
87
  def make_trainer_ready(
88
  raw_ds: DatasetDict,
89
  model_name: str = "cardiffnlp/twitter-roberta-base-sep2022",
90
  train_frac: float = 0.2,
91
- val_frac: float = 0.2,
92
  seed: int = 42,
93
  label_col: str = "label",
94
  text_col: str = "text",
@@ -156,8 +155,8 @@ def make_trainer_ready(
156
  train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(
157
  raw_ds=dataset,
158
  model_name="cardiffnlp/twitter-roberta-base-sep2022",
159
- train_frac=0.2, # take 20% of train
160
- val_frac=0.5, # take 50% of validation
161
  seed=42,
162
  label_col="label",
163
  text_col="text",
 
9
  DataCollatorWithPadding
10
  )
11
  from pathlib import Path
12
+ import os
13
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
14
+ from datasets import DatasetDict
15
+ from transformers import AutoTokenizer, DataCollatorWithPadding
16
+ from app.config import DATASET_PATH, TRAIN_FRACTION_SIZE, EVAL_FRACTION_SIZE
17
+
18
+
19
 
20
 
21
  # --- Device detection ---
 
83
  dataset = load_dataset(DATASET_PATH)
84
 
85
 
 
 
 
 
 
 
 
86
  def make_trainer_ready(
87
  raw_ds: DatasetDict,
88
  model_name: str = "cardiffnlp/twitter-roberta-base-sep2022",
89
  train_frac: float = 0.2,
90
+ val_frac: float = 0.4,
91
  seed: int = 42,
92
  label_col: str = "label",
93
  text_col: str = "text",
 
155
  train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(
156
  raw_ds=dataset,
157
  model_name="cardiffnlp/twitter-roberta-base-sep2022",
158
+ train_frac=TRAIN_FRACTION_SIZE, # take 20% of train
159
+ val_frac=EVAL_FRACTION_SIZE, # take 50% of validation
160
  seed=42,
161
  label_col="label",
162
  text_col="text",
tests/test_data.py DELETED
@@ -1,4 +0,0 @@
1
- import pytest
2
-
3
- from datasets import load_dataset
4
-