Spaces:
Running
Running
lucabadiali
commited on
Commit
·
85c00b7
1
Parent(s):
5396065
Added env config file
Browse files- .gitignore +1 -1
- env_config.sh +6 -0
- prometheus.yml +4 -4
- src/app/__pycache__/config.cpython-311.pyc +0 -0
- src/app/app.py +115 -64
- src/app/config.py +5 -21
- src/app/utils.py +14 -4
- src/nb.ipynb +0 -208
- src/train_model.py +10 -11
- tests/test_data.py +0 -4
.gitignore
CHANGED
|
@@ -5,4 +5,4 @@ data/__pycache__
|
|
| 5 |
data/dataset
|
| 6 |
app/__pycache__
|
| 7 |
*.pyc
|
| 8 |
-
.env
|
|
|
|
| 5 |
data/dataset
|
| 6 |
app/__pycache__
|
| 7 |
*.pyc
|
| 8 |
+
.env
|
env_config.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export MODEL_SOURCE="hf"
|
| 2 |
+
export EVAL_SAMPLE_SIZE=100
|
| 3 |
+
export EVAL_PERIOD_MIN=30
|
| 4 |
+
export EVAL_BATCH_SIZE=64
|
| 5 |
+
export TRAIN_FRACTION_SIZE=0.2
|
| 6 |
+
export EVAL_FRACTION_SIZE=0.4
|
prometheus.yml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
global:
|
| 2 |
-
scrape_interval:
|
| 3 |
|
| 4 |
scrape_configs:
|
| 5 |
- job_name: "fastapi_hf"
|
|
@@ -7,9 +7,9 @@ scrape_configs:
|
|
| 7 |
metrics_path: /metrics
|
| 8 |
static_configs:
|
| 9 |
- targets:
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# Se
|
| 13 |
# authorization:
|
| 14 |
# type: Bearer
|
| 15 |
# credentials: "hf_XXXXXXXXXXXX" # token read-only
|
|
|
|
| 1 |
global:
|
| 2 |
+
scrape_interval: 60s # più gentile verso HF
|
| 3 |
|
| 4 |
scrape_configs:
|
| 5 |
- job_name: "fastapi_hf"
|
|
|
|
| 7 |
metrics_path: /metrics
|
| 8 |
static_configs:
|
| 9 |
- targets:
|
| 10 |
+
- "host.docker.internal:8000"
|
| 11 |
+
#- "lucabadiali-ml-ops-project.hf.space:443"
|
| 12 |
+
# Se l#a Space è privata, aggiungi:
|
| 13 |
# authorization:
|
| 14 |
# type: Bearer
|
| 15 |
# credentials: "hf_XXXXXXXXXXXX" # token read-only
|
src/app/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/src/app/__pycache__/config.cpython-311.pyc and b/src/app/__pycache__/config.cpython-311.pyc differ
|
|
|
src/app/app.py
CHANGED
|
@@ -1,84 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
-
from .utils import preprocess, load_model_and_tokenizer
|
| 4 |
from scipy.special import softmax
|
| 5 |
import numpy as np
|
| 6 |
-
from pydantic import BaseModel
|
| 7 |
import urllib.request
|
| 8 |
import csv
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
from .config import MODEL_SOURCE, ModelSource
|
| 13 |
-
from
|
| 14 |
|
| 15 |
##################
|
|
|
|
|
|
|
| 16 |
from prometheus_client import Counter, Gauge
|
| 17 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 18 |
-
from datetime import datetime
|
| 19 |
-
import
|
| 20 |
-
import random
|
| 21 |
-
import pandas as pd
|
| 22 |
#################
|
| 23 |
|
| 24 |
|
| 25 |
-
#############
|
| 26 |
-
from .config import EVAL_BATCH_SIZE, N_SAMPLES, DATASET_PATH, EVAL_PERIOD_MIN
|
| 27 |
-
from .utils import load_dataset
|
| 28 |
-
###########
|
| 29 |
|
|
|
|
|
|
|
| 30 |
app = FastAPI()
|
| 31 |
Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
|
| 32 |
|
| 33 |
-
###################
|
| 34 |
-
# ---------- Metrics (custom) ----------
|
| 35 |
-
# Production predictions distribution (unlabeled)
|
| 36 |
-
# PRED_COUNTER = Counter(
|
| 37 |
-
# "sentiment_requests_total",
|
| 38 |
-
# "Total predictions served by label",
|
| 39 |
-
# ["label"]
|
| 40 |
-
# )
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
# EVAL_SAMPLE_SIZE = Gauge(
|
| 45 |
-
# "model_evaluation_sample_size",
|
| 46 |
-
# "Number of samples used in the latest periodic evaluation"
|
| 47 |
-
# )
|
| 48 |
-
# EVAL_COUNTER_DIST = Counter(
|
| 49 |
-
# "sentiment_test_distribution_total",
|
| 50 |
-
# "Cumulative predicted label counts on evaluation samples",
|
| 51 |
-
# ["label"]
|
| 52 |
-
# )
|
| 53 |
-
# EVAL_RUNS = Counter(
|
| 54 |
-
# "model_evaluations_total",
|
| 55 |
-
# "Total number of evaluation runs completed"
|
| 56 |
-
# )
|
| 57 |
-
##################
|
| 58 |
-
|
| 59 |
-
|
| 60 |
|
| 61 |
|
| 62 |
|
| 63 |
|
|
|
|
|
|
|
| 64 |
class SentimentQuery(BaseModel):
|
| 65 |
input_texts: Union[str, List[str]]
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
|
| 68 |
with urllib.request.urlopen(mapping_link) as f:
|
| 69 |
html = f.read().decode('utf-8').split("\n")
|
| 70 |
csvreader = csv.reader(html, delimiter='\t')
|
| 71 |
labels = [row[1] for row in csvreader if len(row) > 1]
|
|
|
|
| 72 |
|
| 73 |
-
tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
|
| 74 |
-
model.eval()
|
| 75 |
|
| 76 |
@app.get("/")
|
| 77 |
def read_root():
|
| 78 |
return {"status": "ok", "message": "Sentiment API is running"}
|
| 79 |
|
| 80 |
@app.post("/predict")
|
| 81 |
-
async def analyze_text(query:SentimentQuery):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
if isinstance(query.input_texts, str):
|
| 84 |
input_texts = [query.input_texts]
|
|
@@ -102,8 +95,6 @@ async def analyze_text(query:SentimentQuery):
|
|
| 102 |
for i,text in enumerate(input_texts):
|
| 103 |
|
| 104 |
predicted = labels[pred_labels[i]]
|
| 105 |
-
#PRED_COUNTER.labels(label=predicted).inc()
|
| 106 |
-
|
| 107 |
|
| 108 |
response_body.append(
|
| 109 |
{
|
|
@@ -124,16 +115,25 @@ async def analyze_text(query:SentimentQuery):
|
|
| 124 |
|
| 125 |
|
| 126 |
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
|
| 129 |
-
N_BATCHES = len(dataset["text"])//
|
| 130 |
|
| 131 |
accuracy = 0
|
| 132 |
for i in range(N_BATCHES+1):
|
| 133 |
if i == N_BATCHES :
|
| 134 |
-
samples, labels = dataset["text"][i*
|
| 135 |
else:
|
| 136 |
-
samples, labels = dataset["text"][i*
|
| 137 |
|
| 138 |
model.eval()
|
| 139 |
encoded_batch = tokenizer(
|
|
@@ -150,37 +150,88 @@ def evaluate_accuracy():
|
|
| 150 |
scores = softmax(logits, axis=-1)
|
| 151 |
pred_labels = scores.argmax(axis=-1)
|
| 152 |
accuracy += sum(pred_labels==labels)
|
|
|
|
| 153 |
accuracy/=N_SAMPLES
|
| 154 |
return accuracy
|
| 155 |
|
| 156 |
|
| 157 |
-
#
|
| 158 |
-
|
| 159 |
-
"
|
| 160 |
-
"
|
|
|
|
| 161 |
)
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
_model_lock = threading.Lock()
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
| 171 |
with _model_lock:
|
| 172 |
-
acc = evaluate_accuracy()
|
| 173 |
EVAL_ACCURACY.set(acc)
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
-
scheduler = BackgroundScheduler(daemon=True)
|
| 177 |
|
| 178 |
@app.on_event("startup")
|
| 179 |
def _start_scheduler():
|
|
|
|
| 180 |
# run once soon after startup
|
| 181 |
-
scheduler.add_job(
|
| 182 |
# then every EVAL_PERIOD_MIN minutes
|
| 183 |
-
scheduler.add_job(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
scheduler.start()
|
| 185 |
|
| 186 |
@app.on_event("shutdown")
|
|
|
|
| 1 |
+
###### IMPORTS
|
| 2 |
+
|
| 3 |
+
########
|
| 4 |
+
# Imports for app and model creation and
|
| 5 |
from fastapi import FastAPI, HTTPException
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
import requests
|
| 8 |
+
from typing import Union, List
|
| 9 |
+
|
| 10 |
+
##########
|
| 11 |
+
# Imports for model creation/usage
|
| 12 |
+
import torch
|
| 13 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
|
|
| 14 |
from scipy.special import softmax
|
| 15 |
import numpy as np
|
|
|
|
| 16 |
import urllib.request
|
| 17 |
import csv
|
| 18 |
+
|
| 19 |
+
# #################
|
| 20 |
+
# LOCAL IMPORTS
|
| 21 |
+
from .config import MODEL_SOURCE, ModelSource, EVAL_BATCH_SIZE, EVAL_SAMPLE_SIZE, DATASET_PATH, EVAL_PERIOD_MIN
|
| 22 |
+
from .utils import preprocess, load_model_and_tokenizer, load_dataset
|
| 23 |
|
| 24 |
##################
|
| 25 |
+
# Imports for app monitoring
|
| 26 |
+
from prometheus_fastapi_instrumentator import Instrumentator
|
| 27 |
from prometheus_client import Counter, Gauge
|
| 28 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 29 |
+
from datetime import datetime, timedelta
|
| 30 |
+
import threading
|
|
|
|
|
|
|
| 31 |
#################
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
#################
|
| 36 |
+
# App creation and metrics exposition
|
| 37 |
app = FastAPI()
|
| 38 |
Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
|
| 43 |
|
| 44 |
+
#################
|
| 45 |
+
# class for transferring post request data
|
| 46 |
class SentimentQuery(BaseModel):
|
| 47 |
input_texts: Union[str, List[str]]
|
| 48 |
|
| 49 |
+
|
| 50 |
+
#################
|
| 51 |
+
# Retrieve model either locally or via download
|
| 52 |
+
tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
|
| 53 |
+
model.eval()
|
| 54 |
+
|
| 55 |
+
##############
|
| 56 |
+
# retrieve label to int mapping from model repo
|
| 57 |
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
|
| 58 |
with urllib.request.urlopen(mapping_link) as f:
|
| 59 |
html = f.read().decode('utf-8').split("\n")
|
| 60 |
csvreader = csv.reader(html, delimiter='\t')
|
| 61 |
labels = [row[1] for row in csvreader if len(row) > 1]
|
| 62 |
+
#############
|
| 63 |
|
|
|
|
|
|
|
| 64 |
|
| 65 |
@app.get("/")
|
| 66 |
def read_root():
|
| 67 |
return {"status": "ok", "message": "Sentiment API is running"}
|
| 68 |
|
| 69 |
@app.post("/predict")
|
| 70 |
+
async def analyze_text(query:SentimentQuery)->dict:
|
| 71 |
+
"""
|
| 72 |
+
Elaborates an input query containing one or more text messages and returns a response
|
| 73 |
+
containing the prediction and the sentiment score for each message
|
| 74 |
+
"""
|
| 75 |
|
| 76 |
if isinstance(query.input_texts, str):
|
| 77 |
input_texts = [query.input_texts]
|
|
|
|
| 95 |
for i,text in enumerate(input_texts):
|
| 96 |
|
| 97 |
predicted = labels[pred_labels[i]]
|
|
|
|
|
|
|
| 98 |
|
| 99 |
response_body.append(
|
| 100 |
{
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
|
| 118 |
+
# Evaluation metrics on labeled test set
|
| 119 |
+
EVAL_ACCURACY = Gauge(
|
| 120 |
+
"model_evaluation_accuracy",
|
| 121 |
+
"Accuracy on latest periodic evaluation of labeled test subset"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
def evaluate_accuracy(N_SAMPLES:int, BATCH_SIZE:int)->float:
|
| 125 |
+
"""
|
| 126 |
+
Evaluates and returns the model accuracy on a random subset of the test dataset
|
| 127 |
+
"""
|
| 128 |
dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
|
| 129 |
+
N_BATCHES = len(dataset["text"])//BATCH_SIZE
|
| 130 |
|
| 131 |
accuracy = 0
|
| 132 |
for i in range(N_BATCHES+1):
|
| 133 |
if i == N_BATCHES :
|
| 134 |
+
samples, labels = dataset["text"][i*BATCH_SIZE:], dataset["label"][i*BATCH_SIZE:]
|
| 135 |
else:
|
| 136 |
+
samples, labels = dataset["text"][i*BATCH_SIZE:(i+1)*BATCH_SIZE], dataset["label"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
|
| 137 |
|
| 138 |
model.eval()
|
| 139 |
encoded_batch = tokenizer(
|
|
|
|
| 150 |
scores = softmax(logits, axis=-1)
|
| 151 |
pred_labels = scores.argmax(axis=-1)
|
| 152 |
accuracy += sum(pred_labels==labels)
|
| 153 |
+
|
| 154 |
accuracy/=N_SAMPLES
|
| 155 |
return accuracy
|
| 156 |
|
| 157 |
|
| 158 |
+
# Sentiment Distribution over unlabelled set
|
| 159 |
+
SENTIMENT_BATCH_FRACTION = Gauge(
|
| 160 |
+
"sentiment_batch_fraction",
|
| 161 |
+
"Fraction of predictions in the latest monitored batch, by label (0..1).",
|
| 162 |
+
["label"]
|
| 163 |
)
|
| 164 |
|
| 165 |
+
def evaluate_sentiment_distribution(N_SAMPLES:int, BATCH_SIZE:int)->np.ndarray:
|
| 166 |
+
"""
|
| 167 |
+
Evaluates and returns the sentiment distribution over a random subset of the test dataset
|
| 168 |
+
"""
|
| 169 |
+
dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
|
| 170 |
+
N_BATCHES = len(dataset["text"])//BATCH_SIZE
|
| 171 |
+
|
| 172 |
+
model.eval()
|
| 173 |
+
|
| 174 |
+
counts = np.array([0.,0.,0.])
|
| 175 |
+
for i in range(N_BATCHES+1):
|
| 176 |
+
if i == N_BATCHES :
|
| 177 |
+
samples = dataset["text"][i*BATCH_SIZE:]
|
| 178 |
+
else:
|
| 179 |
+
samples = dataset["text"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
|
| 180 |
+
|
| 181 |
+
encoded_batch = tokenizer(
|
| 182 |
+
[preprocess(t) for t in samples],
|
| 183 |
+
padding=True, # pad to same length
|
| 184 |
+
truncation=True, # truncate long texts
|
| 185 |
+
return_tensors="pt",
|
| 186 |
+
)
|
| 187 |
|
| 188 |
+
with torch.no_grad():
|
| 189 |
+
output = model(**encoded_batch)
|
| 190 |
+
|
| 191 |
+
logits = output[0].detach().cpu().numpy()
|
| 192 |
+
scores = softmax(logits, axis=-1)
|
| 193 |
+
pred_labels = scores.argmax(axis=-1)
|
| 194 |
+
counts += np.unique(pred_labels, return_counts=True)[1]
|
| 195 |
+
|
| 196 |
+
fractions=counts/N_SAMPLES
|
| 197 |
+
return fractions
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
##################
|
| 201 |
+
# scheduler creation for managing the metric creation jobs
|
| 202 |
+
scheduler = BackgroundScheduler(daemon=True)
|
| 203 |
+
# threading lock to possibly handle concurrent request
|
| 204 |
_model_lock = threading.Lock()
|
| 205 |
|
| 206 |
+
|
| 207 |
+
############
|
| 208 |
+
# jobs to be launched periodically
|
| 209 |
+
|
| 210 |
+
def _run_eval_and_send_data():
|
| 211 |
with _model_lock:
|
| 212 |
+
acc = evaluate_accuracy(EVAL_SAMPLE_SIZE, EVAL_BATCH_SIZE)
|
| 213 |
EVAL_ACCURACY.set(acc)
|
| 214 |
|
| 215 |
+
def _run_sentiment_distr_and_send_data():
|
| 216 |
+
with _model_lock:
|
| 217 |
+
fractions = evaluate_sentiment_distribution(EVAL_SAMPLE_SIZE, EVAL_BATCH_SIZE)
|
| 218 |
+
for i, label in enumerate(labels):
|
| 219 |
+
SENTIMENT_BATCH_FRACTION.labels(label=label).set(fractions[i])
|
| 220 |
|
|
|
|
| 221 |
|
| 222 |
@app.on_event("startup")
|
| 223 |
def _start_scheduler():
|
| 224 |
+
|
| 225 |
# run once soon after startup
|
| 226 |
+
scheduler.add_job(_run_eval_and_send_data, next_run_time=datetime.now() + timedelta(seconds=2))
|
| 227 |
# then every EVAL_PERIOD_MIN minutes
|
| 228 |
+
scheduler.add_job(_run_eval_and_send_data, "interval", minutes=EVAL_PERIOD_MIN)
|
| 229 |
+
|
| 230 |
+
# run once soon after startup
|
| 231 |
+
scheduler.add_job(_run_sentiment_distr_and_send_data, next_run_time=datetime.now() + timedelta(seconds=2))
|
| 232 |
+
# then every EVAL_PERIOD_MIN minutes
|
| 233 |
+
scheduler.add_job(_run_sentiment_distr_and_send_data, "interval", minutes=EVAL_PERIOD_MIN)
|
| 234 |
+
|
| 235 |
scheduler.start()
|
| 236 |
|
| 237 |
@app.on_event("shutdown")
|
src/app/config.py
CHANGED
|
@@ -2,7 +2,6 @@ import os
|
|
| 2 |
from enum import Enum
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
-
|
| 6 |
class ModelSource(str, Enum):
|
| 7 |
HF = "hf"
|
| 8 |
LOCAL = "local"
|
|
@@ -10,24 +9,9 @@ class ModelSource(str, Enum):
|
|
| 10 |
MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
|
| 11 |
HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 12 |
DATASET_PATH = Path("data/dataset")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
EVAL_SAMPLE_SIZE = int(os.getenv("EVAL_SAMPLE_SIZE", "80"))
|
| 16 |
-
EVAL_INTERVAL_HOURS = float(os.getenv("EVAL_INTERVAL_HOURS", "1"))
|
| 17 |
-
RANDOM_SEED = int(os.getenv("RANDOM_SEED", "42"))
|
| 18 |
-
|
| 19 |
-
EVAL_BATCH_SIZE = 64
|
| 20 |
-
N_SAMPLES = 500
|
| 21 |
-
EVAL_PERIOD_MIN = 1
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# def load_model_and_tokenizer(MODEL_SOURCE):
|
| 25 |
-
# if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
|
| 26 |
-
# tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
|
| 27 |
-
# model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
|
| 28 |
-
# else: # use a locally fine tuned model
|
| 29 |
-
# local_model_path = Path("models/saved_model")
|
| 30 |
-
# assert local_model_path.exists(), """No local model was found. Run 'python3 src/train_model.py' first"""
|
| 31 |
-
# tokenizer = AutoTokenizer.from_pretrained("models/saved_tokenizer")
|
| 32 |
-
# model = AutoModelForSequenceClassification.from_pretrained("models/saved_model")
|
| 33 |
-
# return tokenizer, model
|
|
|
|
| 2 |
from enum import Enum
|
| 3 |
from pathlib import Path
|
| 4 |
|
|
|
|
| 5 |
class ModelSource(str, Enum):
|
| 6 |
HF = "hf"
|
| 7 |
LOCAL = "local"
|
|
|
|
| 9 |
MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
|
| 10 |
HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 11 |
DATASET_PATH = Path("data/dataset")
|
| 12 |
+
EVAL_SAMPLE_SIZE = int(os.getenv("EVAL_SAMPLE_SIZE", "100"))
|
| 13 |
+
EVAL_PERIOD_MIN = float(os.getenv("EVAL_PERIOD_MIN", "30"))
|
| 14 |
+
EVAL_BATCH_SIZE = int(os.getenv("EVAL_BATCH_SIZE", "64"))
|
| 15 |
+
TRAIN_FRACTION_SIZE = float(os.getenv("TRAIN_FRACTION_SIZE", "0.2"))
|
| 16 |
+
EVAL_FRACTION_SIZE = float(os.getenv("EVAL_FRACTION_SIZE", "0.4"))
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/app/utils.py
CHANGED
|
@@ -5,7 +5,10 @@ from datasets import load_dataset as hf_load_dataset
|
|
| 5 |
from datasets import load_from_disk
|
| 6 |
|
| 7 |
|
| 8 |
-
def preprocess(text):
|
|
|
|
|
|
|
|
|
|
| 9 |
new_text = []
|
| 10 |
for t in text.split(" "):
|
| 11 |
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
|
@@ -14,8 +17,11 @@ def preprocess(text):
|
|
| 14 |
return " ".join(new_text)
|
| 15 |
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
|
| 21 |
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
|
|
@@ -27,7 +33,11 @@ def load_model_and_tokenizer(MODEL_SOURCE):
|
|
| 27 |
return tokenizer, model
|
| 28 |
|
| 29 |
|
| 30 |
-
def load_dataset(dataset_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
if dataset_path.exists():
|
| 32 |
dataset = load_from_disk(dataset_path)
|
| 33 |
else:
|
|
|
|
| 5 |
from datasets import load_from_disk
|
| 6 |
|
| 7 |
|
| 8 |
+
def preprocess(text:str)->str:
|
| 9 |
+
"""
|
| 10 |
+
Returns an input text ready to be tokenized by removing special characters
|
| 11 |
+
"""
|
| 12 |
new_text = []
|
| 13 |
for t in text.split(" "):
|
| 14 |
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
|
|
|
| 17 |
return " ".join(new_text)
|
| 18 |
|
| 19 |
|
| 20 |
+
def load_model_and_tokenizer(MODEL_SOURCE:str)->(AutoTokenizer,AutoModelForSequenceClassification):
|
| 21 |
+
"""
|
| 22 |
+
Loads a tokenizer and sentiment analysis model. These can be either loaded from local
|
| 23 |
+
or downloaded from Hugging Face API
|
| 24 |
+
"""
|
| 25 |
if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
|
| 26 |
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
|
| 27 |
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
|
|
|
|
| 33 |
return tokenizer, model
|
| 34 |
|
| 35 |
|
| 36 |
+
def load_dataset(dataset_path:str):
|
| 37 |
+
"""
|
| 38 |
+
Loads the tweet_eval dataset for sentiment analysis task. The dataset
|
| 39 |
+
can be either loaded from local and downloaded through Hugging Face API
|
| 40 |
+
"""
|
| 41 |
if dataset_path.exists():
|
| 42 |
dataset = load_from_disk(dataset_path)
|
| 43 |
else:
|
src/nb.ipynb
DELETED
|
@@ -1,208 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": 48,
|
| 6 |
-
"id": "7aaceacb",
|
| 7 |
-
"metadata": {},
|
| 8 |
-
"outputs": [],
|
| 9 |
-
"source": [
|
| 10 |
-
"from pathlib import Path\n",
|
| 11 |
-
"from app.config import DATASET_PATH, MODEL_SOURCE\n",
|
| 12 |
-
"from app.utils import load_dataset, load_model_and_tokenizer, preprocess\n",
|
| 13 |
-
"from scipy.special import softmax\n"
|
| 14 |
-
]
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"cell_type": "code",
|
| 18 |
-
"execution_count": 49,
|
| 19 |
-
"id": "7defab3e",
|
| 20 |
-
"metadata": {},
|
| 21 |
-
"outputs": [
|
| 22 |
-
{
|
| 23 |
-
"name": "stderr",
|
| 24 |
-
"output_type": "stream",
|
| 25 |
-
"text": [
|
| 26 |
-
"Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
|
| 27 |
-
"- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
| 28 |
-
"- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
| 29 |
-
]
|
| 30 |
-
}
|
| 31 |
-
],
|
| 32 |
-
"source": [
|
| 33 |
-
"tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)\n"
|
| 34 |
-
]
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"cell_type": "code",
|
| 38 |
-
"execution_count": 24,
|
| 39 |
-
"id": "0a1dcfdd",
|
| 40 |
-
"metadata": {},
|
| 41 |
-
"outputs": [],
|
| 42 |
-
"source": [
|
| 43 |
-
"dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n"
|
| 44 |
-
]
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"cell_type": "code",
|
| 48 |
-
"execution_count": 33,
|
| 49 |
-
"id": "501e6728",
|
| 50 |
-
"metadata": {},
|
| 51 |
-
"outputs": [],
|
| 52 |
-
"source": [
|
| 53 |
-
"import torch"
|
| 54 |
-
]
|
| 55 |
-
},
|
| 56 |
-
{
|
| 57 |
-
"cell_type": "code",
|
| 58 |
-
"execution_count": 47,
|
| 59 |
-
"id": "82b25de1",
|
| 60 |
-
"metadata": {},
|
| 61 |
-
"outputs": [
|
| 62 |
-
{
|
| 63 |
-
"data": {
|
| 64 |
-
"text/plain": [
|
| 65 |
-
"2"
|
| 66 |
-
]
|
| 67 |
-
},
|
| 68 |
-
"execution_count": 47,
|
| 69 |
-
"metadata": {},
|
| 70 |
-
"output_type": "execute_result"
|
| 71 |
-
}
|
| 72 |
-
],
|
| 73 |
-
"source": [
|
| 74 |
-
"N_BEVAL_BATCH_SIZE = 64\n",
|
| 75 |
-
"N_SAMPLES = 500\n",
|
| 76 |
-
"N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
|
| 77 |
-
"N_BATCHES"
|
| 78 |
-
]
|
| 79 |
-
},
|
| 80 |
-
{
|
| 81 |
-
"cell_type": "code",
|
| 82 |
-
"execution_count": 54,
|
| 83 |
-
"id": "7dd5371b",
|
| 84 |
-
"metadata": {},
|
| 85 |
-
"outputs": [
|
| 86 |
-
{
|
| 87 |
-
"name": "stdout",
|
| 88 |
-
"output_type": "stream",
|
| 89 |
-
"text": [
|
| 90 |
-
"0 64\n",
|
| 91 |
-
"64 128\n",
|
| 92 |
-
"128 192\n",
|
| 93 |
-
"192 256\n",
|
| 94 |
-
"256 320\n",
|
| 95 |
-
"320 384\n",
|
| 96 |
-
"384 448\n",
|
| 97 |
-
"448 500\n"
|
| 98 |
-
]
|
| 99 |
-
},
|
| 100 |
-
{
|
| 101 |
-
"data": {
|
| 102 |
-
"text/plain": [
|
| 103 |
-
"np.float64(0.71)"
|
| 104 |
-
]
|
| 105 |
-
},
|
| 106 |
-
"execution_count": 54,
|
| 107 |
-
"metadata": {},
|
| 108 |
-
"output_type": "execute_result"
|
| 109 |
-
}
|
| 110 |
-
],
|
| 111 |
-
"source": [
|
| 112 |
-
"EVAL_BATCH_SIZE = 64\n",
|
| 113 |
-
"N_SAMPLES = 500\n",
|
| 114 |
-
"def evaluate_accuracy():\n",
|
| 115 |
-
"\n",
|
| 116 |
-
" dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n",
|
| 117 |
-
" N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
|
| 118 |
-
"\n",
|
| 119 |
-
" accuracy = 0\n",
|
| 120 |
-
" for i in range(N_BATCHES+1):\n",
|
| 121 |
-
"\n",
|
| 122 |
-
" start = i*EVAL_BATCH_SIZE\n",
|
| 123 |
-
" end = min(N_SAMPLES, (i+1)*EVAL_BATCH_SIZE)\n",
|
| 124 |
-
" print(start, end)\n",
|
| 125 |
-
" samples, labels = dataset[\"text\"][start:end], dataset[\"label\"][start:end]\n",
|
| 126 |
-
" \n",
|
| 127 |
-
" model.eval()\n",
|
| 128 |
-
" encoded_batch = tokenizer(\n",
|
| 129 |
-
" [preprocess(t) for t in samples],\n",
|
| 130 |
-
" padding=True, # pad to same length\n",
|
| 131 |
-
" truncation=True, # truncate long texts\n",
|
| 132 |
-
" return_tensors=\"pt\",\n",
|
| 133 |
-
" )\n",
|
| 134 |
-
"\n",
|
| 135 |
-
" with torch.no_grad():\n",
|
| 136 |
-
" output = model(**encoded_batch)\n",
|
| 137 |
-
" \n",
|
| 138 |
-
" logits = output[0].detach().cpu().numpy()\n",
|
| 139 |
-
" scores = softmax(logits, axis=-1)\n",
|
| 140 |
-
" pred_labels = scores.argmax(axis=-1)\n",
|
| 141 |
-
" accuracy += sum(pred_labels==labels)\n",
|
| 142 |
-
" accuracy/=N_SAMPLES\n",
|
| 143 |
-
" return accuracy\n",
|
| 144 |
-
"evaluate_accuracy()"
|
| 145 |
-
]
|
| 146 |
-
},
|
| 147 |
-
{
|
| 148 |
-
"cell_type": "code",
|
| 149 |
-
"execution_count": 1,
|
| 150 |
-
"id": "dbd3bb8c",
|
| 151 |
-
"metadata": {},
|
| 152 |
-
"outputs": [],
|
| 153 |
-
"source": [
|
| 154 |
-
"def _load_test_data():\n",
|
| 155 |
-
" \"\"\"\n",
|
| 156 |
-
" Expects CSV with columns: text,label\n",
|
| 157 |
-
" label values must be one of labels (negative, neutral, positive) or their indices (0,1,2).\n",
|
| 158 |
-
" \"\"\"\n",
|
| 159 |
-
" df = pd.read_csv(TEST_DATA_PATH)\n",
|
| 160 |
-
" # normalize label column to strings matching our 'labels' list\n",
|
| 161 |
-
" if np.issubdtype(df[\"label\"].dtype, np.number):\n",
|
| 162 |
-
" df[\"label\"] = df[\"label\"].astype(int).map(lambda i: labels[i])\n",
|
| 163 |
-
" else:\n",
|
| 164 |
-
" df[\"label\"] = df[\"label\"].str.lower().str.strip()\n",
|
| 165 |
-
" # keep only supported labels\n",
|
| 166 |
-
" df = df[df[\"label\"].isin(labels)].dropna(subset=[\"text\", \"label\"])\n",
|
| 167 |
-
" return df"
|
| 168 |
-
]
|
| 169 |
-
},
|
| 170 |
-
{
|
| 171 |
-
"cell_type": "code",
|
| 172 |
-
"execution_count": null,
|
| 173 |
-
"id": "ec0b086e",
|
| 174 |
-
"metadata": {},
|
| 175 |
-
"outputs": [],
|
| 176 |
-
"source": []
|
| 177 |
-
},
|
| 178 |
-
{
|
| 179 |
-
"cell_type": "code",
|
| 180 |
-
"execution_count": null,
|
| 181 |
-
"id": "800c8018",
|
| 182 |
-
"metadata": {},
|
| 183 |
-
"outputs": [],
|
| 184 |
-
"source": []
|
| 185 |
-
}
|
| 186 |
-
],
|
| 187 |
-
"metadata": {
|
| 188 |
-
"kernelspec": {
|
| 189 |
-
"display_name": "ProjectEnv",
|
| 190 |
-
"language": "python",
|
| 191 |
-
"name": "python3"
|
| 192 |
-
},
|
| 193 |
-
"language_info": {
|
| 194 |
-
"codemirror_mode": {
|
| 195 |
-
"name": "ipython",
|
| 196 |
-
"version": 3
|
| 197 |
-
},
|
| 198 |
-
"file_extension": ".py",
|
| 199 |
-
"mimetype": "text/x-python",
|
| 200 |
-
"name": "python",
|
| 201 |
-
"nbconvert_exporter": "python",
|
| 202 |
-
"pygments_lexer": "ipython3",
|
| 203 |
-
"version": "3.11.10"
|
| 204 |
-
}
|
| 205 |
-
},
|
| 206 |
-
"nbformat": 4,
|
| 207 |
-
"nbformat_minor": 5
|
| 208 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/train_model.py
CHANGED
|
@@ -9,7 +9,13 @@ from transformers import (
|
|
| 9 |
DataCollatorWithPadding
|
| 10 |
)
|
| 11 |
from pathlib import Path
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
# --- Device detection ---
|
|
@@ -77,18 +83,11 @@ model.config.use_cache = False
|
|
| 77 |
dataset = load_dataset(DATASET_PATH)
|
| 78 |
|
| 79 |
|
| 80 |
-
# ---- COPY-PASTE FROM HERE ----
|
| 81 |
-
import os
|
| 82 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 83 |
-
|
| 84 |
-
from datasets import DatasetDict
|
| 85 |
-
from transformers import AutoTokenizer, DataCollatorWithPadding
|
| 86 |
-
|
| 87 |
def make_trainer_ready(
|
| 88 |
raw_ds: DatasetDict,
|
| 89 |
model_name: str = "cardiffnlp/twitter-roberta-base-sep2022",
|
| 90 |
train_frac: float = 0.2,
|
| 91 |
-
val_frac: float = 0.
|
| 92 |
seed: int = 42,
|
| 93 |
label_col: str = "label",
|
| 94 |
text_col: str = "text",
|
|
@@ -156,8 +155,8 @@ def make_trainer_ready(
|
|
| 156 |
train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(
|
| 157 |
raw_ds=dataset,
|
| 158 |
model_name="cardiffnlp/twitter-roberta-base-sep2022",
|
| 159 |
-
train_frac=
|
| 160 |
-
val_frac=
|
| 161 |
seed=42,
|
| 162 |
label_col="label",
|
| 163 |
text_col="text",
|
|
|
|
| 9 |
DataCollatorWithPadding
|
| 10 |
)
|
| 11 |
from pathlib import Path
|
| 12 |
+
import os
|
| 13 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 14 |
+
from datasets import DatasetDict
|
| 15 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
| 16 |
+
from app.config import DATASET_PATH, TRAIN_FRACTION_SIZE, EVAL_FRACTION_SIZE
|
| 17 |
+
|
| 18 |
+
|
| 19 |
|
| 20 |
|
| 21 |
# --- Device detection ---
|
|
|
|
| 83 |
dataset = load_dataset(DATASET_PATH)
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def make_trainer_ready(
|
| 87 |
raw_ds: DatasetDict,
|
| 88 |
model_name: str = "cardiffnlp/twitter-roberta-base-sep2022",
|
| 89 |
train_frac: float = 0.2,
|
| 90 |
+
val_frac: float = 0.4,
|
| 91 |
seed: int = 42,
|
| 92 |
label_col: str = "label",
|
| 93 |
text_col: str = "text",
|
|
|
|
| 155 |
train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(
|
| 156 |
raw_ds=dataset,
|
| 157 |
model_name="cardiffnlp/twitter-roberta-base-sep2022",
|
| 158 |
+
train_frac=TRAIN_FRACTION_SIZE, # take 20% of train
|
| 159 |
+
val_frac=EVAL_FRACTION_SIZE, # take 50% of validation
|
| 160 |
seed=42,
|
| 161 |
label_col="label",
|
| 162 |
text_col="text",
|
tests/test_data.py
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
import pytest
|
| 2 |
-
|
| 3 |
-
from datasets import load_dataset
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|