Spaces:

lucabadiali
/

ML_OPS_Project

Running

App Files Files Community

lucabadiali commited on 28 days ago

Commit

85c00b7

1 Parent(s): 5396065

Added env config file

Browse files

Files changed (10) hide show

.gitignore +1 -1
env_config.sh +6 -0
prometheus.yml +4 -4
src/app/__pycache__/config.cpython-311.pyc +0 -0
src/app/app.py +115 -64
src/app/config.py +5 -21
src/app/utils.py +14 -4
src/nb.ipynb +0 -208
src/train_model.py +10 -11
tests/test_data.py +0 -4

.gitignore CHANGED Viewed

@@ -5,4 +5,4 @@ data/__pycache__
 data/dataset
 app/__pycache__
 *.pyc
-.env

 data/dataset
 app/__pycache__
 *.pyc
+.env

env_config.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+export MODEL_SOURCE="hf"
+export EVAL_SAMPLE_SIZE=100
+export EVAL_PERIOD_MIN=30
+export EVAL_BATCH_SIZE=64
+export TRAIN_FRACTION_SIZE=0.2
+export EVAL_FRACTION_SIZE=0.4

prometheus.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 global:
-  scrape_interval: 30s   # più gentile verso HF
 scrape_configs:
   - job_name: "fastapi_hf"
@@ -7,9 +7,9 @@ scrape_configs:
     metrics_path: /metrics
     static_configs:
       - targets:
-          #- "host.docker.internal:8000"
-          - "lucabadiali-ml-ops-project.hf.space:443"
-    # Se la Space è privata, aggiungi:
     # authorization:
     #   type: Bearer
     #   credentials: "hf_XXXXXXXXXXXX"   # token read-only

 global:
+  scrape_interval: 60s   # più gentile verso HF
 scrape_configs:
   - job_name: "fastapi_hf"
     metrics_path: /metrics
     static_configs:
       - targets:
+          - "host.docker.internal:8000"
+          #- "lucabadiali-ml-ops-project.hf.space:443"
+    # Se l#a Space è privata, aggiungi:
     # authorization:
     #   type: Bearer
     #   credentials: "hf_XXXXXXXXXXXX"   # token read-only

src/app/__pycache__/config.cpython-311.pyc CHANGED Viewed

Binary files a/src/app/__pycache__/config.cpython-311.pyc and b/src/app/__pycache__/config.cpython-311.pyc differ

src/app/app.py CHANGED Viewed

@@ -1,84 +1,77 @@
 from fastapi import FastAPI, HTTPException
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from .utils import preprocess, load_model_and_tokenizer
 from scipy.special import softmax
 import numpy as np
-from pydantic import BaseModel
 import urllib.request
 import csv
-import requests
-from typing import Union, List
-import torch
-from .config import MODEL_SOURCE, ModelSource
-from prometheus_fastapi_instrumentator import Instrumentator
 ##################
 from prometheus_client import Counter, Gauge
 from apscheduler.schedulers.background import BackgroundScheduler
-from datetime import datetime
-import os
-import random
-import pandas as pd
 #################
-#############
-from .config import EVAL_BATCH_SIZE, N_SAMPLES, DATASET_PATH, EVAL_PERIOD_MIN
-from .utils import load_dataset
-###########
 app = FastAPI()
 Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
-###################
-# ---------- Metrics (custom) ----------
-# Production predictions distribution (unlabeled)
-# PRED_COUNTER = Counter(
-#     "sentiment_requests_total",
-#     "Total predictions served by label",
-#     ["label"]
-# )
-# EVAL_SAMPLE_SIZE = Gauge(
-#     "model_evaluation_sample_size",
-#     "Number of samples used in the latest periodic evaluation"
-# )
-# EVAL_COUNTER_DIST = Counter(
-#     "sentiment_test_distribution_total",
-#     "Cumulative predicted label counts on evaluation samples",
-#     ["label"]
-# )
-# EVAL_RUNS = Counter(
-#     "model_evaluations_total",
-#     "Total number of evaluation runs completed"
-# )
-##################
 class SentimentQuery(BaseModel):
     input_texts: Union[str, List[str]]
 mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
 with urllib.request.urlopen(mapping_link) as f:
     html = f.read().decode('utf-8').split("\n")
     csvreader = csv.reader(html, delimiter='\t')
 labels = [row[1] for row in csvreader if len(row) > 1]
-tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
-model.eval()
 @app.get("/")
 def read_root():
     return {"status": "ok", "message": "Sentiment API is running"}
 @app.post("/predict")
-async def analyze_text(query:SentimentQuery):
     if isinstance(query.input_texts, str):
         input_texts = [query.input_texts]
@@ -102,8 +95,6 @@ async def analyze_text(query:SentimentQuery):
     for i,text in enumerate(input_texts):
         predicted = labels[pred_labels[i]]
-        #PRED_COUNTER.labels(label=predicted).inc()
         response_body.append(
             {
@@ -124,16 +115,25 @@ async def analyze_text(query:SentimentQuery):
-def evaluate_accuracy():
     dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
-    N_BATCHES = len(dataset["text"])//EVAL_BATCH_SIZE
     accuracy = 0
     for i in range(N_BATCHES+1):
         if i == N_BATCHES :
-            samples, labels = dataset["text"][i*EVAL_BATCH_SIZE:], dataset["label"][i*EVAL_BATCH_SIZE:]
         else:
-            samples, labels = dataset["text"][i*EVAL_BATCH_SIZE:(i+1)*EVAL_BATCH_SIZE], dataset["label"][i*EVAL_BATCH_SIZE:(i+1)*EVAL_BATCH_SIZE]
         model.eval()
         encoded_batch = tokenizer(
@@ -150,37 +150,88 @@ def evaluate_accuracy():
         scores = softmax(logits, axis=-1)
         pred_labels = scores.argmax(axis=-1)
         accuracy += sum(pred_labels==labels)
     accuracy/=N_SAMPLES
     return accuracy
-# Evaluation metrics (labeled test set)
-EVAL_ACCURACY = Gauge(
-    "model_evaluation_accuracy",
-    "Accuracy on latest periodic evaluation of labeled test subset"
 )
-from apscheduler.schedulers.background import BackgroundScheduler
-from datetime import datetime, timedelta
-import threading
 _model_lock = threading.Lock()
-def _run_eval_and_set_gauge():
-    # If you expect concurrent requests to /predict, the lock prevents GPU/torch contention
     with _model_lock:
-        acc = evaluate_accuracy()
     EVAL_ACCURACY.set(acc)
-scheduler = BackgroundScheduler(daemon=True)
 @app.on_event("startup")
 def _start_scheduler():
     # run once soon after startup
-    scheduler.add_job(_run_eval_and_set_gauge, next_run_time=datetime.now() + timedelta(seconds=2))
     # then every EVAL_PERIOD_MIN minutes
-    scheduler.add_job(_run_eval_and_set_gauge, "interval", minutes=EVAL_PERIOD_MIN)
     scheduler.start()
 @app.on_event("shutdown")

+###### IMPORTS
+########
+# Imports for app and model creation and
 from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import requests
+from typing import Union, List
+##########
+# Imports for model creation/usage
+import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from scipy.special import softmax
 import numpy as np
 import urllib.request
 import csv
+# #################
+# LOCAL IMPORTS
+from .config import MODEL_SOURCE, ModelSource, EVAL_BATCH_SIZE, EVAL_SAMPLE_SIZE, DATASET_PATH, EVAL_PERIOD_MIN
+from .utils import preprocess, load_model_and_tokenizer, load_dataset
 ##################
+# Imports for app monitoring
+from prometheus_fastapi_instrumentator import Instrumentator
 from prometheus_client import Counter, Gauge
 from apscheduler.schedulers.background import BackgroundScheduler
+from datetime import datetime, timedelta
+import threading
 #################
+#################
+# App creation and metrics exposition
 app = FastAPI()
 Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
+#################
+# class for transferring post request data
 class SentimentQuery(BaseModel):
     input_texts: Union[str, List[str]]
+#################
+# Retrieve model either locally or via download
+tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
+model.eval()
+##############
+# retrieve label to int mapping from model repo
 mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
 with urllib.request.urlopen(mapping_link) as f:
     html = f.read().decode('utf-8').split("\n")
     csvreader = csv.reader(html, delimiter='\t')
 labels = [row[1] for row in csvreader if len(row) > 1]
+#############
 @app.get("/")
 def read_root():
     return {"status": "ok", "message": "Sentiment API is running"}
 @app.post("/predict")
+async def analyze_text(query:SentimentQuery)->dict:
+    """
+    Elaborates an input query containing one or more text messages and returns a response
+    containing the prediction and the sentiment score for each message
+    """
     if isinstance(query.input_texts, str):
         input_texts = [query.input_texts]
     for i,text in enumerate(input_texts):
         predicted = labels[pred_labels[i]]
         response_body.append(
             {
+# Evaluation metrics on labeled test set
+EVAL_ACCURACY = Gauge(
+    "model_evaluation_accuracy",
+    "Accuracy on latest periodic evaluation of labeled test subset"
+)
+def evaluate_accuracy(N_SAMPLES:int, BATCH_SIZE:int)->float:
+    """
+    Evaluates and returns the model accuracy on a random subset of the test dataset
+    """
     dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
+    N_BATCHES = len(dataset["text"])//BATCH_SIZE
     accuracy = 0
     for i in range(N_BATCHES+1):
         if i == N_BATCHES :
+            samples, labels = dataset["text"][i*BATCH_SIZE:], dataset["label"][i*BATCH_SIZE:]
         else:
+            samples, labels = dataset["text"][i*BATCH_SIZE:(i+1)*BATCH_SIZE], dataset["label"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
         model.eval()
         encoded_batch = tokenizer(
         scores = softmax(logits, axis=-1)
         pred_labels = scores.argmax(axis=-1)
         accuracy += sum(pred_labels==labels)
     accuracy/=N_SAMPLES
     return accuracy
+# Sentiment Distribution over unlabelled set
+SENTIMENT_BATCH_FRACTION = Gauge(
+    "sentiment_batch_fraction",
+    "Fraction of predictions in the latest monitored batch, by label (0..1).",
+    ["label"]
 )
+def evaluate_sentiment_distribution(N_SAMPLES:int, BATCH_SIZE:int)->np.ndarray:
+    """
+    Evaluates and returns the sentiment distribution over a random subset of the test dataset
+    """
+    dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
+    N_BATCHES = len(dataset["text"])//BATCH_SIZE
+    model.eval()
+    counts = np.array([0.,0.,0.])
+    for i in range(N_BATCHES+1):
+        if i == N_BATCHES :
+            samples = dataset["text"][i*BATCH_SIZE:]
+        else:
+            samples = dataset["text"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
+        encoded_batch = tokenizer(
+            [preprocess(t) for t in samples],
+            padding=True,          # pad to same length
+            truncation=True,       # truncate long texts
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            output = model(**encoded_batch)
+        logits = output[0].detach().cpu().numpy()
+        scores = softmax(logits, axis=-1)
+        pred_labels = scores.argmax(axis=-1)
+        counts += np.unique(pred_labels, return_counts=True)[1]
+    fractions=counts/N_SAMPLES
+    return fractions
+##################
+# scheduler creation for managing the metric creation jobs
+scheduler = BackgroundScheduler(daemon=True)
+# threading lock to possibly handle concurrent request
 _model_lock = threading.Lock()
+############
+# jobs to be launched periodically
+def _run_eval_and_send_data():
     with _model_lock:
+        acc = evaluate_accuracy(EVAL_SAMPLE_SIZE, EVAL_BATCH_SIZE)
     EVAL_ACCURACY.set(acc)
+def _run_sentiment_distr_and_send_data():
+    with _model_lock:
+        fractions = evaluate_sentiment_distribution(EVAL_SAMPLE_SIZE, EVAL_BATCH_SIZE)
+    for i, label in enumerate(labels):
+        SENTIMENT_BATCH_FRACTION.labels(label=label).set(fractions[i])
 @app.on_event("startup")
 def _start_scheduler():
     # run once soon after startup
+    scheduler.add_job(_run_eval_and_send_data, next_run_time=datetime.now() + timedelta(seconds=2))
     # then every EVAL_PERIOD_MIN minutes
+    scheduler.add_job(_run_eval_and_send_data, "interval", minutes=EVAL_PERIOD_MIN)
+    # run once soon after startup
+    scheduler.add_job(_run_sentiment_distr_and_send_data, next_run_time=datetime.now() + timedelta(seconds=2))
+    # then every EVAL_PERIOD_MIN minutes
+    scheduler.add_job(_run_sentiment_distr_and_send_data, "interval", minutes=EVAL_PERIOD_MIN)
     scheduler.start()
 @app.on_event("shutdown")

src/app/config.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 from enum import Enum
 from pathlib import Path
 class ModelSource(str, Enum):
     HF = "hf"
     LOCAL = "local"
@@ -10,24 +9,9 @@ class ModelSource(str, Enum):
 MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
 HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
 DATASET_PATH = Path("data/dataset")
-EVAL_SAMPLE_SIZE = int(os.getenv("EVAL_SAMPLE_SIZE", "80"))
-EVAL_INTERVAL_HOURS = float(os.getenv("EVAL_INTERVAL_HOURS", "1"))
-RANDOM_SEED = int(os.getenv("RANDOM_SEED", "42"))
-EVAL_BATCH_SIZE = 64
-N_SAMPLES = 500
-EVAL_PERIOD_MIN = 1
-# def load_model_and_tokenizer(MODEL_SOURCE):
-#     if MODEL_SOURCE == ModelSource.HF:   # use the latest model available in the HF hub
-#         tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
-#         model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
-#     else: # use a locally fine tuned model
-#         local_model_path = Path("models/saved_model")
-#         assert local_model_path.exists(), """No local model was found. Run 'python3 src/train_model.py' first"""
-#         tokenizer = AutoTokenizer.from_pretrained("models/saved_tokenizer")
-#         model = AutoModelForSequenceClassification.from_pretrained("models/saved_model")
-#     return tokenizer, model

 from enum import Enum
 from pathlib import Path
 class ModelSource(str, Enum):
     HF = "hf"
     LOCAL = "local"
 MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
 HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
 DATASET_PATH = Path("data/dataset")
+EVAL_SAMPLE_SIZE = int(os.getenv("EVAL_SAMPLE_SIZE", "100"))
+EVAL_PERIOD_MIN = float(os.getenv("EVAL_PERIOD_MIN", "30"))
+EVAL_BATCH_SIZE = int(os.getenv("EVAL_BATCH_SIZE", "64"))
+TRAIN_FRACTION_SIZE = float(os.getenv("TRAIN_FRACTION_SIZE", "0.2"))
+EVAL_FRACTION_SIZE = float(os.getenv("EVAL_FRACTION_SIZE", "0.4"))

src/app/utils.py CHANGED Viewed

@@ -5,7 +5,10 @@ from datasets import load_dataset as hf_load_dataset
 from datasets import load_from_disk
-def preprocess(text):
     new_text = []
     for t in text.split(" "):
         t = '@user' if t.startswith('@') and len(t) > 1 else t
@@ -14,8 +17,11 @@ def preprocess(text):
     return " ".join(new_text)
-def load_model_and_tokenizer(MODEL_SOURCE):
     if MODEL_SOURCE == ModelSource.HF:   # use the latest model available in the HF hub
         tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
         model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
@@ -27,7 +33,11 @@ def load_model_and_tokenizer(MODEL_SOURCE):
     return tokenizer, model
-def load_dataset(dataset_path):
     if dataset_path.exists():
         dataset = load_from_disk(dataset_path)
     else:

 from datasets import load_from_disk
+def preprocess(text:str)->str:
+    """
+    Returns an input text ready to be tokenized by removing special characters
+    """
     new_text = []
     for t in text.split(" "):
         t = '@user' if t.startswith('@') and len(t) > 1 else t
     return " ".join(new_text)
+def load_model_and_tokenizer(MODEL_SOURCE:str)->(AutoTokenizer,AutoModelForSequenceClassification):
+    """
+    Loads a tokenizer and sentiment analysis model. These can be either loaded from local
+    or downloaded from Hugging Face API
+    """
     if MODEL_SOURCE == ModelSource.HF:   # use the latest model available in the HF hub
         tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
         model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
     return tokenizer, model
+def load_dataset(dataset_path:str):
+    """
+    Loads the tweet_eval dataset for sentiment analysis task. The dataset
+    can be either loaded from local and downloaded through Hugging Face API
+    """
     if dataset_path.exists():
         dataset = load_from_disk(dataset_path)
     else:

src/nb.ipynb DELETED Viewed

@@ -1,208 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "7aaceacb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "from app.config import DATASET_PATH, MODEL_SOURCE\n",
-    "from app.utils import load_dataset, load_model_and_tokenizer, preprocess\n",
-    "from scipy.special import softmax\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "7defab3e",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
-      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "0a1dcfdd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "501e6728",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "82b25de1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "N_BEVAL_BATCH_SIZE = 64\n",
-    "N_SAMPLES = 500\n",
-    "N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
-    "N_BATCHES"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "7dd5371b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0 64\n",
-      "64 128\n",
-      "128 192\n",
-      "192 256\n",
-      "256 320\n",
-      "320 384\n",
-      "384 448\n",
-      "448 500\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "np.float64(0.71)"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "EVAL_BATCH_SIZE = 64\n",
-    "N_SAMPLES = 500\n",
-    "def evaluate_accuracy():\n",
-    "\n",
-    "    dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n",
-    "    N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
-    "\n",
-    "    accuracy = 0\n",
-    "    for i in range(N_BATCHES+1):\n",
-    "\n",
-    "        start = i*EVAL_BATCH_SIZE\n",
-    "        end = min(N_SAMPLES, (i+1)*EVAL_BATCH_SIZE)\n",
-    "        print(start, end)\n",
-    "        samples, labels = dataset[\"text\"][start:end], dataset[\"label\"][start:end]\n",
-    "        \n",
-    "        model.eval()\n",
-    "        encoded_batch = tokenizer(\n",
-    "            [preprocess(t) for t in samples],\n",
-    "            padding=True,          # pad to same length\n",
-    "            truncation=True,       # truncate long texts\n",
-    "            return_tensors=\"pt\",\n",
-    "        )\n",
-    "\n",
-    "        with torch.no_grad():\n",
-    "            output = model(**encoded_batch)\n",
-    "    \n",
-    "        logits = output[0].detach().cpu().numpy()\n",
-    "        scores = softmax(logits, axis=-1)\n",
-    "        pred_labels = scores.argmax(axis=-1)\n",
-    "        accuracy += sum(pred_labels==labels)\n",
-    "    accuracy/=N_SAMPLES\n",
-    "    return accuracy\n",
-    "evaluate_accuracy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "dbd3bb8c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _load_test_data():\n",
-    "    \"\"\"\n",
-    "    Expects CSV with columns: text,label\n",
-    "    label values must be one of labels (negative, neutral, positive) or their indices (0,1,2).\n",
-    "    \"\"\"\n",
-    "    df = pd.read_csv(TEST_DATA_PATH)\n",
-    "    # normalize label column to strings matching our 'labels' list\n",
-    "    if np.issubdtype(df[\"label\"].dtype, np.number):\n",
-    "        df[\"label\"] = df[\"label\"].astype(int).map(lambda i: labels[i])\n",
-    "    else:\n",
-    "        df[\"label\"] = df[\"label\"].str.lower().str.strip()\n",
-    "    # keep only supported labels\n",
-    "    df = df[df[\"label\"].isin(labels)].dropna(subset=[\"text\", \"label\"])\n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ec0b086e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "800c8018",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "ProjectEnv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

src/train_model.py CHANGED Viewed

@@ -9,7 +9,13 @@ from transformers import (
     DataCollatorWithPadding
 )
 from pathlib import Path
-from app.config import DATASET_PATH
 # --- Device detection ---
@@ -77,18 +83,11 @@ model.config.use_cache = False
 dataset = load_dataset(DATASET_PATH)
-# ---- COPY-PASTE FROM HERE ----
-import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-from datasets import DatasetDict
-from transformers import AutoTokenizer, DataCollatorWithPadding
 def make_trainer_ready(
     raw_ds: DatasetDict,
     model_name: str = "cardiffnlp/twitter-roberta-base-sep2022",
     train_frac: float = 0.2,
-    val_frac: float = 0.2,
     seed: int = 42,
     label_col: str = "label",
     text_col: str = "text",
@@ -156,8 +155,8 @@ def make_trainer_ready(
 train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(
     raw_ds=dataset,
     model_name="cardiffnlp/twitter-roberta-base-sep2022",
-    train_frac=0.2,    # take 20% of train
-    val_frac=0.5,      # take 50% of validation
     seed=42,
     label_col="label",
     text_col="text",

     DataCollatorWithPadding
 )
 from pathlib import Path
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from datasets import DatasetDict
+from transformers import AutoTokenizer, DataCollatorWithPadding
+from app.config import DATASET_PATH, TRAIN_FRACTION_SIZE, EVAL_FRACTION_SIZE
 # --- Device detection ---
 dataset = load_dataset(DATASET_PATH)
 def make_trainer_ready(
     raw_ds: DatasetDict,
     model_name: str = "cardiffnlp/twitter-roberta-base-sep2022",
     train_frac: float = 0.2,
+    val_frac: float = 0.4,
     seed: int = 42,
     label_col: str = "label",
     text_col: str = "text",
 train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(
     raw_ds=dataset,
     model_name="cardiffnlp/twitter-roberta-base-sep2022",
+    train_frac=TRAIN_FRACTION_SIZE,    # take 20% of train
+    val_frac=EVAL_FRACTION_SIZE,      # take 50% of validation
     seed=42,
     label_col="label",
     text_col="text",

tests/test_data.py DELETED Viewed

@@ -1,4 +0,0 @@
-import pytest
-from datasets import load_dataset