donato11's picture
Fixed Dataset Configs
fd01bb3
from pathlib import Path
from dotenv import load_dotenv
# Load environment variables from .env file if it exists
load_dotenv()
# DIRECTORY PATHS
PROJ_ROOT = Path(__file__).resolve().parents[1]
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
ISSUE_REPORT_DIR = INTERIM_DATA_DIR / "issue-report-classification"
SOFT_CLEANED_DATA_DIR = ISSUE_REPORT_DIR / "soft-cleaned"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"
SAMPLES_DIR = DATA_DIR / "samples"
EMBEDDING_DIR = DATA_DIR / "embeddings"
MODELS_DIR = PROJ_ROOT / "models"
REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
RANDOM_SEED = 42
DATASET_CONFIGs = {
"nasa_cfs_train": {
"data_path": "nasa/cfs_train.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"nasa_fprime_train": {
"data_path": "nasa/fprime_train.csv",
"label_col": "label",
"title_col": "title",
"body_col": "body",
},
"nasa_train": {
"data_path": "nasa/nasa_train_sample.csv",
"label_col": "label",
"title_col": None,
"body_col": "text",
},
"nlbse23_train": {
"data_path": "nlbse23/nlbse23-issue-classification-train.csv",
"label_col": "labels",
"title_col": "title",
"body_col": "body",
},
"nlbse24_train": {
"data_path": "nlbse24/issues_train.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"pySenti4SD_train": {
"data_path": "pySenti4SD/test_stackoverflow.csv",
"label_col": "Polarity",
"title_col": None,
"body_col": "Text",
"sep": ";",
},
"nasa_cfs_test": {
"data_path": "nasa/cfs_test.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"nasa_fprime_test": {
"data_path": "nasa/fprime_test.csv",
"label_col": "label",
"title_col": "title",
"body_col": "body",
},
"nasa_test": {
"data_path": "nasa/nasa_test_sample.csv",
"label_col": "label",
"title_col": None,
"body_col": "text",
},
"nlbse23_test": {
"data_path": "nlbse23/nlbse23-issue-classification-test.csv",
"label_col": "labels",
"title_col": "title",
"body_col": "body",
},
"nlbse24_test": {
"data_path": "nlbse24/issues_test.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"pySenti4SD_test": {
"data_path": "pySenti4SD/test_stackoverflow.csv",
"label_col": "Polarity",
"title_col": None,
"body_col": "Text",
"sep": ";",
},
"test": {
"data_path": "test/test.csv",
"label_col": "label",
"title_col": "title",
"body_col": "body",
},
}
# MODELS CONFIGURATION
MODEL_CONFIGS = {
# SetFit Models
"setfit-minilm": {
"model_checkpoint": "sentence-transformers/all-MiniLM-L6-v2",
"params": {
"batch_size": 16,
"num_epochs": 1,
"num_iterations": 20,
"learning_rate": 2e-5,
},
},
"setfit-distilroberta": {
"model_checkpoint": "sentence-transformers/all-distilroberta-v1",
"params": {
"batch_size": 16,
"num_epochs": 1,
"num_iterations": 20,
"learning_rate": 2e-5,
},
},
# Standard Transformers Models
"modernbert-base": {
"model_checkpoint": "answerdotai/ModernBERT-base",
"params": {
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 32,
"gradient_accumulation_steps": 4,
"num_train_epochs": 10,
"learning_rate": 2e-5,
"weight_decay": 0.01,
"warmup_steps": 500,
},
},
"roberta-base": {
"model_checkpoint": "roberta-base",
"params": {
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 32,
"gradient_accumulation_steps": 4,
"num_train_epochs": 15,
"learning_rate": 2e-5,
"weight_decay": 0.01,
"warmup_steps": 500,
},
},
}
# --- IMPOSTAZIONI MLFLOW ---
MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow"
MLFLOW_EXPERIMENT_NAME = "Baselines_SetFit"