|
|
from pathlib import Path |
|
|
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
PROJ_ROOT = Path(__file__).resolve().parents[1] |
|
|
|
|
|
DATA_DIR = PROJ_ROOT / "data" |
|
|
RAW_DATA_DIR = DATA_DIR / "raw" |
|
|
INTERIM_DATA_DIR = DATA_DIR / "interim" |
|
|
ISSUE_REPORT_DIR = INTERIM_DATA_DIR / "issue-report-classification" |
|
|
SOFT_CLEANED_DATA_DIR = ISSUE_REPORT_DIR / "soft-cleaned" |
|
|
PROCESSED_DATA_DIR = DATA_DIR / "processed" |
|
|
EXTERNAL_DATA_DIR = DATA_DIR / "external" |
|
|
SAMPLES_DIR = DATA_DIR / "samples" |
|
|
EMBEDDING_DIR = DATA_DIR / "embeddings" |
|
|
|
|
|
MODELS_DIR = PROJ_ROOT / "models" |
|
|
|
|
|
REPORTS_DIR = PROJ_ROOT / "reports" |
|
|
FIGURES_DIR = REPORTS_DIR / "figures" |
|
|
|
|
|
RANDOM_SEED = 42 |
|
|
|
|
|
DATASET_CONFIGs = { |
|
|
"nasa_cfs_train": { |
|
|
"data_path": "nasa/cfs_train.csv", |
|
|
"label_col": "label", |
|
|
"title_col": None, |
|
|
"body_col": "issue", |
|
|
}, |
|
|
"nasa_fprime_train": { |
|
|
"data_path": "nasa/fprime_train.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
}, |
|
|
"nasa_train": { |
|
|
"data_path": "nasa/nasa_train_sample.csv", |
|
|
"label_col": "label", |
|
|
"title_col": None, |
|
|
"body_col": "text", |
|
|
}, |
|
|
"nlbse23_train": { |
|
|
"data_path": "nlbse23/nlbse23-issue-classification-train.csv", |
|
|
"label_col": "labels", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
}, |
|
|
"nlbse24_train": { |
|
|
"data_path": "nlbse24/issues_train.csv", |
|
|
"label_col": "label", |
|
|
"title_col": None, |
|
|
"body_col": "issue", |
|
|
}, |
|
|
"pySenti4SD_train": { |
|
|
"data_path": "pySenti4SD/test_stackoverflow.csv", |
|
|
"label_col": "Polarity", |
|
|
"title_col": None, |
|
|
"body_col": "Text", |
|
|
"sep": ";", |
|
|
}, |
|
|
"nasa_cfs_test": { |
|
|
"data_path": "nasa/cfs_test.csv", |
|
|
"label_col": "label", |
|
|
"title_col": None, |
|
|
"body_col": "issue", |
|
|
}, |
|
|
"nasa_fprime_test": { |
|
|
"data_path": "nasa/fprime_test.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
}, |
|
|
"nasa_test": { |
|
|
"data_path": "nasa/nasa_test_sample.csv", |
|
|
"label_col": "label", |
|
|
"title_col": None, |
|
|
"body_col": "text", |
|
|
}, |
|
|
"nlbse23_test": { |
|
|
"data_path": "nlbse23/nlbse23-issue-classification-test.csv", |
|
|
"label_col": "labels", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
}, |
|
|
"nlbse24_test": { |
|
|
"data_path": "nlbse24/issues_test.csv", |
|
|
"label_col": "label", |
|
|
"title_col": None, |
|
|
"body_col": "issue", |
|
|
}, |
|
|
"pySenti4SD_test": { |
|
|
"data_path": "pySenti4SD/test_stackoverflow.csv", |
|
|
"label_col": "Polarity", |
|
|
"title_col": None, |
|
|
"body_col": "Text", |
|
|
"sep": ";", |
|
|
}, |
|
|
"test": { |
|
|
"data_path": "test/test.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
MODEL_CONFIGS = { |
|
|
|
|
|
"setfit-minilm": { |
|
|
"model_checkpoint": "sentence-transformers/all-MiniLM-L6-v2", |
|
|
"params": { |
|
|
"batch_size": 16, |
|
|
"num_epochs": 1, |
|
|
"num_iterations": 20, |
|
|
"learning_rate": 2e-5, |
|
|
}, |
|
|
}, |
|
|
"setfit-distilroberta": { |
|
|
"model_checkpoint": "sentence-transformers/all-distilroberta-v1", |
|
|
"params": { |
|
|
"batch_size": 16, |
|
|
"num_epochs": 1, |
|
|
"num_iterations": 20, |
|
|
"learning_rate": 2e-5, |
|
|
}, |
|
|
}, |
|
|
|
|
|
"modernbert-base": { |
|
|
"model_checkpoint": "answerdotai/ModernBERT-base", |
|
|
"params": { |
|
|
"per_device_train_batch_size": 16, |
|
|
"per_device_eval_batch_size": 32, |
|
|
"gradient_accumulation_steps": 4, |
|
|
"num_train_epochs": 10, |
|
|
"learning_rate": 2e-5, |
|
|
"weight_decay": 0.01, |
|
|
"warmup_steps": 500, |
|
|
}, |
|
|
}, |
|
|
"roberta-base": { |
|
|
"model_checkpoint": "roberta-base", |
|
|
"params": { |
|
|
"per_device_train_batch_size": 16, |
|
|
"per_device_eval_batch_size": 32, |
|
|
"gradient_accumulation_steps": 4, |
|
|
"num_train_epochs": 15, |
|
|
"learning_rate": 2e-5, |
|
|
"weight_decay": 0.01, |
|
|
"warmup_steps": 500, |
|
|
}, |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow" |
|
|
MLFLOW_EXPERIMENT_NAME = "Baselines_SetFit" |
|
|
|