Spaces:

turing-team
/

turing-space

Runtime error

App Files Files Community

turing-space / turing /modeling /models /randomForestTfIdf.py

papri-ka

Deploy FastAPI ML service to Hugging Face Spaces

5fc6e5d 5 days ago

raw

history blame contribute delete

4.86 kB

	import warnings

	from loguru import logger
	from numpy import ndarray
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics import (
	accuracy_score,
	classification_report,
	f1_score,
	precision_score,
	recall_score,
	)
	from sklearn.model_selection import GridSearchCV
	from sklearn.multioutput import MultiOutputClassifier
	from sklearn.pipeline import Pipeline

	from ..baseModel import BaseModel

	warnings.filterwarnings("ignore")


	class RandomForestTfIdf(BaseModel):
	"""
	Sklearn implementation of BaseModel with integrated Grid Search.
	Builds a TF-IDF + RandomForest pipeline for multi-output text classification.
	"""

	def __init__(self, language, path=None):
	"""
	Initialize the RandomForestTfIdf model with configuration parameters.

	Args:
	language (str): Language for the model.
	path (str, optional): Path to load a pre-trained model. Defaults to None.
	If None, a new model is initialized.
	"""

	self.params = {"stop_words": "english", "random_state": 42, "cv_folds": 5}

	self.grid_params = {
	"clf__estimator__n_estimators": [50, 100, 200],
	"clf__estimator__max_depth": [None, 10, 20],
	"tfidf__max_features": [3000, 5000, 8000],
	}

	super().__init__(language, path)

	def setup_model(self):
	"""
	Initialize the scikit-learn pipeline with TF-IDF vectorizer and RandomForest classifier.
	"""

	base_estimator = RandomForestClassifier(
	random_state=self.params["random_state"], n_jobs=-1
	)

	self.pipeline = Pipeline(
	[
	(
	"tfidf",
	TfidfVectorizer(ngram_range=(1, 2), stop_words=self.params["stop_words"]),
	),
	("clf", MultiOutputClassifier(base_estimator, n_jobs=-1)),
	]
	)

	self.model = self.pipeline
	logger.info("Scikit-learn pipeline initialized.")

	def train(self, X_train, y_train) -> dict[str, any]:
	"""
	Train the model using Grid Search to find the best hyperparameters.

	Args:
	X_train: Input training data.
	y_train: True labels for training data.
	"""

	if self.model is None:
	raise ValueError(
	"Model pipeline is not initialized. Call setup_model() before training."
	)

	logger.info(f"Starting training for: {self.language.upper()}")
	logger.info("Performing Grid Search for best hyperparameters...")
	grid_search = GridSearchCV(
	self.pipeline,
	param_grid=self.grid_params,
	cv=self.params["cv_folds"],
	scoring="f1_weighted",
	n_jobs=-1,
	verbose=1,
	)
	grid_search.fit(X_train, y_train)

	logger.success(f"Best params found: {grid_search.best_params_}")

	parameters_to_log = {
	"max_features": grid_search.best_params_["tfidf__max_features"],
	"n_estimators": grid_search.best_params_["clf__estimator__n_estimators"],
	"max_depth": grid_search.best_params_["clf__estimator__max_depth"],
	}

	self.model = grid_search.best_estimator_
	logger.success(f"Training for {self.language.upper()} completed.")

	return parameters_to_log

	def evaluate(self, X_test, y_test) -> dict[str, any]:
	"""
	Evaluate model on test data and return metrics.

	Args:
	X_test: Input test data.
	y_test: True labels for test data.
	"""

	y_pred = self.predict(X_test)

	report = classification_report(y_test, y_pred, zero_division=0)
	print("\n" + "=" * 50)
	print("CLASSIFICATION REPORT")
	print(report)
	print("=" * 50 + "\n")

	metrics = {
	"accuracy": accuracy_score(y_test, y_pred),
	"precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
	"recall": recall_score(y_test, y_pred, average="macro", zero_division=0),
	"f1_score": f1_score(y_test, y_pred, average="weighted"),
	}

	logger.info(
	f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}"
	)
	return metrics

	def predict(self, X) -> ndarray:
	"""
	Make predictions using the trained model.

	Args:
	X: Input data for prediction.

	Returns:
	Predictions made by the model.
	"""

	if self.model is None:
	raise ValueError("Model is not trained. Call train() or load() before prediction.")

	return self.model.predict(X)