Spaces:

Subi003
/

ToxicTweet-Tagger

Sleeping

App Files Files Community

ToxicTweet-Tagger / components /feature_engineering.py

Subi003

Upload folder using huggingface_hub

4c01182 verified 9 days ago

raw

history blame contribute delete

3.82 kB

	import sys
	from src.core.constants import PARAMS_FILE
	from src.core.configuration import AppConfiguration
	from src.core.logger import logging
	from src.core.exception import AppException
	from src.utils import read_yaml, save_obj
	import gc
	import pandas as pd
	from scipy.sparse import csr_matrix
	from pathlib import Path
	from sklearn.feature_extraction.text import TfidfVectorizer


	class FeatureEngineering:
	def __init__(self, config = AppConfiguration()):
	"""
	Initializes the FeatureEngineering object by creating a feature engineering configuration.
	Args:
	config (AppConfiguration): The configuration object containing the application configuration.
	"""
	try:
	self.eng_config = config.feature_engineering_config()

	except Exception as e:
	logging.error(f"Failed to create feature engineering Configuration: {e}", exc_info=True)
	raise AppException(e, sys)


	def perform_feature_engineering(self, df: pd.DataFrame):
	"""
	Performs feature engineering on the given dataframe by extracting features with TF-IDF vectorization
	and also splits data into training and testing sets.
	Saves the vectorizer object and training dataset.
	"""
	try:
	config_params = read_yaml(PARAMS_FILE)
	params = config_params.feature_engineering
	vectorizer_name = params.vectorizer

	vectorizer = TfidfVectorizer(max_features=params.max_features,
	min_df=params.min_df, ngram_range=(params.ngrams.min, params.ngrams.max)
	)

	logging.info("Performing TF-IDF vectorization")
	X_tfidf = vectorizer.fit_transform(df['Content'])
	X_tfidf = csr_matrix(X_tfidf)

	training_data = pd.DataFrame(X_tfidf.toarray())
	training_data['Label'] = df['Label'].values

	save_model_path = self.eng_config.models_dir
	save_obj(location_path=save_model_path, obj_name=f"vectorizer.joblib", obj=vectorizer)

	with open(Path(save_model_path, "vectorizer_meta.txt"), 'w') as f:
	f.write(f"{vectorizer_name} has been created and fitted on the training data\n\n {params}")

	logging.info("Saving training dataset")
	training_data.to_feather(self.eng_config.training_data_path)

	logging.info("Feature engineering operation done")

	except Exception as e:
	logging.error(f"Error - feature engineering operation terminated: {e}", exc_info=True)
	raise AppException(e, sys)


	def initiate_feature_engineering():
	"""
	Main function to initiate the feature engineering workflow. It reads preprocessed data,
	performs feature engineering on the data, and splits data into training and testing sets.

	Raises:
	AppException: If an error occurs during feature engineering.
	"""
	obj = FeatureEngineering()
	try:
	logging.info(f"{'='20}Feature Engineering{'='20}")
	data_path = obj.eng_config.preprocessed_data_path
	if not data_path:
	logging.error("Dataset path after preprocessing stage not found")
	df = pd.read_feather(data_path)
	df.dropna(how='any', inplace=True)
	obj.perform_feature_engineering(df)
	del df, obj
	gc.collect()
	logging.info(f"{'='20}Feature Engineering Completed Successfully{'='20} \n\n")

	except Exception as e:
	logging.error(f"Error during Feature Engineering: {e}", exc_info=True)
	raise AppException(e, sys)


	if __name__ == "__main__":
	initiate_feature_engineering()