Spaces:

alaahussein
/

Winemakers-Diemma

Sleeping

App Files Files Community

Winemakers-Diemma / data_preparation_3.py

alaahussein

Upload folder using huggingface_hub

a17642c verified 2 months ago

raw history blame contribute delete

No virus

2.92 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import classification_report, confusion_matrix
	from imblearn.over_sampling import SMOTE
	from imblearn.pipeline import Pipeline as imbpipeline
	import mlflow
	import dagshub

	# Initialize DagsHub and MLflow for tracking
	dagshub.init(repo_owner='alaa.hussein401', repo_name='Winemakers-Dilemma', mlflow=True)
	mlflow.set_tracking_uri('https://dagshub.com/alaa.hussein401/Winemakers-Dilemma.mlflow')

	# Start an MLflow run
	with mlflow.start_run():
	# Load the dataset
	df = pd.read_csv('vineyard_weather_1948-2017.csv')

	# Preprocess and feature engineering
	df['DATE'] = pd.to_datetime(df['DATE'], format='%Y-%m-%d')
	df['year'] = df['DATE'].dt.year
	df['week_of_year'] = df['DATE'].dt.isocalendar().week

	# Filter data for weeks 35 to 40
	df_filtered = df[(df['week_of_year'] >= 35) & (df['week_of_year'] <= 40)]

	# Aggregate data by year and week number
	weekly_data = df_filtered.groupby(['year', 'week_of_year']).agg({
	'PRCP': 'sum',
	'TMAX': 'mean',
	'TMIN': 'mean',
	}).reset_index()

	# Feature engineering
	weekly_data['PRCP_lag1'] = weekly_data['PRCP'].shift(1)
	weekly_data['PRCP_lag2'] = weekly_data['PRCP'].shift(2)
	weekly_data['PRCP_MA3'] = weekly_data['PRCP'].rolling(window=3).mean()

	# Drop rows with NaN values that were created by shifts and rolling means
	weekly_data.dropna(inplace=True)

	# Select features for training
	features = ['TMAX', 'TMIN', 'PRCP_lag1', 'PRCP_lag2', 'PRCP_MA3']
	X = weekly_data[features]
	y = (weekly_data['PRCP'] >= 0.35).astype(int)

	# Split into train and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Create a pipeline that first standardizes the data, then applies SMOTE, and finally fits the model
	pipeline = imbpipeline([
	('scaler', StandardScaler()),
	('smote', SMOTE(random_state=42)),
	('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
	])

	# Fit the pipeline
	pipeline.fit(X_train, y_train)

	# Make predictions
	y_pred = pipeline.predict(X_test)

	# Log parameters, metrics, and artifacts
	mlflow.log_params(pipeline.named_steps['classifier'].get_params())
	mlflow.log_metric('accuracy', pipeline.score(X_test, y_test))

	# Generate and log classification report
	report = classification_report(y_test, y_pred)
	mlflow.log_text(report, 'classification_report_3.txt')

	# Generate and log confusion matrix
	conf_matrix = confusion_matrix(y_test, y_pred)
	mlflow.log_text(str(conf_matrix), 'confusion_matrix_3.txt')

	# Print the classification report and confusion matrix
	print(report)
	print(conf_matrix)