film_mlmodule / src /data_preprocessing.py
root
first hf commit
6defa3d
raw
history blame
No virus
1.99 kB
from abc import ABC, abstractmethod
from typing import Union
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
current_directory = os.path.dirname(os.path.abspath(__file__))
print(current_directory)
class DataStrategy(ABC):
"""
Abstract class defining strategy for handling data
"""
@abstractmethod
def handle_data(self, df_movie: pd.DataFrame, df_user: pd.DataFrame, df_rating: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
pass
class DataCombiningBaseline(DataStrategy):
"""
Baseline method to return data
"""
def handle_data(self, df_movie: pd.DataFrame, df_user: pd.DataFrame, df_rating: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
result_df = df_rating.pivot(index='movie_id', columns='user_id', values='rating')
# Reset the index to turn movie_id back into a column
result_df.reset_index(inplace=True)
# Fill missing values with NaN (or another value if preferred, like 0)
result_df.fillna(np.nan, inplace=True)
# Rename columns to match user_id
result_df.columns.name = None # Remove the name of the columns index
return result_df
class DataPreprocessing:
"""
Class for cleaning data which processes the data and divides it into train and test
"""
def __init__(self, df_movie: pd.DataFrame, df_user: pd.DataFrame, df_rating: pd.DataFrame, strategy: DataStrategy):
self.df_movie = df_movie
self.df_user = df_user
self.df_rating = df_rating
self.strategy = strategy
def handle_data(self) -> Union[pd.DataFrame, pd.Series]:
"""
Handle data
"""
try:
return self.strategy.handle_data(self.df_movie, self.df_user, self.df_rating)
except Exception as e:
logging.error(f"Error in handling data: {e}")
raise e