Spaces:
Running
Running
| import os | |
| import joblib | |
| from scipy.sparse import save_npz, load_npz | |
| import pandas as pd | |
| from app.utils.data_preprocessing import preprocess_data | |
| from app.utils.feature_engineering import create_feature_matrices | |
| def load_or_create_data(csv_file_path, precomputed_dir, feature_weights): | |
| files = ['df', 'tfidf_vectorizer_ingredients', 'tfidf_vectorizer_keywords', | |
| 'tfidf_vectorizer_keywords_name', 'category_dummies', 'scaler'] | |
| if all(os.path.exists(os.path.join(precomputed_dir, f'{f}.joblib')) for f in files) and \ | |
| os.path.exists(os.path.join(precomputed_dir, 'combined_matrix.npz')): | |
| return load_precomputed_data(precomputed_dir) | |
| else: | |
| return compute_and_save_data(csv_file_path, precomputed_dir, feature_weights) | |
| def load_precomputed_data(precomputed_dir): | |
| data = {} | |
| for f in ['df', 'tfidf_vectorizer_ingredients', 'tfidf_vectorizer_keywords', | |
| 'tfidf_vectorizer_keywords_name', 'category_dummies', 'scaler']: | |
| data[f] = joblib.load(os.path.join(precomputed_dir, f'{f}.joblib')) | |
| data['combined_matrix'] = load_npz(os.path.join(precomputed_dir, 'combined_matrix.npz')) | |
| return data | |
| def compute_and_save_data(csv_file_path, precomputed_dir, feature_weights): | |
| df = preprocess_data(pd.read_csv(csv_file_path)) | |
| results = create_feature_matrices(df, feature_weights) | |
| combined_matrix, tfidf_vectorizer_ingredients, tfidf_vectorizer_keywords, \ | |
| tfidf_vectorizer_keywords_name, category_dummies, scaler = results | |
| os.makedirs(precomputed_dir, exist_ok=True) | |
| data = { | |
| 'df': df, | |
| 'tfidf_vectorizer_ingredients': tfidf_vectorizer_ingredients, | |
| 'tfidf_vectorizer_keywords': tfidf_vectorizer_keywords, | |
| 'tfidf_vectorizer_keywords_name': tfidf_vectorizer_keywords_name, | |
| 'category_dummies': category_dummies, | |
| 'scaler': scaler, | |
| 'combined_matrix': combined_matrix | |
| } | |
| for name, obj in data.items(): | |
| if name == 'combined_matrix': | |
| save_npz(os.path.join(precomputed_dir, f'{name}.npz'), obj) | |
| else: | |
| joblib.dump(obj, os.path.join(precomputed_dir, f'{name}.joblib')) | |
| return data |