Spaces:

GMARTINEZMILLA
/

Final_Project

Sleeping

App Files Files Community

GMARTINEZMILLA commited on Oct 24

Commit

e00db24

•

1 Parent(s): 42f830f

Update utils.py

Browse files

Files changed (1) hide show

utils.py +40 -42

utils.py CHANGED Viewed

@@ -9,57 +9,56 @@ from sklearn.metrics.pairwise import cosine_similarity
 from joblib import dump, load
 from sklearn.preprocessing import normalize
 import re
-def get_next_version(file_prefix, folder='RecommendationFiles/'):
-    """Find the latest version of a file and return the next version's filename."""
-    if not os.path.exists(folder):
-        os.makedirs(folder)  # Ensure the folder exists
-    # Regular expression to match files like 'file_0001.joblib'
-    pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
-    files = [f for f in os.listdir(folder) if pattern.match(f)]
-    # Extract version numbers from matching files
-    versions = [int(pattern.match(f).group(1)) for f in files]
-    # Determine the next version number
-    next_version = max(versions) + 1 if versions else 1
-    # Return the next version filename with the folder path
-    return os.path.join(folder, f"{file_prefix}_{next_version:04d}.joblib")
-def get_latest_version(file_prefix, folder='RecommendationFiles/'):
-    """Find the latest version of a file to load."""
-    if not os.path.exists(folder):
-        raise FileNotFoundError(f"Folder '{folder}' does not exist")
-    # Regular expression to match files like 'file_0001.joblib'
-    pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
-    files = [f for f in os.listdir(folder) if pattern.match(f)]
-    # Extract version numbers from matching files
-    versions = [int(pattern.match(f).group(1)) for f in files]
-    if versions:
-        latest_version = max(versions)
-        return os.path.join(folder, f"{file_prefix}_{latest_version:04d}.joblib")
-    else:
-        raise FileNotFoundError(f"No versions found for {file_prefix} in folder '{folder}'")
 def recomienda_tf(new_basket, cestas, productos):
-    # Get the latest versions of the matrix and vectorizer from the folder
-    tf_matrix_file = get_latest_version('count_matrix')
-    count_vectorizer_file = get_latest_version('count_vectorizer')
-    # Load the matrix TF and the vectorizer
     tf_matrix = load(tf_matrix_file)
-    count = load(count_vectorizer_file)
     # Convert the new basket into TF (Term Frequency) format
     new_basket_str = ' '.join(new_basket)
-    new_basket_vector = count.transform([new_basket_str])
     new_basket_tf = normalize(new_basket_vector, norm='l1')  # Normalize the count matrix for the current basket
     # Compare the new basket with previous ones
@@ -122,11 +121,10 @@ def retroalimentacion(cestas, cesta_nueva):
         cestas.loc[len(cestas)] = cesta_unida
         st.success("✓ Cesta añadida al DataFrame.")
-        # Re-save the updated baskets DataFrame
-        file_path = 'RecommendationFiles/cestas_final.csv'
-        cestas.to_csv(file_path, index=False)
-        st.write(f"DEBUG: Se ha guardado la nueva cesta en {file_path}")
     else:
         st.warning("⚠️ La cesta ya existe en el DataFrame.")
@@ -138,7 +136,7 @@ def retroalimentacion(cestas, cesta_nueva):
     # Save new versions of the vectorizer and matrix
     count_vectorizer_file = get_next_version('count_vectorizer')
-    tf_matrix_file = get_next_version('tf_matrix')
     dump(count_vectorizer, count_vectorizer_file)
     dump(tf_matrix, tf_matrix_file)

 from joblib import dump, load
 from sklearn.preprocessing import normalize
 import re
+from datasets import load_dataset, Dataset
+# Load the dataset from Hugging Face Datasets
+def load_files_from_huggingface():
+    dataset = load_dataset("GMARTINEZMILLA/deepsinisghtz_dataset", split="train")
+    # Load CSV file
+    cestas_file = dataset['cestas_final.csv']
+    cestas = pd.read_csv(cestas_file)
+    # Load joblib files
+    count_matrix_file = dataset['count_matrix_0001.joblib']
+    count_vectorizer_file = dataset['count_vectorizer_0001.joblib']
+    tf_matrix = load(count_matrix_file)
+    count_vectorizer = load(count_vectorizer_file)
+    return cestas, tf_matrix, count_vectorizer
+# Save updated files back to Hugging Face Datasets
+def save_files_to_huggingface(cestas, tf_matrix, count_vectorizer):
+    # Save updated CSV file
+    cestas.to_csv('cestas_final.csv', index=False)
+    # Create new dataset and push to Hugging Face
+    dataset = Dataset.from_pandas(cestas)
+    dataset.push_to_hub("GMARTINEZMILLA/deepsinisghtz_dataset")
+    # Save updated joblib files
+    dump(tf_matrix, 'count_matrix_0002.joblib')  # Increment version
+    dump(count_vectorizer, 'count_vectorizer_0002.joblib')  # Increment version
+    # Optionally, push joblib files back to Hugging Face Datasets (if supported)
+    # You can manually add these files to the dataset in the Hugging Face interface if needed
+def get_next_version(file_prefix):
+    """Return the next version number for joblib files."""
+    # You can hardcode or generate a new version name (e.g., 0002, 0003, etc.)
+    return f"{file_prefix}_0002.joblib"
 def recomienda_tf(new_basket, cestas, productos):
+    # Load the latest versions of the matrix and vectorizer
+    tf_matrix_file = 'count_matrix_0001.joblib'
+    count_vectorizer_file = 'count_vectorizer_0001.joblib'
     tf_matrix = load(tf_matrix_file)
+    count_vectorizer = load(count_vectorizer_file)
     # Convert the new basket into TF (Term Frequency) format
     new_basket_str = ' '.join(new_basket)
+    new_basket_vector = count_vectorizer.transform([new_basket_str])
     new_basket_tf = normalize(new_basket_vector, norm='l1')  # Normalize the count matrix for the current basket
     # Compare the new basket with previous ones
         cestas.loc[len(cestas)] = cesta_unida
         st.success("✓ Cesta añadida al DataFrame.")
+        # Save the updated DataFrame and joblib files back to Hugging Face Datasets
+        save_files_to_huggingface(cestas, tf_matrix, count_vectorizer)
+        st.write("DEBUG: Los archivos se han guardado en Hugging Face Datasets.")
     else:
         st.warning("⚠️ La cesta ya existe en el DataFrame.")
     # Save new versions of the vectorizer and matrix
     count_vectorizer_file = get_next_version('count_vectorizer')
+    tf_matrix_file = get_next_version('count_matrix')
     dump(count_vectorizer, count_vectorizer_file)
     dump(tf_matrix, tf_matrix_file)