GMARTINEZMILLA commited on
Commit
e00db24
1 Parent(s): 42f830f

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +40 -42
utils.py CHANGED
@@ -9,57 +9,56 @@ from sklearn.metrics.pairwise import cosine_similarity
9
  from joblib import dump, load
10
  from sklearn.preprocessing import normalize
11
  import re
 
12
 
13
- def get_next_version(file_prefix, folder='RecommendationFiles/'):
14
- """Find the latest version of a file and return the next version's filename."""
15
- if not os.path.exists(folder):
16
- os.makedirs(folder) # Ensure the folder exists
17
-
18
- # Regular expression to match files like 'file_0001.joblib'
19
- pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
20
- files = [f for f in os.listdir(folder) if pattern.match(f)]
21
 
22
- # Extract version numbers from matching files
23
- versions = [int(pattern.match(f).group(1)) for f in files]
 
24
 
25
- # Determine the next version number
26
- next_version = max(versions) + 1 if versions else 1
 
 
 
27
 
28
- # Return the next version filename with the folder path
29
- return os.path.join(folder, f"{file_prefix}_{next_version:04d}.joblib")
30
 
31
- def get_latest_version(file_prefix, folder='RecommendationFiles/'):
32
- """Find the latest version of a file to load."""
33
- if not os.path.exists(folder):
34
- raise FileNotFoundError(f"Folder '{folder}' does not exist")
35
-
36
- # Regular expression to match files like 'file_0001.joblib'
37
- pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
38
- files = [f for f in os.listdir(folder) if pattern.match(f)]
39
-
40
- # Extract version numbers from matching files
41
- versions = [int(pattern.match(f).group(1)) for f in files]
 
42
 
43
- if versions:
44
- latest_version = max(versions)
45
- return os.path.join(folder, f"{file_prefix}_{latest_version:04d}.joblib")
46
- else:
47
- raise FileNotFoundError(f"No versions found for {file_prefix} in folder '{folder}'")
48
 
 
 
 
 
49
 
50
  def recomienda_tf(new_basket, cestas, productos):
51
-
52
- # Get the latest versions of the matrix and vectorizer from the folder
53
- tf_matrix_file = get_latest_version('count_matrix')
54
- count_vectorizer_file = get_latest_version('count_vectorizer')
55
 
56
- # Load the matrix TF and the vectorizer
57
  tf_matrix = load(tf_matrix_file)
58
- count = load(count_vectorizer_file)
59
 
60
  # Convert the new basket into TF (Term Frequency) format
61
  new_basket_str = ' '.join(new_basket)
62
- new_basket_vector = count.transform([new_basket_str])
63
  new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalize the count matrix for the current basket
64
 
65
  # Compare the new basket with previous ones
@@ -122,11 +121,10 @@ def retroalimentacion(cestas, cesta_nueva):
122
  cestas.loc[len(cestas)] = cesta_unida
123
  st.success("✓ Cesta añadida al DataFrame.")
124
 
125
- # Re-save the updated baskets DataFrame
126
- file_path = 'RecommendationFiles/cestas_final.csv'
127
- cestas.to_csv(file_path, index=False)
128
 
129
- st.write(f"DEBUG: Se ha guardado la nueva cesta en {file_path}")
130
  else:
131
  st.warning("⚠️ La cesta ya existe en el DataFrame.")
132
 
@@ -138,7 +136,7 @@ def retroalimentacion(cestas, cesta_nueva):
138
 
139
  # Save new versions of the vectorizer and matrix
140
  count_vectorizer_file = get_next_version('count_vectorizer')
141
- tf_matrix_file = get_next_version('tf_matrix')
142
 
143
  dump(count_vectorizer, count_vectorizer_file)
144
  dump(tf_matrix, tf_matrix_file)
 
9
  from joblib import dump, load
10
  from sklearn.preprocessing import normalize
11
  import re
12
+ from datasets import load_dataset, Dataset
13
 
14
+ # Load the dataset from Hugging Face Datasets
15
+ def load_files_from_huggingface():
16
+ dataset = load_dataset("GMARTINEZMILLA/deepsinisghtz_dataset", split="train")
 
 
 
 
 
17
 
18
+ # Load CSV file
19
+ cestas_file = dataset['cestas_final.csv']
20
+ cestas = pd.read_csv(cestas_file)
21
 
22
+ # Load joblib files
23
+ count_matrix_file = dataset['count_matrix_0001.joblib']
24
+ count_vectorizer_file = dataset['count_vectorizer_0001.joblib']
25
+ tf_matrix = load(count_matrix_file)
26
+ count_vectorizer = load(count_vectorizer_file)
27
 
28
+ return cestas, tf_matrix, count_vectorizer
 
29
 
30
+ # Save updated files back to Hugging Face Datasets
31
+ def save_files_to_huggingface(cestas, tf_matrix, count_vectorizer):
32
+ # Save updated CSV file
33
+ cestas.to_csv('cestas_final.csv', index=False)
34
+
35
+ # Create new dataset and push to Hugging Face
36
+ dataset = Dataset.from_pandas(cestas)
37
+ dataset.push_to_hub("GMARTINEZMILLA/deepsinisghtz_dataset")
38
+
39
+ # Save updated joblib files
40
+ dump(tf_matrix, 'count_matrix_0002.joblib') # Increment version
41
+ dump(count_vectorizer, 'count_vectorizer_0002.joblib') # Increment version
42
 
43
+ # Optionally, push joblib files back to Hugging Face Datasets (if supported)
44
+ # You can manually add these files to the dataset in the Hugging Face interface if needed
 
 
 
45
 
46
+ def get_next_version(file_prefix):
47
+ """Return the next version number for joblib files."""
48
+ # You can hardcode or generate a new version name (e.g., 0002, 0003, etc.)
49
+ return f"{file_prefix}_0002.joblib"
50
 
51
  def recomienda_tf(new_basket, cestas, productos):
52
+ # Load the latest versions of the matrix and vectorizer
53
+ tf_matrix_file = 'count_matrix_0001.joblib'
54
+ count_vectorizer_file = 'count_vectorizer_0001.joblib'
 
55
 
 
56
  tf_matrix = load(tf_matrix_file)
57
+ count_vectorizer = load(count_vectorizer_file)
58
 
59
  # Convert the new basket into TF (Term Frequency) format
60
  new_basket_str = ' '.join(new_basket)
61
+ new_basket_vector = count_vectorizer.transform([new_basket_str])
62
  new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalize the count matrix for the current basket
63
 
64
  # Compare the new basket with previous ones
 
121
  cestas.loc[len(cestas)] = cesta_unida
122
  st.success("✓ Cesta añadida al DataFrame.")
123
 
124
+ # Save the updated DataFrame and joblib files back to Hugging Face Datasets
125
+ save_files_to_huggingface(cestas, tf_matrix, count_vectorizer)
 
126
 
127
+ st.write("DEBUG: Los archivos se han guardado en Hugging Face Datasets.")
128
  else:
129
  st.warning("⚠️ La cesta ya existe en el DataFrame.")
130
 
 
136
 
137
  # Save new versions of the vectorizer and matrix
138
  count_vectorizer_file = get_next_version('count_vectorizer')
139
+ tf_matrix_file = get_next_version('count_matrix')
140
 
141
  dump(count_vectorizer, count_vectorizer_file)
142
  dump(tf_matrix, tf_matrix_file)