GMARTINEZMILLA commited on
Commit
e0fb16c
·
verified ·
1 Parent(s): afadb57

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +46 -36
utils.py CHANGED
@@ -11,6 +11,9 @@ import re
11
 
12
  def get_next_version(file_prefix, folder='RecommendationFiles/'):
13
  """Find the latest version of a file and return the next version's filename."""
 
 
 
14
  # Regular expression to match files like 'file_0001.joblib'
15
  pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
16
  files = [f for f in os.listdir(folder) if pattern.match(f)]
@@ -19,16 +22,16 @@ def get_next_version(file_prefix, folder='RecommendationFiles/'):
19
  versions = [int(pattern.match(f).group(1)) for f in files]
20
 
21
  # Determine the next version number
22
- if versions:
23
- next_version = max(versions) + 1
24
- else:
25
- next_version = 1 # If no versions exist, start with 1
26
 
27
- # Return the next version filename
28
- return f"{file_prefix}_{next_version:04d}.joblib"
29
 
30
  def get_latest_version(file_prefix, folder='RecommendationFiles/'):
31
  """Find the latest version of a file to load."""
 
 
 
32
  # Regular expression to match files like 'file_0001.joblib'
33
  pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
34
  files = [f for f in os.listdir(folder) if pattern.match(f)]
@@ -38,61 +41,68 @@ def get_latest_version(file_prefix, folder='RecommendationFiles/'):
38
 
39
  if versions:
40
  latest_version = max(versions)
41
- return f"{file_prefix}_{latest_version:04d}.joblib"
42
  else:
43
- raise FileNotFoundError(f"No versions found for {file_prefix}")
44
 
45
 
46
  def recomienda_tf(new_basket, cestas, productos):
47
 
 
48
  tf_matrix_file = get_latest_version('count_matrix')
49
  count_vectorizer_file = get_latest_version('count_vectorizer')
50
 
51
- # Cargar la matriz TF y el modelo
52
  tf_matrix = load(tf_matrix_file)
53
  count = load(count_vectorizer_file)
54
 
55
- # Convertir la nueva cesta en formato TF (Term Frequency)
56
  new_basket_str = ' '.join(new_basket)
57
  new_basket_vector = count.transform([new_basket_str])
58
- new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalizamos la matriz count de la cesta actual
59
- # Comparar la nueva cesta con las anteriores
 
60
  similarities = cosine_similarity(new_basket_tf, tf_matrix)
61
- # Obtener los índices de las cestas más similares
62
- similar_indices = similarities.argsort()[0][-4:] # Las 4 más similares
63
- # Crear un diccionario para contar las recomendaciones
 
 
64
  recommendations_count = {}
65
  total_similarity = 0
66
- # Recomendar productos de cestas similares
 
67
  for idx in similar_indices:
68
  sim_score = similarities[0][idx]
69
- total_similarity += sim_score # Suma de las similitudes
70
  products = cestas.iloc[idx]['Cestas'].split()
71
- # Usar un conjunto para evitar contar productos múltiples veces en la misma cesta
72
- unique_products = set(products) # Usar un conjunto para obtener productos únicos
73
- # Con esto evitamos que la importancia crezca por las unidades
74
  for product in unique_products:
75
- if product.strip() not in new_basket: # Evitar recomendar lo que ya está en la cesta
76
  recommendations_count[product.strip()] = recommendations_count.get(product.strip(), 0) + sim_score
77
- # Almacena el conteo de la relevancia de cada producto basado en cuántas veces aparece en las cestas similares, ponderado por la similitud de cada cesta.
78
- # Calcular la probabilidad relativa de cada producto recomendado
79
  recommendations_with_prob = []
80
- if total_similarity > 0: # Verificar que total_similarity no sea cero
81
  recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
82
  else:
83
  print("No se encontraron similitudes suficientes para calcular probabilidades.")
84
 
85
- recommendations_with_prob.sort(key=lambda x: x[1], reverse=True) # Ordenar por puntuación
86
- # Crear un nuevo DataFrame para almacenar las recomendaciones
 
 
87
  recommendations_data = []
88
 
89
  for product, score in recommendations_with_prob:
90
- # Buscar la descripción en el DataFrame de productos
91
  description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
92
  if not description.empty:
93
  recommendations_data.append({
94
  'ARTICULO': product,
95
- 'DESCRIPCION': description.values[0], # Obtener el primer valor encontrado
96
  'RELEVANCIA': score
97
  })
98
  recommendations_df = pd.DataFrame(recommendations_data)
@@ -100,30 +110,30 @@ def recomienda_tf(new_basket, cestas, productos):
100
  return recommendations_df.head(5)
101
 
102
  def retroalimentacion(cestas, cesta_nueva):
103
- # Pasamos de lista a cadena de texto
104
  cesta_unida = ' '.join(cesta_nueva)
105
- # Añadimos la cesta nueva al histórico de cestas. Primero comprobamos si la cesta nueva ya está
 
106
  if not cestas['Cestas'].isin([cesta_unida]).any():
107
- # Añadir la nueva cesta si no existe
108
  cestas.loc[len(cestas)] = cesta_unida
109
  print("Cesta añadida.")
110
- # Reescribimos la nueva cesta
111
- cestas.to_csv('cestas_final.csv')
 
112
  else:
113
  print("La cesta ya existe en el DataFrame.")
114
 
115
- # Vectorizamos de nuevo el df de cestas
116
  count_vectorizer = CountVectorizer()
117
  count_vectorizer.fit(cestas['Cestas'])
118
  count_matrix = count_vectorizer.transform(cestas['Cestas'])
119
  tf_matrix = normalize(count_matrix, norm='l1')
120
 
121
- # Guardar con nueva versión
122
  count_vectorizer_file = get_next_version('count_vectorizer')
123
  tf_matrix_file = get_next_version('tf_matrix')
124
 
125
  dump(count_vectorizer, count_vectorizer_file)
126
  dump(tf_matrix, tf_matrix_file)
127
-
128
 
129
  return None
 
11
 
12
  def get_next_version(file_prefix, folder='RecommendationFiles/'):
13
  """Find the latest version of a file and return the next version's filename."""
14
+ if not os.path.exists(folder):
15
+ os.makedirs(folder) # Ensure the folder exists
16
+
17
  # Regular expression to match files like 'file_0001.joblib'
18
  pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
19
  files = [f for f in os.listdir(folder) if pattern.match(f)]
 
22
  versions = [int(pattern.match(f).group(1)) for f in files]
23
 
24
  # Determine the next version number
25
+ next_version = max(versions) + 1 if versions else 1
 
 
 
26
 
27
+ # Return the next version filename with the folder path
28
+ return os.path.join(folder, f"{file_prefix}_{next_version:04d}.joblib")
29
 
30
  def get_latest_version(file_prefix, folder='RecommendationFiles/'):
31
  """Find the latest version of a file to load."""
32
+ if not os.path.exists(folder):
33
+ raise FileNotFoundError(f"Folder '{folder}' does not exist")
34
+
35
  # Regular expression to match files like 'file_0001.joblib'
36
  pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
37
  files = [f for f in os.listdir(folder) if pattern.match(f)]
 
41
 
42
  if versions:
43
  latest_version = max(versions)
44
+ return os.path.join(folder, f"{file_prefix}_{latest_version:04d}.joblib")
45
  else:
46
+ raise FileNotFoundError(f"No versions found for {file_prefix} in folder '{folder}'")
47
 
48
 
49
  def recomienda_tf(new_basket, cestas, productos):
50
 
51
+ # Get the latest versions of the matrix and vectorizer from the folder
52
  tf_matrix_file = get_latest_version('count_matrix')
53
  count_vectorizer_file = get_latest_version('count_vectorizer')
54
 
55
+ # Load the matrix TF and the vectorizer
56
  tf_matrix = load(tf_matrix_file)
57
  count = load(count_vectorizer_file)
58
 
59
+ # Convert the new basket into TF (Term Frequency) format
60
  new_basket_str = ' '.join(new_basket)
61
  new_basket_vector = count.transform([new_basket_str])
62
+ new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalize the count matrix for the current basket
63
+
64
+ # Compare the new basket with previous ones
65
  similarities = cosine_similarity(new_basket_tf, tf_matrix)
66
+
67
+ # Get the indices of the most similar baskets
68
+ similar_indices = similarities.argsort()[0][-4:] # Top 4 most similar baskets
69
+
70
+ # Create a dictionary to count recommendations
71
  recommendations_count = {}
72
  total_similarity = 0
73
+
74
+ # Recommend products from similar baskets
75
  for idx in similar_indices:
76
  sim_score = similarities[0][idx]
77
+ total_similarity += sim_score # Sum of similarities
78
  products = cestas.iloc[idx]['Cestas'].split()
79
+
80
+ unique_products = set(products) # Use a set to get unique products
81
+
82
  for product in unique_products:
83
+ if product.strip() not in new_basket: # Avoid recommending items already in the basket
84
  recommendations_count[product.strip()] = recommendations_count.get(product.strip(), 0) + sim_score
85
+
86
+ # Calculate the relative probability of each recommended product
87
  recommendations_with_prob = []
88
+ if total_similarity > 0:
89
  recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
90
  else:
91
  print("No se encontraron similitudes suficientes para calcular probabilidades.")
92
 
93
+ # Sort recommendations by relevance score
94
+ recommendations_with_prob.sort(key=lambda x: x[1], reverse=True)
95
+
96
+ # Create a new DataFrame to store recommendations
97
  recommendations_data = []
98
 
99
  for product, score in recommendations_with_prob:
100
+ # Search for the product description in the products DataFrame
101
  description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
102
  if not description.empty:
103
  recommendations_data.append({
104
  'ARTICULO': product,
105
+ 'DESCRIPCION': description.values[0],
106
  'RELEVANCIA': score
107
  })
108
  recommendations_df = pd.DataFrame(recommendations_data)
 
110
  return recommendations_df.head(5)
111
 
112
  def retroalimentacion(cestas, cesta_nueva):
113
+ # Convert basket from list to string
114
  cesta_unida = ' '.join(cesta_nueva)
115
+
116
+ # Add the new basket to the historical baskets if it doesn't already exist
117
  if not cestas['Cestas'].isin([cesta_unida]).any():
 
118
  cestas.loc[len(cestas)] = cesta_unida
119
  print("Cesta añadida.")
120
+
121
+ # Re-save the updated baskets DataFrame
122
+ cestas.to_csv('RecommendationFiles/cestas_final.csv', index=False)
123
  else:
124
  print("La cesta ya existe en el DataFrame.")
125
 
126
+ # Re-vectorize the basket DataFrame
127
  count_vectorizer = CountVectorizer()
128
  count_vectorizer.fit(cestas['Cestas'])
129
  count_matrix = count_vectorizer.transform(cestas['Cestas'])
130
  tf_matrix = normalize(count_matrix, norm='l1')
131
 
132
+ # Save new versions of the vectorizer and matrix
133
  count_vectorizer_file = get_next_version('count_vectorizer')
134
  tf_matrix_file = get_next_version('tf_matrix')
135
 
136
  dump(count_vectorizer, count_vectorizer_file)
137
  dump(tf_matrix, tf_matrix_file)
 
138
 
139
  return None