AryanRajSaxena commited on
Commit
376f4d8
·
verified ·
1 Parent(s): d87307d

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +482 -0
  2. requirements.txt +4 -0
  3. similarity_pipeline.py +66 -0
  4. utils.py +261 -0
app.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import Dict, Any, Tuple
5
+
6
+ from utils import (
7
+ match_by_material_code,
8
+ process_specifications,
9
+ gower_similarity,
10
+ )
11
+
12
+
13
+ REQUIRED_COLUMNS = {
14
+ "Material_Code",
15
+ "Material_Group",
16
+ "Base_Type",
17
+ "Moulding_Type",
18
+ "Product_Type",
19
+ "components_Specifications",
20
+ }
21
+
22
+ STATUS_ORDER = {"Mismatch": 0, "Partial Match": 1, "Match": 2}
23
+
24
+
25
+ ALLOWED_COLUMNS = [
26
+ "Material_Code",
27
+ "Legislation",
28
+ "Min_Dry_Cocoa_Solids",
29
+ "Dry_Milk_Solids",
30
+ "MilkFat",
31
+ "SKU_Tag_Expanded",
32
+ "Packaging_Info_Bag_Box",
33
+ "Packaging_Info_Palletss",
34
+ "Dry_Fat_Free_Cocoa_Solids",
35
+ "Material_Group",
36
+ "components_Specifications",
37
+ "Sugars_g",
38
+ "Protein_g",
39
+ "Total_Fat_g",
40
+ "Contains_Milk_Proteins",
41
+ "Contains_Egg_Products",
42
+ "Contains_Soy_Proteins",
43
+ "Contains_Wheat",
44
+ "Contains_Rye",
45
+ "Contains_Fish",
46
+ "Contains_Crustacean_And_Shell_Fish",
47
+ "Contains_Hazelnuts_Almonds",
48
+ "Contains_Peanuts",
49
+ "Contains_Sulphite_E220_E227",
50
+ "Contains_Celery",
51
+ "Contains_Sesame_Products",
52
+ "Suitable_For_Vegetarians",
53
+ "Suitable_For_Vegans",
54
+ "Contains_Peanut_Oil",
55
+ "Contains_Mustard",
56
+ "Contains_Molluscs",
57
+ "Contains_Lupin",
58
+ "Contains_Buckwheat",
59
+ "Base_Type",
60
+ "Moulding_Type",
61
+ "Product_Type",
62
+ "Colour_TF",
63
+ "Kosher_Certificate",
64
+ "Country_Claim",
65
+ "Shelflife",
66
+ "Packaging_Info",
67
+ "Brand",
68
+ "Commercial_Name",
69
+ "Contains_Hydrogenated",
70
+ "Hydrogenated",
71
+ "Smallest_Unit_Weight_In_Kg",
72
+ "Units_Per_Pallet",
73
+ "Certification_Tag",
74
+ "Colour_Type_Tag",
75
+ "Flavor_Type_Tag",
76
+ "Shape",
77
+ "SKU_Material_Tag",
78
+ "Origin",
79
+ "Sku_Ingredient_Tag",
80
+ "Is_Organic",
81
+ "pH",
82
+ "Normalised_Yield_Pa",
83
+ "Normalised_Linear_Viscosity_mPaS",
84
+ "Normalised_Casson_Mpa_S",
85
+ "Brookfield_40C_S27_20_RPM",
86
+ "Fineness_Micrometer",
87
+ "Dimensions_Length",
88
+ "Dimensions_Width",
89
+ "Dimensions_Count_lb",
90
+ ]
91
+
92
+
93
+ def _ensure_required_columns(df: pd.DataFrame) -> None:
94
+ missing = REQUIRED_COLUMNS - set(df.columns)
95
+ if missing:
96
+ raise gr.Error(
97
+ "The uploaded file is missing required columns: "
98
+ + ", ".join(sorted(missing))
99
+ )
100
+
101
+
102
+ def _format_value(value: Any) -> str:
103
+ if isinstance(value, (float, np.floating)):
104
+ if np.isnan(value):
105
+ return "-"
106
+ return f"{value:.4g}"
107
+ if isinstance(value, (int, np.integer)):
108
+ return str(value)
109
+ if value is None:
110
+ return "-"
111
+ text = str(value).strip()
112
+ return text if text else "-"
113
+
114
+
115
+ def _classify_match(anchor: Any, candidate: Any) -> str:
116
+ anchor_missing = pd.isna(anchor)
117
+ candidate_missing = pd.isna(candidate)
118
+
119
+ if anchor_missing and candidate_missing:
120
+ return "Match"
121
+ if anchor_missing or candidate_missing:
122
+ return "Partial Match"
123
+
124
+ if isinstance(anchor, (float, np.floating, int, np.integer)) and isinstance(
125
+ candidate, (float, np.floating, int, np.integer)
126
+ ):
127
+ if np.isclose(float(anchor), float(candidate), atol=1e-6):
128
+ return "Match"
129
+ return "Mismatch"
130
+
131
+ if str(anchor).strip().lower() == str(candidate).strip().lower():
132
+ return "Match"
133
+ return "Mismatch"
134
+
135
+
136
+ def load_dataset(file_path) -> Tuple[pd.DataFrame, Any, str]:
137
+ if not file_path:
138
+ raise gr.Error("Please upload an Excel data file.")
139
+
140
+ if isinstance(file_path, (list, tuple)):
141
+ if not file_path:
142
+ raise gr.Error("Please upload an Excel data file.")
143
+ file_path = file_path[0]
144
+
145
+ try:
146
+ df = pd.read_excel(file_path, engine="openpyxl")
147
+ except Exception as exc:
148
+ raise gr.Error(f"Unable to read the uploaded file: {exc}") from exc
149
+
150
+ allowed_cols = ALLOWED_COLUMNS
151
+ if allowed_cols:
152
+ present_allowed = [c for c in allowed_cols if c in df.columns]
153
+ if not present_allowed:
154
+ raise gr.Error(
155
+ "None of the expected columns were found in the uploaded file."
156
+ )
157
+ df = df[present_allowed]
158
+ missing_allowed = [c for c in allowed_cols if c not in df.columns]
159
+ else:
160
+ missing_allowed = []
161
+
162
+ _ensure_required_columns(df)
163
+
164
+ if "Legislation" not in df.columns:
165
+ df["Legislation"] = "Unknown"
166
+
167
+ legislation_options = (
168
+ ["All"]
169
+ + sorted(
170
+ {str(v).strip() for v in df["Legislation"].dropna().unique()} - {""}
171
+ )
172
+ )
173
+
174
+ message = f"Loaded {len(df):,} rows with {df.shape[1]} columns."
175
+ if allowed_cols:
176
+ message += f" Using {len(present_allowed)} allowed column(s)."
177
+ if missing_allowed:
178
+ message += f" {len(missing_allowed)} expected column(s) were not found."
179
+ return df, gr.update(choices=legislation_options, value=legislation_options[0]), message
180
+
181
+
182
+ def _prepare_similarity(
183
+ df: pd.DataFrame,
184
+ material_code: str,
185
+ top_n: int,
186
+ legislation_filter: str,
187
+ ) -> Tuple[pd.DataFrame, Dict[str, Any], Any, str]:
188
+ if df is None:
189
+ raise gr.Error("Please load a data file before searching.")
190
+
191
+ material_code = material_code.strip()
192
+ if not material_code:
193
+ raise gr.Error("Enter a material code to search.")
194
+
195
+ if material_code not in df["Material_Code"].values:
196
+ raise gr.Error(f"Material code '{material_code}' was not found in the dataset.")
197
+
198
+ matches = match_by_material_code(df, material_code)
199
+ if matches.empty:
200
+ raise gr.Error(
201
+ "No comparable SKUs share the required grouping attributes with the anchor material."
202
+ )
203
+
204
+ base_non_spec_cols = [c for c in matches.columns if c != "components_Specifications"]
205
+ matches_expanded = process_specifications(matches, material_code, df)
206
+ spec_columns = [
207
+ c
208
+ for c in matches_expanded.columns
209
+ if c not in base_non_spec_cols and c != "Material_Code"
210
+ ]
211
+
212
+ anchor_idx = matches_expanded.index[
213
+ matches_expanded["Material_Code"] == material_code
214
+ ][0]
215
+
216
+ gower_input = matches_expanded.copy()
217
+ obj_cols = gower_input.select_dtypes(include="object").columns
218
+ for col in obj_cols:
219
+ gower_input[col] = gower_input[col].apply(
220
+ lambda v: v.strip().lower() if isinstance(v, str) else v
221
+ )
222
+
223
+ scores = gower_similarity(
224
+ gower_input,
225
+ query_idx=anchor_idx,
226
+ boost="count",
227
+ normalize=True,
228
+ exclude_cols=["Material_Code", "Legislation"],
229
+ )
230
+
231
+ results = scores.join(
232
+ df[
233
+ [
234
+ "Material_Code",
235
+ "Legislation",
236
+ "Material_Group",
237
+ "Base_Type",
238
+ "Moulding_Type",
239
+ "Product_Type",
240
+ ]
241
+ ],
242
+ how="left",
243
+ )
244
+
245
+ results = results.loc[results.index != anchor_idx]
246
+ results = results[results["Material_Code"].astype(str) != material_code]
247
+
248
+ if legislation_filter and legislation_filter != "All":
249
+ results = results[results["Legislation"].astype(str) == legislation_filter]
250
+
251
+ results = results.sort_values(
252
+ ["score", "similarity"], ascending=[False, False]
253
+ ).head(int(top_n))
254
+
255
+ if results.empty:
256
+ empty_message = "No similar SKUs found for the selected criteria."
257
+ empty_dropdown = gr.update(choices=[], value=None)
258
+ return pd.DataFrame(), {}, empty_dropdown, empty_message
259
+
260
+ display_df = results[
261
+ [
262
+ "Material_Code",
263
+ "Legislation",
264
+ "distance",
265
+ "similarity",
266
+ "score",
267
+ "used_count",
268
+ ]
269
+ ].copy()
270
+ display_df[["distance", "similarity", "score"]] = display_df[
271
+ ["distance", "similarity", "score"]
272
+ ].round(4)
273
+
274
+ state = {
275
+ "scores": scores,
276
+ "matches_expanded": matches_expanded,
277
+ "anchor_idx": anchor_idx,
278
+ "anchor_code": material_code,
279
+ "result_indices": results.index.tolist(),
280
+ "spec_columns": spec_columns,
281
+ }
282
+
283
+ candidate_codes = results["Material_Code"].tolist()
284
+ spec_msg = f" with {len(spec_columns)} component field(s)" if spec_columns else ""
285
+ message = f"Found {len(display_df)} similar SKUs{spec_msg}."
286
+ return (
287
+ display_df.reset_index(drop=True),
288
+ state,
289
+ gr.update(choices=candidate_codes, value=candidate_codes[0]),
290
+ message,
291
+ )
292
+
293
+
294
+ def _build_comparison(
295
+ search_state: Dict[str, Any], selected_code: str
296
+ ) -> Tuple[str, pd.DataFrame]:
297
+ if not search_state:
298
+ return "Load results to compare SKUs.", pd.DataFrame()
299
+ if not selected_code:
300
+ return "Select a SKU to compare against the anchor.", pd.DataFrame()
301
+
302
+ matches_expanded: pd.DataFrame = search_state["matches_expanded"]
303
+ scores: pd.DataFrame = search_state["scores"]
304
+ anchor_idx = search_state["anchor_idx"]
305
+ anchor_code = search_state["anchor_code"]
306
+ spec_columns = search_state.get("spec_columns", [])
307
+
308
+ candidate_rows = matches_expanded[
309
+ matches_expanded["Material_Code"] == selected_code
310
+ ]
311
+ if candidate_rows.empty:
312
+ return "Selected SKU is not available for comparison.", pd.DataFrame()
313
+
314
+ candidate_idx = candidate_rows.index[0]
315
+
316
+ anchor_row = matches_expanded.loc[anchor_idx]
317
+ candidate_row = matches_expanded.loc[candidate_idx]
318
+
319
+ base_columns = [
320
+ "Material_Group",
321
+ "Base_Type",
322
+ "Moulding_Type",
323
+ "Product_Type",
324
+ "Legislation",
325
+ ]
326
+
327
+ other_columns = [
328
+ c
329
+ for c in matches_expanded.columns
330
+ if c not in base_columns + ["Material_Code"] + spec_columns
331
+ ]
332
+ comparison_columns = base_columns + spec_columns + other_columns
333
+
334
+ rows = []
335
+ for col in comparison_columns:
336
+ anchor_value = anchor_row.get(col, np.nan)
337
+ candidate_value = candidate_row.get(col, np.nan)
338
+ status = _classify_match(anchor_value, candidate_value)
339
+ rows.append(
340
+ {
341
+ "Attribute": col,
342
+ "Anchor Value": _format_value(anchor_value),
343
+ "Candidate Value": _format_value(candidate_value),
344
+ "Status": status,
345
+ }
346
+ )
347
+
348
+ comparison_df = pd.DataFrame(rows)
349
+ comparison_df["Status"] = pd.Categorical(
350
+ comparison_df["Status"],
351
+ categories=["Mismatch", "Partial Match", "Match"],
352
+ ordered=True,
353
+ )
354
+ comparison_df = comparison_df.sort_values("Status", key=lambda s: s.map(STATUS_ORDER))
355
+
356
+ score = scores.loc[candidate_idx, "score"]
357
+ similarity = scores.loc[candidate_idx, "similarity"]
358
+ distance = scores.loc[candidate_idx, "distance"]
359
+ used = scores.loc[candidate_idx, "used_count"]
360
+
361
+ spec_note = " (no component specs detected)" if not spec_columns else ""
362
+ summary = (
363
+ f"**{anchor_code} vs {selected_code}**{spec_note} \n"
364
+ f"Score: {score:.4f} • Similarity: {similarity:.4f} • Distance: {distance:.4f} \n"
365
+ f"Evidence Columns Used: {int(used)}"
366
+ )
367
+
368
+ return summary, comparison_df.reset_index(drop=True)
369
+
370
+
371
+ def build_interface() -> gr.Blocks:
372
+ with gr.Blocks(title="SKU Similarity Explorer", theme=gr.themes.Soft()) as demo:
373
+ gr.Markdown(
374
+ """
375
+ ## SKU Similarity Explorer
376
+ Upload a master data file, choose an anchor SKU, and explore the most similar alternatives.
377
+ Use the Legislation filter to focus your results, then drill into any candidate for a side-by-side comparison
378
+ with the anchor SKU to understand alignment across attributes and component specifications.
379
+ """
380
+ )
381
+
382
+ data_state = gr.State()
383
+ search_state = gr.State()
384
+
385
+ with gr.Column():
386
+ with gr.Row():
387
+ data_file = gr.File(
388
+ label="Master Data File (Excel)",
389
+ file_types=[".xlsx"],
390
+ type="filepath",
391
+ file_count="single",
392
+ )
393
+ load_button = gr.Button("Load Data", variant="primary")
394
+ load_status = gr.Markdown("Upload your data file to begin.")
395
+
396
+ legislation_filter = gr.Dropdown(
397
+ label="Legislation Filter",
398
+ choices=["All"],
399
+ value="All",
400
+ )
401
+
402
+ with gr.Row():
403
+ material_code_input = gr.Textbox(
404
+ label="Anchor Material Code",
405
+ placeholder="Enter the SKU to compare against",
406
+ )
407
+ topn_slider = gr.Slider(
408
+ label="Number of Similar SKUs",
409
+ minimum=1,
410
+ maximum=50,
411
+ value=10,
412
+ step=1,
413
+ )
414
+ find_button = gr.Button("Find Similar SKUs", variant="primary")
415
+
416
+ results_status = gr.Markdown()
417
+ results_table = gr.Dataframe(
418
+ headers=[
419
+ "Material_Code",
420
+ "Legislation",
421
+ "distance",
422
+ "similarity",
423
+ "score",
424
+ "used_count",
425
+ ],
426
+ datatype=["str", "str", "number", "number", "number", "number"],
427
+ interactive=False,
428
+ label="Similar SKUs",
429
+ )
430
+
431
+ candidate_selector = gr.Dropdown(
432
+ label="Compare Candidate",
433
+ choices=[],
434
+ interactive=True,
435
+ )
436
+
437
+ comparison_summary = gr.Markdown("Select a candidate SKU to review the comparison.")
438
+ comparison_table = gr.Dataframe(
439
+ headers=["Attribute", "Anchor Value", "Candidate Value", "Status"],
440
+ interactive=False,
441
+ label="Attribute-Level Comparison",
442
+ )
443
+
444
+ load_button.click(
445
+ fn=load_dataset,
446
+ inputs=data_file,
447
+ outputs=[data_state, legislation_filter, load_status],
448
+ )
449
+
450
+ find_event = find_button.click(
451
+ fn=_prepare_similarity,
452
+ inputs=[data_state, material_code_input, topn_slider, legislation_filter],
453
+ outputs=[results_table, search_state, candidate_selector, results_status],
454
+ )
455
+
456
+ find_event.then(
457
+ fn=_build_comparison,
458
+ inputs=[search_state, candidate_selector],
459
+ outputs=[comparison_summary, comparison_table],
460
+ )
461
+
462
+ candidate_selector.change(
463
+ fn=_build_comparison,
464
+ inputs=[search_state, candidate_selector],
465
+ outputs=[comparison_summary, comparison_table],
466
+ )
467
+
468
+ gr.Markdown(
469
+ """
470
+ #### Tips
471
+ - Ensure the uploaded file contains the required attributes listed in the documentation.
472
+ - Use the Legislation filter to focus on products compliant with specific regions or standards.
473
+ - Scores combine similarity with evidence coverage, so higher scores indicate both alignment and stronger data backing.
474
+ """
475
+ )
476
+
477
+ return demo
478
+
479
+
480
+ if __name__ == "__main__":
481
+ app = build_interface()
482
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pandas>=1.3.0
2
+ numpy>=1.20.0
3
+ openpyxl>=3.0.0 # Required for reading Excel files
4
+ gradio>=4.0.0
similarity_pipeline.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd # pip install pandas openpyxl
2
+ import numpy as np
3
+ from utils import (
4
+ match_by_material_code,
5
+ process_specifications,
6
+ gower_similarity
7
+ )
8
+
9
+ def find_similar_materials(material_code: str, data_path: str, top_n: int = 10) -> pd.DataFrame:
10
+
11
+ # Read and prepare the data
12
+ active_cols = [
13
+ 'Material_Code', 'Material_Group', 'Base_Type', 'Moulding_Type',
14
+ 'Product_Type', 'components_Specifications', 'Legislation'
15
+ ]
16
+
17
+ try:
18
+ # Read the data file
19
+ df = pd.read_excel(data_path, usecols=active_cols)
20
+
21
+ # Find matching materials by group attributes
22
+ matches = match_by_material_code(df, material_code)
23
+ if matches.empty:
24
+ raise ValueError(f"No matches found for material code: {material_code}")
25
+
26
+ # Process and expand specifications
27
+ matches_expanded = process_specifications(matches, material_code, df)
28
+
29
+ # Calculate similarity scores
30
+ q_idx = df.index[df['Material_Code'] == material_code][0]
31
+ scores = gower_similarity(
32
+ matches_expanded,
33
+ query_idx=q_idx,
34
+ boost='count',
35
+ normalize=True,
36
+ exclude_cols=['Material_Code', 'Legislation']
37
+ )
38
+
39
+ # Get top N similar materials
40
+ top_indices = scores.head(top_n).index
41
+ similar_materials = df.loc[top_indices].copy()
42
+
43
+ # Add similarity metrics to the results
44
+ similar_materials = similar_materials.join(scores[['distance', 'similarity', 'score', 'used_count']])
45
+
46
+ return similar_materials
47
+
48
+ except Exception as e:
49
+ print(f"Error processing material {material_code}: {str(e)}")
50
+ raise
51
+
52
+ if __name__ == "__main__":
53
+ # Example usage
54
+ data_file = "/Users/aryanrajsaxena/Desktop/BarryC/data_analysis/data-files/Master Data - Part 1.xlsx"
55
+ material_code = "YYW-PN-G300297-E15"
56
+
57
+ try:
58
+ similar_materials = find_similar_materials(material_code, data_file)
59
+ print(f"\nTop similar materials for {material_code}:")
60
+ print(similar_materials[['Material_Code', 'Material_Group', 'similarity', 'score', 'used_count']])
61
+ except FileNotFoundError:
62
+ print(f"Error: Data file not found at {data_file}")
63
+ except ValueError as e:
64
+ print(f"Error: {str(e)}")
65
+ except Exception as e:
66
+ print(f"Unexpected error processing material {material_code}: {str(e)}")
utils.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import ast
4
+ from typing import Optional, Iterable, Union
5
+
6
+ def _parse_dict_cell(x):
7
+ if isinstance(x, dict):
8
+ return x
9
+ if pd.isna(x):
10
+ return {}
11
+ try:
12
+ return ast.literal_eval(str(x))
13
+ except Exception:
14
+ return {}
15
+
16
+ def _flatten(d, parent=''):
17
+ out = {}
18
+ if not isinstance(d, dict):
19
+ return out
20
+ for k, v in d.items():
21
+ key = f"{parent}.{k}" if parent else f"{k}"
22
+ if isinstance(v, dict):
23
+ out.update(_flatten(v, key))
24
+ else:
25
+ # normalize list/tuple/set to a string so it's usable as a single value
26
+ if isinstance(v, (list, tuple, set)):
27
+ try:
28
+ v = ";".join(map(str, v))
29
+ except Exception:
30
+ v = str(v)
31
+ out[key] = v
32
+ return out
33
+
34
+ def _strip_percent_to_float(df: pd.DataFrame) -> pd.DataFrame:
35
+ out = df.copy()
36
+ obj_cols = out.select_dtypes(include=['object']).columns
37
+ for c in obj_cols:
38
+ s = out[c]
39
+ has_pct = s.astype(str).str.contains('%', na=False)
40
+ if not has_pct.any():
41
+ continue
42
+ # strip %, commas, spaces; convert to numeric
43
+ cleaned = s.astype(str).str.replace('%', '', regex=False).str.replace(',', '', regex=False).str.strip()
44
+ out[c] = pd.to_numeric(cleaned, errors='coerce')
45
+ return out
46
+
47
+ def get_spec_keys_from_material(df, material_code, spec_col='components_Specifications'):
48
+ """Get component specification keys from a specific material code"""
49
+ material_idx = df.index[df['Material_Code'] == material_code][0]
50
+ material_specs = df.loc[material_idx, spec_col]
51
+ spec_dict = _parse_dict_cell(material_specs)
52
+ return list(_flatten(spec_dict).keys())
53
+
54
+ def match_by_material_code(df: pd.DataFrame, material_code, code_col='Material_Code'):
55
+ """
56
+ Return rows whose (Material_Group, Base_Type, Moulding_Type, Product_Type)
57
+ exactly match the values of the given material_code in df.
58
+ If multiple rows share the material_code, the first match is used.
59
+ """
60
+ cols = ['Material_Group', 'Base_Type', 'Moulding_Type', 'Product_Type']
61
+ required = [code_col] + cols
62
+ missing = [c for c in required if c not in df.columns]
63
+ if missing:
64
+ raise ValueError(f"Missing required columns: {missing}")
65
+
66
+ ref_rows = df.loc[df[code_col] == material_code, cols]
67
+ if ref_rows.empty:
68
+ # No such material_code
69
+ return df.iloc[0:0].copy()
70
+
71
+ ref = ref_rows.iloc[0] # use first occurrence
72
+ mask = pd.Series(True, index=df.index)
73
+ for c in cols:
74
+ v = ref[c]
75
+ mask &= (df[c].isna() if pd.isna(v) else df[c].eq(v))
76
+
77
+ return df.loc[mask].copy()
78
+
79
+ def process_specifications(matches, material_code, df, spec_col='components_Specifications'):
80
+ """Process and expand component specifications"""
81
+ # Get the keys from the reference material code
82
+ spec_keys = get_spec_keys_from_material(df, material_code)
83
+
84
+ # Parse and flatten each row's dict, but only keep the keys from reference material
85
+ parsed = matches[spec_col].apply(_parse_dict_cell).apply(_flatten)
86
+
87
+ # Build a DataFrame with only the reference material's keys, NaN where missing
88
+ spec_df = pd.DataFrame([{k: d.get(k, np.nan) for k in spec_keys}
89
+ for d in parsed], index=matches.index)
90
+
91
+ # Best-effort numeric coercion so numeric-looking strings become numbers
92
+ def _convert_numeric(col: pd.Series) -> pd.Series:
93
+ try:
94
+ return pd.to_numeric(col)
95
+ except (TypeError, ValueError):
96
+ return col
97
+
98
+ spec_df = spec_df.apply(_convert_numeric)
99
+
100
+ # Join back and drop the original dict column
101
+ matches_expanded = matches.drop(columns=[spec_col]).join(spec_df)
102
+
103
+ # Convert percentage values to floats
104
+ matches_expanded = _strip_percent_to_float(matches_expanded)
105
+
106
+ return matches_expanded
107
+
108
+ def gower_similarity(
109
+ df: pd.DataFrame,
110
+ query_idx,
111
+ weights: Optional[Union[dict, pd.Series]] = None,
112
+ boost: str = 'count', # 'count' or 'weight'
113
+ normalize: bool = True, # True -> final score kept in [0,1]
114
+ exclude_cols: Optional[Iterable[str]] = None
115
+ ) -> pd.DataFrame:
116
+ """
117
+ Weighted Gower-like similarity with anchor-centric missing value handling:
118
+
119
+ Case 1: Anchor NaN, candidate has value -> Column counts as used (15/15)
120
+ Case 2: Anchor has value, candidate NaN -> Column counts as not used (14/15)
121
+ Case 3: Both NaN -> Column counts as used (15/15)
122
+ Case 4: Both have values -> Standard distance calculation
123
+ """
124
+ # Defensive copy
125
+ X = df.copy()
126
+
127
+ # Drop excluded columns
128
+ if exclude_cols:
129
+ exclude = [c for c in exclude_cols if c in X.columns]
130
+ X = X.drop(columns=exclude)
131
+
132
+ cols = X.columns.tolist()
133
+ n = len(X)
134
+ if len(cols) == 0:
135
+ raise ValueError("No columns left after excluding columns.")
136
+
137
+ # split numeric / categorical
138
+ num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
139
+ cat_cols = [c for c in cols if c not in num_cols]
140
+
141
+ # build weight series (default 1.0)
142
+ if weights is None:
143
+ w = pd.Series(1.0, index=cols, dtype='float64')
144
+ else:
145
+ if isinstance(weights, pd.Series):
146
+ w = pd.Series(1.0, index=cols, dtype='float64')
147
+ for k, v in weights.items():
148
+ if k in w.index:
149
+ w[k] = float(v)
150
+ elif isinstance(weights, dict):
151
+ w = pd.Series(1.0, index=cols, dtype='float64')
152
+ for k, v in weights.items():
153
+ if k in w.index:
154
+ w[k] = float(v)
155
+ else:
156
+ raise TypeError("weights must be None, dict, or pd.Series")
157
+
158
+ # pick query row (by index label)
159
+ q = X.loc[query_idx]
160
+
161
+ # NUMERIC PART
162
+ if num_cols:
163
+ A = X[num_cols].to_numpy(dtype='float64') # shape (n, m_num)
164
+ qA = q[num_cols].to_numpy(dtype='float64') # shape (m_num,)
165
+
166
+ # Anchor-centric missing value handling
167
+ anchor_nan = np.isnan(qA) # True where anchor is NaN
168
+ data_nan = np.isnan(A) # True where data is NaN
169
+
170
+ # Cases 1 & 3: Anchor NaN and (candidate has value OR candidate NaN) -> count as used
171
+ # Case 2: Anchor has value, candidate NaN -> count as not used
172
+ # Case 4: Both have values -> standard comparison
173
+ used_num = (~anchor_nan & ~data_nan) | anchor_nan # Case 4 OR (Case 1 & 3)
174
+
175
+ # For distance calculation, only use where both have values (Case 4)
176
+ valid_compare = ~anchor_nan & ~data_nan
177
+
178
+ # ranges robust to all-NaN columns:
179
+ col_max = np.nanmax(A, axis=0)
180
+ col_min = np.nanmin(A, axis=0)
181
+ ranges = col_max - col_min
182
+ ranges = np.where(np.isnan(ranges) | (ranges == 0), 1.0, ranges)
183
+
184
+ diff = np.abs(A - qA) # broadcast (n, m_num)
185
+ comp_num = diff / ranges # scaled numeric difference
186
+ comp_num[~valid_compare] = 0.0 # zero distance for Case 1,2,3
187
+
188
+ w_num = w[num_cols].to_numpy(dtype='float64')
189
+ num_sum = (comp_num * w_num).sum(axis=1)
190
+ num_used_w = (used_num * w_num).sum(axis=1) # weight sum reflects anchor-centric logic
191
+ num_used_cnt = used_num.sum(axis=1) # count reflects anchor-centric logic
192
+ else:
193
+ num_sum = np.zeros(n, dtype='float64')
194
+ num_used_w = np.zeros(n, dtype='float64')
195
+ num_used_cnt = np.zeros(n, dtype='int64')
196
+
197
+ # CATEGORICAL PART
198
+ if cat_cols:
199
+ B = X[cat_cols].astype(object)
200
+ qB = q[cat_cols].astype(object)
201
+
202
+ # Anchor-centric missing value handling for categorical
203
+ anchor_miss = pd.isna(qB.values) # True where anchor is missing
204
+ data_miss = B.isna().values # True where data is missing
205
+
206
+ # Same logic as numeric part
207
+ used_cat = (~anchor_miss & ~data_miss) | anchor_miss
208
+ valid_compare = ~anchor_miss & ~data_miss
209
+
210
+ # equality check only where both have values
211
+ equal = (B.values == qB.values) & valid_compare
212
+ comp_cat = (~equal).astype('float64') # 1.0 if different, 0.0 if same or any NaN
213
+
214
+ w_cat = w[cat_cols].to_numpy(dtype='float64')
215
+ cat_sum = (comp_cat * w_cat).sum(axis=1)
216
+ cat_used_w = (used_cat * w_cat).sum(axis=1) # weight sum reflects anchor-centric logic
217
+ cat_used_cnt = used_cat.sum(axis=1) # count reflects anchor-centric logic
218
+ else:
219
+ cat_sum = np.zeros(n, dtype='float64')
220
+ cat_used_w = np.zeros(n, dtype='float64')
221
+ cat_used_cnt = np.zeros(n, dtype='int64')
222
+
223
+ used_w = num_used_w + cat_used_w
224
+ used_cnt = num_used_cnt + cat_used_cnt
225
+ comp_sum = num_sum + cat_sum
226
+
227
+ # distance calculation (now safer since we zero-out invalid comparisons)
228
+ with np.errstate(invalid='ignore', divide='ignore'):
229
+ dist = comp_sum / used_w
230
+ dist = np.where(used_w == 0, np.nan, dist) # no overlap -> NaN
231
+ dist = np.clip(dist, 0.0, 1.0) # clamp to [0,1]
232
+
233
+ similarity = 1.0 - dist
234
+
235
+ # compute boost factor (now properly accounts for anchor-centric logic)
236
+ total_weight = w.sum()
237
+ total_count = len(cols)
238
+
239
+ if boost == 'weight':
240
+ if normalize:
241
+ factor = np.where(total_weight > 0, used_w / total_weight, 0.0)
242
+ else:
243
+ factor = used_w.copy()
244
+ else: # 'count'
245
+ if normalize:
246
+ factor = used_cnt / total_count # This now implements the 15/15, 14/15 logic
247
+ else:
248
+ factor = used_cnt.astype(float)
249
+
250
+ score = similarity * factor
251
+
252
+ out = pd.DataFrame({
253
+ 'distance': dist,
254
+ 'similarity': similarity,
255
+ 'score': score,
256
+ 'used_count': used_cnt,
257
+ 'used_weight': used_w
258
+ }, index=X.index)
259
+
260
+ out = out.sort_values(['score', 'similarity'], ascending=[False, False])
261
+ return out