Spaces:

AryanRajSaxena
/

BarryC

Sleeping

App Files Files Community

AryanRajSaxena commited on 29 days ago

Commit

376f4d8

verified ·

1 Parent(s): d87307d

Upload 4 files

Browse files

Files changed (4) hide show

app.py +482 -0
requirements.txt +4 -0
similarity_pipeline.py +66 -0
utils.py +261 -0

app.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+from typing import Dict, Any, Tuple
+from utils import (
+    match_by_material_code,
+    process_specifications,
+    gower_similarity,
+)
+REQUIRED_COLUMNS = {
+    "Material_Code",
+    "Material_Group",
+    "Base_Type",
+    "Moulding_Type",
+    "Product_Type",
+    "components_Specifications",
+}
+STATUS_ORDER = {"Mismatch": 0, "Partial Match": 1, "Match": 2}
+ALLOWED_COLUMNS = [
+    "Material_Code",
+    "Legislation",
+    "Min_Dry_Cocoa_Solids",
+    "Dry_Milk_Solids",
+    "MilkFat",
+    "SKU_Tag_Expanded",
+    "Packaging_Info_Bag_Box",
+    "Packaging_Info_Palletss",
+    "Dry_Fat_Free_Cocoa_Solids",
+    "Material_Group",
+    "components_Specifications",
+    "Sugars_g",
+    "Protein_g",
+    "Total_Fat_g",
+    "Contains_Milk_Proteins",
+    "Contains_Egg_Products",
+    "Contains_Soy_Proteins",
+    "Contains_Wheat",
+    "Contains_Rye",
+    "Contains_Fish",
+    "Contains_Crustacean_And_Shell_Fish",
+    "Contains_Hazelnuts_Almonds",
+    "Contains_Peanuts",
+    "Contains_Sulphite_E220_E227",
+    "Contains_Celery",
+    "Contains_Sesame_Products",
+    "Suitable_For_Vegetarians",
+    "Suitable_For_Vegans",
+    "Contains_Peanut_Oil",
+    "Contains_Mustard",
+    "Contains_Molluscs",
+    "Contains_Lupin",
+    "Contains_Buckwheat",
+    "Base_Type",
+    "Moulding_Type",
+    "Product_Type",
+    "Colour_TF",
+    "Kosher_Certificate",
+    "Country_Claim",
+    "Shelflife",
+    "Packaging_Info",
+    "Brand",
+    "Commercial_Name",
+    "Contains_Hydrogenated",
+    "Hydrogenated",
+    "Smallest_Unit_Weight_In_Kg",
+    "Units_Per_Pallet",
+    "Certification_Tag",
+    "Colour_Type_Tag",
+    "Flavor_Type_Tag",
+    "Shape",
+    "SKU_Material_Tag",
+    "Origin",
+    "Sku_Ingredient_Tag",
+    "Is_Organic",
+    "pH",
+    "Normalised_Yield_Pa",
+    "Normalised_Linear_Viscosity_mPaS",
+    "Normalised_Casson_Mpa_S",
+    "Brookfield_40C_S27_20_RPM",
+    "Fineness_Micrometer",
+    "Dimensions_Length",
+    "Dimensions_Width",
+    "Dimensions_Count_lb",
+]
+def _ensure_required_columns(df: pd.DataFrame) -> None:
+    missing = REQUIRED_COLUMNS - set(df.columns)
+    if missing:
+        raise gr.Error(
+            "The uploaded file is missing required columns: "
+            + ", ".join(sorted(missing))
+        )
+def _format_value(value: Any) -> str:
+    if isinstance(value, (float, np.floating)):
+        if np.isnan(value):
+            return "-"
+        return f"{value:.4g}"
+    if isinstance(value, (int, np.integer)):
+        return str(value)
+    if value is None:
+        return "-"
+    text = str(value).strip()
+    return text if text else "-"
+def _classify_match(anchor: Any, candidate: Any) -> str:
+    anchor_missing = pd.isna(anchor)
+    candidate_missing = pd.isna(candidate)
+    if anchor_missing and candidate_missing:
+        return "Match"
+    if anchor_missing or candidate_missing:
+        return "Partial Match"
+    if isinstance(anchor, (float, np.floating, int, np.integer)) and isinstance(
+        candidate, (float, np.floating, int, np.integer)
+    ):
+        if np.isclose(float(anchor), float(candidate), atol=1e-6):
+            return "Match"
+        return "Mismatch"
+    if str(anchor).strip().lower() == str(candidate).strip().lower():
+        return "Match"
+    return "Mismatch"
+def load_dataset(file_path) -> Tuple[pd.DataFrame, Any, str]:
+    if not file_path:
+        raise gr.Error("Please upload an Excel data file.")
+    if isinstance(file_path, (list, tuple)):
+        if not file_path:
+            raise gr.Error("Please upload an Excel data file.")
+        file_path = file_path[0]
+    try:
+        df = pd.read_excel(file_path, engine="openpyxl")
+    except Exception as exc:
+        raise gr.Error(f"Unable to read the uploaded file: {exc}") from exc
+    allowed_cols = ALLOWED_COLUMNS
+    if allowed_cols:
+        present_allowed = [c for c in allowed_cols if c in df.columns]
+        if not present_allowed:
+            raise gr.Error(
+                "None of the expected columns were found in the uploaded file."
+            )
+        df = df[present_allowed]
+        missing_allowed = [c for c in allowed_cols if c not in df.columns]
+    else:
+        missing_allowed = []
+    _ensure_required_columns(df)
+    if "Legislation" not in df.columns:
+        df["Legislation"] = "Unknown"
+    legislation_options = (
+        ["All"]
+        + sorted(
+            {str(v).strip() for v in df["Legislation"].dropna().unique()} - {""}
+        )
+    )
+    message = f"Loaded {len(df):,} rows with {df.shape[1]} columns."
+    if allowed_cols:
+        message += f" Using {len(present_allowed)} allowed column(s)."
+        if missing_allowed:
+            message += f" {len(missing_allowed)} expected column(s) were not found."
+    return df, gr.update(choices=legislation_options, value=legislation_options[0]), message
+def _prepare_similarity(
+    df: pd.DataFrame,
+    material_code: str,
+    top_n: int,
+    legislation_filter: str,
+) -> Tuple[pd.DataFrame, Dict[str, Any], Any, str]:
+    if df is None:
+        raise gr.Error("Please load a data file before searching.")
+    material_code = material_code.strip()
+    if not material_code:
+        raise gr.Error("Enter a material code to search.")
+    if material_code not in df["Material_Code"].values:
+        raise gr.Error(f"Material code '{material_code}' was not found in the dataset.")
+    matches = match_by_material_code(df, material_code)
+    if matches.empty:
+        raise gr.Error(
+            "No comparable SKUs share the required grouping attributes with the anchor material."
+        )
+    base_non_spec_cols = [c for c in matches.columns if c != "components_Specifications"]
+    matches_expanded = process_specifications(matches, material_code, df)
+    spec_columns = [
+        c
+        for c in matches_expanded.columns
+        if c not in base_non_spec_cols and c != "Material_Code"
+    ]
+    anchor_idx = matches_expanded.index[
+        matches_expanded["Material_Code"] == material_code
+    ][0]
+    gower_input = matches_expanded.copy()
+    obj_cols = gower_input.select_dtypes(include="object").columns
+    for col in obj_cols:
+        gower_input[col] = gower_input[col].apply(
+            lambda v: v.strip().lower() if isinstance(v, str) else v
+        )
+    scores = gower_similarity(
+        gower_input,
+        query_idx=anchor_idx,
+        boost="count",
+        normalize=True,
+        exclude_cols=["Material_Code", "Legislation"],
+    )
+    results = scores.join(
+        df[
+            [
+                "Material_Code",
+                "Legislation",
+                "Material_Group",
+                "Base_Type",
+                "Moulding_Type",
+                "Product_Type",
+            ]
+        ],
+        how="left",
+    )
+    results = results.loc[results.index != anchor_idx]
+    results = results[results["Material_Code"].astype(str) != material_code]
+    if legislation_filter and legislation_filter != "All":
+        results = results[results["Legislation"].astype(str) == legislation_filter]
+    results = results.sort_values(
+        ["score", "similarity"], ascending=[False, False]
+    ).head(int(top_n))
+    if results.empty:
+        empty_message = "No similar SKUs found for the selected criteria."
+        empty_dropdown = gr.update(choices=[], value=None)
+        return pd.DataFrame(), {}, empty_dropdown, empty_message
+    display_df = results[
+        [
+            "Material_Code",
+            "Legislation",
+            "distance",
+            "similarity",
+            "score",
+            "used_count",
+        ]
+    ].copy()
+    display_df[["distance", "similarity", "score"]] = display_df[
+        ["distance", "similarity", "score"]
+    ].round(4)
+    state = {
+        "scores": scores,
+        "matches_expanded": matches_expanded,
+        "anchor_idx": anchor_idx,
+        "anchor_code": material_code,
+        "result_indices": results.index.tolist(),
+        "spec_columns": spec_columns,
+    }
+    candidate_codes = results["Material_Code"].tolist()
+    spec_msg = f" with {len(spec_columns)} component field(s)" if spec_columns else ""
+    message = f"Found {len(display_df)} similar SKUs{spec_msg}."
+    return (
+        display_df.reset_index(drop=True),
+        state,
+        gr.update(choices=candidate_codes, value=candidate_codes[0]),
+        message,
+    )
+def _build_comparison(
+    search_state: Dict[str, Any], selected_code: str
+) -> Tuple[str, pd.DataFrame]:
+    if not search_state:
+        return "Load results to compare SKUs.", pd.DataFrame()
+    if not selected_code:
+        return "Select a SKU to compare against the anchor.", pd.DataFrame()
+    matches_expanded: pd.DataFrame = search_state["matches_expanded"]
+    scores: pd.DataFrame = search_state["scores"]
+    anchor_idx = search_state["anchor_idx"]
+    anchor_code = search_state["anchor_code"]
+    spec_columns = search_state.get("spec_columns", [])
+    candidate_rows = matches_expanded[
+        matches_expanded["Material_Code"] == selected_code
+    ]
+    if candidate_rows.empty:
+        return "Selected SKU is not available for comparison.", pd.DataFrame()
+    candidate_idx = candidate_rows.index[0]
+    anchor_row = matches_expanded.loc[anchor_idx]
+    candidate_row = matches_expanded.loc[candidate_idx]
+    base_columns = [
+        "Material_Group",
+        "Base_Type",
+        "Moulding_Type",
+        "Product_Type",
+        "Legislation",
+    ]
+    other_columns = [
+        c
+        for c in matches_expanded.columns
+        if c not in base_columns + ["Material_Code"] + spec_columns
+    ]
+    comparison_columns = base_columns + spec_columns + other_columns
+    rows = []
+    for col in comparison_columns:
+        anchor_value = anchor_row.get(col, np.nan)
+        candidate_value = candidate_row.get(col, np.nan)
+        status = _classify_match(anchor_value, candidate_value)
+        rows.append(
+            {
+                "Attribute": col,
+                "Anchor Value": _format_value(anchor_value),
+                "Candidate Value": _format_value(candidate_value),
+                "Status": status,
+            }
+        )
+    comparison_df = pd.DataFrame(rows)
+    comparison_df["Status"] = pd.Categorical(
+        comparison_df["Status"],
+        categories=["Mismatch", "Partial Match", "Match"],
+        ordered=True,
+    )
+    comparison_df = comparison_df.sort_values("Status", key=lambda s: s.map(STATUS_ORDER))
+    score = scores.loc[candidate_idx, "score"]
+    similarity = scores.loc[candidate_idx, "similarity"]
+    distance = scores.loc[candidate_idx, "distance"]
+    used = scores.loc[candidate_idx, "used_count"]
+    spec_note = " (no component specs detected)" if not spec_columns else ""
+    summary = (
+        f"**{anchor_code} vs {selected_code}**{spec_note}  \n"
+        f"Score: {score:.4f} • Similarity: {similarity:.4f} • Distance: {distance:.4f} \n"
+        f"Evidence Columns Used: {int(used)}"
+    )
+    return summary, comparison_df.reset_index(drop=True)
+def build_interface() -> gr.Blocks:
+    with gr.Blocks(title="SKU Similarity Explorer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            ## SKU Similarity Explorer
+            Upload a master data file, choose an anchor SKU, and explore the most similar alternatives.
+            Use the Legislation filter to focus your results, then drill into any candidate for a side-by-side comparison
+            with the anchor SKU to understand alignment across attributes and component specifications.
+            """
+        )
+        data_state = gr.State()
+        search_state = gr.State()
+        with gr.Column():
+            with gr.Row():
+                data_file = gr.File(
+                    label="Master Data File (Excel)",
+                    file_types=[".xlsx"],
+                    type="filepath",
+                    file_count="single",
+                )
+                load_button = gr.Button("Load Data", variant="primary")
+            load_status = gr.Markdown("Upload your data file to begin.")
+        legislation_filter = gr.Dropdown(
+            label="Legislation Filter",
+            choices=["All"],
+            value="All",
+        )
+        with gr.Row():
+            material_code_input = gr.Textbox(
+                label="Anchor Material Code",
+                placeholder="Enter the SKU to compare against",
+            )
+            topn_slider = gr.Slider(
+                label="Number of Similar SKUs",
+                minimum=1,
+                maximum=50,
+                value=10,
+                step=1,
+            )
+            find_button = gr.Button("Find Similar SKUs", variant="primary")
+        results_status = gr.Markdown()
+        results_table = gr.Dataframe(
+            headers=[
+                "Material_Code",
+                "Legislation",
+                "distance",
+                "similarity",
+                "score",
+                "used_count",
+            ],
+            datatype=["str", "str", "number", "number", "number", "number"],
+            interactive=False,
+            label="Similar SKUs",
+        )
+        candidate_selector = gr.Dropdown(
+            label="Compare Candidate",
+            choices=[],
+            interactive=True,
+        )
+        comparison_summary = gr.Markdown("Select a candidate SKU to review the comparison.")
+        comparison_table = gr.Dataframe(
+            headers=["Attribute", "Anchor Value", "Candidate Value", "Status"],
+            interactive=False,
+            label="Attribute-Level Comparison",
+        )
+        load_button.click(
+            fn=load_dataset,
+            inputs=data_file,
+            outputs=[data_state, legislation_filter, load_status],
+        )
+        find_event = find_button.click(
+            fn=_prepare_similarity,
+            inputs=[data_state, material_code_input, topn_slider, legislation_filter],
+            outputs=[results_table, search_state, candidate_selector, results_status],
+        )
+        find_event.then(
+            fn=_build_comparison,
+            inputs=[search_state, candidate_selector],
+            outputs=[comparison_summary, comparison_table],
+        )
+        candidate_selector.change(
+            fn=_build_comparison,
+            inputs=[search_state, candidate_selector],
+            outputs=[comparison_summary, comparison_table],
+        )
+        gr.Markdown(
+            """
+            #### Tips
+            - Ensure the uploaded file contains the required attributes listed in the documentation.
+            - Use the Legislation filter to focus on products compliant with specific regions or standards.
+            - Scores combine similarity with evidence coverage, so higher scores indicate both alignment and stronger data backing.
+            """
+        )
+    return demo
+if __name__ == "__main__":
+    app = build_interface()
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pandas>=1.3.0
+numpy>=1.20.0
+openpyxl>=3.0.0  # Required for reading Excel files
+gradio>=4.0.0

similarity_pipeline.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pandas as pd  # pip install pandas openpyxl
+import numpy as np
+from utils import (
+    match_by_material_code,
+    process_specifications,
+    gower_similarity
+)
+def find_similar_materials(material_code: str, data_path: str, top_n: int = 10) -> pd.DataFrame:
+    # Read and prepare the data
+    active_cols = [
+        'Material_Code', 'Material_Group', 'Base_Type', 'Moulding_Type',
+        'Product_Type', 'components_Specifications', 'Legislation'
+    ]
+    try:
+        # Read the data file
+        df = pd.read_excel(data_path, usecols=active_cols)
+        # Find matching materials by group attributes
+        matches = match_by_material_code(df, material_code)
+        if matches.empty:
+            raise ValueError(f"No matches found for material code: {material_code}")
+        # Process and expand specifications
+        matches_expanded = process_specifications(matches, material_code, df)
+        # Calculate similarity scores
+        q_idx = df.index[df['Material_Code'] == material_code][0]
+        scores = gower_similarity(
+            matches_expanded,
+            query_idx=q_idx,
+            boost='count',
+            normalize=True,
+            exclude_cols=['Material_Code', 'Legislation']
+        )
+        # Get top N similar materials
+        top_indices = scores.head(top_n).index
+        similar_materials = df.loc[top_indices].copy()
+        # Add similarity metrics to the results
+        similar_materials = similar_materials.join(scores[['distance', 'similarity', 'score', 'used_count']])
+        return similar_materials
+    except Exception as e:
+        print(f"Error processing material {material_code}: {str(e)}")
+        raise
+if __name__ == "__main__":
+    # Example usage
+    data_file = "/Users/aryanrajsaxena/Desktop/BarryC/data_analysis/data-files/Master Data - Part 1.xlsx"
+    material_code = "YYW-PN-G300297-E15"
+    try:
+        similar_materials = find_similar_materials(material_code, data_file)
+        print(f"\nTop similar materials for {material_code}:")
+        print(similar_materials[['Material_Code', 'Material_Group', 'similarity', 'score', 'used_count']])
+    except FileNotFoundError:
+        print(f"Error: Data file not found at {data_file}")
+    except ValueError as e:
+        print(f"Error: {str(e)}")
+    except Exception as e:
+        print(f"Unexpected error processing material {material_code}: {str(e)}")

utils.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import pandas as pd
+import numpy as np
+import ast
+from typing import Optional, Iterable, Union
+def _parse_dict_cell(x):
+    if isinstance(x, dict):
+        return x
+    if pd.isna(x):
+        return {}
+    try:
+        return ast.literal_eval(str(x))
+    except Exception:
+        return {}
+def _flatten(d, parent=''):
+    out = {}
+    if not isinstance(d, dict):
+        return out
+    for k, v in d.items():
+        key = f"{parent}.{k}" if parent else f"{k}"
+        if isinstance(v, dict):
+            out.update(_flatten(v, key))
+        else:
+            # normalize list/tuple/set to a string so it's usable as a single value
+            if isinstance(v, (list, tuple, set)):
+                try:
+                    v = ";".join(map(str, v))
+                except Exception:
+                    v = str(v)
+            out[key] = v
+    return out
+def _strip_percent_to_float(df: pd.DataFrame) -> pd.DataFrame:
+    out = df.copy()
+    obj_cols = out.select_dtypes(include=['object']).columns
+    for c in obj_cols:
+        s = out[c]
+        has_pct = s.astype(str).str.contains('%', na=False)
+        if not has_pct.any():
+            continue
+        # strip %, commas, spaces; convert to numeric
+        cleaned = s.astype(str).str.replace('%', '', regex=False).str.replace(',', '', regex=False).str.strip()
+        out[c] = pd.to_numeric(cleaned, errors='coerce')
+    return out
+def get_spec_keys_from_material(df, material_code, spec_col='components_Specifications'):
+    """Get component specification keys from a specific material code"""
+    material_idx = df.index[df['Material_Code'] == material_code][0]
+    material_specs = df.loc[material_idx, spec_col]
+    spec_dict = _parse_dict_cell(material_specs)
+    return list(_flatten(spec_dict).keys())
+def match_by_material_code(df: pd.DataFrame, material_code, code_col='Material_Code'):
+    """
+    Return rows whose (Material_Group, Base_Type, Moulding_Type, Product_Type)
+    exactly match the values of the given material_code in df.
+    If multiple rows share the material_code, the first match is used.
+    """
+    cols = ['Material_Group', 'Base_Type', 'Moulding_Type', 'Product_Type']
+    required = [code_col] + cols
+    missing = [c for c in required if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing required columns: {missing}")
+    ref_rows = df.loc[df[code_col] == material_code, cols]
+    if ref_rows.empty:
+        # No such material_code
+        return df.iloc[0:0].copy()
+    ref = ref_rows.iloc[0]  # use first occurrence
+    mask = pd.Series(True, index=df.index)
+    for c in cols:
+        v = ref[c]
+        mask &= (df[c].isna() if pd.isna(v) else df[c].eq(v))
+    return df.loc[mask].copy()
+def process_specifications(matches, material_code, df, spec_col='components_Specifications'):
+    """Process and expand component specifications"""
+    # Get the keys from the reference material code
+    spec_keys = get_spec_keys_from_material(df, material_code)
+    # Parse and flatten each row's dict, but only keep the keys from reference material
+    parsed = matches[spec_col].apply(_parse_dict_cell).apply(_flatten)
+    # Build a DataFrame with only the reference material's keys, NaN where missing
+    spec_df = pd.DataFrame([{k: d.get(k, np.nan) for k in spec_keys}
+                           for d in parsed], index=matches.index)
+    # Best-effort numeric coercion so numeric-looking strings become numbers
+    def _convert_numeric(col: pd.Series) -> pd.Series:
+        try:
+            return pd.to_numeric(col)
+        except (TypeError, ValueError):
+            return col
+    spec_df = spec_df.apply(_convert_numeric)
+    # Join back and drop the original dict column
+    matches_expanded = matches.drop(columns=[spec_col]).join(spec_df)
+    # Convert percentage values to floats
+    matches_expanded = _strip_percent_to_float(matches_expanded)
+    return matches_expanded
+def gower_similarity(
+    df: pd.DataFrame,
+    query_idx,
+    weights: Optional[Union[dict, pd.Series]] = None,
+    boost: str = 'count',            # 'count' or 'weight'
+    normalize: bool = True,          # True -> final score kept in [0,1]
+    exclude_cols: Optional[Iterable[str]] = None
+) -> pd.DataFrame:
+    """
+    Weighted Gower-like similarity with anchor-centric missing value handling:
+    Case 1: Anchor NaN, candidate has value -> Column counts as used (15/15)
+    Case 2: Anchor has value, candidate NaN -> Column counts as not used (14/15)
+    Case 3: Both NaN -> Column counts as used (15/15)
+    Case 4: Both have values -> Standard distance calculation
+    """
+    # Defensive copy
+    X = df.copy()
+    # Drop excluded columns
+    if exclude_cols:
+        exclude = [c for c in exclude_cols if c in X.columns]
+        X = X.drop(columns=exclude)
+    cols = X.columns.tolist()
+    n = len(X)
+    if len(cols) == 0:
+        raise ValueError("No columns left after excluding columns.")
+    # split numeric / categorical
+    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
+    cat_cols = [c for c in cols if c not in num_cols]
+    # build weight series (default 1.0)
+    if weights is None:
+        w = pd.Series(1.0, index=cols, dtype='float64')
+    else:
+        if isinstance(weights, pd.Series):
+            w = pd.Series(1.0, index=cols, dtype='float64')
+            for k, v in weights.items():
+                if k in w.index:
+                    w[k] = float(v)
+        elif isinstance(weights, dict):
+            w = pd.Series(1.0, index=cols, dtype='float64')
+            for k, v in weights.items():
+                if k in w.index:
+                    w[k] = float(v)
+        else:
+            raise TypeError("weights must be None, dict, or pd.Series")
+    # pick query row (by index label)
+    q = X.loc[query_idx]
+    # NUMERIC PART
+    if num_cols:
+        A = X[num_cols].to_numpy(dtype='float64')          # shape (n, m_num)
+        qA = q[num_cols].to_numpy(dtype='float64')         # shape (m_num,)
+        # Anchor-centric missing value handling
+        anchor_nan = np.isnan(qA)                          # True where anchor is NaN
+        data_nan = np.isnan(A)                             # True where data is NaN
+        # Cases 1 & 3: Anchor NaN and (candidate has value OR candidate NaN) -> count as used
+        # Case 2: Anchor has value, candidate NaN -> count as not used
+        # Case 4: Both have values -> standard comparison
+        used_num = (~anchor_nan & ~data_nan) | anchor_nan  # Case 4 OR (Case 1 & 3)
+        # For distance calculation, only use where both have values (Case 4)
+        valid_compare = ~anchor_nan & ~data_nan
+        # ranges robust to all-NaN columns:
+        col_max = np.nanmax(A, axis=0)
+        col_min = np.nanmin(A, axis=0)
+        ranges = col_max - col_min
+        ranges = np.where(np.isnan(ranges) | (ranges == 0), 1.0, ranges)
+        diff = np.abs(A - qA)                              # broadcast (n, m_num)
+        comp_num = diff / ranges                           # scaled numeric difference
+        comp_num[~valid_compare] = 0.0                     # zero distance for Case 1,2,3
+        w_num = w[num_cols].to_numpy(dtype='float64')
+        num_sum = (comp_num * w_num).sum(axis=1)
+        num_used_w = (used_num * w_num).sum(axis=1)       # weight sum reflects anchor-centric logic
+        num_used_cnt = used_num.sum(axis=1)               # count reflects anchor-centric logic
+    else:
+        num_sum = np.zeros(n, dtype='float64')
+        num_used_w = np.zeros(n, dtype='float64')
+        num_used_cnt = np.zeros(n, dtype='int64')
+    # CATEGORICAL PART
+    if cat_cols:
+        B = X[cat_cols].astype(object)
+        qB = q[cat_cols].astype(object)
+        # Anchor-centric missing value handling for categorical
+        anchor_miss = pd.isna(qB.values)                   # True where anchor is missing
+        data_miss = B.isna().values                        # True where data is missing
+        # Same logic as numeric part
+        used_cat = (~anchor_miss & ~data_miss) | anchor_miss
+        valid_compare = ~anchor_miss & ~data_miss
+        # equality check only where both have values
+        equal = (B.values == qB.values) & valid_compare
+        comp_cat = (~equal).astype('float64')             # 1.0 if different, 0.0 if same or any NaN
+        w_cat = w[cat_cols].to_numpy(dtype='float64')
+        cat_sum = (comp_cat * w_cat).sum(axis=1)
+        cat_used_w = (used_cat * w_cat).sum(axis=1)      # weight sum reflects anchor-centric logic
+        cat_used_cnt = used_cat.sum(axis=1)              # count reflects anchor-centric logic
+    else:
+        cat_sum = np.zeros(n, dtype='float64')
+        cat_used_w = np.zeros(n, dtype='float64')
+        cat_used_cnt = np.zeros(n, dtype='int64')
+    used_w = num_used_w + cat_used_w
+    used_cnt = num_used_cnt + cat_used_cnt
+    comp_sum = num_sum + cat_sum
+    # distance calculation (now safer since we zero-out invalid comparisons)
+    with np.errstate(invalid='ignore', divide='ignore'):
+        dist = comp_sum / used_w
+    dist = np.where(used_w == 0, np.nan, dist)    # no overlap -> NaN
+    dist = np.clip(dist, 0.0, 1.0)                # clamp to [0,1]
+    similarity = 1.0 - dist
+    # compute boost factor (now properly accounts for anchor-centric logic)
+    total_weight = w.sum()
+    total_count = len(cols)
+    if boost == 'weight':
+        if normalize:
+            factor = np.where(total_weight > 0, used_w / total_weight, 0.0)
+        else:
+            factor = used_w.copy()
+    else:  # 'count'
+        if normalize:
+            factor = used_cnt / total_count    # This now implements the 15/15, 14/15 logic
+        else:
+            factor = used_cnt.astype(float)
+    score = similarity * factor
+    out = pd.DataFrame({
+        'distance': dist,
+        'similarity': similarity,
+        'score': score,
+        'used_count': used_cnt,
+        'used_weight': used_w
+    }, index=X.index)
+    out = out.sort_values(['score', 'similarity'], ascending=[False, False])
+    return out