Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from typing import List, Tuple, Dict, Any | |
| from langchain_community.vectorstores import faiss | |
| def multi_column(db: faiss.FAISS, df: pd.DataFrame, qc_pairs: Dict[str, str], threshold: float) -> List[Tuple[int, float, Dict[str, Any]]]: | |
| """Perform semantic search across multiple columns and return aggregated results. | |
| Args: | |
| db: FAISS vector database for search | |
| df: Original DataFrame containing the data | |
| qc_pairs: Dictionary mapping columns to query fragments | |
| threshold: Minimum similarity threshold to include a result | |
| Returns: | |
| List[Tuple[int, float, Dict[str, Any]]]: List of tuples (row_id, avg_score, row_dict) | |
| """ | |
| per_column_scores = [] | |
| for column, query in qc_pairs.items(): | |
| hits = db.similarity_search_with_score( | |
| query, | |
| k=db.index.ntotal, | |
| filter={'column': column}, | |
| distance_strategy=faiss.DistanceStrategy.COSINE | |
| ) | |
| score_map = { | |
| doc.metadata['row']: score | |
| for doc, score in hits | |
| if score >= threshold | |
| } | |
| per_column_scores.append(score_map) | |
| all_rows = set() | |
| for score_map in per_column_scores: | |
| all_rows.update(score_map.keys()) | |
| results = [] | |
| for rid in all_rows: | |
| scores = [score_map[rid] for score_map in per_column_scores if rid in score_map] | |
| if scores: | |
| avg_score = sum(scores) / len(scores) | |
| row_dict = df.loc[rid].to_dict() | |
| results.append((rid, avg_score, row_dict)) | |
| results.sort(key=lambda x: x[1], reverse=True) | |
| return results | |