Hopsakee's picture
Upload folder using huggingface_hub
5fe3652 verified
"""Deduplication module for fabric-to-espanso."""
import logging
from typing import List, Dict, Any, Tuple, Set
import difflib
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, PointIdsList
from .config import config
from .database import get_dense_vector_name, get_sparse_vector_name
logger = logging.getLogger('fabric_to_espanso')
def calculate_text_difference_percentage(text1: str, text2: str) -> float:
"""
Calculate the percentage difference between two text strings.
Args:
text1: First text string
text2: Second text string
Returns:
Percentage difference as a float between 0.0 (identical) and 1.0 (completely different)
"""
# Use difflib's SequenceMatcher to calculate similarity ratio
similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
# Convert similarity to difference percentage
difference_percentage = 1.0 - similarity
return difference_percentage
# TODO: Consider moving the vector similarity search functionality to database_query.py and import it here
# This would create a more structured codebase with search functionality centralized in one place
def find_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> List[Tuple[str, List[str]]]:
"""
Find duplicate entries in the database based on semantic similarity and text difference.
Args:
client: Initialized Qdrant client
collection_name: Name of the collection to query
Returns:
List of tuples containing (kept_point_id, [duplicate_point_ids])
"""
# Constants for duplicate detection
SIMILARITY_THRESHOLD = 0.85 # Minimum semantic similarity to consider as potential duplicate
DIFFERENCE_THRESHOLD = 0.1 # Maximum text difference (10%) to consider as duplicate
# Get all points from the database
all_points = client.scroll(
collection_name=collection_name,
with_vectors=True, # Include vector data, else no vector will be available
limit=10000 # Adjust based on expected file count
)[0]
logger.info(f"Checking {len(all_points)} entries for duplicates")
# Track processed points to avoid redundant comparisons
processed_points = set()
# Store duplicates as (kept_id, [duplicate_ids])
duplicates = []
# For each point, find semantically similar points
for i, point in enumerate(all_points):
if point.id in processed_points:
continue
point_id = point.id
point_content = point.payload.get('content', '')
logger.debug(f"Checking point {point_id} for duplicates")
logger.debug(f"Content: {point_content}")
# Skip if no content
if not point_content:
logger.debug(f"Skipping point {point_id} as it has no content")
continue
# Get the actual vector names from the collection configuration
dense_vector_name = get_dense_vector_name(client, collection_name)
# Skip points without vector or without the required vector type
if not point.vector or dense_vector_name not in point.vector:
logger.debug(f"Skipping point {point_id} as it has no valid vector")
continue
# Find semantically similar points using Qdrant's search
similar_points = client.search(
collection_name=collection_name,
query_vector=(dense_vector_name, point.vector.get(dense_vector_name)),
limit=100,
score_threshold=SIMILARITY_THRESHOLD # Only consider points with similarity > threshold
)
# Skip the first result (which is the point itself)
similar_points = [p for p in similar_points if p.id != point_id]
if not similar_points:
continue
logger.debug(f"Found {len(similar_points)} semantically similar points for {point.payload.get('filename', 'unknown')}")
# Check text difference for each similar point
duplicate_ids = []
for similar_point in similar_points:
similar_id = similar_point.id
# Skip if already processed
if similar_id in processed_points:
continue
# Get content of similar point
similar_content = None
for p in all_points:
if p.id == similar_id:
similar_content = p.payload.get('content', '')
break
if not similar_content:
continue
# Calculate text difference percentage
diff_percentage = calculate_text_difference_percentage(point_content, similar_content)
# If difference is less than threshold, consider it a duplicate
if diff_percentage <= DIFFERENCE_THRESHOLD:
duplicate_ids.append(similar_id)
processed_points.add(similar_id)
logger.debug(f"Found duplicate: {similar_id} (diff: {diff_percentage:.2%})")
if duplicate_ids:
duplicates.append((point_id, duplicate_ids))
processed_points.add(point_id)
logger.info(f"Found {sum(len(dups) for _, dups in duplicates)} duplicate entries in {len(duplicates)} groups")
return duplicates
def remove_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> int:
"""
Remove duplicate entries from the database based on semantic similarity and text difference.
Uses a two-step verification process:
1. Find entries with semantic similarity > 0.9 (using vector search)
2. For those entries, keep only those with text difference <= 5%
Args:
client: Initialized Qdrant client
collection_name: Name of the collection to query
Returns:
Number of removed duplicate entries
"""
# Find duplicates
duplicate_groups = find_duplicates(client, collection_name)
if not duplicate_groups:
logger.info("No duplicates found")
return 0
# Count total duplicates
total_duplicates = sum(len(dups) for _, dups in duplicate_groups)
# Remove duplicates
for _, duplicate_ids in duplicate_groups:
if duplicate_ids:
client.delete(
collection_name=collection_name,
points_selector=PointIdsList(points=duplicate_ids)
)
logger.info(f"Removed {total_duplicates} duplicate entries from the database")
return total_duplicates