Spaces:

Hopsakee
/

fabric_to_espanso

Sleeping

File size: 6,750 Bytes

5fe3652

"""Deduplication module for fabric-to-espanso."""
import logging
from typing import List, Dict, Any, Tuple, Set
import difflib
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, PointIdsList

from .config import config
from .database import get_dense_vector_name, get_sparse_vector_name

logger = logging.getLogger('fabric_to_espanso')

def calculate_text_difference_percentage(text1: str, text2: str) -> float:
    """
    Calculate the percentage difference between two text strings.
    
    Args:
        text1: First text string
        text2: Second text string
        
    Returns:
        Percentage difference as a float between 0.0 (identical) and 1.0 (completely different)
    """
    # Use difflib's SequenceMatcher to calculate similarity ratio
    similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
    # Convert similarity to difference percentage
    difference_percentage = 1.0 - similarity
    return difference_percentage

# TODO: Consider moving the vector similarity search functionality to database_query.py and import it here
# This would create a more structured codebase with search functionality centralized in one place
def find_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> List[Tuple[str, List[str]]]:
    """
    Find duplicate entries in the database based on semantic similarity and text difference.
    
    Args:
        client: Initialized Qdrant client
        collection_name: Name of the collection to query
        
    Returns:
        List of tuples containing (kept_point_id, [duplicate_point_ids])
    """
    # Constants for duplicate detection
    SIMILARITY_THRESHOLD = 0.85  # Minimum semantic similarity to consider as potential duplicate
    DIFFERENCE_THRESHOLD = 0.1  # Maximum text difference (10%) to consider as duplicate
    # Get all points from the database
    all_points = client.scroll(
        collection_name=collection_name,
        with_vectors=True, # Include vector data, else no vector will be available
        limit=10000  # Adjust based on expected file count
    )[0]
    
    logger.info(f"Checking {len(all_points)} entries for duplicates")
    
    # Track processed points to avoid redundant comparisons
    processed_points = set()
    # Store duplicates as (kept_id, [duplicate_ids])
    duplicates = []
    
    # For each point, find semantically similar points
    for i, point in enumerate(all_points):
        if point.id in processed_points:
            continue
            
        point_id = point.id
        point_content = point.payload.get('content', '')
        logger.debug(f"Checking point {point_id} for duplicates")
        logger.debug(f"Content: {point_content}")
        
        # Skip if no content
        if not point_content:
            logger.debug(f"Skipping point {point_id} as it has no content")
            continue
            
        # Get the actual vector names from the collection configuration
        dense_vector_name = get_dense_vector_name(client, collection_name)
        
        # Skip points without vector or without the required vector type
        if not point.vector or dense_vector_name not in point.vector:
            logger.debug(f"Skipping point {point_id} as it has no valid vector")
            continue
            
        # Find semantically similar points using Qdrant's search
        similar_points = client.search(
            collection_name=collection_name,
            query_vector=(dense_vector_name, point.vector.get(dense_vector_name)),
            limit=100,
            score_threshold=SIMILARITY_THRESHOLD  # Only consider points with similarity > threshold
        )
        
        # Skip the first result (which is the point itself)
        similar_points = [p for p in similar_points if p.id != point_id]
        
        if not similar_points:
            continue
            
        logger.debug(f"Found {len(similar_points)} semantically similar points for {point.payload.get('filename', 'unknown')}")
        
        # Check text difference for each similar point
        duplicate_ids = []
        for similar_point in similar_points:
            similar_id = similar_point.id
            
            # Skip if already processed
            if similar_id in processed_points:
                continue
                
            # Get content of similar point
            similar_content = None
            for p in all_points:
                if p.id == similar_id:
                    similar_content = p.payload.get('content', '')
                    break
                    
            if not similar_content:
                continue
                
            # Calculate text difference percentage
            diff_percentage = calculate_text_difference_percentage(point_content, similar_content)
            
            # If difference is less than threshold, consider it a duplicate
            if diff_percentage <= DIFFERENCE_THRESHOLD:
                duplicate_ids.append(similar_id)
                processed_points.add(similar_id)
                logger.debug(f"Found duplicate: {similar_id} (diff: {diff_percentage:.2%})")
        
        if duplicate_ids:
            duplicates.append((point_id, duplicate_ids))
            processed_points.add(point_id)
            
    logger.info(f"Found {sum(len(dups) for _, dups in duplicates)} duplicate entries in {len(duplicates)} groups")
    return duplicates

def remove_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> int:
    """
    Remove duplicate entries from the database based on semantic similarity and text difference.
    Uses a two-step verification process:
    1. Find entries with semantic similarity > 0.9 (using vector search)
    2. For those entries, keep only those with text difference <= 5%
    
    Args:
        client: Initialized Qdrant client
        collection_name: Name of the collection to query
        
    Returns:
        Number of removed duplicate entries
    """
    # Find duplicates
    duplicate_groups = find_duplicates(client, collection_name)
    
    if not duplicate_groups:
        logger.info("No duplicates found")
        return 0
        
    # Count total duplicates
    total_duplicates = sum(len(dups) for _, dups in duplicate_groups)
    
    # Remove duplicates
    for _, duplicate_ids in duplicate_groups:
        if duplicate_ids:
            client.delete(
                collection_name=collection_name,
                points_selector=PointIdsList(points=duplicate_ids)
            )
            
    logger.info(f"Removed {total_duplicates} duplicate entries from the database")
    return total_duplicates