File size: 6,750 Bytes
5fe3652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""Deduplication module for fabric-to-espanso."""
import logging
from typing import List, Dict, Any, Tuple, Set
import difflib
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, PointIdsList

from .config import config
from .database import get_dense_vector_name, get_sparse_vector_name

logger = logging.getLogger('fabric_to_espanso')

def calculate_text_difference_percentage(text1: str, text2: str) -> float:
    """
    Calculate the percentage difference between two text strings.
    
    Args:
        text1: First text string
        text2: Second text string
        
    Returns:
        Percentage difference as a float between 0.0 (identical) and 1.0 (completely different)
    """
    # Use difflib's SequenceMatcher to calculate similarity ratio
    similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
    # Convert similarity to difference percentage
    difference_percentage = 1.0 - similarity
    return difference_percentage

# TODO: Consider moving the vector similarity search functionality to database_query.py and import it here
# This would create a more structured codebase with search functionality centralized in one place
def find_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> List[Tuple[str, List[str]]]:
    """
    Find duplicate entries in the database based on semantic similarity and text difference.
    
    Args:
        client: Initialized Qdrant client
        collection_name: Name of the collection to query
        
    Returns:
        List of tuples containing (kept_point_id, [duplicate_point_ids])
    """
    # Constants for duplicate detection
    SIMILARITY_THRESHOLD = 0.85  # Minimum semantic similarity to consider as potential duplicate
    DIFFERENCE_THRESHOLD = 0.1  # Maximum text difference (10%) to consider as duplicate
    # Get all points from the database
    all_points = client.scroll(
        collection_name=collection_name,
        with_vectors=True, # Include vector data, else no vector will be available
        limit=10000  # Adjust based on expected file count
    )[0]
    
    logger.info(f"Checking {len(all_points)} entries for duplicates")
    
    # Track processed points to avoid redundant comparisons
    processed_points = set()
    # Store duplicates as (kept_id, [duplicate_ids])
    duplicates = []
    
    # For each point, find semantically similar points
    for i, point in enumerate(all_points):
        if point.id in processed_points:
            continue
            
        point_id = point.id
        point_content = point.payload.get('content', '')
        logger.debug(f"Checking point {point_id} for duplicates")
        logger.debug(f"Content: {point_content}")
        
        # Skip if no content
        if not point_content:
            logger.debug(f"Skipping point {point_id} as it has no content")
            continue
            
        # Get the actual vector names from the collection configuration
        dense_vector_name = get_dense_vector_name(client, collection_name)
        
        # Skip points without vector or without the required vector type
        if not point.vector or dense_vector_name not in point.vector:
            logger.debug(f"Skipping point {point_id} as it has no valid vector")
            continue
            
        # Find semantically similar points using Qdrant's search
        similar_points = client.search(
            collection_name=collection_name,
            query_vector=(dense_vector_name, point.vector.get(dense_vector_name)),
            limit=100,
            score_threshold=SIMILARITY_THRESHOLD  # Only consider points with similarity > threshold
        )
        
        # Skip the first result (which is the point itself)
        similar_points = [p for p in similar_points if p.id != point_id]
        
        if not similar_points:
            continue
            
        logger.debug(f"Found {len(similar_points)} semantically similar points for {point.payload.get('filename', 'unknown')}")
        
        # Check text difference for each similar point
        duplicate_ids = []
        for similar_point in similar_points:
            similar_id = similar_point.id
            
            # Skip if already processed
            if similar_id in processed_points:
                continue
                
            # Get content of similar point
            similar_content = None
            for p in all_points:
                if p.id == similar_id:
                    similar_content = p.payload.get('content', '')
                    break
                    
            if not similar_content:
                continue
                
            # Calculate text difference percentage
            diff_percentage = calculate_text_difference_percentage(point_content, similar_content)
            
            # If difference is less than threshold, consider it a duplicate
            if diff_percentage <= DIFFERENCE_THRESHOLD:
                duplicate_ids.append(similar_id)
                processed_points.add(similar_id)
                logger.debug(f"Found duplicate: {similar_id} (diff: {diff_percentage:.2%})")
        
        if duplicate_ids:
            duplicates.append((point_id, duplicate_ids))
            processed_points.add(point_id)
            
    logger.info(f"Found {sum(len(dups) for _, dups in duplicates)} duplicate entries in {len(duplicates)} groups")
    return duplicates

def remove_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> int:
    """
    Remove duplicate entries from the database based on semantic similarity and text difference.
    Uses a two-step verification process:
    1. Find entries with semantic similarity > 0.9 (using vector search)
    2. For those entries, keep only those with text difference <= 5%
    
    Args:
        client: Initialized Qdrant client
        collection_name: Name of the collection to query
        
    Returns:
        Number of removed duplicate entries
    """
    # Find duplicates
    duplicate_groups = find_duplicates(client, collection_name)
    
    if not duplicate_groups:
        logger.info("No duplicates found")
        return 0
        
    # Count total duplicates
    total_duplicates = sum(len(dups) for _, dups in duplicate_groups)
    
    # Remove duplicates
    for _, duplicate_ids in duplicate_groups:
        if duplicate_ids:
            client.delete(
                collection_name=collection_name,
                points_selector=PointIdsList(points=duplicate_ids)
            )
            
    logger.info(f"Removed {total_duplicates} duplicate entries from the database")
    return total_duplicates