File size: 1,306 Bytes
3fcddc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""Compute near duplicates for a dataset."""
from typing import Iterable, Optional, cast

from pydantic import Field as PydanticField
from typing_extensions import override

from ..schema import Field, Item, RichData, SignalInputType, field
from .minhash_dup import find_clusters
from .signal import TextSignal

CLUSTER_KEY = 'cluster_id'


class NearDuplicateSignal(TextSignal):
  """Find near duplicate documents in a dataset using n-grams.

  <br/>

  Documents are fingerprinted using n-grams with
  [minhash LSH](https://en.wikipedia.org/wiki/MinHash). Documents are assigned the same cluster id
  if their Jaccard similarity is above the provided threshold.
  """
  name = 'near_dup'
  display_name = 'Near duplicate documents'

  input_type = SignalInputType.TEXT
  compute_type = SignalInputType.TEXT

  threshold: float = PydanticField(
    default=0.75,
    description='The similarity threshold for detecting a near duplicate.',
  )

  @override
  def fields(self) -> Field:
    return field(fields={CLUSTER_KEY: field('uint32', categorical=True)})

  @override
  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
    cluster_ids = find_clusters(cast(Iterable[str], data), threshold=self.threshold)
    for cluster_id in cluster_ids:
      yield {CLUSTER_KEY: cluster_id}