lhoestq HF staff commited on
Commit
f12f776
1 Parent(s): 8ffc0c7

add max text length

Browse files
Files changed (1) hide show
  1. analyze.py +2 -1
analyze.py CHANGED
@@ -10,6 +10,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerRes
10
  Row = dict[str, Any]
11
  T = TypeVar("T")
12
  BATCH_SIZE = 1
 
13
  batch_analyzer: Optional[BatchAnalyzerEngine] = None
14
 
15
 
@@ -124,7 +125,7 @@ def presidio_scan_entities(
124
  if batch_analyzer is None:
125
  batch_analyser = BatchAnalyzerEngine(AnalyzerEngine())
126
  rows_with_scanned_columns_only = (
127
- {column_name: get_strings(row[column_name]) for column_name in scanned_columns} for row in rows
128
  )
129
  for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
130
  yield from analyze(
 
10
  Row = dict[str, Any]
11
  T = TypeVar("T")
12
  BATCH_SIZE = 1
13
+ MAX_TEXT_LENGTH = 3000
14
  batch_analyzer: Optional[BatchAnalyzerEngine] = None
15
 
16
 
 
125
  if batch_analyzer is None:
126
  batch_analyser = BatchAnalyzerEngine(AnalyzerEngine())
127
  rows_with_scanned_columns_only = (
128
+ {column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
129
  )
130
  for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
131
  yield from analyze(