Spaces:

lhoestq
/

presidio-dataset-scanner

Running

lhoestq HF staff commited on May 2, 2024

Commit

0d3784b

1 Parent(s): 09a5205

fix analyze bug

Files changed (1) hide show

analyze.py CHANGED Viewed

@@ -72,15 +72,12 @@ def _simple_analyze_iterator_cache(
     score_threshold: float,
     cache: dict[str, list[RecognizerResult]],
 ) -> list[list[RecognizerResult]]:
-    print(cache)
-    print(texts)
     not_cached_results = iter(
         batch_analyzer.analyze_iterator(
             (text for text in texts if text not in cache), language=language, score_threshold=score_threshold
         )
     )
     results = [cache[text] if text in cache else next(not_cached_results) for text in texts]
-    print(results)
     # cache the last results
     cache.clear()
     cache.update(dict(zip(texts, results)))
@@ -103,19 +100,20 @@ def analyze(
     ]
     return [
         PresidioEntity(
-            text=mask(texts[i][recognizer_result.start : recognizer_result.end]),
             type=recognizer_result.entity_type,
             row_idx=row_idx,
             column_name=column_name,
         )
-        for i, row_idx, recognizer_results in zip(
             count(),
             indices,
-            _simple_analyze_iterator_cache(batch_analyzer, texts, language="en", score_threshold=0.8, cache=cache),
         )
-        for column_name, columns_description, recognizer_result in zip(
-            scanned_columns, columns_descriptions, recognizer_results
         )
         if recognizer_result.start >= len(f"The following is {columns_description} data:\n\n")
     ]

     score_threshold: float,
     cache: dict[str, list[RecognizerResult]],
 ) -> list[list[RecognizerResult]]:
     not_cached_results = iter(
         batch_analyzer.analyze_iterator(
             (text for text in texts if text not in cache), language=language, score_threshold=score_threshold
         )
     )
     results = [cache[text] if text in cache else next(not_cached_results) for text in texts]
     # cache the last results
     cache.clear()
     cache.update(dict(zip(texts, results)))
     ]
     return [
         PresidioEntity(
+            text=mask(texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end]),
             type=recognizer_result.entity_type,
             row_idx=row_idx,
             column_name=column_name,
         )
+        for i, row_idx, recognizer_row_results in zip(
             count(),
             indices,
+            batched(_simple_analyze_iterator_cache(batch_analyzer, texts, language="en", score_threshold=0.8, cache=cache), len(scanned_columns)),
         )
+        for j, column_name, columns_description, recognizer_results in zip(
+            count(), scanned_columns, columns_descriptions, recognizer_row_results
         )
+        for recognizer_result in recognizer_results
         if recognizer_result.start >= len(f"The following is {columns_description} data:\n\n")
     ]