lhoestq HF staff commited on
Commit
d180662
β€’
1 Parent(s): d0ca54b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -18
app.py CHANGED
@@ -8,11 +8,15 @@ import requests
8
  import pandas as pd
9
  from datasets import Features
10
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
11
 
12
  from analyze import PresidioEntity, analyzer, get_column_description, get_columns_with_strings, mask, presidio_scan_entities
13
 
14
  MAX_ROWS = 100
15
  T = TypeVar("T")
 
 
 
16
  DEFAULT_PRESIDIO_ENTITIES = sorted([
17
  'PERSON',
18
  'CREDIT_CARD',
@@ -26,27 +30,11 @@ DEFAULT_PRESIDIO_ENTITIES = sorted([
26
  'IBAN_CODE',
27
  'EMAIL',
28
  ])
29
- WARNING_PRESIDIO_ENTITIES = sorted([
30
- 'PHONE_NUMBER',
31
- 'US_PASSPORT',
32
- 'EMAIL_ADDRESS',
33
- 'IP_ADDRESS',
34
- 'US_BANK_NUMBER',
35
- 'IBAN_CODE',
36
- 'EMAIL',
37
- ])
38
- ALERT_PRESIDIO_ENTITIES = sorted([
39
- 'CREDIT_CARD',
40
- 'US_SSN',
41
- 'US_PASSPORT',
42
- 'US_BANK_NUMBER',
43
- 'IBAN_CODE',
44
- ])
45
 
46
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
47
  batch_size = 100
48
  for i in count():
49
- rows_resp = requests.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
50
  if "error" in rows_resp:
51
  raise RuntimeError(rows_resp["error"])
52
  if not rows_resp["rows"]:
@@ -75,7 +63,7 @@ def presidio_report(presidio_entities: list[PresidioEntity], next_row_idx: int,
75
 
76
 
77
  def analyze_dataset(dataset: str, enabled_presidio_entities: list[str] = DEFAULT_PRESIDIO_ENTITIES, show_texts_without_masks: bool = False) -> pd.DataFrame:
78
- info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
79
  if "error" in info_resp:
80
  yield "❌ " + info_resp["error"], pd.DataFrame()
81
  return
 
8
  import pandas as pd
9
  from datasets import Features
10
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
11
+ from requests.adapters import HTTPAdapter, Retry
12
 
13
  from analyze import PresidioEntity, analyzer, get_column_description, get_columns_with_strings, mask, presidio_scan_entities
14
 
15
  MAX_ROWS = 100
16
  T = TypeVar("T")
17
+ session = requests.Session()
18
+ retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
19
+ session.mount('http://', HTTPAdapter(max_retries=retries))
20
  DEFAULT_PRESIDIO_ENTITIES = sorted([
21
  'PERSON',
22
  'CREDIT_CARD',
 
30
  'IBAN_CODE',
31
  'EMAIL',
32
  ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
35
  batch_size = 100
36
  for i in count():
37
+ rows_resp = session.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=10).json()
38
  if "error" in rows_resp:
39
  raise RuntimeError(rows_resp["error"])
40
  if not rows_resp["rows"]:
 
63
 
64
 
65
  def analyze_dataset(dataset: str, enabled_presidio_entities: list[str] = DEFAULT_PRESIDIO_ENTITIES, show_texts_without_masks: bool = False) -> pd.DataFrame:
66
+ info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
67
  if "error" in info_resp:
68
  yield "❌ " + info_resp["error"], pd.DataFrame()
69
  return