lhoestq HF staff commited on
Commit
ede461a
1 Parent(s): 1d02824

again + ignore image/audio

Browse files
Files changed (1) hide show
  1. analyze.py +3 -0
analyze.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from itertools import count, islice
2
  from typing import Any, Iterable, Literal, Optional, TypedDict, TypeVar, Union, overload
3
 
@@ -53,6 +54,8 @@ def get_strings(row_content: Any) -> str:
53
  if isinstance(row_content, str):
54
  return row_content
55
  if isinstance(row_content, dict):
 
 
56
  row_content = list(row_content.values())
57
  if isinstance(row_content, list):
58
  str_items = (get_strings(row_content_item) for row_content_item in row_content)
 
1
+ import re
2
  from itertools import count, islice
3
  from typing import Any, Iterable, Literal, Optional, TypedDict, TypeVar, Union, overload
4
 
 
54
  if isinstance(row_content, str):
55
  return row_content
56
  if isinstance(row_content, dict):
57
+ if "src" in row_content:
58
+ return "" # could be image or audio
59
  row_content = list(row_content.values())
60
  if isinstance(row_content, list):
61
  str_items = (get_strings(row_content_item) for row_content_item in row_content)