davanstrien HF staff commited on
Commit
1f7ca14
1 Parent(s): fe9092a

Add card_processing.py for markdown parsing and text loading

Browse files
Files changed (1) hide show
  1. card_processing.py +72 -0
card_processing.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from huggingface_hub import DatasetCard
4
+
5
+
6
+ def parse_markdown(markdown_text):
7
+ lines = markdown_text.split("\n")
8
+ parsed_lines = []
9
+ skip_section = False
10
+ empty_section = True
11
+ table_of_contents = False
12
+ more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)")
13
+ html_comment_pattern = re.compile(r"<!--.*?-->")
14
+
15
+ for line in lines:
16
+ if "Table of Contents" in line:
17
+ table_of_contents = True
18
+ continue
19
+ if table_of_contents:
20
+ if line.startswith("#"):
21
+ table_of_contents = False
22
+ else:
23
+ continue
24
+
25
+ if line.startswith("#"):
26
+ if skip_section or empty_section:
27
+ continue
28
+ empty_section = True
29
+
30
+ if skip_section:
31
+ if line.startswith("#"):
32
+ skip_section = False
33
+
34
+ else:
35
+ continue
36
+ if more_info_pattern.match(line.strip()):
37
+ skip_section = True
38
+ empty_section = True
39
+ continue
40
+
41
+ if html_comment_pattern.match(line.strip()):
42
+ continue
43
+
44
+ if line.strip():
45
+ empty_section = False
46
+ parsed_lines.append(line)
47
+
48
+ if skip_section or empty_section:
49
+ while parsed_lines and parsed_lines[-1].startswith("#"):
50
+ parsed_lines.pop()
51
+
52
+ return "\n".join(parsed_lines)
53
+
54
+
55
+ def is_empty_template(text):
56
+ # Define the placeholder phrases
57
+ placeholders = [r"\[More Information Needed\]", r"\[optional\]"]
58
+ # Remove the placeholder phrases from the text
59
+ for placeholder in placeholders:
60
+ text = re.sub(placeholder, "", text)
61
+ # Remove whitespace and newline characters
62
+ text = text.strip()
63
+ # Check if the remaining text is empty
64
+ return not text
65
+
66
+
67
+ def try_load_text(row):
68
+ try:
69
+ return DatasetCard(row["card"]).text
70
+ except Exception as e:
71
+ print(e)
72
+ return None