Spaces:
Paused
Paused
File size: 1,948 Bytes
1f7ca14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import re
from huggingface_hub import DatasetCard
def parse_markdown(markdown_text):
lines = markdown_text.split("\n")
parsed_lines = []
skip_section = False
empty_section = True
table_of_contents = False
more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)")
html_comment_pattern = re.compile(r"<!--.*?-->")
for line in lines:
if "Table of Contents" in line:
table_of_contents = True
continue
if table_of_contents:
if line.startswith("#"):
table_of_contents = False
else:
continue
if line.startswith("#"):
if skip_section or empty_section:
continue
empty_section = True
if skip_section:
if line.startswith("#"):
skip_section = False
else:
continue
if more_info_pattern.match(line.strip()):
skip_section = True
empty_section = True
continue
if html_comment_pattern.match(line.strip()):
continue
if line.strip():
empty_section = False
parsed_lines.append(line)
if skip_section or empty_section:
while parsed_lines and parsed_lines[-1].startswith("#"):
parsed_lines.pop()
return "\n".join(parsed_lines)
def is_empty_template(text):
# Define the placeholder phrases
placeholders = [r"\[More Information Needed\]", r"\[optional\]"]
# Remove the placeholder phrases from the text
for placeholder in placeholders:
text = re.sub(placeholder, "", text)
# Remove whitespace and newline characters
text = text.strip()
# Check if the remaining text is empty
return not text
def try_load_text(row):
try:
return DatasetCard(row["card"]).text
except Exception as e:
print(e)
return None
|