dataset-tldr / card_processing.py
davanstrien's picture
davanstrien HF staff
Add card_processing.py for markdown parsing and text loading
1f7ca14
import re
from huggingface_hub import DatasetCard
def parse_markdown(markdown_text):
lines = markdown_text.split("\n")
parsed_lines = []
skip_section = False
empty_section = True
table_of_contents = False
more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)")
html_comment_pattern = re.compile(r"<!--.*?-->")
for line in lines:
if "Table of Contents" in line:
table_of_contents = True
continue
if table_of_contents:
if line.startswith("#"):
table_of_contents = False
else:
continue
if line.startswith("#"):
if skip_section or empty_section:
continue
empty_section = True
if skip_section:
if line.startswith("#"):
skip_section = False
else:
continue
if more_info_pattern.match(line.strip()):
skip_section = True
empty_section = True
continue
if html_comment_pattern.match(line.strip()):
continue
if line.strip():
empty_section = False
parsed_lines.append(line)
if skip_section or empty_section:
while parsed_lines and parsed_lines[-1].startswith("#"):
parsed_lines.pop()
return "\n".join(parsed_lines)
def is_empty_template(text):
# Define the placeholder phrases
placeholders = [r"\[More Information Needed\]", r"\[optional\]"]
# Remove the placeholder phrases from the text
for placeholder in placeholders:
text = re.sub(placeholder, "", text)
# Remove whitespace and newline characters
text = text.strip()
# Check if the remaining text is empty
return not text
def try_load_text(row):
try:
return DatasetCard(row["card"]).text
except Exception as e:
print(e)
return None