Spaces:
Paused
Paused
import re | |
from huggingface_hub import DatasetCard | |
def parse_markdown(markdown_text): | |
lines = markdown_text.split("\n") | |
parsed_lines = [] | |
skip_section = False | |
empty_section = True | |
table_of_contents = False | |
more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)") | |
html_comment_pattern = re.compile(r"<!--.*?-->") | |
for line in lines: | |
if "Table of Contents" in line: | |
table_of_contents = True | |
continue | |
if table_of_contents: | |
if line.startswith("#"): | |
table_of_contents = False | |
else: | |
continue | |
if line.startswith("#"): | |
if skip_section or empty_section: | |
continue | |
empty_section = True | |
if skip_section: | |
if line.startswith("#"): | |
skip_section = False | |
else: | |
continue | |
if more_info_pattern.match(line.strip()): | |
skip_section = True | |
empty_section = True | |
continue | |
if html_comment_pattern.match(line.strip()): | |
continue | |
if line.strip(): | |
empty_section = False | |
parsed_lines.append(line) | |
if skip_section or empty_section: | |
while parsed_lines and parsed_lines[-1].startswith("#"): | |
parsed_lines.pop() | |
return "\n".join(parsed_lines) | |
def is_empty_template(text): | |
# Define the placeholder phrases | |
placeholders = [r"\[More Information Needed\]", r"\[optional\]"] | |
# Remove the placeholder phrases from the text | |
for placeholder in placeholders: | |
text = re.sub(placeholder, "", text) | |
# Remove whitespace and newline characters | |
text = text.strip() | |
# Check if the remaining text is empty | |
return not text | |
def try_load_text(row): | |
try: | |
return DatasetCard(row["card"]).text | |
except Exception as e: | |
print(e) | |
return None | |