import re from huggingface_hub import DatasetCard def parse_markdown(markdown_text): lines = markdown_text.split("\n") parsed_lines = [] skip_section = False empty_section = True table_of_contents = False more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)") html_comment_pattern = re.compile(r"") for line in lines: if "Table of Contents" in line: table_of_contents = True continue if table_of_contents: if line.startswith("#"): table_of_contents = False else: continue if line.startswith("#"): if skip_section or empty_section: continue empty_section = True if skip_section: if line.startswith("#"): skip_section = False else: continue if more_info_pattern.match(line.strip()): skip_section = True empty_section = True continue if html_comment_pattern.match(line.strip()): continue if line.strip(): empty_section = False parsed_lines.append(line) if skip_section or empty_section: while parsed_lines and parsed_lines[-1].startswith("#"): parsed_lines.pop() return "\n".join(parsed_lines) def is_empty_template(text): # Define the placeholder phrases placeholders = [r"\[More Information Needed\]", r"\[optional\]"] # Remove the placeholder phrases from the text for placeholder in placeholders: text = re.sub(placeholder, "", text) # Remove whitespace and newline characters text = text.strip() # Check if the remaining text is empty return not text def try_load_text(row): try: return DatasetCard(row["card"]).text except Exception as e: print(e) return None