dataset-tldr

Paused

File size: 1,948 Bytes

1f7ca14

import re

from huggingface_hub import DatasetCard


def parse_markdown(markdown_text):
    lines = markdown_text.split("\n")
    parsed_lines = []
    skip_section = False
    empty_section = True
    table_of_contents = False
    more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)")
    html_comment_pattern = re.compile(r"<!--.*?-->")

    for line in lines:
        if "Table of Contents" in line:
            table_of_contents = True
            continue
        if table_of_contents:
            if line.startswith("#"):
                table_of_contents = False
            else:
                continue

        if line.startswith("#"):
            if skip_section or empty_section:
                continue
            empty_section = True

        if skip_section:
            if line.startswith("#"):
                skip_section = False

            else:
                continue
        if more_info_pattern.match(line.strip()):
            skip_section = True
            empty_section = True
            continue

        if html_comment_pattern.match(line.strip()):
            continue

        if line.strip():
            empty_section = False
            parsed_lines.append(line)

    if skip_section or empty_section:
        while parsed_lines and parsed_lines[-1].startswith("#"):
            parsed_lines.pop()

    return "\n".join(parsed_lines)


def is_empty_template(text):
    # Define the placeholder phrases
    placeholders = [r"\[More Information Needed\]", r"\[optional\]"]
    # Remove the placeholder phrases from the text
    for placeholder in placeholders:
        text = re.sub(placeholder, "", text)
    # Remove whitespace and newline characters
    text = text.strip()
    # Check if the remaining text is empty
    return not text


def try_load_text(row):
    try:
        return DatasetCard(row["card"]).text
    except Exception as e:
        print(e)
        return None