Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Sleeping

App Files Files Community

Omar Solano commited on Jul 5

Commit

f798896

•

1 Parent(s): 8c29627

add scraping hf scripts

Browse files

Files changed (5) hide show

.gitignore +2 -0
data/scraping/huggingface_docs/parse_hf_html.py +163 -0
data/scraping/huggingface_docs/scrape_hf_docs_from_repo.py +57 -0
data/scraping/huggingface_docs/scrape_hf_docs_from_web.py +133 -0
data/scraping/huggingface_docs/validate_jsonl.py +51 -0

.gitignore CHANGED Viewed

@@ -165,6 +165,8 @@ notebooks/mini-llama-articles/
 scripts/ai-tutor-db
 .huggingface
 *.csv
 *.json
 *.jsonl

 scripts/ai-tutor-db
 .huggingface
+.DS_Store
 *.csv
 *.json
 *.jsonl

data/scraping/huggingface_docs/parse_hf_html.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import io
+import json
+import os
+from pathlib import Path
+from urllib.parse import urljoin
+import pandas as pd
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+class HuggingfaceParser:
+    def __init__(self, html, url):
+        self.soup = BeautifulSoup(html, "html.parser")
+        self.url = url
+    def find_sections(self):
+        sections = []
+        main_content = self.soup.find("article", class_="md-content__inner")
+        if not main_content:
+            main_content = self.soup.find(
+                "div", class_="main-container"
+            )  # Look for main container
+        if not main_content:
+            main_content = self.soup.find(
+                "body"
+            )  # Fallback to body if nothing else found
+        if not main_content:
+            print(f"Error: No main content found for {self.url}")
+            return sections
+        # Try to find headers
+        headers = main_content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
+        if not headers:
+            # If no headers, look for other structural elements
+            headers = main_content.find_all(
+                ["div", "p"], class_=["docstring", "section"]
+            )
+        if not headers:
+            print(f"Warning: No headers or sections found in {self.url}")
+            # If still no headers, treat the whole content as one section
+            title = self.soup.title.string if self.soup.title else "Untitled"
+            sections.append(
+                {
+                    "name": title,
+                    "url": self.url,
+                    "content": main_content.get_text(strip=True),
+                    "level": 1,
+                }
+            )
+            return sections
+        for i, header in enumerate(headers):
+            name = header.text.strip()
+            header_id = header.get("id", "")
+            if header_id:
+                section_url = f"{self.url}#{header_id}"
+            else:
+                section_url = self.url
+            content = self.extract_content(
+                header, headers[i + 1] if i + 1 < len(headers) else None
+            )
+            sections.append(
+                {
+                    "name": name,
+                    "url": section_url,
+                    "content": content,
+                    "level": self.get_header_level(header),
+                }
+            )
+        return sections
+    def extract_content(self, start_tag, end_tag):
+        content = []
+        current = start_tag.next_sibling
+        while current and current != end_tag:
+            if isinstance(current, str):
+                content.append(current.strip())
+            elif current.name == "table":
+                table_html = io.StringIO(str(current))
+                content.append(
+                    pd.read_html(table_html)[0].to_markdown(
+                        index=False, tablefmt="github"
+                    )
+                )
+            elif current.name not in ["script", "style"]:
+                content.append(current.get_text(strip=True, separator=" "))
+            current = current.next_sibling
+        return "\n".join(filter(None, content))
+    def get_header_level(self, tag):
+        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            return int(tag.name[1])
+        elif "class" in tag.attrs:
+            if "docstring" in tag["class"]:
+                return 1
+            elif "section" in tag["class"]:
+                return 2
+        return 1  # Default level
+def is_likely_html_file(file_path):
+    excluded_extensions = {".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg"}
+    return file_path.suffix == "" or file_path.suffix.lower() not in excluded_extensions
+def parse_saved_html_files(html_dir, base_url):
+    all_sections = []
+    html_files = [
+        f for f in Path(html_dir).rglob("*") if f.is_file() and is_likely_html_file(f)
+    ]
+    print(f"Found {len(html_files)} HTML files")
+    for html_file in tqdm(html_files, desc="Parsing HTML files"):
+        try:
+            with open(html_file, "r", encoding="utf-8") as file:
+                html_content = file.read()
+            relative_path = html_file.relative_to(html_dir)
+            url = urljoin(base_url, str(relative_path).replace(os.path.sep, "/"))
+            parser = HuggingfaceParser(html_content, url)
+            sections = parser.find_sections()
+            if not sections:
+                print(f"Warning: No sections found in {html_file}")
+                # exit(0)
+                # break
+            all_sections.extend(sections)
+        except Exception as e:
+            print(f"Error parsing {html_file}: {str(e)}")
+            # exit(0)
+    return all_sections
+def save_to_jsonl(data, output_file):
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in data:
+            json.dump(item, f, ensure_ascii=False)
+            f.write("\n")
+def main():
+    # html_dir = "huggingface_docs"  # Directory where HTML files are saved
+    html_dir = "transformers_docs_v4.42.0"  # Directory where HTML files are saved
+    base_url = "https://huggingface.co/docs/transformers/"
+    output_file = "hf_transformers_v4_42_0.jsonl"
+    all_sections = parse_saved_html_files(html_dir, base_url)
+    save_to_jsonl(all_sections, output_file)
+    print(f"Parsed content saved to {output_file}")
+    print(f"Total sections parsed: {len(all_sections)}")
+if __name__ == "__main__":
+    main()

data/scraping/huggingface_docs/scrape_hf_docs_from_repo.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import requests
+# GitHub repository information
+owner = "huggingface"
+# repo = "peft"
+# path = "docs/source"
+repo = "transformers"
+path = "docs/source/en"
+# GitHub API endpoint for the repository contents
+api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
+def get_files_in_directory(api_url):
+    response = requests.get(api_url)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to fetch directory contents: {response.status_code}")
+        return []
+def download_file(file_url, file_path):
+    response = requests.get(file_url)
+    if response.status_code == 200:
+        with open(file_path, "wb") as file:
+            file.write(response.content)
+    else:
+        print(f"Failed to download file: {response.status_code}")
+def fetch_md_files(api_url, local_dir):
+    files = get_files_in_directory(api_url)
+    for file in files:
+        if file["type"] == "file" and file["name"].endswith(".md"):
+            file_url = file["download_url"]
+            file_path = os.path.join(local_dir, file["name"])
+            print(f'Downloading {file["name"]}...')
+            download_file(file_url, file_path)
+        elif file["type"] == "dir":
+            subdir = os.path.join(local_dir, file["name"])
+            os.makedirs(subdir, exist_ok=True)
+            fetch_md_files(file["url"], subdir)
+# Local directory to save the files
+local_dir = f"data/{repo}_docs"
+os.makedirs(local_dir, exist_ok=True)
+# Start fetching files
+fetch_md_files(api_url, local_dir)
+print("All files have been downloaded.")

data/scraping/huggingface_docs/scrape_hf_docs_from_web.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import logging
+from pathlib import Path
+from urllib.parse import unquote, urljoin, urlparse
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from tqdm import tqdm
+logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
+def is_valid_url(url, domain, base_path):
+    parsed = urlparse(url)
+    return (
+        parsed.scheme in ["http", "https"]
+        and parsed.netloc == domain
+        and parsed.path.startswith(base_path)
+        and "#" not in url
+    )  # Exclude URLs with fragments
+def clean_url(url):
+    # Replace &amp; with &, and &num; with #
+    url = url.replace("&amp;", "&").replace("&num;", "#")
+    # Decode URL-encoded characters
+    return unquote(url)
+class DocsSpider(scrapy.Spider):
+    name = "docs"
+    def __init__(
+        self,
+        homepage_url: str,
+        domain: str,
+        base_path: str,
+        save_dir="outputs/",
+        target_version=None,
+        *args,
+        **kwargs,
+    ):
+        super(DocsSpider, self).__init__(*args, **kwargs)
+        self.homepage_url = homepage_url
+        self.domain = domain
+        self.base_path = base_path
+        self.allowed_domains = [domain]
+        self.start_urls = [self.homepage_url]
+        self.base_dir = Path(save_dir)
+        self.target_version = target_version
+        self.pages = []
+        self.progress_bar = None
+    def start_requests(self):
+        self.progress_bar = tqdm(desc="Crawling pages", unit="page")
+        yield scrapy.Request(self.homepage_url, self.parse)
+    def parse(self, response):
+        if not is_valid_url(response.url, self.domain, self.base_path):
+            return
+        parsed_uri = urlparse(response.url)
+        relative_path = parsed_uri.path.removeprefix(self.base_path).strip("/")
+        if relative_path:
+            filepath = self.base_dir / relative_path
+        else:
+            filepath = self.base_dir / "index.html"
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        with open(filepath, "wb") as f:
+            f.write(response.body)
+        self.pages.append({"url": response.url, "html": response.body})
+        self.progress_bar.update(1)
+        for href in response.css("a::attr(href)").getall():
+            full_url = response.urljoin(clean_url(href))
+            if is_valid_url(full_url, self.domain, self.base_path):
+                if self.target_version:
+                    if self.target_version in full_url:
+                        yield response.follow(full_url, self.parse)
+                else:
+                    yield response.follow(full_url, self.parse)
+    def closed(self, reason):
+        if self.progress_bar:
+            self.progress_bar.close()
+def crawl_docs(start_url, domain, base_path, save_dir="outputs/", target_version=None):
+    process = CrawlerProcess(
+        settings={
+            "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+            "DOWNLOAD_DELAY": 2,
+            "RANDOMIZE_DOWNLOAD_DELAY": True,
+            "CONCURRENT_REQUESTS": 1,
+            "RETRY_TIMES": 5,
+            "RETRY_HTTP_CODES": [429, 500, 502, 503, 504, 522, 524, 408, 400],
+            "HTTPERROR_ALLOWED_CODES": [404],  # Allow 404 errors to be logged
+        }
+    )
+    process.crawl(
+        DocsSpider,
+        homepage_url=start_url,
+        domain=domain,
+        base_path=base_path,
+        save_dir=save_dir,
+        target_version=target_version,
+    )
+    process.start()
+    spider = next(s for s in process.crawlers if s.spider.name == "docs").spider
+    print(f"Total pages crawled and parsed: {len(spider.pages)}")
+if __name__ == "__main__":
+    # https://huggingface.co/docs/peft/v0.11.0/en/index
+    # Customizable parameters
+    domain = "huggingface.co"
+    version = "v0.11.0"
+    library = "peft"
+    language = "en"
+    # Construct URL and paths
+    base_path = f"/docs/{library}/{version}/{language}"
+    start_url = f"https://{domain}{base_path}/index"
+    save_dir = f"{library}_docs_{version}"
+    # Optional: Set target_version to None if you want to crawl all versions
+    target_version = None
+    crawl_docs(start_url, domain, base_path, save_dir, target_version)

data/scraping/huggingface_docs/validate_jsonl.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+from typing import Any, Dict, List
+def load_and_validate_jsonl(file_path: str) -> Dict[int, Any]:
+    """
+    Load a .jsonl file into a dictionary and validate each line.
+    Args:
+    file_path (str): Path to the .jsonl file
+    Returns:
+    Dict[int, Any]: A dictionary where keys are line numbers (1-indexed) and values are the parsed JSON objects
+    Raises:
+    ValueError: If any line in the file is not valid JSON
+    """
+    result = {}
+    with open(file_path, "r") as file:
+        for line_number, line in enumerate(file, 1):
+            try:
+                # Strip whitespace and check if the line is empty
+                stripped_line = line.strip()
+                if not stripped_line:
+                    print(f"Warning: Line {line_number} is empty.")
+                    continue
+                # Attempt to parse the JSON
+                parsed_json = json.loads(stripped_line)
+                result[line_number] = parsed_json
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON on line {line_number}: {e}")
+    return result
+if __name__ == "__main__":
+    file_path = "hf_transformers_v4_42_0.jsonl"
+    try:
+        loaded_data = load_and_validate_jsonl(file_path)
+        print(f"Successfully loaded {len(loaded_data)} valid JSON objects.")
+        # Optional: Print the first few items
+        print("\nFirst few items:")
+        for line_number, data in list(loaded_data.items())[:5]:
+            print(f"Line {line_number}: {data}")
+    except ValueError as e:
+        print(f"Error: {e}")
+    except FileNotFoundError:
+        print(f"Error: File '{file_path}' not found.")