Spaces:
Sleeping
Sleeping
Omar Solano
commited on
Commit
β’
f798896
1
Parent(s):
8c29627
add scraping hf scripts
Browse files
.gitignore
CHANGED
@@ -165,6 +165,8 @@ notebooks/mini-llama-articles/
|
|
165 |
scripts/ai-tutor-db
|
166 |
.huggingface
|
167 |
|
|
|
|
|
168 |
*.csv
|
169 |
*.json
|
170 |
*.jsonl
|
|
|
165 |
scripts/ai-tutor-db
|
166 |
.huggingface
|
167 |
|
168 |
+
.DS_Store
|
169 |
+
|
170 |
*.csv
|
171 |
*.json
|
172 |
*.jsonl
|
data/scraping/huggingface_docs/parse_hf_html.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
from urllib.parse import urljoin
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
|
12 |
+
class HuggingfaceParser:
|
13 |
+
def __init__(self, html, url):
|
14 |
+
self.soup = BeautifulSoup(html, "html.parser")
|
15 |
+
self.url = url
|
16 |
+
|
17 |
+
def find_sections(self):
|
18 |
+
sections = []
|
19 |
+
main_content = self.soup.find("article", class_="md-content__inner")
|
20 |
+
if not main_content:
|
21 |
+
main_content = self.soup.find(
|
22 |
+
"div", class_="main-container"
|
23 |
+
) # Look for main container
|
24 |
+
if not main_content:
|
25 |
+
main_content = self.soup.find(
|
26 |
+
"body"
|
27 |
+
) # Fallback to body if nothing else found
|
28 |
+
|
29 |
+
if not main_content:
|
30 |
+
print(f"Error: No main content found for {self.url}")
|
31 |
+
return sections
|
32 |
+
|
33 |
+
# Try to find headers
|
34 |
+
headers = main_content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
35 |
+
|
36 |
+
if not headers:
|
37 |
+
# If no headers, look for other structural elements
|
38 |
+
headers = main_content.find_all(
|
39 |
+
["div", "p"], class_=["docstring", "section"]
|
40 |
+
)
|
41 |
+
|
42 |
+
if not headers:
|
43 |
+
print(f"Warning: No headers or sections found in {self.url}")
|
44 |
+
# If still no headers, treat the whole content as one section
|
45 |
+
title = self.soup.title.string if self.soup.title else "Untitled"
|
46 |
+
sections.append(
|
47 |
+
{
|
48 |
+
"name": title,
|
49 |
+
"url": self.url,
|
50 |
+
"content": main_content.get_text(strip=True),
|
51 |
+
"level": 1,
|
52 |
+
}
|
53 |
+
)
|
54 |
+
return sections
|
55 |
+
|
56 |
+
for i, header in enumerate(headers):
|
57 |
+
name = header.text.strip()
|
58 |
+
header_id = header.get("id", "")
|
59 |
+
if header_id:
|
60 |
+
section_url = f"{self.url}#{header_id}"
|
61 |
+
else:
|
62 |
+
section_url = self.url
|
63 |
+
|
64 |
+
content = self.extract_content(
|
65 |
+
header, headers[i + 1] if i + 1 < len(headers) else None
|
66 |
+
)
|
67 |
+
sections.append(
|
68 |
+
{
|
69 |
+
"name": name,
|
70 |
+
"url": section_url,
|
71 |
+
"content": content,
|
72 |
+
"level": self.get_header_level(header),
|
73 |
+
}
|
74 |
+
)
|
75 |
+
|
76 |
+
return sections
|
77 |
+
|
78 |
+
def extract_content(self, start_tag, end_tag):
|
79 |
+
content = []
|
80 |
+
current = start_tag.next_sibling
|
81 |
+
while current and current != end_tag:
|
82 |
+
if isinstance(current, str):
|
83 |
+
content.append(current.strip())
|
84 |
+
elif current.name == "table":
|
85 |
+
table_html = io.StringIO(str(current))
|
86 |
+
content.append(
|
87 |
+
pd.read_html(table_html)[0].to_markdown(
|
88 |
+
index=False, tablefmt="github"
|
89 |
+
)
|
90 |
+
)
|
91 |
+
elif current.name not in ["script", "style"]:
|
92 |
+
content.append(current.get_text(strip=True, separator=" "))
|
93 |
+
current = current.next_sibling
|
94 |
+
return "\n".join(filter(None, content))
|
95 |
+
|
96 |
+
def get_header_level(self, tag):
|
97 |
+
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
98 |
+
return int(tag.name[1])
|
99 |
+
elif "class" in tag.attrs:
|
100 |
+
if "docstring" in tag["class"]:
|
101 |
+
return 1
|
102 |
+
elif "section" in tag["class"]:
|
103 |
+
return 2
|
104 |
+
return 1 # Default level
|
105 |
+
|
106 |
+
|
107 |
+
def is_likely_html_file(file_path):
|
108 |
+
excluded_extensions = {".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg"}
|
109 |
+
return file_path.suffix == "" or file_path.suffix.lower() not in excluded_extensions
|
110 |
+
|
111 |
+
|
112 |
+
def parse_saved_html_files(html_dir, base_url):
|
113 |
+
all_sections = []
|
114 |
+
html_files = [
|
115 |
+
f for f in Path(html_dir).rglob("*") if f.is_file() and is_likely_html_file(f)
|
116 |
+
]
|
117 |
+
print(f"Found {len(html_files)} HTML files")
|
118 |
+
|
119 |
+
for html_file in tqdm(html_files, desc="Parsing HTML files"):
|
120 |
+
try:
|
121 |
+
with open(html_file, "r", encoding="utf-8") as file:
|
122 |
+
html_content = file.read()
|
123 |
+
|
124 |
+
relative_path = html_file.relative_to(html_dir)
|
125 |
+
url = urljoin(base_url, str(relative_path).replace(os.path.sep, "/"))
|
126 |
+
|
127 |
+
parser = HuggingfaceParser(html_content, url)
|
128 |
+
sections = parser.find_sections()
|
129 |
+
|
130 |
+
if not sections:
|
131 |
+
print(f"Warning: No sections found in {html_file}")
|
132 |
+
# exit(0)
|
133 |
+
# break
|
134 |
+
all_sections.extend(sections)
|
135 |
+
except Exception as e:
|
136 |
+
print(f"Error parsing {html_file}: {str(e)}")
|
137 |
+
# exit(0)
|
138 |
+
|
139 |
+
return all_sections
|
140 |
+
|
141 |
+
|
142 |
+
def save_to_jsonl(data, output_file):
|
143 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
144 |
+
for item in data:
|
145 |
+
json.dump(item, f, ensure_ascii=False)
|
146 |
+
f.write("\n")
|
147 |
+
|
148 |
+
|
149 |
+
def main():
|
150 |
+
# html_dir = "huggingface_docs" # Directory where HTML files are saved
|
151 |
+
html_dir = "transformers_docs_v4.42.0" # Directory where HTML files are saved
|
152 |
+
base_url = "https://huggingface.co/docs/transformers/"
|
153 |
+
output_file = "hf_transformers_v4_42_0.jsonl"
|
154 |
+
|
155 |
+
all_sections = parse_saved_html_files(html_dir, base_url)
|
156 |
+
save_to_jsonl(all_sections, output_file)
|
157 |
+
|
158 |
+
print(f"Parsed content saved to {output_file}")
|
159 |
+
print(f"Total sections parsed: {len(all_sections)}")
|
160 |
+
|
161 |
+
|
162 |
+
if __name__ == "__main__":
|
163 |
+
main()
|
data/scraping/huggingface_docs/scrape_hf_docs_from_repo.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import requests
|
4 |
+
|
5 |
+
# GitHub repository information
|
6 |
+
owner = "huggingface"
|
7 |
+
|
8 |
+
# repo = "peft"
|
9 |
+
# path = "docs/source"
|
10 |
+
|
11 |
+
repo = "transformers"
|
12 |
+
path = "docs/source/en"
|
13 |
+
|
14 |
+
# GitHub API endpoint for the repository contents
|
15 |
+
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
|
16 |
+
|
17 |
+
|
18 |
+
def get_files_in_directory(api_url):
|
19 |
+
response = requests.get(api_url)
|
20 |
+
if response.status_code == 200:
|
21 |
+
return response.json()
|
22 |
+
else:
|
23 |
+
print(f"Failed to fetch directory contents: {response.status_code}")
|
24 |
+
return []
|
25 |
+
|
26 |
+
|
27 |
+
def download_file(file_url, file_path):
|
28 |
+
response = requests.get(file_url)
|
29 |
+
if response.status_code == 200:
|
30 |
+
with open(file_path, "wb") as file:
|
31 |
+
file.write(response.content)
|
32 |
+
else:
|
33 |
+
print(f"Failed to download file: {response.status_code}")
|
34 |
+
|
35 |
+
|
36 |
+
def fetch_md_files(api_url, local_dir):
|
37 |
+
files = get_files_in_directory(api_url)
|
38 |
+
for file in files:
|
39 |
+
if file["type"] == "file" and file["name"].endswith(".md"):
|
40 |
+
file_url = file["download_url"]
|
41 |
+
file_path = os.path.join(local_dir, file["name"])
|
42 |
+
print(f'Downloading {file["name"]}...')
|
43 |
+
download_file(file_url, file_path)
|
44 |
+
elif file["type"] == "dir":
|
45 |
+
subdir = os.path.join(local_dir, file["name"])
|
46 |
+
os.makedirs(subdir, exist_ok=True)
|
47 |
+
fetch_md_files(file["url"], subdir)
|
48 |
+
|
49 |
+
|
50 |
+
# Local directory to save the files
|
51 |
+
local_dir = f"data/{repo}_docs"
|
52 |
+
os.makedirs(local_dir, exist_ok=True)
|
53 |
+
|
54 |
+
# Start fetching files
|
55 |
+
fetch_md_files(api_url, local_dir)
|
56 |
+
|
57 |
+
print("All files have been downloaded.")
|
data/scraping/huggingface_docs/scrape_hf_docs_from_web.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
from urllib.parse import unquote, urljoin, urlparse
|
4 |
+
|
5 |
+
import scrapy
|
6 |
+
from scrapy.crawler import CrawlerProcess
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
|
10 |
+
|
11 |
+
|
12 |
+
def is_valid_url(url, domain, base_path):
|
13 |
+
parsed = urlparse(url)
|
14 |
+
return (
|
15 |
+
parsed.scheme in ["http", "https"]
|
16 |
+
and parsed.netloc == domain
|
17 |
+
and parsed.path.startswith(base_path)
|
18 |
+
and "#" not in url
|
19 |
+
) # Exclude URLs with fragments
|
20 |
+
|
21 |
+
|
22 |
+
def clean_url(url):
|
23 |
+
# Replace & with &, and # with #
|
24 |
+
url = url.replace("&", "&").replace("#", "#")
|
25 |
+
# Decode URL-encoded characters
|
26 |
+
return unquote(url)
|
27 |
+
|
28 |
+
|
29 |
+
class DocsSpider(scrapy.Spider):
|
30 |
+
name = "docs"
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self,
|
34 |
+
homepage_url: str,
|
35 |
+
domain: str,
|
36 |
+
base_path: str,
|
37 |
+
save_dir="outputs/",
|
38 |
+
target_version=None,
|
39 |
+
*args,
|
40 |
+
**kwargs,
|
41 |
+
):
|
42 |
+
super(DocsSpider, self).__init__(*args, **kwargs)
|
43 |
+
self.homepage_url = homepage_url
|
44 |
+
self.domain = domain
|
45 |
+
self.base_path = base_path
|
46 |
+
self.allowed_domains = [domain]
|
47 |
+
self.start_urls = [self.homepage_url]
|
48 |
+
self.base_dir = Path(save_dir)
|
49 |
+
self.target_version = target_version
|
50 |
+
self.pages = []
|
51 |
+
self.progress_bar = None
|
52 |
+
|
53 |
+
def start_requests(self):
|
54 |
+
self.progress_bar = tqdm(desc="Crawling pages", unit="page")
|
55 |
+
yield scrapy.Request(self.homepage_url, self.parse)
|
56 |
+
|
57 |
+
def parse(self, response):
|
58 |
+
if not is_valid_url(response.url, self.domain, self.base_path):
|
59 |
+
return
|
60 |
+
|
61 |
+
parsed_uri = urlparse(response.url)
|
62 |
+
relative_path = parsed_uri.path.removeprefix(self.base_path).strip("/")
|
63 |
+
if relative_path:
|
64 |
+
filepath = self.base_dir / relative_path
|
65 |
+
else:
|
66 |
+
filepath = self.base_dir / "index.html"
|
67 |
+
|
68 |
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
69 |
+
with open(filepath, "wb") as f:
|
70 |
+
f.write(response.body)
|
71 |
+
|
72 |
+
self.pages.append({"url": response.url, "html": response.body})
|
73 |
+
self.progress_bar.update(1)
|
74 |
+
|
75 |
+
for href in response.css("a::attr(href)").getall():
|
76 |
+
full_url = response.urljoin(clean_url(href))
|
77 |
+
if is_valid_url(full_url, self.domain, self.base_path):
|
78 |
+
if self.target_version:
|
79 |
+
if self.target_version in full_url:
|
80 |
+
yield response.follow(full_url, self.parse)
|
81 |
+
else:
|
82 |
+
yield response.follow(full_url, self.parse)
|
83 |
+
|
84 |
+
def closed(self, reason):
|
85 |
+
if self.progress_bar:
|
86 |
+
self.progress_bar.close()
|
87 |
+
|
88 |
+
|
89 |
+
def crawl_docs(start_url, domain, base_path, save_dir="outputs/", target_version=None):
|
90 |
+
process = CrawlerProcess(
|
91 |
+
settings={
|
92 |
+
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
|
93 |
+
"DOWNLOAD_DELAY": 2,
|
94 |
+
"RANDOMIZE_DOWNLOAD_DELAY": True,
|
95 |
+
"CONCURRENT_REQUESTS": 1,
|
96 |
+
"RETRY_TIMES": 5,
|
97 |
+
"RETRY_HTTP_CODES": [429, 500, 502, 503, 504, 522, 524, 408, 400],
|
98 |
+
"HTTPERROR_ALLOWED_CODES": [404], # Allow 404 errors to be logged
|
99 |
+
}
|
100 |
+
)
|
101 |
+
|
102 |
+
process.crawl(
|
103 |
+
DocsSpider,
|
104 |
+
homepage_url=start_url,
|
105 |
+
domain=domain,
|
106 |
+
base_path=base_path,
|
107 |
+
save_dir=save_dir,
|
108 |
+
target_version=target_version,
|
109 |
+
)
|
110 |
+
process.start()
|
111 |
+
|
112 |
+
spider = next(s for s in process.crawlers if s.spider.name == "docs").spider
|
113 |
+
|
114 |
+
print(f"Total pages crawled and parsed: {len(spider.pages)}")
|
115 |
+
|
116 |
+
|
117 |
+
if __name__ == "__main__":
|
118 |
+
# https://huggingface.co/docs/peft/v0.11.0/en/index
|
119 |
+
# Customizable parameters
|
120 |
+
domain = "huggingface.co"
|
121 |
+
version = "v0.11.0"
|
122 |
+
library = "peft"
|
123 |
+
language = "en"
|
124 |
+
|
125 |
+
# Construct URL and paths
|
126 |
+
base_path = f"/docs/{library}/{version}/{language}"
|
127 |
+
start_url = f"https://{domain}{base_path}/index"
|
128 |
+
save_dir = f"{library}_docs_{version}"
|
129 |
+
|
130 |
+
# Optional: Set target_version to None if you want to crawl all versions
|
131 |
+
target_version = None
|
132 |
+
|
133 |
+
crawl_docs(start_url, domain, base_path, save_dir, target_version)
|
data/scraping/huggingface_docs/validate_jsonl.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import Any, Dict, List
|
3 |
+
|
4 |
+
|
5 |
+
def load_and_validate_jsonl(file_path: str) -> Dict[int, Any]:
|
6 |
+
"""
|
7 |
+
Load a .jsonl file into a dictionary and validate each line.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
file_path (str): Path to the .jsonl file
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
Dict[int, Any]: A dictionary where keys are line numbers (1-indexed) and values are the parsed JSON objects
|
14 |
+
|
15 |
+
Raises:
|
16 |
+
ValueError: If any line in the file is not valid JSON
|
17 |
+
"""
|
18 |
+
result = {}
|
19 |
+
with open(file_path, "r") as file:
|
20 |
+
for line_number, line in enumerate(file, 1):
|
21 |
+
try:
|
22 |
+
# Strip whitespace and check if the line is empty
|
23 |
+
stripped_line = line.strip()
|
24 |
+
if not stripped_line:
|
25 |
+
print(f"Warning: Line {line_number} is empty.")
|
26 |
+
continue
|
27 |
+
|
28 |
+
# Attempt to parse the JSON
|
29 |
+
parsed_json = json.loads(stripped_line)
|
30 |
+
result[line_number] = parsed_json
|
31 |
+
except json.JSONDecodeError as e:
|
32 |
+
raise ValueError(f"Invalid JSON on line {line_number}: {e}")
|
33 |
+
|
34 |
+
return result
|
35 |
+
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
file_path = "hf_transformers_v4_42_0.jsonl"
|
39 |
+
try:
|
40 |
+
loaded_data = load_and_validate_jsonl(file_path)
|
41 |
+
print(f"Successfully loaded {len(loaded_data)} valid JSON objects.")
|
42 |
+
|
43 |
+
# Optional: Print the first few items
|
44 |
+
print("\nFirst few items:")
|
45 |
+
for line_number, data in list(loaded_data.items())[:5]:
|
46 |
+
print(f"Line {line_number}: {data}")
|
47 |
+
|
48 |
+
except ValueError as e:
|
49 |
+
print(f"Error: {e}")
|
50 |
+
except FileNotFoundError:
|
51 |
+
print(f"Error: File '{file_path}' not found.")
|