Omar Solano commited on
Commit
f798896
β€’
1 Parent(s): 8c29627

add scraping hf scripts

Browse files
.gitignore CHANGED
@@ -165,6 +165,8 @@ notebooks/mini-llama-articles/
165
  scripts/ai-tutor-db
166
  .huggingface
167
 
 
 
168
  *.csv
169
  *.json
170
  *.jsonl
 
165
  scripts/ai-tutor-db
166
  .huggingface
167
 
168
+ .DS_Store
169
+
170
  *.csv
171
  *.json
172
  *.jsonl
data/scraping/huggingface_docs/parse_hf_html.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ from urllib.parse import urljoin
6
+
7
+ import pandas as pd
8
+ from bs4 import BeautifulSoup
9
+ from tqdm import tqdm
10
+
11
+
12
+ class HuggingfaceParser:
13
+ def __init__(self, html, url):
14
+ self.soup = BeautifulSoup(html, "html.parser")
15
+ self.url = url
16
+
17
+ def find_sections(self):
18
+ sections = []
19
+ main_content = self.soup.find("article", class_="md-content__inner")
20
+ if not main_content:
21
+ main_content = self.soup.find(
22
+ "div", class_="main-container"
23
+ ) # Look for main container
24
+ if not main_content:
25
+ main_content = self.soup.find(
26
+ "body"
27
+ ) # Fallback to body if nothing else found
28
+
29
+ if not main_content:
30
+ print(f"Error: No main content found for {self.url}")
31
+ return sections
32
+
33
+ # Try to find headers
34
+ headers = main_content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
35
+
36
+ if not headers:
37
+ # If no headers, look for other structural elements
38
+ headers = main_content.find_all(
39
+ ["div", "p"], class_=["docstring", "section"]
40
+ )
41
+
42
+ if not headers:
43
+ print(f"Warning: No headers or sections found in {self.url}")
44
+ # If still no headers, treat the whole content as one section
45
+ title = self.soup.title.string if self.soup.title else "Untitled"
46
+ sections.append(
47
+ {
48
+ "name": title,
49
+ "url": self.url,
50
+ "content": main_content.get_text(strip=True),
51
+ "level": 1,
52
+ }
53
+ )
54
+ return sections
55
+
56
+ for i, header in enumerate(headers):
57
+ name = header.text.strip()
58
+ header_id = header.get("id", "")
59
+ if header_id:
60
+ section_url = f"{self.url}#{header_id}"
61
+ else:
62
+ section_url = self.url
63
+
64
+ content = self.extract_content(
65
+ header, headers[i + 1] if i + 1 < len(headers) else None
66
+ )
67
+ sections.append(
68
+ {
69
+ "name": name,
70
+ "url": section_url,
71
+ "content": content,
72
+ "level": self.get_header_level(header),
73
+ }
74
+ )
75
+
76
+ return sections
77
+
78
+ def extract_content(self, start_tag, end_tag):
79
+ content = []
80
+ current = start_tag.next_sibling
81
+ while current and current != end_tag:
82
+ if isinstance(current, str):
83
+ content.append(current.strip())
84
+ elif current.name == "table":
85
+ table_html = io.StringIO(str(current))
86
+ content.append(
87
+ pd.read_html(table_html)[0].to_markdown(
88
+ index=False, tablefmt="github"
89
+ )
90
+ )
91
+ elif current.name not in ["script", "style"]:
92
+ content.append(current.get_text(strip=True, separator=" "))
93
+ current = current.next_sibling
94
+ return "\n".join(filter(None, content))
95
+
96
+ def get_header_level(self, tag):
97
+ if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
98
+ return int(tag.name[1])
99
+ elif "class" in tag.attrs:
100
+ if "docstring" in tag["class"]:
101
+ return 1
102
+ elif "section" in tag["class"]:
103
+ return 2
104
+ return 1 # Default level
105
+
106
+
107
+ def is_likely_html_file(file_path):
108
+ excluded_extensions = {".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg"}
109
+ return file_path.suffix == "" or file_path.suffix.lower() not in excluded_extensions
110
+
111
+
112
+ def parse_saved_html_files(html_dir, base_url):
113
+ all_sections = []
114
+ html_files = [
115
+ f for f in Path(html_dir).rglob("*") if f.is_file() and is_likely_html_file(f)
116
+ ]
117
+ print(f"Found {len(html_files)} HTML files")
118
+
119
+ for html_file in tqdm(html_files, desc="Parsing HTML files"):
120
+ try:
121
+ with open(html_file, "r", encoding="utf-8") as file:
122
+ html_content = file.read()
123
+
124
+ relative_path = html_file.relative_to(html_dir)
125
+ url = urljoin(base_url, str(relative_path).replace(os.path.sep, "/"))
126
+
127
+ parser = HuggingfaceParser(html_content, url)
128
+ sections = parser.find_sections()
129
+
130
+ if not sections:
131
+ print(f"Warning: No sections found in {html_file}")
132
+ # exit(0)
133
+ # break
134
+ all_sections.extend(sections)
135
+ except Exception as e:
136
+ print(f"Error parsing {html_file}: {str(e)}")
137
+ # exit(0)
138
+
139
+ return all_sections
140
+
141
+
142
+ def save_to_jsonl(data, output_file):
143
+ with open(output_file, "w", encoding="utf-8") as f:
144
+ for item in data:
145
+ json.dump(item, f, ensure_ascii=False)
146
+ f.write("\n")
147
+
148
+
149
+ def main():
150
+ # html_dir = "huggingface_docs" # Directory where HTML files are saved
151
+ html_dir = "transformers_docs_v4.42.0" # Directory where HTML files are saved
152
+ base_url = "https://huggingface.co/docs/transformers/"
153
+ output_file = "hf_transformers_v4_42_0.jsonl"
154
+
155
+ all_sections = parse_saved_html_files(html_dir, base_url)
156
+ save_to_jsonl(all_sections, output_file)
157
+
158
+ print(f"Parsed content saved to {output_file}")
159
+ print(f"Total sections parsed: {len(all_sections)}")
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
data/scraping/huggingface_docs/scrape_hf_docs_from_repo.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import requests
4
+
5
+ # GitHub repository information
6
+ owner = "huggingface"
7
+
8
+ # repo = "peft"
9
+ # path = "docs/source"
10
+
11
+ repo = "transformers"
12
+ path = "docs/source/en"
13
+
14
+ # GitHub API endpoint for the repository contents
15
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
16
+
17
+
18
+ def get_files_in_directory(api_url):
19
+ response = requests.get(api_url)
20
+ if response.status_code == 200:
21
+ return response.json()
22
+ else:
23
+ print(f"Failed to fetch directory contents: {response.status_code}")
24
+ return []
25
+
26
+
27
+ def download_file(file_url, file_path):
28
+ response = requests.get(file_url)
29
+ if response.status_code == 200:
30
+ with open(file_path, "wb") as file:
31
+ file.write(response.content)
32
+ else:
33
+ print(f"Failed to download file: {response.status_code}")
34
+
35
+
36
+ def fetch_md_files(api_url, local_dir):
37
+ files = get_files_in_directory(api_url)
38
+ for file in files:
39
+ if file["type"] == "file" and file["name"].endswith(".md"):
40
+ file_url = file["download_url"]
41
+ file_path = os.path.join(local_dir, file["name"])
42
+ print(f'Downloading {file["name"]}...')
43
+ download_file(file_url, file_path)
44
+ elif file["type"] == "dir":
45
+ subdir = os.path.join(local_dir, file["name"])
46
+ os.makedirs(subdir, exist_ok=True)
47
+ fetch_md_files(file["url"], subdir)
48
+
49
+
50
+ # Local directory to save the files
51
+ local_dir = f"data/{repo}_docs"
52
+ os.makedirs(local_dir, exist_ok=True)
53
+
54
+ # Start fetching files
55
+ fetch_md_files(api_url, local_dir)
56
+
57
+ print("All files have been downloaded.")
data/scraping/huggingface_docs/scrape_hf_docs_from_web.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from urllib.parse import unquote, urljoin, urlparse
4
+
5
+ import scrapy
6
+ from scrapy.crawler import CrawlerProcess
7
+ from tqdm import tqdm
8
+
9
+ logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
10
+
11
+
12
+ def is_valid_url(url, domain, base_path):
13
+ parsed = urlparse(url)
14
+ return (
15
+ parsed.scheme in ["http", "https"]
16
+ and parsed.netloc == domain
17
+ and parsed.path.startswith(base_path)
18
+ and "#" not in url
19
+ ) # Exclude URLs with fragments
20
+
21
+
22
+ def clean_url(url):
23
+ # Replace &amp; with &, and &num; with #
24
+ url = url.replace("&amp;", "&").replace("&num;", "#")
25
+ # Decode URL-encoded characters
26
+ return unquote(url)
27
+
28
+
29
+ class DocsSpider(scrapy.Spider):
30
+ name = "docs"
31
+
32
+ def __init__(
33
+ self,
34
+ homepage_url: str,
35
+ domain: str,
36
+ base_path: str,
37
+ save_dir="outputs/",
38
+ target_version=None,
39
+ *args,
40
+ **kwargs,
41
+ ):
42
+ super(DocsSpider, self).__init__(*args, **kwargs)
43
+ self.homepage_url = homepage_url
44
+ self.domain = domain
45
+ self.base_path = base_path
46
+ self.allowed_domains = [domain]
47
+ self.start_urls = [self.homepage_url]
48
+ self.base_dir = Path(save_dir)
49
+ self.target_version = target_version
50
+ self.pages = []
51
+ self.progress_bar = None
52
+
53
+ def start_requests(self):
54
+ self.progress_bar = tqdm(desc="Crawling pages", unit="page")
55
+ yield scrapy.Request(self.homepage_url, self.parse)
56
+
57
+ def parse(self, response):
58
+ if not is_valid_url(response.url, self.domain, self.base_path):
59
+ return
60
+
61
+ parsed_uri = urlparse(response.url)
62
+ relative_path = parsed_uri.path.removeprefix(self.base_path).strip("/")
63
+ if relative_path:
64
+ filepath = self.base_dir / relative_path
65
+ else:
66
+ filepath = self.base_dir / "index.html"
67
+
68
+ filepath.parent.mkdir(parents=True, exist_ok=True)
69
+ with open(filepath, "wb") as f:
70
+ f.write(response.body)
71
+
72
+ self.pages.append({"url": response.url, "html": response.body})
73
+ self.progress_bar.update(1)
74
+
75
+ for href in response.css("a::attr(href)").getall():
76
+ full_url = response.urljoin(clean_url(href))
77
+ if is_valid_url(full_url, self.domain, self.base_path):
78
+ if self.target_version:
79
+ if self.target_version in full_url:
80
+ yield response.follow(full_url, self.parse)
81
+ else:
82
+ yield response.follow(full_url, self.parse)
83
+
84
+ def closed(self, reason):
85
+ if self.progress_bar:
86
+ self.progress_bar.close()
87
+
88
+
89
+ def crawl_docs(start_url, domain, base_path, save_dir="outputs/", target_version=None):
90
+ process = CrawlerProcess(
91
+ settings={
92
+ "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
93
+ "DOWNLOAD_DELAY": 2,
94
+ "RANDOMIZE_DOWNLOAD_DELAY": True,
95
+ "CONCURRENT_REQUESTS": 1,
96
+ "RETRY_TIMES": 5,
97
+ "RETRY_HTTP_CODES": [429, 500, 502, 503, 504, 522, 524, 408, 400],
98
+ "HTTPERROR_ALLOWED_CODES": [404], # Allow 404 errors to be logged
99
+ }
100
+ )
101
+
102
+ process.crawl(
103
+ DocsSpider,
104
+ homepage_url=start_url,
105
+ domain=domain,
106
+ base_path=base_path,
107
+ save_dir=save_dir,
108
+ target_version=target_version,
109
+ )
110
+ process.start()
111
+
112
+ spider = next(s for s in process.crawlers if s.spider.name == "docs").spider
113
+
114
+ print(f"Total pages crawled and parsed: {len(spider.pages)}")
115
+
116
+
117
+ if __name__ == "__main__":
118
+ # https://huggingface.co/docs/peft/v0.11.0/en/index
119
+ # Customizable parameters
120
+ domain = "huggingface.co"
121
+ version = "v0.11.0"
122
+ library = "peft"
123
+ language = "en"
124
+
125
+ # Construct URL and paths
126
+ base_path = f"/docs/{library}/{version}/{language}"
127
+ start_url = f"https://{domain}{base_path}/index"
128
+ save_dir = f"{library}_docs_{version}"
129
+
130
+ # Optional: Set target_version to None if you want to crawl all versions
131
+ target_version = None
132
+
133
+ crawl_docs(start_url, domain, base_path, save_dir, target_version)
data/scraping/huggingface_docs/validate_jsonl.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+
5
+ def load_and_validate_jsonl(file_path: str) -> Dict[int, Any]:
6
+ """
7
+ Load a .jsonl file into a dictionary and validate each line.
8
+
9
+ Args:
10
+ file_path (str): Path to the .jsonl file
11
+
12
+ Returns:
13
+ Dict[int, Any]: A dictionary where keys are line numbers (1-indexed) and values are the parsed JSON objects
14
+
15
+ Raises:
16
+ ValueError: If any line in the file is not valid JSON
17
+ """
18
+ result = {}
19
+ with open(file_path, "r") as file:
20
+ for line_number, line in enumerate(file, 1):
21
+ try:
22
+ # Strip whitespace and check if the line is empty
23
+ stripped_line = line.strip()
24
+ if not stripped_line:
25
+ print(f"Warning: Line {line_number} is empty.")
26
+ continue
27
+
28
+ # Attempt to parse the JSON
29
+ parsed_json = json.loads(stripped_line)
30
+ result[line_number] = parsed_json
31
+ except json.JSONDecodeError as e:
32
+ raise ValueError(f"Invalid JSON on line {line_number}: {e}")
33
+
34
+ return result
35
+
36
+
37
+ if __name__ == "__main__":
38
+ file_path = "hf_transformers_v4_42_0.jsonl"
39
+ try:
40
+ loaded_data = load_and_validate_jsonl(file_path)
41
+ print(f"Successfully loaded {len(loaded_data)} valid JSON objects.")
42
+
43
+ # Optional: Print the first few items
44
+ print("\nFirst few items:")
45
+ for line_number, data in list(loaded_data.items())[:5]:
46
+ print(f"Line {line_number}: {data}")
47
+
48
+ except ValueError as e:
49
+ print(f"Error: {e}")
50
+ except FileNotFoundError:
51
+ print(f"Error: File '{file_path}' not found.")