anpigon commited on
Commit
e05a89c
โ€ข
1 Parent(s): a905ab0

feat: Add script to download content from Wikidocs

Browse files
Files changed (2) hide show
  1. download_wikidocs.py +74 -0
  2. requirements.txt +4 -1
download_wikidocs.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ from markdownify import markdownify as md
6
+ import pandas as pd
7
+ import argparse
8
+
9
+
10
+ def extract_content(url: str):
11
+ response = requests.get(url)
12
+ soup = BeautifulSoup(response.content, "html.parser")
13
+
14
+ page_subject = soup.select_one("#load_content .page-subject")
15
+ page_content = soup.select_one("#load_content .page-content")
16
+ markdown_content = md(
17
+ str(page_subject) + str(page_content),
18
+ heading_style="ATX",
19
+ bullets="-",
20
+ strong_em_symbol="*",
21
+ code_language="python",
22
+ escape_asterisks=False,
23
+ escape_underscores=False,
24
+ )
25
+ normalized_text = re.sub(r"\n{2}", "\n", markdown_content)
26
+
27
+ return normalized_text
28
+
29
+
30
+ def main(ebook_url):
31
+ base_url = "https://wikidocs.net"
32
+
33
+ # book_id ์ถ”์ถœ
34
+ book_id = ebook_url.split("/")[-1]
35
+
36
+ # ํŽ˜์ด์ง€ ์†Œ์Šค ๊ฐ€์ ธ์˜ค๊ธฐ
37
+ response = requests.get(ebook_url)
38
+ response.raise_for_status() # ์˜ˆ์™ธ ์ฒ˜๋ฆฌ
39
+ soup = BeautifulSoup(response.content, "html.parser")
40
+
41
+ # ๋ชฉ์ฐจ์—์„œ 'a' ํƒœ๊ทธ๋งŒ ๊ฐ€์ ธ์˜ค๊ธฐ
42
+ toc = soup.select(".list-group-toc a[href^='javascript:page(']")
43
+
44
+ # ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
45
+ data_list = []
46
+ for item in toc:
47
+ title = item.get_text(strip=True)
48
+ page_id = item.get("href").split("page(")[-1].rstrip(")")
49
+ link = f"{base_url}/{page_id}"
50
+ data_list.append({"title": title, "link": link})
51
+
52
+ # ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ˆœํšŒํ•˜๋ฉฐ ์ฝ˜ํ…์ธ  ์ถ”์ถœ
53
+ for item in data_list[1:]:
54
+ item["content"] = extract_content(item["link"])
55
+ time.sleep(1) # ํŽ˜์ด์ง€ ๋กœ๋“œ๋ฅผ ์œ„ํ•ด ๋Œ€๊ธฐ
56
+
57
+ # ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ณ€ํ™˜
58
+ df = pd.DataFrame(data_list)
59
+ df = df.dropna(subset=["content"])
60
+
61
+ # ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ parquet ํŒŒ์ผ๋กœ ์ €์žฅ
62
+ parquet_filename = f"wikidocs_{book_id}.parquet"
63
+ df.to_parquet(parquet_filename, index=False)
64
+
65
+ print(f"ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {parquet_filename}")
66
+
67
+
68
+ if __name__ == "__main__":
69
+ # ๋ช…๋ น์–ด ์ค„ ์ธ์ž ์ฒ˜๋ฆฌ
70
+ parser = argparse.ArgumentParser(description="Wikidocs ebook URL์„ ์ž…๋ ฅํ•˜์„ธ์š”.")
71
+ parser.add_argument("ebook_url", type=str, help="Wikidocs ebook URL")
72
+ args = parser.parse_args()
73
+
74
+ main(args.ebook_url)
requirements.txt CHANGED
@@ -12,4 +12,7 @@ langchain-openai
12
  langchain-core
13
  langchain-groq
14
  langchain_cohere
15
- chromadb
 
 
 
 
12
  langchain-core
13
  langchain-groq
14
  langchain_cohere
15
+ chromadb
16
+ markdownify
17
+ pandas
18
+ beautifulsoup4