Spaces:
Runtime error
Runtime error
feat: Add script to download content from Wikidocs
Browse files- download_wikidocs.py +74 -0
- requirements.txt +4 -1
download_wikidocs.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import re
|
| 5 |
+
from markdownify import markdownify as md
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import argparse
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def extract_content(url: str):
|
| 11 |
+
response = requests.get(url)
|
| 12 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 13 |
+
|
| 14 |
+
page_subject = soup.select_one("#load_content .page-subject")
|
| 15 |
+
page_content = soup.select_one("#load_content .page-content")
|
| 16 |
+
markdown_content = md(
|
| 17 |
+
str(page_subject) + str(page_content),
|
| 18 |
+
heading_style="ATX",
|
| 19 |
+
bullets="-",
|
| 20 |
+
strong_em_symbol="*",
|
| 21 |
+
code_language="python",
|
| 22 |
+
escape_asterisks=False,
|
| 23 |
+
escape_underscores=False,
|
| 24 |
+
)
|
| 25 |
+
normalized_text = re.sub(r"\n{2}", "\n", markdown_content)
|
| 26 |
+
|
| 27 |
+
return normalized_text
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def main(ebook_url):
|
| 31 |
+
base_url = "https://wikidocs.net"
|
| 32 |
+
|
| 33 |
+
# book_id ์ถ์ถ
|
| 34 |
+
book_id = ebook_url.split("/")[-1]
|
| 35 |
+
|
| 36 |
+
# ํ์ด์ง ์์ค ๊ฐ์ ธ์ค๊ธฐ
|
| 37 |
+
response = requests.get(ebook_url)
|
| 38 |
+
response.raise_for_status() # ์์ธ ์ฒ๋ฆฌ
|
| 39 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 40 |
+
|
| 41 |
+
# ๋ชฉ์ฐจ์์ 'a' ํ๊ทธ๋ง ๊ฐ์ ธ์ค๊ธฐ
|
| 42 |
+
toc = soup.select(".list-group-toc a[href^='javascript:page(']")
|
| 43 |
+
|
| 44 |
+
# ์ถ์ถํ ๋ฐ์ดํฐ ์ ์ฅํ ๋ฆฌ์คํธ
|
| 45 |
+
data_list = []
|
| 46 |
+
for item in toc:
|
| 47 |
+
title = item.get_text(strip=True)
|
| 48 |
+
page_id = item.get("href").split("page(")[-1].rstrip(")")
|
| 49 |
+
link = f"{base_url}/{page_id}"
|
| 50 |
+
data_list.append({"title": title, "link": link})
|
| 51 |
+
|
| 52 |
+
# ๋ฐ์ดํฐ ๋ฆฌ์คํธ๋ฅผ ์ํํ๋ฉฐ ์ฝํ
์ธ ์ถ์ถ
|
| 53 |
+
for item in data_list[1:]:
|
| 54 |
+
item["content"] = extract_content(item["link"])
|
| 55 |
+
time.sleep(1) # ํ์ด์ง ๋ก๋๋ฅผ ์ํด ๋๊ธฐ
|
| 56 |
+
|
| 57 |
+
# ๋ฐ์ดํฐํ๋ ์์ผ๋ก ๋ณํ
|
| 58 |
+
df = pd.DataFrame(data_list)
|
| 59 |
+
df = df.dropna(subset=["content"])
|
| 60 |
+
|
| 61 |
+
# ๋ฐ์ดํฐํ๋ ์์ parquet ํ์ผ๋ก ์ ์ฅ
|
| 62 |
+
parquet_filename = f"wikidocs_{book_id}.parquet"
|
| 63 |
+
df.to_parquet(parquet_filename, index=False)
|
| 64 |
+
|
| 65 |
+
print(f"ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์ ์ฅ๋์์ต๋๋ค: {parquet_filename}")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
# ๋ช
๋ น์ด ์ค ์ธ์ ์ฒ๋ฆฌ
|
| 70 |
+
parser = argparse.ArgumentParser(description="Wikidocs ebook URL์ ์
๋ ฅํ์ธ์.")
|
| 71 |
+
parser.add_argument("ebook_url", type=str, help="Wikidocs ebook URL")
|
| 72 |
+
args = parser.parse_args()
|
| 73 |
+
|
| 74 |
+
main(args.ebook_url)
|
requirements.txt
CHANGED
|
@@ -12,4 +12,7 @@ langchain-openai
|
|
| 12 |
langchain-core
|
| 13 |
langchain-groq
|
| 14 |
langchain_cohere
|
| 15 |
-
chromadb
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
langchain-core
|
| 13 |
langchain-groq
|
| 14 |
langchain_cohere
|
| 15 |
+
chromadb
|
| 16 |
+
markdownify
|
| 17 |
+
pandas
|
| 18 |
+
beautifulsoup4
|