Spaces:
Sleeping
Sleeping
Create web_scrape_and_pdf_loader.py
Browse files- web_scrape_and_pdf_loader.py +229 -0
web_scrape_and_pdf_loader.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, os.path
|
2 |
+
|
3 |
+
# to search web for results
|
4 |
+
from urllib.parse import urlparse, quote
|
5 |
+
import requests
|
6 |
+
from duckduckgo_search import DDGS
|
7 |
+
|
8 |
+
# to present web search results in a table
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
# to get get document chunks, embed and build vector database
|
12 |
+
# 2 vector databases are built, one for keyword search (BM25), one for semantic (Chroma)
|
13 |
+
from langchain.document_loaders import WebBaseLoader, PyPDFLoader
|
14 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
15 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
16 |
+
from langchain.vectorstores import Chroma
|
17 |
+
from langchain.retrievers import BM25Retriever
|
18 |
+
|
19 |
+
# for saving bm25 retriever
|
20 |
+
# pickle is not for production, just for prototype
|
21 |
+
import pickle
|
22 |
+
|
23 |
+
|
24 |
+
# this is for returning top n search results using DuckDuckGo
|
25 |
+
top_n_results = 10
|
26 |
+
|
27 |
+
# chroma vector store will be set up for all the different combinations of chunk sizes and overlaps below
|
28 |
+
# the process of building up the vector store will take very long
|
29 |
+
chunk_sizes = [500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
|
30 |
+
chunk_overlaps = [50, 100, 150, 200]
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
################################ Search the Web ################################
|
35 |
+
## Use DuckDuckGo to Search for Top N Results for Each Country's ESG Policies
|
36 |
+
|
37 |
+
# Use DuckDuckGo search to loop through each country and save the top N results by searching for
|
38 |
+
# "{country} sustainability esg newest updated public policy document government"
|
39 |
+
# After some experimentation the search phrase above seems to give the best results
|
40 |
+
# for the most recent ESG policies as it contains all the necessary keywords, but it can be changed
|
41 |
+
# Store the Relevant Links in a Dictionary
|
42 |
+
# Links are Mostly HTML or PDF
|
43 |
+
|
44 |
+
def duckduckgo_scrape(country, search_term, n_search_results):
|
45 |
+
all_links = []
|
46 |
+
|
47 |
+
with DDGS() as ddgs:
|
48 |
+
results = ddgs.text(f"{search term}", max_results=n_search_results)
|
49 |
+
for result in results:
|
50 |
+
result['country'] = country
|
51 |
+
all_links.append(result)
|
52 |
+
|
53 |
+
# Save scraped links into csv
|
54 |
+
df_links = pd.DataFrame(all_links).rename(columns = {
|
55 |
+
'title': 'Title',
|
56 |
+
'href': 'url',
|
57 |
+
'body': 'Summarized Body',
|
58 |
+
'country': 'Country'
|
59 |
+
})
|
60 |
+
# save scraped links into csv
|
61 |
+
df_links.to_csv("duck_duck_go_scraped_links.csv")
|
62 |
+
|
63 |
+
return all_links, df_links
|
64 |
+
|
65 |
+
################################ Load the Documents ################################
|
66 |
+
## For every search result returned by DuckDuckGo for each country above, scrape the web using the url and convert to documents using Langchain loaders:
|
67 |
+
# PDF Documents: If link from search result points to PDF document,
|
68 |
+
# save the PDF permanently in local storage in the folder called 'pdfs', then use PyPDFLoader to convert it to raw documents.
|
69 |
+
# HTML Documents: If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
|
70 |
+
# Metadata: Add country in the metadata, this is an important step as it is needed by RetrieveQA for filtering in the future.
|
71 |
+
# For PDFs, langchain will use its local path as the source, need to change it back to the online path.
|
72 |
+
# Save all the documents into a list called "all_documents".
|
73 |
+
|
74 |
+
# for adding country metadata
|
75 |
+
def add_country_metadata(docs, country):
|
76 |
+
for doc in docs:
|
77 |
+
doc.metadata['country'] = country
|
78 |
+
return docs
|
79 |
+
|
80 |
+
# for adding source url metadata
|
81 |
+
def add_url_metadata(docs, url):
|
82 |
+
for doc in docs:
|
83 |
+
doc.metadata['source'] = url
|
84 |
+
return docs
|
85 |
+
|
86 |
+
# If link from search result points to PDF document,
|
87 |
+
# save the PDF permanently in local storage in the folder called 'pdfs',
|
88 |
+
# then use PyPDFLoader to convert it to raw documents.
|
89 |
+
def pdf_loader(url, country):
|
90 |
+
try:
|
91 |
+
try:
|
92 |
+
response = requests.get(url)
|
93 |
+
except:
|
94 |
+
# sometimes there is ssl error, and the page is actually http://
|
95 |
+
url = url.replace("https://", "http://")
|
96 |
+
response = requests.get(url)
|
97 |
+
# create pdf directory to save pdfs locally
|
98 |
+
pdf_dir = f"pdfs/{country}"
|
99 |
+
if not os.path.exists(pdf_dir):
|
100 |
+
os.makedirs(pdf_dir)
|
101 |
+
pdf_filename = f"{pdf_dir}/{url.split('/')[-1]}"
|
102 |
+
with open(pdf_filename, 'wb') as f: # save the pdf locally first
|
103 |
+
f.write(response.content)
|
104 |
+
loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
|
105 |
+
raw_pdf_documents = loader.load()
|
106 |
+
raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
|
107 |
+
# pdf source data will be populated by Langchain as the local path
|
108 |
+
# we do not want this, we change it back to the original path on the web instead
|
109 |
+
raw_pdf_documents = add_url_metadata(raw_pdf_documents, url)
|
110 |
+
return raw_pdf_documents
|
111 |
+
except Exception as e:
|
112 |
+
print(f"Failed to load for {url}")
|
113 |
+
|
114 |
+
# Same as above but for pdf in local directory
|
115 |
+
def pdf_loader_local(pdf_filename, country):
|
116 |
+
try:
|
117 |
+
with open(pdf_filename, 'wb') as f: # save the pdf locally first
|
118 |
+
f.write(response.content)
|
119 |
+
loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
|
120 |
+
raw_pdf_documents = loader.load()
|
121 |
+
raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
|
122 |
+
return raw_pdf_documents
|
123 |
+
|
124 |
+
except Exception as e:
|
125 |
+
print(f"Failed to load for {url}")
|
126 |
+
return False
|
127 |
+
|
128 |
+
# If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
|
129 |
+
def html_loader(url, country):
|
130 |
+
try:
|
131 |
+
loader = WebBaseLoader(url)
|
132 |
+
raw_html_documents = loader.load()
|
133 |
+
raw_html_documents = add_country_metadata(raw_html_documents, country)
|
134 |
+
return raw_html_documents
|
135 |
+
except:
|
136 |
+
print(f"Failed to load for {url}")
|
137 |
+
|
138 |
+
def process_links_load_documents(all_links):
|
139 |
+
all_documents = [] # store all the documents
|
140 |
+
|
141 |
+
for link in all_links:
|
142 |
+
country = link['country']
|
143 |
+
title = link['title']
|
144 |
+
url = link['href']
|
145 |
+
url = url.replace(" ", "%20") # replace spaces to encoded version e.g. %20
|
146 |
+
|
147 |
+
# If url points to PDF documents
|
148 |
+
if url.endswith('.pdf') or (('.pdf' in url) & ('blob' in url)):
|
149 |
+
print(f"{country}: Loading PDF from {url}")
|
150 |
+
docs = pdf_loader(url, country)
|
151 |
+
if docs is not None: # if error, docs will be None
|
152 |
+
if isinstance(docs, list):
|
153 |
+
all_documents.extend(docs)
|
154 |
+
else:
|
155 |
+
all_documents.append(docs)
|
156 |
+
#print(docs)
|
157 |
+
|
158 |
+
# If url is just a HTML page
|
159 |
+
else:
|
160 |
+
print(f"{country}: Loading HTML from {url}")
|
161 |
+
docs = html_loader(url, country)
|
162 |
+
if docs is not None: # if error, docs will be None
|
163 |
+
if isinstance(docs, list):
|
164 |
+
all_documents.extend(docs)
|
165 |
+
else:
|
166 |
+
all_documents.append(docs)
|
167 |
+
#print(docs)
|
168 |
+
|
169 |
+
# documents return a lot of \n, perform some cleaning
|
170 |
+
for document in all_documents:
|
171 |
+
document.page_content = document.page_content.replace('\n', '')
|
172 |
+
|
173 |
+
return all_documents
|
174 |
+
|
175 |
+
|
176 |
+
################################ Set Up Chroma Vector Store ################################
|
177 |
+
# This is for semantic search.
|
178 |
+
# In the configuration cell at the top, we define all the chunk sizes and overlaps that we are interested in.
|
179 |
+
# The Chroma vector stores will be set up for each of the configuration, persisted in a different directory.
|
180 |
+
# These vector stores can be accessed in the main app later.
|
181 |
+
# Time taken to get the embeddings for every document chunk can be very long.
|
182 |
+
# Note: If we are using a lot more data than can be stored in the RAM or when in production,
|
183 |
+
# better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
|
184 |
+
|
185 |
+
def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country):
|
186 |
+
chromadb_dir = "chromadb"
|
187 |
+
if not os.path.exists(chromadb_dir):
|
188 |
+
os.makedirs(chromadb_dir)
|
189 |
+
|
190 |
+
print(f"Processing Chunk Size: {chunk_size}, Chunk Overlap: {chunk_overlap}")
|
191 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
192 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
193 |
+
)
|
194 |
+
split_documents = text_splitter.split_documents(all_documents)
|
195 |
+
persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
196 |
+
|
197 |
+
# Build the vector database using Chroma and persist it in a local directory
|
198 |
+
chroma_db = Chroma.from_documents(split_documents,
|
199 |
+
hf_embeddings,
|
200 |
+
persist_directory=persist_directory)
|
201 |
+
chroma_db.persist()
|
202 |
+
|
203 |
+
return True # to let user know this process is done
|
204 |
+
|
205 |
+
################################ Set Up BM25 Retriever ################################
|
206 |
+
# This is for keyword search.
|
207 |
+
|
208 |
+
# BM25 is a keyword-based algorithm that performs well on queries containing keywords without capturing the semantic meaning of the query terms,
|
209 |
+
# hence there is no need to embed the text with HuggingFaceEmbeddings and it is relatively faster to set up.
|
210 |
+
# We will use it with combination of the chroma_db vector store retriver in our application later, with ensemble retriever to re-rank the results.
|
211 |
+
# The retriever is just a small file so we just store it using pickle, but for production this is still not recommended.
|
212 |
+
|
213 |
+
def setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country):
|
214 |
+
bm25_dir = "bm25"
|
215 |
+
if not os.path.exists(bm25_dir):
|
216 |
+
os.makedirs(bm25_dir)
|
217 |
+
|
218 |
+
print(f"Processing Chunk Size: {chunk_size}, Chunk Overlap: {chunk_overlap}, Country: {country}")
|
219 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
220 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
221 |
+
)
|
222 |
+
split_documents = text_splitter.split_documents(all_documents)
|
223 |
+
split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
|
224 |
+
bm25_retriever = BM25Retriever.from_documents(split_documents)
|
225 |
+
filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}.pickle"
|
226 |
+
|
227 |
+
with open(filename, 'wb') as handle:
|
228 |
+
pickle.dump(bm25_retriever, handle)
|
229 |
+
|