bohmian commited on
Commit
71339bd
·
verified ·
1 Parent(s): dccdb1f

Create web_scrape_and_pdf_loader.py

Browse files
Files changed (1) hide show
  1. web_scrape_and_pdf_loader.py +229 -0
web_scrape_and_pdf_loader.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, os.path
2
+
3
+ # to search web for results
4
+ from urllib.parse import urlparse, quote
5
+ import requests
6
+ from duckduckgo_search import DDGS
7
+
8
+ # to present web search results in a table
9
+ import pandas as pd
10
+
11
+ # to get get document chunks, embed and build vector database
12
+ # 2 vector databases are built, one for keyword search (BM25), one for semantic (Chroma)
13
+ from langchain.document_loaders import WebBaseLoader, PyPDFLoader
14
+ from langchain.embeddings import HuggingFaceEmbeddings
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.vectorstores import Chroma
17
+ from langchain.retrievers import BM25Retriever
18
+
19
+ # for saving bm25 retriever
20
+ # pickle is not for production, just for prototype
21
+ import pickle
22
+
23
+
24
+ # this is for returning top n search results using DuckDuckGo
25
+ top_n_results = 10
26
+
27
+ # chroma vector store will be set up for all the different combinations of chunk sizes and overlaps below
28
+ # the process of building up the vector store will take very long
29
+ chunk_sizes = [500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
30
+ chunk_overlaps = [50, 100, 150, 200]
31
+
32
+
33
+
34
+ ################################ Search the Web ################################
35
+ ## Use DuckDuckGo to Search for Top N Results for Each Country's ESG Policies
36
+
37
+ # Use DuckDuckGo search to loop through each country and save the top N results by searching for
38
+ # "{country} sustainability esg newest updated public policy document government"
39
+ # After some experimentation the search phrase above seems to give the best results
40
+ # for the most recent ESG policies as it contains all the necessary keywords, but it can be changed
41
+ # Store the Relevant Links in a Dictionary
42
+ # Links are Mostly HTML or PDF
43
+
44
+ def duckduckgo_scrape(country, search_term, n_search_results):
45
+ all_links = []
46
+
47
+ with DDGS() as ddgs:
48
+ results = ddgs.text(f"{search term}", max_results=n_search_results)
49
+ for result in results:
50
+ result['country'] = country
51
+ all_links.append(result)
52
+
53
+ # Save scraped links into csv
54
+ df_links = pd.DataFrame(all_links).rename(columns = {
55
+ 'title': 'Title',
56
+ 'href': 'url',
57
+ 'body': 'Summarized Body',
58
+ 'country': 'Country'
59
+ })
60
+ # save scraped links into csv
61
+ df_links.to_csv("duck_duck_go_scraped_links.csv")
62
+
63
+ return all_links, df_links
64
+
65
+ ################################ Load the Documents ################################
66
+ ## For every search result returned by DuckDuckGo for each country above, scrape the web using the url and convert to documents using Langchain loaders:
67
+ # PDF Documents: If link from search result points to PDF document,
68
+ # save the PDF permanently in local storage in the folder called 'pdfs', then use PyPDFLoader to convert it to raw documents.
69
+ # HTML Documents: If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
70
+ # Metadata: Add country in the metadata, this is an important step as it is needed by RetrieveQA for filtering in the future.
71
+ # For PDFs, langchain will use its local path as the source, need to change it back to the online path.
72
+ # Save all the documents into a list called "all_documents".
73
+
74
+ # for adding country metadata
75
+ def add_country_metadata(docs, country):
76
+ for doc in docs:
77
+ doc.metadata['country'] = country
78
+ return docs
79
+
80
+ # for adding source url metadata
81
+ def add_url_metadata(docs, url):
82
+ for doc in docs:
83
+ doc.metadata['source'] = url
84
+ return docs
85
+
86
+ # If link from search result points to PDF document,
87
+ # save the PDF permanently in local storage in the folder called 'pdfs',
88
+ # then use PyPDFLoader to convert it to raw documents.
89
+ def pdf_loader(url, country):
90
+ try:
91
+ try:
92
+ response = requests.get(url)
93
+ except:
94
+ # sometimes there is ssl error, and the page is actually http://
95
+ url = url.replace("https://", "http://")
96
+ response = requests.get(url)
97
+ # create pdf directory to save pdfs locally
98
+ pdf_dir = f"pdfs/{country}"
99
+ if not os.path.exists(pdf_dir):
100
+ os.makedirs(pdf_dir)
101
+ pdf_filename = f"{pdf_dir}/{url.split('/')[-1]}"
102
+ with open(pdf_filename, 'wb') as f: # save the pdf locally first
103
+ f.write(response.content)
104
+ loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
105
+ raw_pdf_documents = loader.load()
106
+ raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
107
+ # pdf source data will be populated by Langchain as the local path
108
+ # we do not want this, we change it back to the original path on the web instead
109
+ raw_pdf_documents = add_url_metadata(raw_pdf_documents, url)
110
+ return raw_pdf_documents
111
+ except Exception as e:
112
+ print(f"Failed to load for {url}")
113
+
114
+ # Same as above but for pdf in local directory
115
+ def pdf_loader_local(pdf_filename, country):
116
+ try:
117
+ with open(pdf_filename, 'wb') as f: # save the pdf locally first
118
+ f.write(response.content)
119
+ loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
120
+ raw_pdf_documents = loader.load()
121
+ raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
122
+ return raw_pdf_documents
123
+
124
+ except Exception as e:
125
+ print(f"Failed to load for {url}")
126
+ return False
127
+
128
+ # If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
129
+ def html_loader(url, country):
130
+ try:
131
+ loader = WebBaseLoader(url)
132
+ raw_html_documents = loader.load()
133
+ raw_html_documents = add_country_metadata(raw_html_documents, country)
134
+ return raw_html_documents
135
+ except:
136
+ print(f"Failed to load for {url}")
137
+
138
+ def process_links_load_documents(all_links):
139
+ all_documents = [] # store all the documents
140
+
141
+ for link in all_links:
142
+ country = link['country']
143
+ title = link['title']
144
+ url = link['href']
145
+ url = url.replace(" ", "%20") # replace spaces to encoded version e.g. %20
146
+
147
+ # If url points to PDF documents
148
+ if url.endswith('.pdf') or (('.pdf' in url) & ('blob' in url)):
149
+ print(f"{country}: Loading PDF from {url}")
150
+ docs = pdf_loader(url, country)
151
+ if docs is not None: # if error, docs will be None
152
+ if isinstance(docs, list):
153
+ all_documents.extend(docs)
154
+ else:
155
+ all_documents.append(docs)
156
+ #print(docs)
157
+
158
+ # If url is just a HTML page
159
+ else:
160
+ print(f"{country}: Loading HTML from {url}")
161
+ docs = html_loader(url, country)
162
+ if docs is not None: # if error, docs will be None
163
+ if isinstance(docs, list):
164
+ all_documents.extend(docs)
165
+ else:
166
+ all_documents.append(docs)
167
+ #print(docs)
168
+
169
+ # documents return a lot of \n, perform some cleaning
170
+ for document in all_documents:
171
+ document.page_content = document.page_content.replace('\n', '')
172
+
173
+ return all_documents
174
+
175
+
176
+ ################################ Set Up Chroma Vector Store ################################
177
+ # This is for semantic search.
178
+ # In the configuration cell at the top, we define all the chunk sizes and overlaps that we are interested in.
179
+ # The Chroma vector stores will be set up for each of the configuration, persisted in a different directory.
180
+ # These vector stores can be accessed in the main app later.
181
+ # Time taken to get the embeddings for every document chunk can be very long.
182
+ # Note: If we are using a lot more data than can be stored in the RAM or when in production,
183
+ # better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
184
+
185
+ def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country):
186
+ chromadb_dir = "chromadb"
187
+ if not os.path.exists(chromadb_dir):
188
+ os.makedirs(chromadb_dir)
189
+
190
+ print(f"Processing Chunk Size: {chunk_size}, Chunk Overlap: {chunk_overlap}")
191
+ text_splitter = RecursiveCharacterTextSplitter(
192
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
193
+ )
194
+ split_documents = text_splitter.split_documents(all_documents)
195
+ persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
196
+
197
+ # Build the vector database using Chroma and persist it in a local directory
198
+ chroma_db = Chroma.from_documents(split_documents,
199
+ hf_embeddings,
200
+ persist_directory=persist_directory)
201
+ chroma_db.persist()
202
+
203
+ return True # to let user know this process is done
204
+
205
+ ################################ Set Up BM25 Retriever ################################
206
+ # This is for keyword search.
207
+
208
+ # BM25 is a keyword-based algorithm that performs well on queries containing keywords without capturing the semantic meaning of the query terms,
209
+ # hence there is no need to embed the text with HuggingFaceEmbeddings and it is relatively faster to set up.
210
+ # We will use it with combination of the chroma_db vector store retriver in our application later, with ensemble retriever to re-rank the results.
211
+ # The retriever is just a small file so we just store it using pickle, but for production this is still not recommended.
212
+
213
+ def setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country):
214
+ bm25_dir = "bm25"
215
+ if not os.path.exists(bm25_dir):
216
+ os.makedirs(bm25_dir)
217
+
218
+ print(f"Processing Chunk Size: {chunk_size}, Chunk Overlap: {chunk_overlap}, Country: {country}")
219
+ text_splitter = RecursiveCharacterTextSplitter(
220
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
221
+ )
222
+ split_documents = text_splitter.split_documents(all_documents)
223
+ split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
224
+ bm25_retriever = BM25Retriever.from_documents(split_documents)
225
+ filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}.pickle"
226
+
227
+ with open(filename, 'wb') as handle:
228
+ pickle.dump(bm25_retriever, handle)
229
+