File size: 10,116 Bytes
71339bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d1be9c
71339bd
bfef8be
 
71339bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e078a0
71339bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e078a0
71339bd
 
 
 
 
 
 
 
 
5e078a0
71339bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e078a0
71339bd
 
 
 
5e078a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import os, os.path

# to search web for results
from urllib.parse import urlparse, quote
import requests
from duckduckgo_search import DDGS

# to present web search results in a table
import pandas as pd

# to get get document chunks, embed and build vector database
# 2 vector databases are built, one for keyword search (BM25), one for semantic (Chroma)
from langchain.document_loaders import WebBaseLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever

# for saving bm25 retriever
# pickle is not for production, just for prototype
import pickle


# this is for returning top n search results using DuckDuckGo
top_n_results = 10

# chroma vector store will be set up for all the different combinations of chunk sizes and overlaps below
# the process of building up the vector store will take very long
chunk_sizes = [500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
chunk_overlaps = [50, 100, 150, 200]



################################ Search the Web ################################
## Use DuckDuckGo to Search for Top N Results for Each Country's ESG Policies

# Use DuckDuckGo search to loop through each country and save the top N results by searching for
# "{country} sustainability esg newest updated public policy document government"
# After some experimentation the search phrase above seems to give the best results 
# for the most recent ESG policies as it contains all the necessary keywords, but it can be changed
# Store the Relevant Links in a Dictionary
# Links are Mostly HTML or PDF

def duckduckgo_scrape(country, search_term, n_search_results):
    all_links = []

    with DDGS() as ddgs:
        results = ddgs.text(search_term, max_results=n_search_results)
        for result in results:
            result['country'] = country
            all_links.append(result)

    # Save scraped links into csv
    df_links = pd.DataFrame(all_links).rename(columns = {
        'title': 'Title',
        'href': 'url',
        'body': 'Summarized Body',
        'country': 'Country'
    })
    # save scraped links into csv
    df_links.to_csv("duck_duck_go_scraped_links.csv")

    return all_links, df_links

################################ Load the Documents ################################
## For every search result returned by DuckDuckGo for each country above, scrape the web using the url and convert to documents using Langchain loaders:
# PDF Documents: If link from search result points to PDF document, 
# save the PDF permanently in local storage in the folder called 'pdfs', then use PyPDFLoader to convert it to raw documents.
# HTML Documents: If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
# Metadata: Add country in the metadata, this is an important step as it is needed by RetrieveQA for filtering in the future. 
# For PDFs, langchain will use its local path as the source, need to change it back to the online path.
# Save all the documents into a list called "all_documents".

# for adding country metadata
def add_country_metadata(docs, country):
    for doc in docs:
        doc.metadata['country'] = country
    return docs

# for adding source url metadata
def add_url_metadata(docs, url):
    for doc in docs:
        doc.metadata['source'] = url
    return docs

# If link from search result points to PDF document,
# save the PDF permanently in local storage in the folder called 'pdfs',
# then use PyPDFLoader to convert it to raw documents.
def pdf_loader(url, country):
    try:
        try:
            response = requests.get(url)
        except:
            # sometimes there is ssl error, and the page is actually http://
            url = url.replace("https://", "http://")
            response = requests.get(url)
        # create pdf directory to save pdfs locally
        pdf_dir = f"pdfs/{country}"
        if not os.path.exists(pdf_dir):
            os.makedirs(pdf_dir)
        pdf_filename = f"{pdf_dir}/{url.split('/')[-1]}"
        with open(pdf_filename, 'wb') as f: # save the pdf locally first
            f.write(response.content)
        loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
        raw_pdf_documents = loader.load()
        raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
        # pdf source data will be populated by Langchain as the local path
        # we do not want this, we change it back to the original path on the web instead
        raw_pdf_documents = add_url_metadata(raw_pdf_documents, url)
        return raw_pdf_documents
    except Exception as e:
        print(f"Failed to load for {url}")

# Same as above but for pdf in local directory
def pdf_loader_local(pdf_filename, country):
    try:
        loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
        raw_pdf_documents = loader.load()
        raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
        return raw_pdf_documents

    except Exception as e:
        print(f"Failed to load for {pdf_filename} {e}")
        return False

# If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
def html_loader(url, country):
    try:
        loader = WebBaseLoader(url)
        raw_html_documents = loader.load()
        raw_html_documents = add_country_metadata(raw_html_documents, country)
        return raw_html_documents
    except:
        print(f"Failed to load for {url}")

def process_links_load_documents(all_links):
    all_documents = [] # store all the documents

    for link in all_links:
        country = link['country']
        title = link['title']
        url = link['href']
        url = url.replace(" ", "%20")  # replace spaces to encoded version e.g. %20

        # If url points to PDF documents
        if url.endswith('.pdf') or (('.pdf' in url) & ('blob' in url)):
            print(f"{country}: Loading PDF from {url}")
            docs = pdf_loader(url, country)
            if docs is not None: # if error, docs will be None
                if isinstance(docs, list):
                    all_documents.extend(docs)
                else:
                    all_documents.append(docs)
            #print(docs)

        # If url is just a HTML page
        else:
            print(f"{country}: Loading HTML from {url}")
            docs = html_loader(url, country)
            if docs is not None: # if error, docs will be None
                if isinstance(docs, list):
                    all_documents.extend(docs)
                else:
                    all_documents.append(docs)
            #print(docs)

    # documents return a lot of \n, perform some cleaning
    for document in all_documents:
        document.page_content = document.page_content.replace('\n', '')

    return all_documents


################################ Set Up Chroma Vector Store ################################
# This is for semantic search.
# In the configuration cell at the top, we define all the chunk sizes and overlaps that we are interested in. 
# The Chroma vector stores will be set up for each of the configuration, persisted in a different directory.  
# These vector stores can be accessed in the main app later. 
# Time taken to get the embeddings for every document chunk can be very long.
# Note: If we are using a lot more data than can be stored in the RAM or when in production, 
# better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.

def setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country):
    chromadb_dir = "chromadb"
    if not os.path.exists(chromadb_dir):
        os.makedirs(chromadb_dir)

    print(f"Processing Chunk Size: {chunk_size}, Chunk Overlap: {chunk_overlap}")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
    split_documents = text_splitter.split_documents(all_documents)
    persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"

    # Build the vector database using Chroma and persist it in a local directory
    chroma_db =  Chroma.from_documents(split_documents,
                                    hf_embeddings,
                                    persist_directory=persist_directory)
    chroma_db.persist()

    return True # to let user know this process is done

################################ Set Up BM25 Retriever ################################
# This is for keyword search.

# BM25 is a keyword-based algorithm that performs well on queries containing keywords without capturing the semantic meaning of the query terms, 
# hence there is no need to embed the text with HuggingFaceEmbeddings and it is relatively faster to set up.
# We will use it with combination of the chroma_db vector store retriver in our application later, with ensemble retriever to re-rank the results.
# The retriever is just a small file so we just store it using pickle, but for production this is still not recommended.  

def setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country):
    bm25_dir = "bm25"
    if not os.path.exists(bm25_dir):
        os.makedirs(bm25_dir)

    print(f"Processing Chunk Size: {chunk_size}, Chunk Overlap: {chunk_overlap}, Country: {country}")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
    split_documents = text_splitter.split_documents(all_documents)
    split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
    bm25_retriever = BM25Retriever.from_documents(split_documents)
    filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_.pickle"

    with open(filename, 'wb') as handle:
        pickle.dump(bm25_retriever, handle)

    return True # to let user know this process is done