Spaces:
Sleeping
Sleeping
Update web_scrape_and_pdf_loader.py
Browse files
web_scrape_and_pdf_loader.py
CHANGED
@@ -114,15 +114,13 @@ def pdf_loader(url, country):
|
|
114 |
# Same as above but for pdf in local directory
|
115 |
def pdf_loader_local(pdf_filename, country):
|
116 |
try:
|
117 |
-
with open(pdf_filename, 'wb') as f: # save the pdf locally first
|
118 |
-
f.write(response.content)
|
119 |
loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
|
120 |
raw_pdf_documents = loader.load()
|
121 |
raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
|
122 |
return raw_pdf_documents
|
123 |
|
124 |
except Exception as e:
|
125 |
-
print(f"Failed to load for {
|
126 |
return False
|
127 |
|
128 |
# If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
|
@@ -182,7 +180,7 @@ def process_links_load_documents(all_links):
|
|
182 |
# Note: If we are using a lot more data than can be stored in the RAM or when in production,
|
183 |
# better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
|
184 |
|
185 |
-
def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country):
|
186 |
chromadb_dir = "chromadb"
|
187 |
if not os.path.exists(chromadb_dir):
|
188 |
os.makedirs(chromadb_dir)
|
@@ -192,7 +190,7 @@ def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country
|
|
192 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
193 |
)
|
194 |
split_documents = text_splitter.split_documents(all_documents)
|
195 |
-
persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
196 |
|
197 |
# Build the vector database using Chroma and persist it in a local directory
|
198 |
chroma_db = Chroma.from_documents(split_documents,
|
@@ -222,8 +220,9 @@ def setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country):
|
|
222 |
split_documents = text_splitter.split_documents(all_documents)
|
223 |
split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
|
224 |
bm25_retriever = BM25Retriever.from_documents(split_documents)
|
225 |
-
filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}.pickle"
|
226 |
|
227 |
with open(filename, 'wb') as handle:
|
228 |
pickle.dump(bm25_retriever, handle)
|
229 |
|
|
|
|
114 |
# Same as above but for pdf in local directory
|
115 |
def pdf_loader_local(pdf_filename, country):
|
116 |
try:
|
|
|
|
|
117 |
loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
|
118 |
raw_pdf_documents = loader.load()
|
119 |
raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
|
120 |
return raw_pdf_documents
|
121 |
|
122 |
except Exception as e:
|
123 |
+
print(f"Failed to load for {pdf_filename} {e}")
|
124 |
return False
|
125 |
|
126 |
# If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
|
|
|
180 |
# Note: If we are using a lot more data than can be stored in the RAM or when in production,
|
181 |
# better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
|
182 |
|
183 |
+
def setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country):
|
184 |
chromadb_dir = "chromadb"
|
185 |
if not os.path.exists(chromadb_dir):
|
186 |
os.makedirs(chromadb_dir)
|
|
|
190 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
191 |
)
|
192 |
split_documents = text_splitter.split_documents(all_documents)
|
193 |
+
persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
|
194 |
|
195 |
# Build the vector database using Chroma and persist it in a local directory
|
196 |
chroma_db = Chroma.from_documents(split_documents,
|
|
|
220 |
split_documents = text_splitter.split_documents(all_documents)
|
221 |
split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
|
222 |
bm25_retriever = BM25Retriever.from_documents(split_documents)
|
223 |
+
filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_.pickle"
|
224 |
|
225 |
with open(filename, 'wb') as handle:
|
226 |
pickle.dump(bm25_retriever, handle)
|
227 |
|
228 |
+
return True # to let user know this process is done
|