Spaces:
Sleeping
Sleeping
File size: 5,089 Bytes
ba5136e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import argparse
import os
import shutil
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.bedrock import BedrockEmbeddings
import json
import requests
from chromadb import Documents, EmbeddingFunction, Embeddings
CHROMA_PATH = "chroma"
DATA_PATH = "pdfs"
class MyEmbeddingFunction(EmbeddingFunction):
def embed_documents(self, input: Documents) -> Embeddings:
for i in range(5):
try:
embeddings = []
url = "https://api.deepinfra.com/v1/inference/BAAI/bge-large-en-v1.5"
payload = json.dumps({
"inputs": input
})
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Origin': 'https://deepinfra.com',
'Referer': 'https://deepinfra.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()["embeddings"]
except:
pass
def main():
# Check if the database should be cleared (using the --clear flag).
parser = argparse.ArgumentParser()
parser.add_argument("--reset", action="store_true", help="Reset the database.")
args = parser.parse_args()
if args.reset:
print("β¨ Clearing Database")
clear_database()
# Create (or update) the data store.
documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)
def load_documents():
print("π Loading Documents")
document_loader = PyPDFDirectoryLoader(DATA_PATH)
return document_loader.load()
def split_documents(documents: list[Document]):
print("πͺ Splitting Documents")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=4000,
chunk_overlap=100,
length_function=len,
is_separator_regex=True
)
return text_splitter.split_documents(documents)
def add_to_chroma(chunks: list[Document]):
print("π Adding to Chroma")
# Load the existing database.
custom_embeddings = MyEmbeddingFunction()
db = Chroma(
persist_directory=CHROMA_PATH, embedding_function=custom_embeddings
)
# Calculate Page IDs.
chunks_with_ids = calculate_chunk_ids(chunks)
# Add or Update the documents.
existing_items = db.get(include=[]) # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")
# Only add documents that don't exist in the DB.
new_chunks = []
for chunk in chunks_with_ids:
if chunk.metadata["id"] not in existing_ids:
new_chunks.append(chunk)
if len(new_chunks):
print(f"π Adding new documents: {len(new_chunks)}")
new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
for i in range(0, len(new_chunks), 100):
try:
db.add_documents(new_chunks[i:i+100], ids=new_chunk_ids[i:i+100])
db.persist()
print(f"Added {i+100} documents")
except:
pass
else:
print("β
No new documents to add")
def calculate_chunk_ids(chunks):
last_page_id = None
current_chunk_index = 0
for chunk in chunks:
source = chunk.metadata.get("source")
page = chunk.metadata.get("page")
current_page_id = f"{source}:{page}"
# If the page ID is the same as the last one, increment the index.
if current_page_id == last_page_id:
current_chunk_index += 1
else:
current_chunk_index = 0
# Calculate the chunk ID.
chunk_id = f"{current_page_id}:{current_chunk_index}"
last_page_id = current_page_id
# Add it to the page meta-data.
chunk.metadata["id"] = chunk_id
return chunks
def clear_database():
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)
if __name__ == "__main__":
main()
|