Spaces:
Sleeping
Sleeping
import os | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import UnstructuredFileLoader | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain_core.documents import Document | |
def ingest_documents(): | |
""" | |
Ingest PDF to Redis from the data/ directory that | |
contains Edgar 10k filings data for Nike. | |
""" | |
# Load list of pdfs | |
data_path = "data/" | |
doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0] | |
print("Parsing 10k filing doc for NIKE", doc) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1500, chunk_overlap=100, add_start_index=True | |
) | |
loader = UnstructuredFileLoader(doc, mode="single", strategy="fast") | |
chunks = loader.load_and_split(text_splitter) | |
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf") | |
# Create vectorstore | |
embedder = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2" | |
) | |
documents = [] | |
for chunk in chunks: | |
doc = Document(page_content=chunk.page_content, metadata=chunk.metadata) | |
documents.append(doc) | |
# Add to vectorDB | |
_ = Chroma.from_documents( | |
documents=documents, | |
collection_name="xeon-rag", | |
embedding=embedder, | |
persist_directory="/tmp/xeon_rag_db", | |
) | |
if __name__ == "__main__": | |
ingest_documents() | |