Spaces:
Runtime error
Runtime error
File size: 1,299 Bytes
42a588a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import os
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Redis
from rag_redis.config import INDEX_NAME, INDEX_SCHEMA, REDIS_URL
from langchain.embeddings import OpenAIEmbeddings
def ingest_documents():
"""
Ingest PDF to Redis from the data/ directory that
"""
# Load list of pdfs
data_path = "data/"
docs = [os.path.join(data_path, file) for file in os.listdir(data_path)]
print("Parsing docs", docs)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500, chunk_overlap=100, add_start_index=True
)
chunks = []
for doc in docs:
loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
chunks.extend(loader.load_and_split(text_splitter))
print("Chunk 0:", chunks[0])
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")
rds = Redis.from_texts(
texts=[chunk.page_content for chunk in chunks],
metadatas=[chunk.metadata for chunk in chunks],
embedding=OpenAIEmbeddings(),
index_name=INDEX_NAME,
redis_url=REDIS_URL,
)
rds.write_schema(INDEX_SCHEMA)
if __name__ == "__main__":
ingest_documents()
|