File size: 1,299 Bytes
42a588a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os

from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Redis
from rag_redis.config import INDEX_NAME, INDEX_SCHEMA, REDIS_URL
from langchain.embeddings import OpenAIEmbeddings


def ingest_documents():
    """
    Ingest PDF to Redis from the data/ directory that
    """
    # Load list of pdfs
    data_path = "data/"
    docs = [os.path.join(data_path, file) for file in os.listdir(data_path)]

    print("Parsing docs", docs)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500, chunk_overlap=100, add_start_index=True
    )
    chunks = []
    for doc in docs:
        loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
        chunks.extend(loader.load_and_split(text_splitter))

    print("Chunk 0:", chunks[0])

    print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")

    rds = Redis.from_texts(
        texts=[chunk.page_content for chunk in chunks],
        metadatas=[chunk.metadata for chunk in chunks],
        embedding=OpenAIEmbeddings(),
        index_name=INDEX_NAME,
        redis_url=REDIS_URL,
    )
    rds.write_schema(INDEX_SCHEMA)


if __name__ == "__main__":
    ingest_documents()