Spaces:
Runtime error
Runtime error
Commit
·
3c0fc42
1
Parent(s):
22c11b2
Update app.py
Browse files
app.py
CHANGED
@@ -66,4 +66,22 @@ tokenizer = initialize_tokenizer(model_name)
|
|
66 |
# specify stop token ids
|
67 |
stop_token_ids = [0]
|
68 |
|
|
|
|
|
|
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# specify stop token ids
|
67 |
stop_token_ids = [0]
|
68 |
|
69 |
+
# load pdf files
|
70 |
+
loader = PyPDFDirectoryLoader(pdf_files)
|
71 |
+
documents = loader.load()
|
72 |
|
73 |
+
# split the documents in small chunks
|
74 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #Chage the chunk_size and chunk_overlap as needed
|
75 |
+
all_splits = text_splitter.split_documents(documents)
|
76 |
+
|
77 |
+
# specify embedding model (using huggingface sentence transformer)
|
78 |
+
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
|
79 |
+
#model_kwargs = {"device": "cuda"}
|
80 |
+
#embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs)
|
81 |
+
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
|
82 |
+
|
83 |
+
#embed document chunks
|
84 |
+
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
|
85 |
+
|
86 |
+
# specify the retriever
|
87 |
+
retriever = vectordb.as_retriever()
|