Spaces:

binqiangliu
/

Zephyr7BAlpha

Runtime error

binqiangliu commited on Oct 23, 2023

Commit

3c0fc42

1 Parent(s): 22c11b2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -66,4 +66,22 @@ tokenizer = initialize_tokenizer(model_name)
 # specify stop token ids
 stop_token_ids = [0]

 # specify stop token ids
 stop_token_ids = [0]
+# load pdf files
+loader = PyPDFDirectoryLoader(pdf_files)
+documents = loader.load()
+# split the documents in small chunks
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #Chage the chunk_size and chunk_overlap as needed
+all_splits = text_splitter.split_documents(documents)
+# specify embedding model (using huggingface sentence transformer)
+embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
+#model_kwargs = {"device": "cuda"}
+#embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs)
+embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
+#embed document chunks
+vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
+# specify the retriever
+retriever = vectordb.as_retriever()