binqiangliu commited on
Commit
3c0fc42
·
1 Parent(s): 22c11b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -0
app.py CHANGED
@@ -66,4 +66,22 @@ tokenizer = initialize_tokenizer(model_name)
66
  # specify stop token ids
67
  stop_token_ids = [0]
68
 
 
 
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # specify stop token ids
67
  stop_token_ids = [0]
68
 
69
+ # load pdf files
70
+ loader = PyPDFDirectoryLoader(pdf_files)
71
+ documents = loader.load()
72
 
73
+ # split the documents in small chunks
74
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #Chage the chunk_size and chunk_overlap as needed
75
+ all_splits = text_splitter.split_documents(documents)
76
+
77
+ # specify embedding model (using huggingface sentence transformer)
78
+ embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
79
+ #model_kwargs = {"device": "cuda"}
80
+ #embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs)
81
+ embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
82
+
83
+ #embed document chunks
84
+ vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
85
+
86
+ # specify the retriever
87
+ retriever = vectordb.as_retriever()