danicafisher commited on
Commit
768b51c
·
verified ·
1 Parent(s): 2a571e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  from langchain_community.document_loaders import PyMuPDFLoader
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_qdrant import QdrantVectorStore
5
  from langchain_community.vectorstores import Qdrant
6
  from langchain.prompts import ChatPromptTemplate
@@ -35,6 +35,13 @@ text_splitter = RecursiveCharacterTextSplitter(
35
  )
36
  rag_documents = text_splitter.split_documents(documents)
37
 
 
 
 
 
 
 
 
38
  embedding = OpenAIEmbeddings(model="text-embedding-3-small")
39
 
40
  # Create the vector store
 
1
  import os
2
  from langchain_community.document_loaders import PyMuPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
4
  from langchain_qdrant import QdrantVectorStore
5
  from langchain_community.vectorstores import Qdrant
6
  from langchain.prompts import ChatPromptTemplate
 
35
  )
36
  rag_documents = text_splitter.split_documents(documents)
37
 
38
+ # Alternative chunking: Tokens (more accurate for OpenAI models)
39
+ token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
40
+ encoding="cl100k_base", chunk_size=100, chunk_overlap=0
41
+ )
42
+ token_rag_documents = token_text_splitter.split_documents(documents)
43
+ # TO DO ^^ test
44
+
45
  embedding = OpenAIEmbeddings(model="text-embedding-3-small")
46
 
47
  # Create the vector store