3gg commited on
Commit
047da88
1 Parent(s): 074f5a4

Improve search results with tokenizer length function and by removing TOC from the pdf.

Browse files
app.py CHANGED
@@ -6,27 +6,30 @@ from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.llms import HuggingFacePipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.vectorstores import FAISS
 
9
 
10
 
11
  # Number of search results to query from the vector database.
12
- SIMILARITY_SEARCH_COUNT = 3
13
 
14
- # Size of each document chunk in number of characters.
15
- CHUNK_SIZE = 800
16
 
17
- # Chunk overlap in number of characters.
18
- CHUNK_OVERLAP = 50
19
 
20
  # Maximum number of output tokens.
21
  MODEL_MAX_LENGTH = 500
22
 
23
 
24
  print("Loading documents")
25
- loader = PyMuPDFLoader("rdna3-shader-instruction-set-architecture-feb-2023_0.pdf")
26
  documents = loader.load()
27
 
28
  print("Creating chunks")
29
- splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
 
 
30
  chunks = splitter.split_documents(documents)
31
 
32
  print("Creating database")
 
6
  from langchain.llms import HuggingFacePipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.vectorstores import FAISS
9
+ from transformers import GPT2TokenizerFast
10
 
11
 
12
  # Number of search results to query from the vector database.
13
+ SIMILARITY_SEARCH_COUNT = 8
14
 
15
+ # Size of each document chunk in number of tokens.
16
+ CHUNK_SIZE = 100
17
 
18
+ # Chunk overlap in number of tokens.
19
+ CHUNK_OVERLAP = 10
20
 
21
  # Maximum number of output tokens.
22
  MODEL_MAX_LENGTH = 500
23
 
24
 
25
  print("Loading documents")
26
+ loader = PyMuPDFLoader("rdna3.pdf")
27
  documents = loader.load()
28
 
29
  print("Creating chunks")
30
+ tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
31
+ splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
32
+ tokenizer, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
33
  chunks = splitter.split_documents(documents)
34
 
35
  print("Creating database")
rdna3-shader-instruction-set-architecture-feb-2023_0.pdf → rdna3.pdf RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bff84b0bc818446356e73ba894149b8c810549fa240a9872b46179f412fcd13b
3
- size 3246429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81348dff61fc7b4912d86102a709a5a620819ee6d3e773a1da7a8d4c433fa45c
3
+ size 3053930
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  langchain==0.0.162
2
  faiss-cpu==1.7.4
3
  huggingface-hub==0.14.1
4
- sentence-transformers==2.2.2
5
  protobuf==3.20.1
6
  pymupdf==1.22.2
 
 
 
1
  langchain==0.0.162
2
  faiss-cpu==1.7.4
3
  huggingface-hub==0.14.1
 
4
  protobuf==3.20.1
5
  pymupdf==1.22.2
6
+ sentence-transformers==2.2.2
7
+ transformers==4.28.1