Improve search results with tokenizer length function and by removing TOC from the pdf.
Browse files
app.py
CHANGED
|
@@ -6,27 +6,30 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|
| 6 |
from langchain.llms import HuggingFacePipeline
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
from langchain.vectorstores import FAISS
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
# Number of search results to query from the vector database.
|
| 12 |
-
SIMILARITY_SEARCH_COUNT =
|
| 13 |
|
| 14 |
-
# Size of each document chunk in number of
|
| 15 |
-
CHUNK_SIZE =
|
| 16 |
|
| 17 |
-
# Chunk overlap in number of
|
| 18 |
-
CHUNK_OVERLAP =
|
| 19 |
|
| 20 |
# Maximum number of output tokens.
|
| 21 |
MODEL_MAX_LENGTH = 500
|
| 22 |
|
| 23 |
|
| 24 |
print("Loading documents")
|
| 25 |
-
loader = PyMuPDFLoader("rdna3
|
| 26 |
documents = loader.load()
|
| 27 |
|
| 28 |
print("Creating chunks")
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
chunks = splitter.split_documents(documents)
|
| 31 |
|
| 32 |
print("Creating database")
|
|
|
|
| 6 |
from langchain.llms import HuggingFacePipeline
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
from langchain.vectorstores import FAISS
|
| 9 |
+
from transformers import GPT2TokenizerFast
|
| 10 |
|
| 11 |
|
| 12 |
# Number of search results to query from the vector database.
|
| 13 |
+
SIMILARITY_SEARCH_COUNT = 8
|
| 14 |
|
| 15 |
+
# Size of each document chunk in number of tokens.
|
| 16 |
+
CHUNK_SIZE = 100
|
| 17 |
|
| 18 |
+
# Chunk overlap in number of tokens.
|
| 19 |
+
CHUNK_OVERLAP = 10
|
| 20 |
|
| 21 |
# Maximum number of output tokens.
|
| 22 |
MODEL_MAX_LENGTH = 500
|
| 23 |
|
| 24 |
|
| 25 |
print("Loading documents")
|
| 26 |
+
loader = PyMuPDFLoader("rdna3.pdf")
|
| 27 |
documents = loader.load()
|
| 28 |
|
| 29 |
print("Creating chunks")
|
| 30 |
+
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
|
| 31 |
+
splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
| 32 |
+
tokenizer, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
| 33 |
chunks = splitter.split_documents(documents)
|
| 34 |
|
| 35 |
print("Creating database")
|
rdna3-shader-instruction-set-architecture-feb-2023_0.pdf → rdna3.pdf
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81348dff61fc7b4912d86102a709a5a620819ee6d3e773a1da7a8d4c433fa45c
|
| 3 |
+
size 3053930
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
langchain==0.0.162
|
| 2 |
faiss-cpu==1.7.4
|
| 3 |
huggingface-hub==0.14.1
|
| 4 |
-
sentence-transformers==2.2.2
|
| 5 |
protobuf==3.20.1
|
| 6 |
pymupdf==1.22.2
|
|
|
|
|
|
|
|
|
| 1 |
langchain==0.0.162
|
| 2 |
faiss-cpu==1.7.4
|
| 3 |
huggingface-hub==0.14.1
|
|
|
|
| 4 |
protobuf==3.20.1
|
| 5 |
pymupdf==1.22.2
|
| 6 |
+
sentence-transformers==2.2.2
|
| 7 |
+
transformers==4.28.1
|