switch from Unstructured Loader to PyPDF as its results have page nubmer
Browse files- .env.example +2 -0
- README.md +1 -1
- app_modules/qa_chain.py +12 -1
- data/chromadb_1024_512/chroma-collections.parquet +1 -1
- data/chromadb_1024_512/chroma-embeddings.parquet +2 -2
- data/chromadb_1024_512/index/{id_to_uuid_67de6665-0585-4559-85bd-e044c61f64df.pkl β id_to_uuid_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} +2 -2
- data/chromadb_1024_512/index/{uuid_to_id_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_44a39155-bdc7-450c-8532-01db0e4b66cc.bin} +2 -2
- data/chromadb_1024_512/index/{index_metadata_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_metadata_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} +1 -1
- data/chromadb_1024_512/index/{index_67de6665-0585-4559-85bd-e044c61f64df.bin β uuid_to_id_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} +2 -2
- ingest.py +2 -4
.env.example
CHANGED
@@ -18,6 +18,8 @@ HF_PIPELINE_DEVICE_TYPE=
|
|
18 |
|
19 |
CHAT_HISTORY_ENABLED=true
|
20 |
|
|
|
|
|
21 |
# if unset, default to "hkunlp/instructor-xl"
|
22 |
HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
|
23 |
|
|
|
18 |
|
19 |
CHAT_HISTORY_ENABLED=true
|
20 |
|
21 |
+
PDF_FILE_BASE_URL=
|
22 |
+
|
23 |
# if unset, default to "hkunlp/instructor-xl"
|
24 |
HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
|
25 |
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Chat
|
3 |
emoji: π
|
4 |
colorFrom: indigo
|
5 |
colorTo: blue
|
|
|
1 |
---
|
2 |
+
title: Chat with AI Books
|
3 |
emoji: π
|
4 |
colorFrom: indigo
|
5 |
colorTo: blue
|
app_modules/qa_chain.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import sys
|
|
|
3 |
from queue import Queue
|
4 |
from typing import Any, Optional
|
5 |
|
@@ -528,4 +529,14 @@ class QAChain:
|
|
528 |
self.streamer.reset(q)
|
529 |
|
530 |
qa = self.get_chain(tracing)
|
531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
+
import urllib
|
4 |
from queue import Queue
|
5 |
from typing import Any, Optional
|
6 |
|
|
|
529 |
self.streamer.reset(q)
|
530 |
|
531 |
qa = self.get_chain(tracing)
|
532 |
+
result = qa(inputs)
|
533 |
+
|
534 |
+
base_url = os.environ.get("PDF_FILE_BASE_URL")
|
535 |
+
if base_url is not None:
|
536 |
+
documents = result["source_documents"]
|
537 |
+
for doc in documents:
|
538 |
+
source = doc.metadata["source"]
|
539 |
+
title = source.split("/")[-1]
|
540 |
+
doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
|
541 |
+
|
542 |
+
return result
|
data/chromadb_1024_512/chroma-collections.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 557
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d0e4364f9a67d91e3185cc597297b8651ca02bdfddb8467767c8a71cbb89d4e
|
3 |
size 557
|
data/chromadb_1024_512/chroma-embeddings.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b050c60c5fd263355aabc3cc35e6308930cb4b8a1929e7209b6777da0782d59
|
3 |
+
size 7513430
|
data/chromadb_1024_512/index/{id_to_uuid_67de6665-0585-4559-85bd-e044c61f64df.pkl β id_to_uuid_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4eb4fe05362f4052e3af173b0915e9758bb7bc7f9f681850e765cbde35d8783f
|
3 |
+
size 47652
|
data/chromadb_1024_512/index/{uuid_to_id_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_44a39155-bdc7-450c-8532-01db0e4b66cc.bin}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a26db7cd65749049856321b4aef559a0ffbef7f4286131c1bcd5f5dc4cc3849
|
3 |
+
size 4743996
|
data/chromadb_1024_512/index/{index_metadata_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_metadata_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 105
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae5e0c780f18efa625dc2d0ad2d60328b51d2842cac144446196e4032e7c2c43
|
3 |
size 105
|
data/chromadb_1024_512/index/{index_67de6665-0585-4559-85bd-e044c61f64df.bin β uuid_to_id_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49aab2d749c5650688e4b1b566d8773889ca59d92ea2083d04fd5882a626ecc0
|
3 |
+
size 55737
|
ingest.py
CHANGED
@@ -3,9 +3,7 @@ import os
|
|
3 |
from timeit import default_timer as timer
|
4 |
from typing import List
|
5 |
|
6 |
-
import
|
7 |
-
from dotenv import load_dotenv
|
8 |
-
from langchain.document_loaders.directory import DirectoryLoader
|
9 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain.vectorstores.chroma import Chroma
|
@@ -14,7 +12,7 @@ from app_modules.utils import *
|
|
14 |
|
15 |
|
16 |
def load_documents(source_pdfs_path) -> List:
|
17 |
-
loader =
|
18 |
documents = loader.load()
|
19 |
return documents
|
20 |
|
|
|
3 |
from timeit import default_timer as timer
|
4 |
from typing import List
|
5 |
|
6 |
+
from langchain.document_loaders import PyPDFDirectoryLoader
|
|
|
|
|
7 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.vectorstores.chroma import Chroma
|
|
|
12 |
|
13 |
|
14 |
def load_documents(source_pdfs_path) -> List:
|
15 |
+
loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
|
16 |
documents = loader.load()
|
17 |
return documents
|
18 |
|