added embeddings for new books
Browse files- Makefile +5 -1
- data/ai_books/index.faiss +2 -2
- data/ai_books/index.pkl +2 -2
- ingest.py +7 -6
Makefile
CHANGED
@@ -50,7 +50,11 @@ format:
|
|
50 |
black .
|
51 |
|
52 |
install:
|
53 |
-
pip install -
|
|
|
|
|
|
|
|
|
54 |
pip show langchain transformers
|
55 |
|
56 |
install-extra:
|
|
|
50 |
black .
|
51 |
|
52 |
install:
|
53 |
+
pip install -r requirements.txt
|
54 |
+
pip show langchain transformers
|
55 |
+
|
56 |
+
install:
|
57 |
+
pip install -r requirements-mac.txt
|
58 |
pip show langchain transformers
|
59 |
|
60 |
install-extra:
|
data/ai_books/index.faiss
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57cf906d0a49d48c53ef8bfe9c107d035d2f0a15bd4e57a2d8f5560960db239f
|
3 |
+
size 110456877
|
data/ai_books/index.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5091df974d4a7c0c832619b0acaae195fa69ab37f7cd18873459c11c3a537494
|
3 |
+
size 37484917
|
ingest.py
CHANGED
@@ -3,8 +3,8 @@ import os
|
|
3 |
from timeit import default_timer as timer
|
4 |
from typing import List
|
5 |
|
6 |
-
from
|
7 |
-
from
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.vectorstores.base import VectorStore
|
10 |
from langchain.vectorstores.chroma import Chroma
|
@@ -81,7 +81,7 @@ if not os.path.isdir(index_path):
|
|
81 |
)
|
82 |
os.mkdir(index_path)
|
83 |
|
84 |
-
if source_urls
|
85 |
# Open the file for reading
|
86 |
file = open(source_urls, "r")
|
87 |
|
@@ -93,10 +93,11 @@ if not os.path.isdir(index_path):
|
|
93 |
|
94 |
# Remove the newline characters from each string
|
95 |
source_urls = [line.strip() for line in lines]
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
print(
|
98 |
-
f"Loading {'' if source_urls is None else str(len(source_urls)) + ' '}PDF files from {source_pdfs_path}"
|
99 |
-
)
|
100 |
sources = load_documents(source_pdfs_path, source_urls)
|
101 |
|
102 |
print(f"Splitting {len(sources)} PDF pages in to chunks ...")
|
|
|
3 |
from timeit import default_timer as timer
|
4 |
from typing import List
|
5 |
|
6 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
7 |
+
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.vectorstores.base import VectorStore
|
10 |
from langchain.vectorstores.chroma import Chroma
|
|
|
81 |
)
|
82 |
os.mkdir(index_path)
|
83 |
|
84 |
+
if source_urls:
|
85 |
# Open the file for reading
|
86 |
file = open(source_urls, "r")
|
87 |
|
|
|
93 |
|
94 |
# Remove the newline characters from each string
|
95 |
source_urls = [line.strip() for line in lines]
|
96 |
+
print(f"Loading {len(source_urls)} PDF files from {source_pdfs_path}")
|
97 |
+
else:
|
98 |
+
source_urls = None
|
99 |
+
print(f"Loading PDF files from {source_pdfs_path}")
|
100 |
|
|
|
|
|
|
|
101 |
sources = load_documents(source_pdfs_path, source_urls)
|
102 |
|
103 |
print(f"Splitting {len(sources)} PDF pages in to chunks ...")
|