inflaton commited on
Commit
c6b090e
·
1 Parent(s): a28a4f8

added embeddings for new books

Browse files
Files changed (4) hide show
  1. Makefile +5 -1
  2. data/ai_books/index.faiss +2 -2
  3. data/ai_books/index.pkl +2 -2
  4. ingest.py +7 -6
Makefile CHANGED
@@ -50,7 +50,11 @@ format:
50
  black .
51
 
52
  install:
53
- pip install -U -r requirements.txt
 
 
 
 
54
  pip show langchain transformers
55
 
56
  install-extra:
 
50
  black .
51
 
52
  install:
53
+ pip install -r requirements.txt
54
+ pip show langchain transformers
55
+
56
+ install:
57
+ pip install -r requirements-mac.txt
58
  pip show langchain transformers
59
 
60
  install-extra:
data/ai_books/index.faiss CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27612af17ee2ee1c73a4dc91772122c112c1f14ae280649e3b7ca5afeea1e7b9
3
- size 91047981
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57cf906d0a49d48c53ef8bfe9c107d035d2f0a15bd4e57a2d8f5560960db239f
3
+ size 110456877
data/ai_books/index.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9df9fa6831ab9736f93a877822bdaf3e472dea2ba6701ea63598a4447bfab463
3
- size 30845062
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5091df974d4a7c0c832619b0acaae195fa69ab37f7cd18873459c11c3a537494
3
+ size 37484917
ingest.py CHANGED
@@ -3,8 +3,8 @@ import os
3
  from timeit import default_timer as timer
4
  from typing import List
5
 
6
- from langchain.document_loaders import PyPDFDirectoryLoader
7
- from langchain.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.vectorstores.base import VectorStore
10
  from langchain.vectorstores.chroma import Chroma
@@ -81,7 +81,7 @@ if not os.path.isdir(index_path):
81
  )
82
  os.mkdir(index_path)
83
 
84
- if source_urls is not None:
85
  # Open the file for reading
86
  file = open(source_urls, "r")
87
 
@@ -93,10 +93,11 @@ if not os.path.isdir(index_path):
93
 
94
  # Remove the newline characters from each string
95
  source_urls = [line.strip() for line in lines]
 
 
 
 
96
 
97
- print(
98
- f"Loading {'' if source_urls is None else str(len(source_urls)) + ' '}PDF files from {source_pdfs_path}"
99
- )
100
  sources = load_documents(source_pdfs_path, source_urls)
101
 
102
  print(f"Splitting {len(sources)} PDF pages in to chunks ...")
 
3
  from timeit import default_timer as timer
4
  from typing import List
5
 
6
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
7
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.vectorstores.base import VectorStore
10
  from langchain.vectorstores.chroma import Chroma
 
81
  )
82
  os.mkdir(index_path)
83
 
84
+ if source_urls:
85
  # Open the file for reading
86
  file = open(source_urls, "r")
87
 
 
93
 
94
  # Remove the newline characters from each string
95
  source_urls = [line.strip() for line in lines]
96
+ print(f"Loading {len(source_urls)} PDF files from {source_pdfs_path}")
97
+ else:
98
+ source_urls = None
99
+ print(f"Loading PDF files from {source_pdfs_path}")
100
 
 
 
 
101
  sources = load_documents(source_pdfs_path, source_urls)
102
 
103
  print(f"Splitting {len(sources)} PDF pages in to chunks ...")