Zaman
commited on
Commit
•
444dc2c
1
Parent(s):
748323a
added index for more files
Browse files- .gitattributes +2 -0
- bot.py +4 -2
- db_index/chroma-collections.parquet +3 -0
- db_index/chroma-embeddings.parquet +3 -0
- db_index/index/id_to_uuid_c058f88c-340a-4016-92e2-428db1eee8d3.pkl +3 -0
- db_index/index/index_c058f88c-340a-4016-92e2-428db1eee8d3.bin +3 -0
- db_index/index/index_metadata_c058f88c-340a-4016-92e2-428db1eee8d3.pkl +3 -0
- db_index/index/uuid_to_id_c058f88c-340a-4016-92e2-428db1eee8d3.pkl +3 -0
- index.py +41 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
apple_amazon_intel_db/* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
apple_amazon_intel_db/* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
db_index/* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
db_index filter=lfs diff=lfs merge=lfs -text
|
bot.py
CHANGED
@@ -11,13 +11,15 @@ from langchain.memory import ConversationBufferWindowMemory
|
|
11 |
# ConversationalRetrievalChain()
|
12 |
|
13 |
|
14 |
-
persist_direcory = "
|
15 |
# persist_directory = "db"
|
16 |
|
17 |
embeddings = OpenAIEmbeddings()
|
18 |
# db = FAISS.load_local(persist_directory, embeddings)
|
19 |
chroma = Chroma(embedding_function=embeddings, persist_directory=persist_direcory)
|
20 |
-
retriever = chroma.as_retriever(search_type="mmr", search_kwargs={"k": 10})
|
|
|
|
|
21 |
|
22 |
memory = ConversationBufferWindowMemory(
|
23 |
memory_key="chat_history", return_messages=False
|
|
|
11 |
# ConversationalRetrievalChain()
|
12 |
|
13 |
|
14 |
+
persist_direcory = "db_index"
|
15 |
# persist_directory = "db"
|
16 |
|
17 |
embeddings = OpenAIEmbeddings()
|
18 |
# db = FAISS.load_local(persist_directory, embeddings)
|
19 |
chroma = Chroma(embedding_function=embeddings, persist_directory=persist_direcory)
|
20 |
+
# retriever = chroma.as_retriever(search_type="mmr", search_kwargs={"k": 10})
|
21 |
+
retriever = chroma.as_retriever(search_kwargs={"k": 10})
|
22 |
+
|
23 |
|
24 |
memory = ConversationBufferWindowMemory(
|
25 |
memory_key="chat_history", return_messages=False
|
db_index/chroma-collections.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a93b6f7695acd8c6266ae93fd7006576543ae90070656b3f14f139fe9ac1d96
|
3 |
+
size 557
|
db_index/chroma-embeddings.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee9c8a25473ff3aeee5af2cca3ef440f0b80c6a3f4fb605928c61bedd1ffae71
|
3 |
+
size 3671568
|
db_index/index/id_to_uuid_c058f88c-340a-4016-92e2-428db1eee8d3.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bfd2bd74cb66b0f5bc62c6f2a0f082782d9cdc4cb04feb89f122e4c8e945352
|
3 |
+
size 5824
|
db_index/index/index_c058f88c-340a-4016-92e2-428db1eee8d3.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:54ee98ef0848ead7a6e67b4cbe425c3fd7556eeab854d1040e0d333cdf16a4ec
|
3 |
+
size 1158108
|
db_index/index/index_metadata_c058f88c-340a-4016-92e2-428db1eee8d3.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1476eae771b3d8b048407afe33482c83b9c0ac90067139e9a45f9cde061e4a5
|
3 |
+
size 103
|
db_index/index/uuid_to_id_c058f88c-340a-4016-92e2-428db1eee8d3.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e243f6aab9a67f825380ca1d8bd14685c4d6f09d2e1a75ec8c6884b06903a72d
|
3 |
+
size 6824
|
index.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import Chroma
|
2 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader
|
5 |
+
import tiktoken
|
6 |
+
|
7 |
+
|
8 |
+
loader = DirectoryLoader(
|
9 |
+
"./apple_amazon_intel", glob="**/*.pdf", loader_cls=UnstructuredPDFLoader
|
10 |
+
)
|
11 |
+
documents = loader.load()
|
12 |
+
|
13 |
+
# loader = DirectoryLoader("./data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
|
14 |
+
# documents = loader.load()
|
15 |
+
# print(documents)
|
16 |
+
|
17 |
+
|
18 |
+
def tiktoken_len(text):
|
19 |
+
tokenizer = tiktoken.encoding_for_model("gpt-4")
|
20 |
+
tokens = tokenizer.encode(text, disallowed_special=())
|
21 |
+
return len(tokens)
|
22 |
+
|
23 |
+
|
24 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
25 |
+
chunk_size=4000,
|
26 |
+
chunk_overlap=400,
|
27 |
+
length_function=tiktoken_len,
|
28 |
+
separators=["\n\n", "\n", " ", ""],
|
29 |
+
)
|
30 |
+
texts = text_splitter.split_documents(documents)
|
31 |
+
|
32 |
+
persist_direcory = "db_index"
|
33 |
+
|
34 |
+
# embeddings = OpenAIEmbeddings()
|
35 |
+
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
36 |
+
|
37 |
+
db = Chroma.from_documents(
|
38 |
+
texts, embedding=embeddings, persist_directory=persist_direcory
|
39 |
+
)
|
40 |
+
db.persist()
|
41 |
+
print("done")
|