Zaman commited on
Commit
444dc2c
1 Parent(s): 748323a

added index for more files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  apple_amazon_intel_db/* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  apple_amazon_intel_db/* filter=lfs diff=lfs merge=lfs -text
36
+ db_index/* filter=lfs diff=lfs merge=lfs -text
37
+ db_index filter=lfs diff=lfs merge=lfs -text
bot.py CHANGED
@@ -11,13 +11,15 @@ from langchain.memory import ConversationBufferWindowMemory
11
  # ConversationalRetrievalChain()
12
 
13
 
14
- persist_direcory = "apple_amazon_intel_db"
15
  # persist_directory = "db"
16
 
17
  embeddings = OpenAIEmbeddings()
18
  # db = FAISS.load_local(persist_directory, embeddings)
19
  chroma = Chroma(embedding_function=embeddings, persist_directory=persist_direcory)
20
- retriever = chroma.as_retriever(search_type="mmr", search_kwargs={"k": 10})
 
 
21
 
22
  memory = ConversationBufferWindowMemory(
23
  memory_key="chat_history", return_messages=False
 
11
  # ConversationalRetrievalChain()
12
 
13
 
14
+ persist_direcory = "db_index"
15
  # persist_directory = "db"
16
 
17
  embeddings = OpenAIEmbeddings()
18
  # db = FAISS.load_local(persist_directory, embeddings)
19
  chroma = Chroma(embedding_function=embeddings, persist_directory=persist_direcory)
20
+ # retriever = chroma.as_retriever(search_type="mmr", search_kwargs={"k": 10})
21
+ retriever = chroma.as_retriever(search_kwargs={"k": 10})
22
+
23
 
24
  memory = ConversationBufferWindowMemory(
25
  memory_key="chat_history", return_messages=False
db_index/chroma-collections.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a93b6f7695acd8c6266ae93fd7006576543ae90070656b3f14f139fe9ac1d96
3
+ size 557
db_index/chroma-embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee9c8a25473ff3aeee5af2cca3ef440f0b80c6a3f4fb605928c61bedd1ffae71
3
+ size 3671568
db_index/index/id_to_uuid_c058f88c-340a-4016-92e2-428db1eee8d3.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bfd2bd74cb66b0f5bc62c6f2a0f082782d9cdc4cb04feb89f122e4c8e945352
3
+ size 5824
db_index/index/index_c058f88c-340a-4016-92e2-428db1eee8d3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54ee98ef0848ead7a6e67b4cbe425c3fd7556eeab854d1040e0d333cdf16a4ec
3
+ size 1158108
db_index/index/index_metadata_c058f88c-340a-4016-92e2-428db1eee8d3.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1476eae771b3d8b048407afe33482c83b9c0ac90067139e9a45f9cde061e4a5
3
+ size 103
db_index/index/uuid_to_id_c058f88c-340a-4016-92e2-428db1eee8d3.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e243f6aab9a67f825380ca1d8bd14685c4d6f09d2e1a75ec8c6884b06903a72d
3
+ size 6824
index.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import Chroma
2
+ from langchain.embeddings import HuggingFaceEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader
5
+ import tiktoken
6
+
7
+
8
+ loader = DirectoryLoader(
9
+ "./apple_amazon_intel", glob="**/*.pdf", loader_cls=UnstructuredPDFLoader
10
+ )
11
+ documents = loader.load()
12
+
13
+ # loader = DirectoryLoader("./data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
14
+ # documents = loader.load()
15
+ # print(documents)
16
+
17
+
18
+ def tiktoken_len(text):
19
+ tokenizer = tiktoken.encoding_for_model("gpt-4")
20
+ tokens = tokenizer.encode(text, disallowed_special=())
21
+ return len(tokens)
22
+
23
+
24
+ text_splitter = RecursiveCharacterTextSplitter(
25
+ chunk_size=4000,
26
+ chunk_overlap=400,
27
+ length_function=tiktoken_len,
28
+ separators=["\n\n", "\n", " ", ""],
29
+ )
30
+ texts = text_splitter.split_documents(documents)
31
+
32
+ persist_direcory = "db_index"
33
+
34
+ # embeddings = OpenAIEmbeddings()
35
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
36
+
37
+ db = Chroma.from_documents(
38
+ texts, embedding=embeddings, persist_directory=persist_direcory
39
+ )
40
+ db.persist()
41
+ print("done")