aaromosshf2424 commited on
Commit
168dc64
·
1 Parent(s): 75d1e33

pulled solution from main

Browse files
Files changed (1) hide show
  1. app.py +53 -20
app.py CHANGED
@@ -11,6 +11,9 @@ from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
 
 
 
14
 
15
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
16
  # ---- ENV VARIABLES ---- #
@@ -37,8 +40,6 @@ HF_TOKEN = os.environ["HF_TOKEN"]
37
  3. Load HuggingFace Embeddings (remember to use the URL we set above)
38
  4. Index Files if they do not exist, otherwise load the vectorstore
39
  """
40
- ### 1. CREATE TEXT LOADER AND LOAD DOCUMENTS
41
- ### NOTE: PAY ATTENTION TO THE PATH THEY ARE IN.
42
  document_loader = TextLoader("./data/paul_graham_essays.txt")
43
  documents = document_loader.load()
44
 
@@ -51,25 +52,57 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
51
  huggingfacehub_api_token=HF_TOKEN,
52
  )
53
 
54
- if os.path.exists("./data/vectorstore"):
55
- vectorstore = FAISS.load_local(
56
- "./data/vectorstore",
57
- hf_embeddings,
58
- allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
59
- )
60
- hf_retriever = vectorstore.as_retriever()
61
- print("Loaded Vectorstore")
62
- else:
 
 
 
 
63
  print("Indexing Files")
64
- os.makedirs("./data/vectorstore", exist_ok=True)
65
- for i in range(0, len(split_documents), 32):
66
- if i == 0:
67
- vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
68
- continue
69
- vectorstore.add_documents(split_documents[i:i+32])
70
- vectorstore.save_local("./data/vectorstore")
71
-
72
- hf_retriever = vectorstore.as_retriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # -- AUGMENTED -- #
75
  """
 
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
14
+ from tqdm.asyncio import tqdm_asyncio
15
+ import asyncio
16
+ from tqdm.asyncio import tqdm
17
 
18
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
19
  # ---- ENV VARIABLES ---- #
 
40
  3. Load HuggingFace Embeddings (remember to use the URL we set above)
41
  4. Index Files if they do not exist, otherwise load the vectorstore
42
  """
 
 
43
  document_loader = TextLoader("./data/paul_graham_essays.txt")
44
  documents = document_loader.load()
45
 
 
52
  huggingfacehub_api_token=HF_TOKEN,
53
  )
54
 
55
+ async def add_documents_async(vectorstore, documents):
56
+ await vectorstore.aadd_documents(documents)
57
+
58
+ async def process_batch(vectorstore, batch, is_first_batch, pbar):
59
+ if is_first_batch:
60
+ result = await FAISS.afrom_documents(batch, hf_embeddings)
61
+ else:
62
+ await add_documents_async(vectorstore, batch)
63
+ result = vectorstore
64
+ pbar.update(len(batch))
65
+ return result
66
+
67
+ async def main():
68
  print("Indexing Files")
69
+
70
+ vectorstore = None
71
+ batch_size = 32
72
+
73
+ batches = [split_documents[i:i+batch_size] for i in range(0, len(split_documents), batch_size)]
74
+
75
+ async def process_all_batches():
76
+ nonlocal vectorstore
77
+ tasks = []
78
+ pbars = []
79
+
80
+ for i, batch in enumerate(batches):
81
+ pbar = tqdm(total=len(batch), desc=f"Batch {i+1}/{len(batches)}", position=i)
82
+ pbars.append(pbar)
83
+
84
+ if i == 0:
85
+ vectorstore = await process_batch(None, batch, True, pbar)
86
+ else:
87
+ tasks.append(process_batch(vectorstore, batch, False, pbar))
88
+
89
+ if tasks:
90
+ await asyncio.gather(*tasks)
91
+
92
+ for pbar in pbars:
93
+ pbar.close()
94
+
95
+ await process_all_batches()
96
+
97
+ hf_retriever = vectorstore.as_retriever()
98
+ print("\nIndexing complete. Vectorstore is ready for use.")
99
+ return hf_retriever
100
+
101
+ async def run():
102
+ retriever = await main()
103
+ return retriever
104
+
105
+ hf_retriever = asyncio.run(run())
106
 
107
  # -- AUGMENTED -- #
108
  """