AFischer1985 commited on
Commit
5c691a5
1 Parent(s): d8b9fc9

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +4 -4
run.py CHANGED
@@ -2,7 +2,7 @@
2
  # Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
3
  # Author: Andreas Fischer
4
  # Date: October 10th, 2024
5
- # Last update: October 14th, 2024
6
  ##########################################################################################
7
 
8
  import os
@@ -203,18 +203,18 @@ def add_doc(path, session):
203
  print(len(x))
204
  if(len(x)==0):
205
  chunkSize=40000
206
- for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
207
  print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
208
  ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
209
  batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
210
  textIDs=[str(id) for id in ids[0:len(batch)]]
211
  ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
212
  collection.add(documents=batch, ids=ids,
213
- metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
214
  print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
215
  now = datetime.now()
216
  gr.Info(f"Indexing complete!")
217
- print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
218
  return(collection)
219
 
220
  #split_with_overlap("test me if you can",2,1)
 
2
  # Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
3
  # Author: Andreas Fischer
4
  # Date: October 10th, 2024
5
+ # Last update: October 15th, 2024
6
  ##########################################################################################
7
 
8
  import os
 
203
  print(len(x))
204
  if(len(x)==0):
205
  chunkSize=40000
206
+ for i in range(round(len(corpus)/chunkSize+0.5)):
207
  print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
208
  ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
209
  batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
210
  textIDs=[str(id) for id in ids[0:len(batch)]]
211
  ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
212
  collection.add(documents=batch, ids=ids,
213
+ metadatas=[{"date": str("2024-10-10")} for b in batch])
214
  print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
215
  now = datetime.now()
216
  gr.Info(f"Indexing complete!")
217
+ print(now-then)
218
  return(collection)
219
 
220
  #split_with_overlap("test me if you can",2,1)