Frag-dein-PDF

Running

App Files Files Community

AFischer1985 commited on Oct 11

Commit

0182410

•

1 Parent(s): de6ea12

Update run.py

Browse files

Files changed (1) hide show

run.py +38 -30

run.py CHANGED Viewed

@@ -111,41 +111,49 @@ def split_with_overlap(text,chunk_size=3500, overlap=700):
 def add_doc(path):
   print("def add_doc!")
   print(path)
-  if(str.lower(path).endswith(".pdf")):
       doc=convertPDF(path)
-      doc="\n\n".join(doc[0])
-      gr.Info("PDF uploaded, start Indexing!")
-  else:
-      gr.Info("Error: Only pdfs are accepted!")
   client = chromadb.PersistentClient(path="output/general_knowledge")
   print(str(client.list_collections()))
   #global collection
   dbName="test"
-  if("name="+dbName in str(client.list_collections())):
-    client.delete_collection(name=dbName)
-  collection = client.create_collection(
-    dbName,
-    embedding_function=embeddingModel,
-    metadata={"hnsw:space": "cosine"})
-  corpus=split_with_overlap(doc,3500,700)
-  print(len(corpus))
-  then = datetime.now()
-  x=collection.get(include=[])["ids"]
-  print(len(x))
-  if(len(x)==0):
-    chunkSize=40000
-    for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
-      print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
-      ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
-      batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
-      textIDs=[str(id) for id in ids[0:len(batch)]]
-      ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
-      collection.add(documents=batch, ids=ids,
-        metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
-      print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
-  now = datetime.now()
-  gr.Info(f"Indexing complete!")
-  print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
   return(collection)
 #split_with_overlap("test me if you can",2,1)

 def add_doc(path):
   print("def add_doc!")
   print(path)
+  anhang=False
+  if(str.lower(path).endswith(".pdf") and os.path.exists(path)):
       doc=convertPDF(path)
+      if(len(doc[0])>5):
+        gr.Info("PDF uploaded, start Indexing excerpt (first 5 pages)!")
+      else:
+        gr.Info("PDF uploaded, start Indexing!")
+      doc="\n\n".join(doc[0][0:5])
+      anhang=True
   client = chromadb.PersistentClient(path="output/general_knowledge")
   print(str(client.list_collections()))
   #global collection
   dbName="test"
+  if(not "name="+dbName in str(client.list_collections())):
+    #  client.delete_collection(name=dbName)
+    collection = client.create_collection(
+      name=dbName,
+      embedding_function=embeddingModel,
+      metadata={"hnsw:space": "cosine"})
+  else:
+    collection = client.get_collection(
+      name=dbName, embedding_function=embeddingModel)
+  if(anhang==True):
+    corpus=split_with_overlap(doc,3500,700)
+    print(len(corpus))
+    then = datetime.now()
+    x=collection.get(include=[])["ids"]
+    print(len(x))
+    if(len(x)==0):
+      chunkSize=40000
+      for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
+        print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
+        ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
+        batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
+        textIDs=[str(id) for id in ids[0:len(batch)]]
+        ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
+        collection.add(documents=batch, ids=ids,
+          metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
+        print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
+    now = datetime.now()
+    gr.Info(f"Indexing complete!")
+    print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
   return(collection)
 #split_with_overlap("test me if you can",2,1)