Frag-dein-PDF

Running

App Files Files Community

AFischer1985 commited on Oct 10

Commit

b7f29b3

•

1 Parent(s): a380202

Update run.py

Browse files

Files changed (1) hide show

run.py +184 -87

run.py CHANGED Viewed

@@ -1,70 +1,51 @@
-#########################################################################################
-# Title:  Gradio Interface to LLM-chatbot with RAG-funcionality and ChromaDB on HF-Hub
 # Author: Andreas Fischer
-# Date:   December 29th, 2023
-# Last update: December 31th, 2023
 ##########################################################################################
-# Chroma-DB
-#-----------
 import os
 import chromadb
-dbPath="/home/af/Schreibtisch/gradio/Chroma/db"
-if(os.path.exists(dbPath)==False):
-  dbPath="/home/user/app/db"
 print(dbPath)
-#client = chromadb.Client()
 path=dbPath
 client = chromadb.PersistentClient(path=path)
 print(client.heartbeat())
 print(client.get_version())
 print(client.list_collections())
-from chromadb.utils import embedding_functions
-default_ef = embedding_functions.DefaultEmbeddingFunction()
-sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
-#instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda")
-print(str(client.list_collections()))
-global collection
-if("name=ChromaDB1" in str(client.list_collections())):
-  print("ChromaDB1 found!")
-  collection = client.get_collection(name="ChromaDB1", embedding_function=sentence_transformer_ef)
-else:
-  print("ChromaDB1 created!")
-  collection = client.create_collection(
-    "ChromaDB1",
-    embedding_function=sentence_transformer_ef,
-    metadata={"hnsw:space": "cosine"})
-  collection.add(
-    documents=["The meaning of life is to love.", "This is a sentence", "This is a sentence too"],
-    metadatas=[{"source": "notion"}, {"source": "google-docs"}, {"source": "google-docs"}],
-    ids=["doc1", "doc2", "doc3"],
-  )
-print("Database ready!")
-print(collection.count())
-# Model
-#-------
 from huggingface_hub import InferenceClient
 import gradio as gr
-client = InferenceClient(
     "mistralai/Mixtral-8x7B-Instruct-v0.1"
     #"mistralai/Mistral-7B-Instruct-v0.1"
 )
-# Gradio-GUI
-#------------
-import gradio as gr
-import json
 def format_prompt(message, history):
   prompt = "<s>"
   #for user_prompt, bot_response in history:
@@ -73,45 +54,161 @@ def format_prompt(message, history):
   prompt += f"[INST] {message} [/INST]"
   return prompt
-def response(
-    prompt, history, temperature=0.9, max_new_tokens=500, top_p=0.95, repetition_penalty=1.0,
-):
-    temperature = float(temperature)
-    if temperature < 1e-2: temperature = 1e-2
-    top_p = float(top_p)
-    generate_kwargs = dict(
-        temperature=temperature,
-        max_new_tokens=max_new_tokens,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
         do_sample=True,
         seed=42,
-    )
-    addon=""
-    results=collection.query(
-      query_texts=[prompt],
-      n_results=2,
-      #where={"source": "google-docs"}
-      #where_document={"$contains":"search_string"}
-    )
-    dists=["<small>(relevance: "+str(round((1-d)*100)/100)+";" for d in results['distances'][0]]
-    sources=["source: "+s["source"]+")</small>" for s in results['metadatas'][0]]
-    results=results['documents'][0]
-    combination = zip(results,dists,sources)
-    combination = [' '.join(triplets) for triplets in combination]
-    print(combination)
-    if(len(results)>1):
-      addon=" Bitte berücksichtige bei deiner Antwort ggf. folgende Auszüge aus unserer Datenbank, sofern sie für die Antwort erforderlich sind. Beantworte die Frage knapp und präzise. Ignoriere unpassende Datenbank-Auszüge OHNE sie zu kommentieren, zu erwähnen oder aufzulisten:\n"+"\n".join(results)
-    system="Du bist ein KI-basiertes Assistenzsystem."+addon+"\n\nUser-Anliegen:"
-    #body={"prompt":system+"### Instruktion:\n"+message+"\n\n### Antwort:","max_tokens":500, "echo":"False","stream":"True"} #e.g. SauerkrautLM
-    formatted_prompt = format_prompt(system+"\n"+prompt, history)
-    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
-    output = ""
-    for response in stream:
-        output += response.token.text
-        yield output
-    output=output+"\n\n<br><details open><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
     yield output
-gr.ChatInterface(response, chatbot=gr.Chatbot(render_markdown=True),title="German RAG-Interface to the Hugging Face Hub").queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
-print("Interface up and running!")

+###########################################################################################
+# Title:  Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
 # Author: Andreas Fischer
+# Date:   October 10th, 2024
+# Last update: October 10th, 2024
 ##########################################################################################
 import os
 import chromadb
+from datetime import datetime
+from chromadb import Documents, EmbeddingFunction, Embeddings
+from chromadb.utils import embedding_functions
+from transformers import AutoTokenizer, AutoModel
+import torch
+jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
+#jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
+device='cuda' if torch.cuda.is_available() else 'cpu'
+#device='cpu' #'cuda' if torch.cuda.is_available() else 'cpu'
+jina.to(device) #cuda:0
+print(device)
+class JinaEmbeddingFunction(EmbeddingFunction):
+  def __call__(self, input: Documents) -> Embeddings:
+    embeddings = jina.encode(input) #max_length=2048
+    return(embeddings.tolist())
+dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db"
+onPrem = True if(os.path.exists(dbPath)) else False
+if(onPrem==False): dbPath="/home/user/app/db"
+#onPrem=True  # uncomment to override automatic detection
 print(dbPath)
 path=dbPath
 client = chromadb.PersistentClient(path=path)
 print(client.heartbeat())
 print(client.get_version())
 print(client.list_collections())
+jina_ef=JinaEmbeddingFunction()
+embeddingModel=jina_ef
 from huggingface_hub import InferenceClient
 import gradio as gr
+import json
+inferenceClient = InferenceClient(
     "mistralai/Mixtral-8x7B-Instruct-v0.1"
     #"mistralai/Mistral-7B-Instruct-v0.1"
 )
 def format_prompt(message, history):
   prompt = "<s>"
   #for user_prompt, bot_response in history:
   prompt += f"[INST] {message} [/INST]"
   return prompt
+from pypdf import PdfReader
+import ocrmypdf
+def convertPDF(pdf_file, allow_ocr=False):
+    reader = PdfReader(pdf_file)
+    full_text = ""
+    page_list = []
+    def extract_text_from_pdf(reader):
+        full_text = ""
+        page_list = []
+        page_count = 1
+        for idx, page in enumerate(reader.pages):
+            text = page.extract_text()
+            if len(text) > 0:
+                page_list.append(text)
+                #full_text += f"---- Page {idx} ----\n" + text + "\n\n"
+                page_count += 1
+        return full_text.strip(), page_count, page_list
+    # Check if there are any images
+    image_count = sum(len(page.images) for page in reader.pages)
+    # If there are images and not much content, perform OCR on the document
+    if allow_ocr:
+        print(f"{image_count} Images")
+        if image_count > 0 and len(full_text) < 1000:
+            out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
+            ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
+            reader = PdfReader(out_pdf_file)
+    # Extract text:
+    full_text, page_count, page_list = extract_text_from_pdf(reader)
+    l = len(page_list)
+    print(f"{l} Pages")
+    # Extract metadata
+    metadata = {
+        "author": reader.metadata.author,
+        "creator": reader.metadata.creator,
+        "producer": reader.metadata.producer,
+        "subject": reader.metadata.subject,
+        "title": reader.metadata.title,
+        "image_count": image_count,
+        "page_count": page_count,
+        "char_count": len(full_text),
+    }
+    return page_list, full_text, metadata
+def split_with_overlap(text,chunk_size=3500, overlap=700):
+ chunks=[]
+ step=max(1,chunk_size-overlap)
+ for i in range(0,len(text),step):
+   end=min(i+chunk_size,len(text))
+   #chunk = text[i:i+chunk_size]
+   chunks.append(text[i:end])
+ return chunks
+def add_doc(path):
+  print("def add_doc!")
+  print(path)
+  if(str.lower(path).endswith(".pdf")):
+      doc=convertPDF(path)
+      doc="\n\n".join(doc[0])
+      gr.Info("PDF uploaded, start Indexing!")
+  else:
+      gr.Info("Error: Only pdfs are accepted!")
+  client = chromadb.PersistentClient(path="output/general_knowledge")
+  print(str(client.list_collections()))
+  #global collection
+  dbName="test"
+  if("name="+dbName in str(client.list_collections())):
+    client.delete_collection(name=dbName)
+  collection = client.create_collection(
+    dbName,
+    embedding_function=embeddingModel,
+    metadata={"hnsw:space": "cosine"})
+  corpus=split_with_overlap(doc,3500,700)
+  print(len(corpus))
+  then = datetime.now()
+  x=collection.get(include=[])["ids"]
+  print(len(x))
+  if(len(x)==0):
+    chunkSize=40000
+    for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
+      print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
+      ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
+      batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
+      textIDs=[str(id) for id in ids[0:len(batch)]]
+      ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
+      collection.add(documents=batch, ids=ids,
+        metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
+      print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
+  now = datetime.now()
+  gr.Info(f"Indexing complete!")
+  print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
+  return(collection)
+#split_with_overlap("test me if you can",2,1)
+import gradio as gr
+import re
+def multimodalResponse(message,history,headerPattern,sentenceWiseSplitting):
+  print("def multimodal response!")
+  length=str(len(history))
+  query=message["text"]
+  if(len(message["files"])>0): # is there at least one file attached?
+    collection=add_doc(message["files"][0])
+  client = chromadb.PersistentClient(path="output/general_knowledge")
+  print(str(client.list_collections()))
+  x=collection.get(include=[])["ids"]
+  context=collection.query(query_texts=[query], n_results=1)
+  print(str(context))
+  #context=["<context "+str(i+1)+">\n"+c+"\n</context "+str(i+1)+">" for i, c in enumerate(retrievedTexts)]
+  #context="\n\n".join(context)
+  #return context
+  if temperature < 1e-2: temperature = 1e-2
+  top_p = float(top_p)
+  generate_kwargs = dict(
+        temperature=float(0.9),
+        max_new_tokens=5000,
+        top_p=0.95,
+        repetition_penalty=1.0,
         do_sample=True,
         seed=42,
+  )
+  system="Given the following conversation, relevant context, and a follow up question, "+\
+  "reply with an answer to the current question the user is asking. "+\
+  "Return only your response to the question given the above information "+\
+  "following the users instructions as needed.\n\nContext:"+\
+  str(context)
+  print(system)
+  formatted_prompt = format_prompt(system+"\n"+prompt, history)
+  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+  output = ""
+  for response in stream:
+    output += response.token.text
     yield output
+  #output=output+"\n\n<br><details open><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
+  yield output
+i=gr.ChatInterface(multimodalResponse,
+  title="pdfChatbot",
+  multimodal=True,
+  additional_inputs=[
+    gr.Dropdown(
+      info="select retrieval version",
+      choices=["1","2","3"],
+      value=["1"],
+      label="Retrieval Version")])
+i.launch() #allowed_paths=["."])