Frag-dein-PDF

Running

App Files Files Community

AFischer1985 commited on Oct 14, 2024

Commit

0ad705b

verified ·

1 Parent(s): e00a35e

Update run.py

Browse files

Files changed (1) hide show

run.py +85 -41

run.py CHANGED Viewed

@@ -2,50 +2,70 @@
 # Title:  Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
 # Author: Andreas Fischer
 # Date:   October 10th, 2024
-# Last update: October 12th, 2024
 ##########################################################################################
 import os
-import chromadb
-from datetime import datetime
-from chromadb import Documents, EmbeddingFunction, Embeddings
-from chromadb.utils import embedding_functions
-from transformers import AutoTokenizer, AutoModel
 import torch
 jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
 #jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
-device='cuda' if torch.cuda.is_available() else 'cpu'
-#device='cpu' #'cuda' if torch.cuda.is_available() else 'cpu'
 jina.to(device) #cuda:0
 print(device)
 class JinaEmbeddingFunction(EmbeddingFunction):
   def __call__(self, input: Documents) -> Embeddings:
     embeddings = jina.encode(input) #max_length=2048
     return(embeddings.tolist())
-dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db"
 onPrem = True if(os.path.exists(dbPath)) else False
-if(onPrem==False): dbPath="/home/user/app/db"
-#onPrem=True  # uncomment to override automatic detection
 print(dbPath)
-path=dbPath
-client = chromadb.PersistentClient(path=path)
 print(client.heartbeat())
 print(client.get_version())
 print(client.list_collections())
 jina_ef=JinaEmbeddingFunction()
 embeddingModel=jina_ef
-myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
-#mod="mistralai/Mixtral-8x7b-instruct-v0.1"
-#tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
-#cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
-#cha=[{"role":"user","content":"U1"},{"role":"assistant","content":"A1"},{"role":"user","content":"U2"},{"role":"assistant","content":"A2"}]
-#res=tok.apply_chat_template(cha)
-#print(tok.decode(res))
 def format_prompt0(message, history):
   prompt = "<s>"
@@ -56,6 +76,10 @@ def format_prompt0(message, history):
   return prompt
 def format_prompt(message, history, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False):
   if zeichenlimit is None: zeichenlimit=1000000000 # :-)
   startOfString="<s>"  #<s> [INST] U1 [/INST] A1</s> [INST] U2 [/INST] A2</s>
@@ -71,8 +95,8 @@ def format_prompt(message, history, system=None, RAGAddon=None, system2=None, ze
     for user_message, bot_response in history[-historylimit:]:
       if user_message is None: user_message = ""
       if bot_response is None: bot_response = ""
-      #bot_response = re.sub("\n\n<details>((.|\n)*?)</details>","", bot_response) # remove RAG-compontents
-      if removeHTML==True: bot_response = re.sub("<(.*?)>","\n", bot_response) # remove HTML-components in general (may cause bugs with markdown-rendering)
       if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit])
       if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit])
   if message is not None: prompt += template1.format(message=message[:zeichenlimit])
@@ -81,8 +105,10 @@ def format_prompt(message, history, system=None, RAGAddon=None, system2=None, ze
   return startOfString+prompt
-from pypdf import PdfReader
-import ocrmypdf
 def convertPDF(pdf_file, allow_ocr=False):
     reader = PdfReader(pdf_file)
     full_text = ""
@@ -100,7 +126,7 @@ def convertPDF(pdf_file, allow_ocr=False):
         return full_text.strip(), page_count, page_list
     # Check if there are any images
     image_count = sum(len(page.images) for page in reader.pages)
-    # If there are images and not much content, perform OCR on the document
     if allow_ocr:
         print(f"{image_count} Images")
         if image_count > 0 and len(full_text) < 1000:
@@ -124,16 +150,24 @@ def convertPDF(pdf_file, allow_ocr=False):
     }
     return page_list, full_text, metadata
 def split_with_overlap(text,chunk_size=3500, overlap=700):
  chunks=[]
  step=max(1,chunk_size-overlap)
  for i in range(0,len(text),step):
    end=min(i+chunk_size,len(text))
-   #chunk = text[i:i+chunk_size]
    chunks.append(text[i:end])
  return chunks
 def add_doc(path, session):
   print("def add_doc!")
   print(path)
@@ -148,9 +182,8 @@ def add_doc(path, session):
       anhang=True
   else:
     gr.Info("No PDF attached - answer based on DB_"+str(session)+".")
-  client = chromadb.PersistentClient(path="output/general_knowledge")
   print(str(client.list_collections()))
-  #global collection
   print(str(session))
   dbName="DB_"+str(session)
   if(not "name="+dbName in str(client.list_collections())):
@@ -184,15 +217,14 @@ def add_doc(path, session):
     print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
   return(collection)
 #split_with_overlap("test me if you can",2,1)
-from datetime import date
-databases=[(date.today(),"0")] # list of all databases
-from huggingface_hub import InferenceClient
-import gradio as gr
-import re
-def multimodalResponse(message, history, dropdown, hfToken, request: gr.Request):
   print("def multimodal response!")
   if(hfToken.startswith("hf_")): # use HF-hub with custom token if token is provided
     inferenceClient = InferenceClient(model=myModel, token=hfToken)
@@ -213,7 +245,7 @@ def multimodalResponse(message, history, dropdown, hfToken, request: gr.Request)
     collection=add_doc(message["files"][0], session)
   else: # otherwise, you still want to get the collection with the session-based db
     collection=add_doc(message["text"], session)
-  client = chromadb.PersistentClient(path="output/general_knowledge")
   print(str(client.list_collections()))
   x=collection.get(include=[])["ids"]
   context=collection.query(query_texts=[query], n_results=1)
@@ -238,15 +270,27 @@ def multimodalResponse(message, history, dropdown, hfToken, request: gr.Request)
   #formatted_prompt = format_prompt0(system+"\n"+query, history)
   formatted_prompt = format_prompt(query, history,system=system)
   print(formatted_prompt)
-  stream = inferenceClient.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
   output = ""
-  for response in stream:
-    output += response.token.text
     yield output
-  #output=output+"\n\n<br><details open><summary><strong>Sources</strong></summary><br>"+str(context)+"</details>"
   yield output
-i=gr.ChatInterface(multimodalResponse,
   title="Frag dein PDF",
   multimodal=True,
   additional_inputs=[

 # Title:  Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
 # Author: Andreas Fischer
 # Date:   October 10th, 2024
+# Last update: October 14th, 2024
 ##########################################################################################
 import os
 import torch
+from transformers import AutoTokenizer, AutoModel # chromaDB
+from datetime import datetime, date #add_doc,
+import chromadb #chromaDB
+from chromadb import Documents, EmbeddingFunction, Embeddings #chromaDB
+from chromadb.utils import embedding_functions #chromaDB
+import ocrmypdf #convertPDF
+from pypdf import PdfReader #convertPDF
+import re #format_prompt
+import gradio as gr # multimodal_response
+from huggingface_hub import InferenceClient #multimodal_response
+#---------------------------------------------------
+# Specify models for text generation and embeddings
+#---------------------------------------------------
+myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
+#mod="mistralai/Mixtral-8x7b-instruct-v0.1"
+#tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
+#cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
+#cha=[{"role":"user","content":"U1"},{"role":"assistant","content":"A1"},{"role":"user","content":"U2"},{"role":"assistant","content":"A2"}]
+#res=tok.apply_chat_template(cha)
+#print(tok.decode(res))
 jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
 #jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
+device='cuda:0' if torch.cuda.is_available() else 'cpu'
 jina.to(device) #cuda:0
 print(device)
+#-----------------
+# ChromaDB-client
+#-----------------
 class JinaEmbeddingFunction(EmbeddingFunction):
   def __call__(self, input: Documents) -> Embeddings:
     embeddings = jina.encode(input) #max_length=2048
     return(embeddings.tolist())
+dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db/"
 onPrem = True if(os.path.exists(dbPath)) else False
+if(onPrem==False): dbPath="/home/user/app/db/"
 print(dbPath)
+client = chromadb.PersistentClient(path=dbPath)
 print(client.heartbeat())
 print(client.get_version())
 print(client.list_collections())
 jina_ef=JinaEmbeddingFunction()
 embeddingModel=jina_ef
+databases=[(date.today(),"0")] # start a list of databases
+#---------------------------------------------------------------------
+# Function for formatting single message according to prompt template
+#---------------------------------------------------------------------
 def format_prompt0(message, history):
   prompt = "<s>"
   return prompt
+#-------------------------------------------------------------------------
+# Function for formatting multiturn-dialogue according to prompt template
+#-------------------------------------------------------------------------
 def format_prompt(message, history, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False):
   if zeichenlimit is None: zeichenlimit=1000000000 # :-)
   startOfString="<s>"  #<s> [INST] U1 [/INST] A1</s> [INST] U2 [/INST] A2</s>
     for user_message, bot_response in history[-historylimit:]:
       if user_message is None: user_message = ""
       if bot_response is None: bot_response = ""
+      bot_response = re.sub("\n\n<details>((.|\n)*?)</details>","", bot_response) # remove RAG-compontents
+      if removeHTML==True: bot_response = re.sub("<(.*?)>","\n", bot_response)    # remove HTML-components in general (may cause bugs with markdown-rendering)
       if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit])
       if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit])
   if message is not None: prompt += template1.format(message=message[:zeichenlimit])
   return startOfString+prompt
+#--------------------------------------------
+# Function for converting pdf-files to text
+#--------------------------------------------
 def convertPDF(pdf_file, allow_ocr=False):
     reader = PdfReader(pdf_file)
     full_text = ""
         return full_text.strip(), page_count, page_list
     # Check if there are any images
     image_count = sum(len(page.images) for page in reader.pages)
+    # If there are images and not much content, you may want to perform OCR on the document
     if allow_ocr:
         print(f"{image_count} Images")
         if image_count > 0 and len(full_text) < 1000:
     }
     return page_list, full_text, metadata
+#------------------------------------------
+# Function for splitting text with overlap
+#------------------------------------------
 def split_with_overlap(text,chunk_size=3500, overlap=700):
  chunks=[]
  step=max(1,chunk_size-overlap)
  for i in range(0,len(text),step):
    end=min(i+chunk_size,len(text))
    chunks.append(text[i:end])
  return chunks
+#---------------------------------------------------------------
+# Function for adding docs to ChromaDB and/or return collection
+#---------------------------------------------------------------
 def add_doc(path, session):
   print("def add_doc!")
   print(path)
       anhang=True
   else:
     gr.Info("No PDF attached - answer based on DB_"+str(session)+".")
+  client = chromadb.PersistentClient(path=dbPath)
   print(str(client.list_collections()))
   print(str(session))
   dbName="DB_"+str(session)
   if(not "name="+dbName in str(client.list_collections())):
     print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
   return(collection)
 #split_with_overlap("test me if you can",2,1)
+#--------------------------------------------------------
+# Function for response to user queries and pot. addenda
+#--------------------------------------------------------
+def multimodal_response(message, history, dropdown, hfToken, request: gr.Request):
   print("def multimodal response!")
   if(hfToken.startswith("hf_")): # use HF-hub with custom token if token is provided
     inferenceClient = InferenceClient(model=myModel, token=hfToken)
     collection=add_doc(message["files"][0], session)
   else: # otherwise, you still want to get the collection with the session-based db
     collection=add_doc(message["text"], session)
+  client = chromadb.PersistentClient(path=dbPath)
   print(str(client.list_collections()))
   x=collection.get(include=[])["ids"]
   context=collection.query(query_texts=[query], n_results=1)
   #formatted_prompt = format_prompt0(system+"\n"+query, history)
   formatted_prompt = format_prompt(query, history,system=system)
   print(formatted_prompt)
   output = ""
+  try:
+    stream = inferenceClient.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+    for response in stream:
+      output += response.token.text
+      yield output
+  except Exception as e:
+    output = "Für weitere Antworten von der KI gebe bitte einen gültigen HuggingFace-Token an."
+    if(len(context)>0):
+      output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
     yield output
+    print(str(e))
+  if(len(context)>0):
+    output=output+"\n\n<br><details open><summary><strong>Quellen</strong></summary><br><ul>"+ "".join(["<li>" + c + "</li>" for c in context])+"</ul></details>"
   yield output
+#------------------------------
+# Launch Gradio-ChatInterface
+#------------------------------
+i=gr.ChatInterface(multimodal_response,
   title="Frag dein PDF",
   multimodal=True,
   additional_inputs=[