AFischer1985 commited on
Commit
beaf90e
1 Parent(s): 7b16373

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +48 -28
run.py CHANGED
@@ -2,7 +2,7 @@
2
  # Title: German AI-Interface with advanced RAG
3
  # Author: Andreas Fischer
4
  # Date: January 31st, 2023
5
- # Last update: February 25st, 2024
6
  ##########################################################################################
7
 
8
  #https://github.com/abetlen/llama-cpp-python/issues/306
@@ -30,7 +30,7 @@ dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db"
30
  onPrem = True if(os.path.exists(dbPath)) else False
31
  if(onPrem==False): dbPath="/home/user/app/db"
32
 
33
- onPrem=False
34
  print(dbPath)
35
 
36
  #client = chromadb.Client()
@@ -164,12 +164,11 @@ else:
164
  import os
165
  import requests
166
  import subprocess
167
- modelPath="/home/af/gguf/models/Discolm_german_7b_v1.Q4_0.gguf"
 
168
  if(os.path.exists(modelPath)==False):
169
- #url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
170
- url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
171
- #url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
172
  #url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
 
173
  response = requests.get(url)
174
  with open("./Mixtral-8x7b-instruct.gguf", mode="wb") as file:
175
  file.write(response.content)
@@ -183,10 +182,15 @@ else:
183
  print("Server ready!")
184
 
185
 
 
 
 
 
 
186
  # Gradio-GUI
187
  #------------
188
-
189
- def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4): #float("Inf")
190
  startOfString=""
191
  if zeichenlimit is None: zeichenlimit=1000000000 # :-)
192
  template0=" [INST]{system}\n [/INST] </s>"
@@ -229,13 +233,18 @@ def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=
229
  prompt += template0.format(system=system) #"<s>"
230
  if history is not None:
231
  for user_message, bot_response in history[-historylimit:]:
232
- if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit]) #"[INST] {user_prompt} [/INST] "
233
- if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit]) #"{bot_response}</s> "
234
- if message is not None: prompt += template1.format(message=message[:zeichenlimit]) #"[INST] {message} [/INST]"
 
 
 
 
235
  if system2 is not None:
236
  prompt += system2
237
  return startOfString+prompt
238
 
 
239
  import gradio as gr
240
  import requests
241
  import json
@@ -244,7 +253,8 @@ import os
244
  import re
245
 
246
  def response(message, history):
247
- settings="Temporär"
 
248
 
249
  # Preprocessing to revent simple forms of prompt injection:
250
  #----------------------------------------------------------
@@ -253,12 +263,12 @@ def response(message, history):
253
  message=message.replace("[/INST]","")
254
  message=re.sub("<[|](im_start|im_end|end_of_turn)[|]>", '', message)
255
 
256
- # Load Memory if settings=="Permanent"
257
  #-------------------------------------
258
- if (settings=="Permanent"):
259
  if((len(history)==0)&(os.path.isfile(filename))): history=json.load(open(filename,'r',encoding="utf-8")) # retrieve history (if available)
260
 
261
- system="Du bist ein deutschsprachiges wortkarges KI-basiertes Assistenzsystem. Fasse dich kurz und verzichte auf Codebeispiele."
262
 
263
  #RAG-layer 0: Intention-RAG
264
  #---------------------------
@@ -354,7 +364,13 @@ def response(message, history):
354
  rag, # RAG-component added to the system prompt
355
  system2, # fictive first words of the AI (neither displayed nor stored)
356
  historylimit=historylimit # number of past messages to consider for response to current message
 
357
  )
 
 
 
 
 
358
  print("\n\n*** Prompt:\n"+prompt+"\n***\n\n")
359
 
360
  ## Request response from model
@@ -383,13 +399,14 @@ def response(message, history):
383
  part=text.token.text
384
  #print(part, end="", flush=True)
385
  response += part
 
386
  yield response
387
  if((myType=="1a")): #add RAG-results to chat-output if appropriate
388
- response2=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
389
- yield response2
390
  history.append((message, response)) # add current dialog to history
391
- # Store current state in DB if settings=="Permanent"
392
- if (settings=="Permanent"):
393
  x=collection.get(include=[])["ids"] # add current dialog to db
394
  collection.add(
395
  documents=[message,response],
@@ -405,7 +422,8 @@ def response(message, history):
405
  # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
406
  url="http://0.0.0.0:2600/v1/completions"
407
  body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"} # e.g. Mixtral-Instruct
408
- if("discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]}) # fix stop-token of DiscoLM
 
409
  response="" #+"("+myType+")\n"
410
  buffer=""
411
  #print("URL: "+url)
@@ -432,13 +450,13 @@ def response(message, history):
432
  except Exception as e:
433
  print("Exception:"+str(e))
434
  pass
 
435
  yield response
436
  if((myType=="1a")): #add RAG-results to chat-output if appropriate
437
- response2=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
438
- yield response2
439
- history.append((message, response)) # add current dialog to history
440
- # Store current state in DB if settings=="Permanent"
441
- if (settings=="Permanent"):
442
  x=collection.get(include=[])["ids"] # add current dialog to db
443
  collection.add(
444
  documents=[message,response],
@@ -453,9 +471,11 @@ def response(message, history):
453
 
454
  gr.ChatInterface(
455
  response,
456
- chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.<br>Aktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<ul><li>wenn du ein KI-Modell suchst, antworte ich auf Basis der Liste</li><li>wenn du Fragen zur Benutzung eines KI-Modells hast, verweise ich an andere Stellen</li><li>wenn du andre Fragen hast, antworte ich frei und berücksichtige dabei Relevantes aus dem gesamten bisherigen Dialog.</li></ul><br>Was ist dein Anliegen?"]],render_markdown=True),
457
- title="German AI-Interface with advanced RAG",
458
- #additional_inputs=[gr.Dropdown(["Permanent","Temporär"],value="Temporär",label="Dialog sichern?")]
459
  ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
460
  print("Interface up and running!")
461
 
 
 
 
2
  # Title: German AI-Interface with advanced RAG
3
  # Author: Andreas Fischer
4
  # Date: January 31st, 2023
5
+ # Last update: February 26st, 2024
6
  ##########################################################################################
7
 
8
  #https://github.com/abetlen/llama-cpp-python/issues/306
 
30
  onPrem = True if(os.path.exists(dbPath)) else False
31
  if(onPrem==False): dbPath="/home/user/app/db"
32
 
33
+ #onPrem=True # uncomment to override automatic detection
34
  print(dbPath)
35
 
36
  #client = chromadb.Client()
 
164
  import os
165
  import requests
166
  import subprocess
167
+ #modelPath="/home/af/gguf/models/Discolm_german_7b_v1.Q4_0.gguf"
168
+ modelPath="/home/af/gguf/models/Mixtral-8x7b-instruct-v0.1.Q4_0.gguf"
169
  if(os.path.exists(modelPath)==False):
 
 
 
170
  #url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
171
+ url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
172
  response = requests.get(url)
173
  with open("./Mixtral-8x7b-instruct.gguf", mode="wb") as file:
174
  file.write(response.content)
 
182
  print("Server ready!")
183
 
184
 
185
+ #import llama_cpp
186
+ #llama_cpp.llama_backend_init(numa=False)
187
+ #params=llama_cpp.llama_context_default_params()
188
+ #params.n_ctx
189
+
190
  # Gradio-GUI
191
  #------------
192
+ import re
193
+ def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=True):
194
  startOfString=""
195
  if zeichenlimit is None: zeichenlimit=1000000000 # :-)
196
  template0=" [INST]{system}\n [/INST] </s>"
 
233
  prompt += template0.format(system=system) #"<s>"
234
  if history is not None:
235
  for user_message, bot_response in history[-historylimit:]:
236
+ if user_message is None: user_message = ""
237
+ if bot_response is None: bot_response = ""
238
+ bot_response = re.sub("\n\n<details>((.|\n)*?)</details>","", bot_response) # remove RAG-compontents
239
+ if removeHTML==True: bot_response = re.sub("<(.*?)>","\n", bot_response) # remove HTML-components in general (may cause bugs with markdown-rendering)
240
+ if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit])
241
+ if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit])
242
+ if message is not None: prompt += template1.format(message=message[:zeichenlimit])
243
  if system2 is not None:
244
  prompt += system2
245
  return startOfString+prompt
246
 
247
+
248
  import gradio as gr
249
  import requests
250
  import json
 
253
  import re
254
 
255
  def response(message, history):
256
+ settings="Memory Off"
257
+ removeHTML=True
258
 
259
  # Preprocessing to revent simple forms of prompt injection:
260
  #----------------------------------------------------------
 
263
  message=message.replace("[/INST]","")
264
  message=re.sub("<[|](im_start|im_end|end_of_turn)[|]>", '', message)
265
 
266
+ # Load Memory if memory is turned on
267
  #-------------------------------------
268
+ if (settings=="Memory On"):
269
  if((len(history)==0)&(os.path.isfile(filename))): history=json.load(open(filename,'r',encoding="utf-8")) # retrieve history (if available)
270
 
271
+ system="Du bist ein deutschsprachiges wortkarges KI-basiertes Assistenzsystem. Antworte kurz, in deutsche Sprache und verzichte auf HTML und Code jeder Art."
272
 
273
  #RAG-layer 0: Intention-RAG
274
  #---------------------------
 
364
  rag, # RAG-component added to the system prompt
365
  system2, # fictive first words of the AI (neither displayed nor stored)
366
  historylimit=historylimit # number of past messages to consider for response to current message
367
+ removeHTML=removeHTML # remove HTML-components from History (to prevent bugs with Markdown)
368
  )
369
+ #print("\n\nMESSAGE:"+str(message))
370
+ #print("\n\nHISTORY:"+str(history))
371
+ #print("\n\nSYSTEM:"+str(system))
372
+ #print("\n\nRAG:"+str(rag))
373
+ #print("\n\nSYSTEM2:"+str(system2))
374
  print("\n\n*** Prompt:\n"+prompt+"\n***\n\n")
375
 
376
  ## Request response from model
 
399
  part=text.token.text
400
  #print(part, end="", flush=True)
401
  response += part
402
+ if removeHTML==True: response = re.sub("<(.*?)>","\n", response) # remove HTML-components in general (may cause bugs with markdown-rendering)
403
  yield response
404
  if((myType=="1a")): #add RAG-results to chat-output if appropriate
405
+ response=response+"\n\n<details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
406
+ yield response
407
  history.append((message, response)) # add current dialog to history
408
+ # Store current state in DB if memory is turned on
409
+ if (settings=="Memory On"):
410
  x=collection.get(include=[])["ids"] # add current dialog to db
411
  collection.add(
412
  documents=[message,response],
 
422
  # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
423
  url="http://0.0.0.0:2600/v1/completions"
424
  body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"} # e.g. Mixtral-Instruct
425
+ if("Discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]}) # fix stop-token of DiscoLM
426
+ if("Gemma-" in modelPath): body.update({"stop": ["<|im_end|>","</end_of_turn>"]}) # fix stop-token of Gemma
427
  response="" #+"("+myType+")\n"
428
  buffer=""
429
  #print("URL: "+url)
 
450
  except Exception as e:
451
  print("Exception:"+str(e))
452
  pass
453
+ if removeHTML==True: response = re.sub("<(.*?)>","\n", response) # remove HTML-components in general (may cause bugs with markdown-rendering)
454
  yield response
455
  if((myType=="1a")): #add RAG-results to chat-output if appropriate
456
+ response=response+"\n\n<details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
457
+ yield response
458
+ # Store current state in DB if memory is turned on
459
+ if (settings=="Memory On"):
 
460
  x=collection.get(include=[])["ids"] # add current dialog to db
461
  collection.add(
462
  documents=[message,response],
 
471
 
472
  gr.ChatInterface(
473
  response,
474
+ chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.\nAktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<ul><li>Wenn du ein KI-Modell suchst, antworte ich auf Basis der Liste</li><li>Wenn du Fragen zur Benutzung eines KI-Modells hast, verweise ich an andere Stellen</li><li>Wenn du andre Fragen hast, antworte ich frei und berücksichtige dabei Relevantes aus dem gesamten bisherigen Dialog.</li></ul>\nWas ist dein Anliegen?"]],render_markdown=True),
475
+ title="German AI-Interface with advanced RAG (on prem)" if onPrem else "German AI-Interface with advanced RAG (HFHub)",
476
+ #additional_inputs=[gr.Dropdown(["Memory On","Memory Off"],value="Memory Off",label="Memory")]
477
  ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
478
  print("Interface up and running!")
479
 
480
+
481
+