import json import subprocess import requests import gradio as gr import os # Laden der geheimen Umgebungsvariablen für den Systemprompt SYSTEM_PROMPT_SECRET = os.environ.get('HF_SYSTEM_PROMPT_SECRET') # URL zum Herunterladen des Modells von Hugging Face url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true" response = requests.get(url) with open("./model.gguf", mode="wb") as file: file.write(response.content) print("Modell heruntergeladen.") # Starten des Llama-Modellservers command = ["python3", "-m", "llama_cpp.server", "--model", "./model.gguf", "--host", "0.0.0.0", "--port", "2600", "--n_threads", "2"] subprocess.Popen(command) print("Modell bereit!") # Funktion zur Behandlung der Chat-Antwort def response(message, history): # Lokale Server-URL url = "http://0.0.0.0:2600/v1/completions" body = { "prompt": SYSTEM_PROMPT_SECRET + message, # Hinzufügen des Systemprompts "max_tokens": 1500, "echo": False, "stream": True } response_text = "" buffer = "" for text in requests.post(url, json=body, stream=True): if buffer is None: buffer = "" buffer = str("".join(buffer)) text = text.decode('utf-8') if text.startswith(": ping -") is False and len(text.strip("\n\r")) > 0: buffer += str(text) buffer = buffer.split('"finish_reason": null}]}') if len(buffer) == 1: buffer = "".join(buffer) if len(buffer) == 2: part = buffer[0] + '"finish_reason": null}]}' if part.lstrip('\n\r').startswith("data: "): part = part.lstrip('\n\r').replace("data: ", "") try: part = str(json.loads(part)["choices"][0]["text"]) print(part, end="", flush=True) response_text += part buffer = "" # Zurücksetzen des Buffers except Exception as e: print("Exception:" + str(e)) return response_text # Gradio-Schnittstelle mit spezifiziertem Theme gr_interface = gr.ChatInterface( fn=response, title="Mixtral_7Bx2_MoE-GGUF Chatbot", theme='ParityError/Anime' ) # Starten des Gradio-Interfaces gr_interface.queue().launch(share=True)