import subprocess import requests from llama_cpp import Llama import gradio as gr url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf" response = requests.get(url) with open("./model.gguf", mode="wb") as file: file.write(response.content) llm = Llama(model_path="./model.gguf") def response(input_text, history): output = llm(f"Q: {input_text} A:", max_tokens=256, stop=["Q:", "\n"], echo=True) return output['choices'][0]['text'] gr.ChatInterface(response).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864) command = ["python3", "-m", "llama_cpp.server", "--model", "./model.gguf", "--host", "0.0.0.0", "--port", "2600"] subprocess.Popen(command) print("Hello world")