llama.cpp / app.py
kat33's picture
Update app.py
fa52c5f
raw
history blame contribute delete
No virus
2.81 kB
import os # to check if file exists
import sys # to flush stdout
import markdown # to render answer
import gradio as gr
#import transformers
#from transformers import pipeline
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
model_repo="TheBloke/Nous-Hermes-13B-GGML"
model_filename="nous-hermes-13b.ggmlv3.q4_K_S.bin"
#model="TheBloke/Nous-Hermes-13B-GGML"
#model="https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q4_K_S.bin"
def download_model():
# See https://github.com/OpenAccess-AI-Collective/ggml-webui/blob/main/tabbed.py
file_path="/home/user/.cache/huggingface/hub/models--TheBloke--Nous-Hermes-13B-GGML/snapshots/f1a48f90a07550e1ba30e347b2be69d4fa5e393b/nous-hermes-13b.ggmlv3.q4_K_S.bin"
if os.path.exists(file_path):
return file_path
else:
print("Downloading model...")
sys.stdout.flush()
file = hf_hub_download(
repo_id=model_repo, filename=model_filename
)
print("Downloaded " + file)
return file
def question_answer(context, question, max_tokens):
mfile=download_model()
# structure the prompt to make it easier for the ai
question1="\"\"\"\n" + question + "\n\"\"\"\n"
text=context + "\n\nQuestion: " + question1 + "\nPlease use markdown formatting for answer. \nAnswer:\n"
llm = Llama(model_path=mfile)
output = llm(text, max_tokens=max_tokens, stop=["### Response"], echo=True)
print(output)
# remove the context and leave only the answer
answer=output['choices'][0]['text']
answer = answer.replace(text, "", 1)
# render the markdown and return the html and question
html_answer = markdown.markdown(answer)
return question, html_answer
'''
Output is of the form:
{
"id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
"object": "text_completion",
"created": 1679561337,
"model": "./models/7B/ggml-model.bin",
"choices": [
{
"text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
"index": 0,
"logprobs": None,
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 14,
"completion_tokens": 28,
"total_tokens": 42
}
}
'''
# old transformers code
#generator = pipeline(model=model, device_map="auto")
#return generator(text)
app=gr.Interface(fn=question_answer, inputs=["text", "text",gr.Slider(33, 2333)], outputs=["textbox", "html"])
app.launch()