Spaces:

kat33
/

llama.cpp

Runtime error

File size: 2,809 Bytes

2e1a289
 
f20f7fd
2e1a289
c1e6490
cd5e755
 
 
e721849
cf3dc70
e721849
 
 
 
 
 
2fcbfe7
e721849
36d9e29
2e1a289
 
 
 
 
 
 
 
 
 
3fae970
ab4a091
e721849
9fe5d5a
ab4a091
 
e721849
ab4a091
cd5e755
f20f7fd
 
 
 
ab4a091
f20f7fd
 
 
cd5e755
f20f7fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd5e755
 
f20f7fd
cd5e755
 
3fae970
c1e6490
fa52c5f
85c036e

import os   # to check if file exists
import sys  # to flush stdout
import markdown # to render answer

import gradio as gr
#import transformers
#from transformers import pipeline
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

model_repo="TheBloke/Nous-Hermes-13B-GGML"
model_filename="nous-hermes-13b.ggmlv3.q4_K_S.bin"

#model="TheBloke/Nous-Hermes-13B-GGML"
#model="https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q4_K_S.bin"

def download_model():
    # See https://github.com/OpenAccess-AI-Collective/ggml-webui/blob/main/tabbed.py
    file_path="/home/user/.cache/huggingface/hub/models--TheBloke--Nous-Hermes-13B-GGML/snapshots/f1a48f90a07550e1ba30e347b2be69d4fa5e393b/nous-hermes-13b.ggmlv3.q4_K_S.bin"
    if os.path.exists(file_path):
        return file_path
    else:
        print("Downloading model...")
        sys.stdout.flush()
        file = hf_hub_download(
                repo_id=model_repo, filename=model_filename
        )
        print("Downloaded " + file)
        return file

def question_answer(context, question, max_tokens):
    mfile=download_model()
    # structure the prompt to make it easier for the ai
    question1="\"\"\"\n" + question + "\n\"\"\"\n"
    text=context + "\n\nQuestion: " + question1 + "\nPlease use markdown formatting for answer. \nAnswer:\n" 
    llm = Llama(model_path=mfile)
    output = llm(text, max_tokens=max_tokens, stop=["### Response"], echo=True)
    print(output)

    # remove the context and leave only the answer
    answer=output['choices'][0]['text']
    answer = answer.replace(text, "", 1)
    
    # render the markdown and return the html and question
    html_answer = markdown.markdown(answer)
    return question, html_answer
    '''
            Output is of the form:
            {
              "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
              "object": "text_completion",
              "created": 1679561337,
              "model": "./models/7B/ggml-model.bin",
              "choices": [
                    {
                      "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
                      "index": 0,
                      "logprobs": None,
                      "finish_reason": "stop"
                    }
                ],
              "usage": {
                "prompt_tokens": 14,
                "completion_tokens": 28,
                "total_tokens": 42
              }
            }
    '''
    
    # old transformers code
    #generator = pipeline(model=model, device_map="auto")
    #return generator(text)


app=gr.Interface(fn=question_answer, inputs=["text", "text",gr.Slider(33, 2333)], outputs=["textbox", "html"])
app.launch()