import gradio as gr from ctransformers import AutoModelForCausalLM # choose your champion #model_id = "TheBloke/Llama-2-7B-GGML" model_id = "TheBloke/Llama-2-7B-chat-GGML" #model_id = "TheBloke/Llama-2-13B-GGML" #model_id = "TheBloke/Llama-2-13B-chat-GGML" # instantiate other inputs gpu_layers = 130 if '13B' in model_id else 110 config = {'max_new_tokens': 256, 'repetition_penalty': 1.1, 'temperature': 0.1, 'stream': True} # get llm instance llm = AutoModelForCausalLM.from_pretrained(model_id, model_type="llama", #lib='avx2', #for cpu use gpu_layers=gpu_layers, #110 for 7b, 130 for 13b **config ) def predict(prompt): # write prompt & tokenize #system_prompt = """ #""" # send through model res = llm(prompt, stream=False) return res demo = gr.Interface( fn=predict, inputs='text', outputs='text', ) demo.launch()