## Imports from huggingface_hub import hf_hub_download from llama_cpp import Llama import gradio as gr import copy ## Download the GGUF model model_name = "kazuma313/lora_model_dokter_consultasi_q4_k_m" model_file = "lora_model_dokter_consultasi_q4_k_m-unsloth.Q4_K_M.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred model_path = hf_hub_download(model_name, filename=model_file) llm = Llama( model_path=model_path, n_ctx=2048, # Context length to use n_threads=4, # Number of CPU threads to use # n_gpu_layers=0 # Number of model layers to offload to GPU # chat_format="chatml", verbose=False ) prompt_template="""<|begin_of_text|>Dibawah ini adalah percakapan antara dokter dengan pasiennya yang ingin berkonsultasi terkait kesehatan. Tuliskan jawaban yang tepat dan lengkap sesuai sesuai pertanyaan dari pasien.<|end_of_text|> ### Pertanyaan: {ask} ### Jawaban: """ def output_inference(tanya, history): temp = "" prompt = prompt_template.format(ask=tanya) output = llm( prompt, stop=["<|end_of_text|>","Pertanyaan:","Jawaban:", "###"], max_tokens=512, temperature=0.2, top_p=0.95, top_k=40, min_p=0.05, typical_p=1.0, repeat_penalty=1.1, stream=True, ) for out in output: stream = copy.deepcopy(out) temp += stream["choices"][0]["text"] yield temp history = ["init", prompt] gr.ChatInterface( output_inference, chatbot=gr.Chatbot(height=300), textbox=gr.Textbox(placeholder="Tanya saya kesehatan anda", container=False, scale=7), title="Konsultasi dokter", description="Tanya saja semua keluhan mu", theme="soft", examples=["apa saja tips agar badan sehat?", "apa efek samping dari minum alkohol berlebihan?", "berapa hasil dari 10 + 5?"], cache_examples=True, retry_btn=None, undo_btn="Delete Previous", clear_btn="Clear", ).launch()