from ctransformers import AutoModelForCausalLM, AutoTokenizer from transformers import pipeline import streamlit as st from langchain.chains import LLMChain from langchain.prompts import PromptTemplate # Simple inference example # output = llm( # "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", # Prompt # max_tokens=512, # Generate up to 512 tokens # stop=[""], # Example stop token - not necessarily correct for this specific model! Please check before using. # echo=True # Whether to echo the prompt #) prompt_format = "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant" template = """Question: {question} Answer:""" def get_llm_response(repo, filename, model_type, gpu_layers, prompt): print("Loading model") llm = AutoModelForCausalLM.from_pretrained(repo, model_file=filename, model_type=model_type, gpu_layers=gpu_layers) print("Model loaded") #llm_prompt = prompt_format.format(system_message=system_prompt, prompt=prompt) print(f"LLM prompt: {prompt}") prompt = PromptTemplate(template=template, input_variables=["question"]) llm_chain = LLMChain(prompt=prompt, llm=llm) response = llm_chain.run(prompt) return response