from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import streamlit as st def gptq_model_options(): return [ "TheBloke/Llama-2-7B-Chat-GPTQ", "TheBloke/Llama-2-13B-chat-GPTQ", "TheBloke/meditron-7B-GPTQ", "TheBloke/meditron-70B-GPTQ", ] loaded_model = None loaded_model_name = "" def get_llm_response(model_name_or_path, temperature, do_sample, top_p, top_k, max_new_tokens, repetition_penalty, formatted_prompt): global loaded_model global loaded_model_name if loaded_model != model_name_or_path: loaded_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", trust_remote_code=False, revision="main") loaded_model_name = model_name_or_path tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) print("Formatted prompt:") print(formatted_prompt) st.session_state["llm_messages"].append(formatted_prompt) #print("\n\n*** Generate:") #input_ids = tokenizer(formatted_prompt, return_tensors='pt').input_ids.cuda() #output = model.generate(inputs=input_ids, temperature=temperature, do_sample=do_sample, top_p=top_p, top_k=top_k, max_new_tokens=max_new_tokens) #print(tokenizer.decode(output[0], skip_special_tokens=True)) print("*** Pipeline:") pipe = pipeline( "text-generation", model=loaded_model, tokenizer=tokenizer, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, return_full_text=False ) pipe_response = pipe(formatted_prompt) st.session_state["llm_messages"].append(pipe_response) print(pipe_response) return pipe_response[0]['generated_text']