Spaces:

darpan-jain
/

llm-chat

Runtime error

App Files Files Community

llm-chat / app_chat.py

darpan-jain

Add logging

ae9cafc over 1 year ago

raw

history blame contribute delete

3.7 kB

	from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
	from peft import PeftModel
	import torch
	import transformers
	import gradio as gr
	import time
	import logging

	logging.basicConfig(level=logging.INFO)

	# Dump logs to a file
	logging.getLogger().addHandler(logging.FileHandler("app_chat.log"))


	MODEL = "decapoda-research/llama-7b-hf"
	LORA_WEIGHTS = "tloen/alpaca-lora-7b"
	device = "cpu"
	print(f"Model device = {device}", flush=True)

	def load_model():
	logging.info("Loading model...")
	tokenizer = LlamaTokenizer.from_pretrained(MODEL)
	model = LlamaForCausalLM.from_pretrained(MODEL, device_map={"": device}, low_cpu_mem_usage=True)
	model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device}, torch_dtype=torch.float16)
	model.eval()

	logging.info("Model loaded.")
	return model, tokenizer

	def generate_prompt(input):
	return f""" Below A dialog, where User interacts with you - the AI.

	### Instruction: AI is helpful, kind, obedient, honest, and knows its own limits.

	### User: {input}

	### Response:
	"""

	def eval_prompt(
	model,
	tokenizer,
	input: str,
	temparature = 0.7,
	top_p = 0.75,
	top_k = 40,
	num_beams = 1,
	max_new_tokens = 128,
	**kwargs):

	prompt = generate_prompt(input)
	inputs = tokenizer(prompt, return_tensors = "pt")
	input_ids = inputs["input_ids"]
	generation_config = GenerationConfig(
	temparatue = temparature,
	top_p = top_p,
	top_k = top_k,
	num_beams = num_beams,
	repetition_penalty = 1.17,
	** kwargs,)

	# with torch.inference_mode():
	with torch.no_grad():
	generation_output = model.generate(
	input_ids = input_ids,
	generation_config = generation_config,
	return_dict_in_generate = True,
	output_scores = True,
	max_new_tokens = max_new_tokens,
	)
	s = generation_output.sequences[0]
	response = tokenizer.decode(s)
	print(f"Bot response: {response.split('### Response:')[-1].strip()}")
	bot_response = response.split("### Response:")[-1].strip()
	return bot_response

	def run_app(model, tokenizer):

	logging.info("Starting chat app...")

	with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=True) as chat:
	chatbot = gr.Chatbot(label = "Alpaca Demo")
	msg = gr.Textbox(show_label = False, placeholder = "Enter your text here")
	clear = gr.Button("Clear")

	def user(user_msg, history):
	logging.info("User input received.")
	return "", history + [[user_msg, None]]

	def bot(history):
	logging.info("Processing user input for Alpaca response...")
	last_input = history[-1][0]
	logging.info(f"User input = {last_input}")

	tick = time.time()
	bot_response = eval_prompt(model, tokenizer, last_input)
	logging.info(f"Inference time = {time.time() - tick} seconds")

	history[-1][1] = bot_response
	logging.info("Response generated and added to history.\n")

	return history

	msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
	bot, chatbot, chatbot
	)

	clear.click(lambda: None, None, chatbot, queue=False)

	chat.queue()
	chat.launch(share=True)


	if __name__ == "__main__":

	model, tokenizer = load_model()

	# Run the actual gradio app
	run_app(model, tokenizer)