Spaces:

MudassirFayaz
/

testing

Runtime error

App Files Files Community

testing / app.py

MudassirFayaz

Update app.py

4431147 verified 5 months ago

raw

history blame

5.55 kB

	import os
	from threading import Thread
	from typing import Iterator, List, Tuple

	import torch
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from peft import PeftModel
	import gradio as gr
	from gradio import Blocks
	from transformers import TextIteratorStreamer

	# Load the base model and tokenizer
	base_model = AutoModelForCausalLM.from_pretrained(
	'meta-llama/Llama-2-7b-chat-hf',
	trust_remote_code=True,
	device_map="auto",
	torch_dtype=torch.float16,
	)
	tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf')

	# Load the finetuned model
	model = PeftModel.from_pretrained(base_model, 'FinGPT/fingpt-forecaster_dow30_llama2-7b_lora')
	model = model.eval()

	# Define constants
	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	# FastAPI setup
	app = FastAPI()

	class ChatRequest(BaseModel):
	message: str
	chat_history: List[Tuple[str, str]] = []
	system_prompt: str = ""
	max_new_tokens: int = 1024
	temperature: float = 0.6
	top_p: float = 0.9
	top_k: int = 50
	repetition_penalty: float = 1.2

	@app.post("/chat/")
	async def chat(request: ChatRequest):
	try:
	response = await generate_response(
	request.message,
	request.chat_history,
	request.system_prompt,
	request.max_new_tokens,
	request.temperature,
	request.top_p,
	request.top_k,
	request.repetition_penalty
	)
	return {"response": response}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	async def generate_response(
	message: str,
	chat_history: List[Tuple[str, str]],
	system_prompt: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> str:
	conversation = []
	if system_prompt:
	conversation.append({"role": "system", "content": system_prompt})
	for user, assistant in chat_history:
	conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
	conversation.append({"role": "user", "content": message})

	input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]

	input_ids = input_ids.to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = {
	"input_ids": input_ids,
	"streamer": streamer,
	"max_new_tokens": max_new_tokens,
	"do_sample": True,
	"top_p": top_p,
	"top_k": top_k,
	"temperature": temperature,
	"num_beams": 1,
	"repetition_penalty": repetition_penalty,
	}
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	return "".join(outputs)

	# Gradio setup
	def generate(
	message: str,
	chat_history: List[Tuple[str, str]],
	system_prompt: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> Iterator[str]:
	return generate_response(
	message,
	chat_history,
	system_prompt,
	max_new_tokens,
	temperature,
	top_p,
	top_k,
	repetition_penalty
	)

	chat_interface = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Textbox(label="System prompt", lines=6),
	gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	),
	gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.6,
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.9,
	),
	gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=1000,
	step=1,
	value=50,
	),
	gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.2,
	),
	],
	stop_btn=None,
	examples=[
	["Hello there! How are you doing?"],
	["Can you explain briefly to me what is the Python programming language?"],
	["Explain the plot of Cinderella in a sentence."],
	["How many hours does it take a man to eat a Helicopter?"],
	["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
	],
	)

	with Blocks(css="style.css") as demo:
	gr.Markdown("# Llama-2 7B Chat")
	gr.Markdown("""
	This Space demonstrates the Llama-2 7B Chat model by Meta, fine-tuned for chat instructions.
	Feel free to chat with the model here or use the API to integrate it into your applications.
	""")
	chat_interface.render()
	gr.Markdown("---")
	gr.Markdown("This demo is governed by the original [license](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/LICENSE.txt).")

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()