import os from threading import Thread from typing import Iterator, List, Tuple import torch from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import gradio as gr from gradio import Blocks from transformers import TextIteratorStreamer # Load the base model and tokenizer base_model = AutoModelForCausalLM.from_pretrained( 'meta-llama/Llama-2-7b-chat-hf', trust_remote_code=True, device_map="auto", torch_dtype=torch.float16, ) tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf') # Load the finetuned model model = PeftModel.from_pretrained(base_model, 'FinGPT/fingpt-forecaster_dow30_llama2-7b_lora') model = model.eval() # Define constants MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 1024 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) # FastAPI setup app = FastAPI() class ChatRequest(BaseModel): message: str chat_history: List[Tuple[str, str]] = [] system_prompt: str = "" max_new_tokens: int = 1024 temperature: float = 0.6 top_p: float = 0.9 top_k: int = 50 repetition_penalty: float = 1.2 @app.post("/chat/") async def chat(request: ChatRequest): try: response = await generate_response( request.message, request.chat_history, request.system_prompt, request.max_new_tokens, request.temperature, request.top_p, request.top_k, request.repetition_penalty ) return {"response": response} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) async def generate_response( message: str, chat_history: List[Tuple[str, str]], system_prompt: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2, ) -> str: conversation = [] if system_prompt: conversation.append({"role": "system", "content": system_prompt}) for user, assistant in chat_history: conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) conversation.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt") if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH: input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:] input_ids = input_ids.to(model.device) streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = { "input_ids": input_ids, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": True, "top_p": top_p, "top_k": top_k, "temperature": temperature, "num_beams": 1, "repetition_penalty": repetition_penalty, } t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs = [] for text in streamer: outputs.append(text) return "".join(outputs) # Gradio setup def generate( message: str, chat_history: List[Tuple[str, str]], system_prompt: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2, ) -> Iterator[str]: return generate_response( message, chat_history, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty ) chat_interface = gr.ChatInterface( fn=generate, additional_inputs=[ gr.Textbox(label="System prompt", lines=6), gr.Slider( label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, ), gr.Slider( label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6, ), gr.Slider( label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9, ), gr.Slider( label="Top-k", minimum=1, maximum=1000, step=1, value=50, ), gr.Slider( label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2, ), ], stop_btn=None, examples=[ ["Hello there! How are you doing?"], ["Can you explain briefly to me what is the Python programming language?"], ["Explain the plot of Cinderella in a sentence."], ["How many hours does it take a man to eat a Helicopter?"], ["Write a 100-word article on 'Benefits of Open-Source in AI research'"], ], ) with Blocks(css="style.css") as demo: gr.Markdown("# Llama-2 7B Chat") gr.Markdown(""" This Space demonstrates the Llama-2 7B Chat model by Meta, fine-tuned for chat instructions. Feel free to chat with the model here or use the API to integrate it into your applications. """) chat_interface.render() gr.Markdown("---") gr.Markdown("This demo is governed by the original [license](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/LICENSE.txt).") if __name__ == "__main__": demo.queue(max_size=20).launch()