chat_llm_v3 / app.py
daniloedu's picture
Duplicate from daniloedu/chat_llm_v2
8c5a3b3
raw
history blame
1.81 kB
import os
import requests
import gradio as gr
from dotenv import load_dotenv
from transformers import AutoTokenizer
load_dotenv()
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-7b-instruct"
headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
def format_chat_prompt(message, instruction):
prompt = f"System:{instruction}\nUser: {message}\nAssistant:"
return prompt
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def respond(message, instruction="A conversation between a user and an AI assistant. The assistant gives helpful and honest answers."):
MAX_TOKENS = 1024 # limit for the model
prompt = format_chat_prompt(message, instruction)
# Check if the prompt is too long and, if so, truncate it
num_tokens = len(tokenizer.encode(prompt))
if num_tokens > MAX_TOKENS:
# Truncate the prompt to fit within the token limit
prompt = tokenizer.decode(tokenizer.encode(prompt)[-MAX_TOKENS:])
response = query({"inputs": prompt})
generated_text = response[0]['generated_text']
assistant_message = generated_text.split("Assistant:")[-1]
assistant_message = assistant_message.split("User:")[0].strip() # Only keep the text before the first "User:"
return assistant_message
iface = gr.Interface(
respond,
inputs=[
gr.inputs.Textbox(label="Your question"),
gr.inputs.Textbox(label="System message", lines=2, default="A conversation between a user and an AI assistant. The assistant gives helpful and honest answers.")
],
outputs=[
gr.outputs.Textbox(label="AI's response")
],
)
iface.launch()