Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import gradio as gr | |
from dotenv import load_dotenv | |
from transformers import AutoTokenizer | |
load_dotenv() | |
model_name = "tiiuae/falcon-7b-instruct" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-7b-instruct" | |
headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} | |
def format_chat_prompt(message, instruction): | |
prompt = f"System:{instruction}\nUser: {message}\nAssistant:" | |
return prompt | |
def query(payload): | |
response = requests.post(API_URL, headers=headers, json=payload) | |
return response.json() | |
def respond(message, instruction="A conversation between a user and an AI assistant. The assistant gives helpful and honest answers."): | |
MAX_TOKENS = 1024 # limit for the model | |
prompt = format_chat_prompt(message, instruction) | |
# Check if the prompt is too long and, if so, truncate it | |
num_tokens = len(tokenizer.encode(prompt)) | |
if num_tokens > MAX_TOKENS: | |
# Truncate the prompt to fit within the token limit | |
prompt = tokenizer.decode(tokenizer.encode(prompt)[-MAX_TOKENS:]) | |
response = query({"inputs": prompt}) | |
generated_text = response[0]['generated_text'] | |
assistant_message = generated_text.split("Assistant:")[-1] | |
assistant_message = assistant_message.split("User:")[0].strip() # Only keep the text before the first "User:" | |
return assistant_message | |
iface = gr.Interface( | |
respond, | |
inputs=[ | |
gr.inputs.Textbox(label="Your question"), | |
gr.inputs.Textbox(label="System message", lines=2, default="A conversation between a user and an AI assistant. The assistant gives helpful and honest answers.") | |
], | |
outputs=[ | |
gr.outputs.Textbox(label="AI's response") | |
], | |
) | |
iface.launch() | |