import os import requests import gradio as gr from dotenv import load_dotenv from transformers import AutoTokenizer load_dotenv() model_name = "tiiuae/falcon-7b-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-7b-instruct" headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} def format_chat_prompt(message, instruction): prompt = f"System:{instruction}\nUser: {message}\nAssistant:" return prompt def query(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.json() def respond(message, instruction="A conversation between a user and an AI assistant. The assistant gives helpful and honest answers."): MAX_TOKENS = 1024 # limit for the model prompt = format_chat_prompt(message, instruction) # Check if the prompt is too long and, if so, truncate it num_tokens = len(tokenizer.encode(prompt)) if num_tokens > MAX_TOKENS: # Truncate the prompt to fit within the token limit prompt = tokenizer.decode(tokenizer.encode(prompt)[-MAX_TOKENS:]) response = query({"inputs": prompt}) generated_text = response[0]['generated_text'] assistant_message = generated_text.split("Assistant:")[-1] assistant_message = assistant_message.split("User:")[0].strip() # Only keep the text before the first "User:" return assistant_message iface = gr.Interface( respond, inputs=[ gr.inputs.Textbox(label="Your question"), gr.inputs.Textbox(label="System message", lines=2, default="A conversation between a user and an AI assistant. The assistant gives helpful and honest answers.") ], outputs=[ gr.outputs.Textbox(label="AI's response") ], ) iface.launch()