|
from flask import Flask, request, jsonify, Response |
|
import json |
|
from huggingface_hub import InferenceClient |
|
import threading |
|
import gradio as gr |
|
|
|
app = Flask(__name__) |
|
|
|
|
|
def generate_response(model_name, messages, stream=False): |
|
try: |
|
client = InferenceClient(model_name) |
|
system_instructions = next((msg["content"] for msg in messages if msg["role"] == "system"), "You are a helpful assistant.") |
|
user_prompt = next((msg["content"] for msg in messages if msg["role"] == "user"), "") |
|
|
|
formatted_prompt = f"[SYSTEM] {system_instructions}[QUESTION]{user_prompt}[ANSWER]" |
|
generate_kwargs = dict( |
|
max_new_tokens=100, |
|
do_sample=True, |
|
) |
|
|
|
if stream: |
|
|
|
def stream_response(): |
|
output = "" |
|
for response in client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False): |
|
token_text = response.token.text |
|
output += token_text |
|
if output.endswith("<|assistant|>"): |
|
output = output[:-13] |
|
elif output.endswith("</s>"): |
|
output = output[:-4] |
|
yield json.dumps({"choices": [{"delta": {"content": token_text}}]}) + "\n" |
|
|
|
return Response(stream_response(), content_type="application/json") |
|
|
|
else: |
|
|
|
output = "" |
|
stream_response = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False) |
|
for response in stream_response: |
|
output += response.token.text |
|
return jsonify({"choices": [{"message": {"role": "assistant", "content": output}}]}) |
|
|
|
except Exception as e: |
|
return jsonify({"error": str(e)}) |
|
|
|
|
|
@app.route("/completions", methods=["POST"]) |
|
def completions(): |
|
data = request.json |
|
model_name = data.get("model", "microsoft/Phi-3-mini-4k-instruct") |
|
messages = data.get("messages", []) |
|
stream = data.get("stream", False) |
|
|
|
return generate_response(model_name, messages, stream) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## AI Text Generation") |
|
with gr.Row(): |
|
user_input = gr.Textbox(label="Enter your prompt") |
|
generate_button = gr.Button("Generate") |
|
output_display = gr.Textbox(label="Generated Response") |
|
|
|
def gradio_generate(user_prompt): |
|
return generate_response("microsoft/Phi-3-mini-4k-instruct", [{"role": "user", "content": user_prompt}]) |
|
|
|
generate_button.click( |
|
fn=gradio_generate, |
|
inputs=[user_input], |
|
outputs=output_display |
|
) |
|
|
|
|
|
def start_gradio(): |
|
demo.launch(share=True) |
|
|
|
if __name__ == "__main__": |
|
|
|
gradio_thread = threading.Thread(target=start_gradio) |
|
gradio_thread.start() |
|
|
|
|
|
app.run(port=7860) |
|
|