Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| import re | |
| # ----------------------------- | |
| # 1. Configure the open-source LLM API endpoint | |
| # For demonstration, we can use a hosted inference API on Hugging Face | |
| # that is free to use (to a certain rate limit). | |
| # ----------------------------- | |
| # Example: We'll use an OpenAssistant model endpoint on HF. | |
| # You can find many such endpoints in the Hugging Face "Spaces" or "Models" section | |
| # that provide Inference API for free. | |
| API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5" | |
| #API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/falcon-7b-sft-mix-2000" | |
| headers={} | |
| # ----------------------------- | |
| # 2. Define a function to query the model | |
| # ----------------------------- | |
| def query_model(prompt: str) -> str: | |
| """ | |
| Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response. | |
| """ | |
| # The payload format for text generation can vary by model. We'll try a general approach: | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": 200, # limit response length | |
| "temperature": 0.7, # moderate creativity | |
| } | |
| } | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| if response.status_code == 200: | |
| model_output = response.json() | |
| # "generated_text" or "text" can vary depending on the model | |
| if isinstance(model_output, dict) and "generated_text" in model_output: | |
| return model_output["generated_text"] | |
| elif isinstance(model_output, list) and len(model_output) > 0: | |
| # Some endpoints return a list of dicts | |
| return model_output[0].get("generated_text", "") | |
| else: | |
| return "Error: Unexpected model output format." | |
| else: | |
| return f"Error {response.status_code}: {response.text}" | |
| # ----------------------------- | |
| # 3. Define a simple evaluation function | |
| # This is a naive "keyword and structure" based scoring for demonstration. | |
| # ----------------------------- | |
| def evaluate_response(response: str) -> dict: | |
| """ | |
| Rates the response on a scale of 1β5 for: | |
| 1) Relevance (R) | |
| 2) Depth (D) | |
| 3) Clarity (C) | |
| 4) References (E) | |
| 5) Overall Quality (Q) | |
| Returns a dict with individual scores and total. | |
| """ | |
| # We'll do a very simplistic approach: | |
| # Relevance: presence of 'remote work' or synonyms + mention of 'software engineers' | |
| relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 0 | |
| # Depth: check if the text is > 100 words or includes multiple paragraphs | |
| word_count = len(response.split()) | |
| depth = 5 if word_count > 150 else (4 if word_count > 80 else 0) | |
| # Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs | |
| paragraphs = response.strip().split("\n\n") | |
| clarity = 5 if len(paragraphs) >= 2 else 0 | |
| # References: look for something like 'reference', 'source', 'citation', or an URL | |
| if re.search(r"reference|source|citation|http", response, re.IGNORECASE): | |
| references = 5 | |
| else: | |
| references = 0 | |
| # Overall Quality: a naive combination | |
| # We'll penalize if the text is too short or if it's obviously incomplete | |
| if "..." in response[-10:]: | |
| # If it ends with ... maybe it's incomplete | |
| overall = 0 | |
| else: | |
| overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 0 | |
| # Summation | |
| total_score = relevance + depth + clarity + references + overall | |
| return { | |
| "Relevance": relevance, | |
| "Depth": depth, | |
| "Clarity": clarity, | |
| "References": references, | |
| "Overall": overall, | |
| "Total": total_score | |
| } | |
| # ----------------------------- | |
| # 4. Define the Gradio interface function | |
| # This is the function that runs when user clicks "Generate & Evaluate" | |
| # ----------------------------- | |
| def generate_and_evaluate(prompt: str): | |
| if not prompt.strip(): | |
| return "Please enter a prompt.", {} | |
| # 1) Get LLM response | |
| llm_response = query_model(prompt) | |
| # 2) Evaluate | |
| scores = evaluate_response(llm_response) | |
| return llm_response, scores | |
| # ----------------------------- | |
| # 5. Build the Gradio UI | |
| # ----------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Prompt Evaluator") | |
| gr.Markdown( | |
| "Enter a prompt about something" | |
| "The model will generate a response and our auto-evaluator will score it." | |
| ) | |
| prompt_input = gr.Textbox( | |
| label="Enter your prompt here", | |
| placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'", | |
| lines=3 | |
| ) | |
| generate_button = gr.Button("Generate & Evaluate") | |
| response_output = gr.Textbox( | |
| label="LLM Response", | |
| lines=10 | |
| ) | |
| score_output = gr.JSON( | |
| label="Evaluation Scores", | |
| visible=True | |
| ) | |
| generate_button.click( | |
| fn=generate_and_evaluate, | |
| inputs=[prompt_input], | |
| outputs=[response_output, score_output] | |
| ) | |
| # ----------------------------- | |
| # 6. Launch | |
| # ----------------------------- | |
| if __name__ == "__main__": | |
| demo.launch() | |