Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import re
|
4 |
+
|
5 |
+
# -----------------------------
|
6 |
+
# 1. Configure the open-source LLM API endpoint
|
7 |
+
# For demonstration, we can use a hosted inference API on Hugging Face
|
8 |
+
# that is free to use (to a certain rate limit).
|
9 |
+
# -----------------------------
|
10 |
+
# Example: We'll use an OpenAssistant model endpoint on HF.
|
11 |
+
# You can find many such endpoints in the Hugging Face "Spaces" or "Models" section
|
12 |
+
# that provide Inference API for free.
|
13 |
+
|
14 |
+
API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
15 |
+
headers = {"Authorization": "Bearer HF_API_TOKEN"} # If needed; if the model doesn't require a token, leave blank or remove.
|
16 |
+
|
17 |
+
# -----------------------------
|
18 |
+
# 2. Define a function to query the model
|
19 |
+
# -----------------------------
|
20 |
+
def query_model(prompt: str) -> str:
|
21 |
+
"""
|
22 |
+
Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response.
|
23 |
+
"""
|
24 |
+
# The payload format for text generation can vary by model. We'll try a general approach:
|
25 |
+
payload = {
|
26 |
+
"inputs": prompt,
|
27 |
+
"parameters": {
|
28 |
+
"max_new_tokens": 200, # limit response length
|
29 |
+
"temperature": 0.7, # moderate creativity
|
30 |
+
}
|
31 |
+
}
|
32 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
33 |
+
if response.status_code == 200:
|
34 |
+
model_output = response.json()
|
35 |
+
# "generated_text" or "text" can vary depending on the model
|
36 |
+
if isinstance(model_output, dict) and "generated_text" in model_output:
|
37 |
+
return model_output["generated_text"]
|
38 |
+
elif isinstance(model_output, list) and len(model_output) > 0:
|
39 |
+
# Some endpoints return a list of dicts
|
40 |
+
return model_output[0].get("generated_text", "")
|
41 |
+
else:
|
42 |
+
return "Error: Unexpected model output format."
|
43 |
+
else:
|
44 |
+
return f"Error {response.status_code}: {response.text}"
|
45 |
+
|
46 |
+
# -----------------------------
|
47 |
+
# 3. Define a simple evaluation function
|
48 |
+
# This is a naive "keyword and structure" based scoring for demonstration.
|
49 |
+
# -----------------------------
|
50 |
+
def evaluate_response(response: str) -> dict:
|
51 |
+
"""
|
52 |
+
Rates the response on a scale of 1–5 for:
|
53 |
+
1) Relevance (R)
|
54 |
+
2) Depth (D)
|
55 |
+
3) Clarity (C)
|
56 |
+
4) References (E)
|
57 |
+
5) Overall Quality (Q)
|
58 |
+
Returns a dict with individual scores and total.
|
59 |
+
"""
|
60 |
+
# We'll do a very simplistic approach:
|
61 |
+
# Relevance: presence of 'remote work' or synonyms + mention of 'software engineers'
|
62 |
+
relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 3
|
63 |
+
|
64 |
+
# Depth: check if the text is > 100 words or includes multiple paragraphs
|
65 |
+
word_count = len(response.split())
|
66 |
+
depth = 5 if word_count > 150 else (4 if word_count > 80 else 3)
|
67 |
+
|
68 |
+
# Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs
|
69 |
+
paragraphs = response.strip().split("\n\n")
|
70 |
+
clarity = 5 if len(paragraphs) >= 2 else 3
|
71 |
+
|
72 |
+
# References: look for something like 'reference', 'source', 'citation', or an URL
|
73 |
+
if re.search(r"reference|source|citation|http", response, re.IGNORECASE):
|
74 |
+
references = 5
|
75 |
+
else:
|
76 |
+
references = 2
|
77 |
+
|
78 |
+
# Overall Quality: a naive combination
|
79 |
+
# We'll penalize if the text is too short or if it's obviously incomplete
|
80 |
+
if "..." in response[-10:]:
|
81 |
+
# If it ends with ... maybe it's incomplete
|
82 |
+
overall = 3
|
83 |
+
else:
|
84 |
+
overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 4
|
85 |
+
|
86 |
+
# Summation
|
87 |
+
total_score = relevance + depth + clarity + references + overall
|
88 |
+
|
89 |
+
return {
|
90 |
+
"Relevance": relevance,
|
91 |
+
"Depth": depth,
|
92 |
+
"Clarity": clarity,
|
93 |
+
"References": references,
|
94 |
+
"Overall": overall,
|
95 |
+
"Total": total_score
|
96 |
+
}
|
97 |
+
|
98 |
+
# -----------------------------
|
99 |
+
# 4. Define the Gradio interface function
|
100 |
+
# This is the function that runs when user clicks "Generate & Evaluate"
|
101 |
+
# -----------------------------
|
102 |
+
def generate_and_evaluate(prompt: str):
|
103 |
+
if not prompt.strip():
|
104 |
+
return "Please enter a prompt.", {}
|
105 |
+
|
106 |
+
# 1) Get LLM response
|
107 |
+
llm_response = query_model(prompt)
|
108 |
+
|
109 |
+
# 2) Evaluate
|
110 |
+
scores = evaluate_response(llm_response)
|
111 |
+
|
112 |
+
return llm_response, scores
|
113 |
+
|
114 |
+
# -----------------------------
|
115 |
+
# 5. Build the Gradio UI
|
116 |
+
# -----------------------------
|
117 |
+
with gr.Blocks() as demo:
|
118 |
+
gr.Markdown("# Remote Work Benefits Generator & Evaluator")
|
119 |
+
gr.Markdown(
|
120 |
+
"Enter a prompt about the key benefits of remote work for software engineers. "
|
121 |
+
"The model will generate a response and our auto-evaluator will score it."
|
122 |
+
)
|
123 |
+
|
124 |
+
prompt_input = gr.Textbox(
|
125 |
+
label="Enter your prompt here",
|
126 |
+
placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'",
|
127 |
+
lines=3
|
128 |
+
)
|
129 |
+
|
130 |
+
generate_button = gr.Button("Generate & Evaluate")
|
131 |
+
|
132 |
+
response_output = gr.Textbox(
|
133 |
+
label="LLM Response",
|
134 |
+
lines=10
|
135 |
+
)
|
136 |
+
|
137 |
+
score_output = gr.JSON(
|
138 |
+
label="Evaluation Scores",
|
139 |
+
visible=True
|
140 |
+
)
|
141 |
+
|
142 |
+
generate_button.click(
|
143 |
+
fn=generate_and_evaluate,
|
144 |
+
inputs=[prompt_input],
|
145 |
+
outputs=[response_output, score_output]
|
146 |
+
)
|
147 |
+
|
148 |
+
# -----------------------------
|
149 |
+
# 6. Launch
|
150 |
+
# -----------------------------
|
151 |
+
if __name__ == "__main__":
|
152 |
+
demo.launch()
|