Spaces:
Running
Running
import os | |
import re | |
from datetime import datetime | |
import gradio as gr | |
import openai | |
import gspread | |
from oauth2client.service_account import ServiceAccountCredentials | |
# Set OpenAI API key from an environment variable. | |
openai.api_key = os.environ["OPENAI_API_KEY"] | |
def get_evaluation_questions(): | |
""" | |
Loads evaluation questions and expected answers from environment variables. | |
Expected environment variable names are: | |
TEST_QUESTION_1, TEST_EXPECTED_1, | |
TEST_QUESTION_2, TEST_EXPECTED_2, and so on. | |
""" | |
questions = [] | |
i = 1 | |
while True: | |
question = os.environ.get(f"TEST_QUESTION_{i}") | |
expected = os.environ.get(f"TEST_EXPECTED_{i}") | |
if not question or not expected: | |
break | |
questions.append({"question": question, "expected": expected}) | |
i += 1 | |
return questions | |
# Load the evaluation questions once at startup. | |
EVALUATION_QUESTIONS = get_evaluation_questions() | |
def init_sheet(): | |
""" | |
Initializes and returns the Google Sheet. | |
The sheet name is taken from the SHEET_NAME environment variable, | |
defaulting to "Prompt Evaluations" if not set. | |
Ensure that your service account credentials (credentials.json) | |
are available and that the sheet is shared with the service account's email. | |
""" | |
scopes = [ | |
"https://spreadsheets.google.com/feeds", | |
"https://www.googleapis.com/auth/spreadsheets", | |
"https://www.googleapis.com/auth/drive" | |
] | |
creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scopes) | |
client = gspread.authorize(creds) | |
sheet_name = os.environ.get("SHEET_NAME", "Prompt Evaluations") | |
sheet = client.open(sheet_name).sheet1 | |
return sheet | |
def sanitize_input(text): | |
""" | |
Sanitizes input to prevent harmful content and limits its length. | |
""" | |
# Allow alphanumerics and some punctuation, then truncate to 500 characters. | |
clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text) | |
return clean_text.strip()[:500] | |
def evaluate_prompt(email, name, system_prompt): | |
""" | |
For each test question: | |
- Uses the provided system prompt to generate a response with GPT-4o Mini. | |
- Checks if the expected substring is present. | |
- Computes an aggregate score. | |
Logs the user's email, name, system prompt, and score to a Google Sheet. | |
""" | |
# Sanitize the inputs. | |
email = sanitize_input(email) | |
name = sanitize_input(name) | |
system_prompt = sanitize_input(system_prompt) | |
score = 0 | |
responses = [] | |
for item in EVALUATION_QUESTIONS: | |
question = item["question"] | |
expected = item["expected"] | |
try: | |
response = openai.ChatCompletion.create( | |
model="gpt-4o-mini", # Ensure this identifier matches the deployed model. | |
messages=[ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": question} | |
] | |
) | |
answer = response.choices[0].message["content"].strip() | |
except Exception as e: | |
answer = f"Error during OpenAI API call: {str(e)}" | |
# Simple evaluation: check if the answer contains the expected substring. | |
if expected.lower() in answer.lower(): | |
score += 1 | |
verdict = "Correct" | |
else: | |
verdict = "Incorrect" | |
responses.append( | |
f"Question: {question}\n" | |
f"Answer: {answer}\n" | |
f"Expected: {expected}\n" | |
f"Result: {verdict}\n" | |
) | |
result_details = "\n".join(responses) | |
try: | |
sheet = init_sheet() | |
timestamp = datetime.now().isoformat() | |
row = [timestamp, email, name, score, system_prompt] | |
sheet.append_row(row) | |
except Exception as err: | |
print("Error writing to Google Sheet:", err) | |
return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}" | |
def build_interface(): | |
""" | |
Constructs the Gradio interface. | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown("# GPT-4o Mini Prompt Evaluation") | |
gr.Markdown("Enter your email, name, and a system prompt below:") | |
email_input = gr.Textbox(label="Email", placeholder="your.email@example.com") | |
name_input = gr.Textbox(label="Name", placeholder="Your name") | |
system_prompt_input = gr.Textbox( | |
label="System Prompt", | |
placeholder="Enter your system prompt here...", | |
lines=6 | |
) | |
eval_button = gr.Button("Evaluate") | |
output_text = gr.Textbox(label="Results", lines=15) | |
eval_button.click( | |
fn=evaluate_prompt, | |
inputs=[email_input, name_input, system_prompt_input], | |
outputs=output_text | |
) | |
return demo | |
if __name__ == "__main__": | |
interface = build_interface() | |
# Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container). | |
interface.launch(server_name="0.0.0.0", server_port=7860) | |