Spaces:

DeepJudge
/

Applicant-Task-Submission

Running

App Files Files

Applicant-Task-Submission / app.py

Timothy-Vinzent

Update app.py

821e9b3 verified 3 months ago

raw

history blame

5.1 kB

	import os
	import re
	from datetime import datetime

	import gradio as gr
	import openai
	import gspread
	from oauth2client.service_account import ServiceAccountCredentials

	# Set OpenAI API key from an environment variable.
	openai.api_key = os.environ["OPENAI_API_KEY"]

	def get_evaluation_questions():
	"""
	Loads evaluation questions and expected answers from environment variables.
	Expected environment variable names are:
	TEST_QUESTION_1, TEST_EXPECTED_1,
	TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
	"""
	questions = []
	i = 1
	while True:
	question = os.environ.get(f"TEST_QUESTION_{i}")
	expected = os.environ.get(f"TEST_EXPECTED_{i}")
	if not question or not expected:
	break
	questions.append({"question": question, "expected": expected})
	i += 1
	return questions

	# Load the evaluation questions once at startup.
	EVALUATION_QUESTIONS = get_evaluation_questions()

	def init_sheet():
	"""
	Initializes and returns the Google Sheet.
	The sheet name is taken from the SHEET_NAME environment variable,
	defaulting to "Prompt Evaluations" if not set.

	Ensure that your service account credentials (credentials.json)
	are available and that the sheet is shared with the service account's email.
	"""
	scopes = [
	"https://spreadsheets.google.com/feeds",
	"https://www.googleapis.com/auth/spreadsheets",
	"https://www.googleapis.com/auth/drive"
	]
	creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scopes)
	client = gspread.authorize(creds)
	sheet_name = os.environ.get("SHEET_NAME", "Prompt Evaluations")
	sheet = client.open(sheet_name).sheet1
	return sheet

	def sanitize_input(text):
	"""
	Sanitizes input to prevent harmful content and limits its length.
	"""
	# Allow alphanumerics and some punctuation, then truncate to 500 characters.
	clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
	return clean_text.strip()[:500]

	def evaluate_prompt(email, name, system_prompt):
	"""
	For each test question:
	- Uses the provided system prompt to generate a response with GPT-4o Mini.
	- Checks if the expected substring is present.
	- Computes an aggregate score.
	Logs the user's email, name, system prompt, and score to a Google Sheet.
	"""
	# Sanitize the inputs.
	email = sanitize_input(email)
	name = sanitize_input(name)
	system_prompt = sanitize_input(system_prompt)

	score = 0
	responses = []
	for item in EVALUATION_QUESTIONS:
	question = item["question"]
	expected = item["expected"]
	try:
	response = openai.ChatCompletion.create(
	model="gpt-4o-mini", # Ensure this identifier matches the deployed model.
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": question}
	]
	)
	answer = response.choices[0].message["content"].strip()
	except Exception as e:
	answer = f"Error during OpenAI API call: {str(e)}"

	# Simple evaluation: check if the answer contains the expected substring.
	if expected.lower() in answer.lower():
	score += 1
	verdict = "Correct"
	else:
	verdict = "Incorrect"

	responses.append(
	f"Question: {question}\n"
	f"Answer: {answer}\n"
	f"Expected: {expected}\n"
	f"Result: {verdict}\n"
	)

	result_details = "\n".join(responses)

	try:
	sheet = init_sheet()
	timestamp = datetime.now().isoformat()
	row = [timestamp, email, name, score, system_prompt]
	sheet.append_row(row)
	except Exception as err:
	print("Error writing to Google Sheet:", err)

	return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"

	def build_interface():
	"""
	Constructs the Gradio interface.
	"""
	with gr.Blocks() as demo:
	gr.Markdown("# GPT-4o Mini Prompt Evaluation")
	gr.Markdown("Enter your email, name, and a system prompt below:")

	email_input = gr.Textbox(label="Email", placeholder="your.email@example.com")
	name_input = gr.Textbox(label="Name", placeholder="Your name")
	system_prompt_input = gr.Textbox(
	label="System Prompt",
	placeholder="Enter your system prompt here...",
	lines=6
	)
	eval_button = gr.Button("Evaluate")
	output_text = gr.Textbox(label="Results", lines=15)

	eval_button.click(
	fn=evaluate_prompt,
	inputs=[email_input, name_input, system_prompt_input],
	outputs=output_text
	)
	return demo

	if __name__ == "__main__":
	interface = build_interface()
	# Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
	interface.launch(server_name="0.0.0.0", server_port=7860)