Spaces:
Sleeping
Sleeping
Shiyu Zhao
commited on
Commit
•
680cbe9
1
Parent(s):
53e6c12
Update space
Browse files- README.md +1 -0
- app.py +293 -41
- requirements.txt +3 -1
README.md
CHANGED
@@ -8,6 +8,7 @@ app_file: app.py
|
|
8 |
pinned: true
|
9 |
license: mit
|
10 |
short_description: leaderboard of Semi-structured Retrieval Benchmark (STaRK)
|
|
|
11 |
---
|
12 |
|
13 |
# Start the configuration
|
|
|
8 |
pinned: true
|
9 |
license: mit
|
10 |
short_description: leaderboard of Semi-structured Retrieval Benchmark (STaRK)
|
11 |
+
hf_oauth: write
|
12 |
---
|
13 |
|
14 |
# Start the configuration
|
app.py
CHANGED
@@ -8,6 +8,9 @@ import json
|
|
8 |
import torch
|
9 |
from tqdm import tqdm
|
10 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
|
|
|
|
11 |
|
12 |
from stark_qa import load_qa
|
13 |
from stark_qa.evaluator import Evaluator
|
@@ -283,62 +286,311 @@ def update_leaderboard_data(submission_data):
|
|
283 |
# Add new row
|
284 |
df_to_update.loc[len(df_to_update)] = new_row
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
def process_submission(
|
287 |
method_name, team_name, dataset, split, contact_email,
|
288 |
code_repo, csv_file, model_description, hardware, paper_link
|
289 |
):
|
290 |
"""Process and validate submission"""
|
291 |
try:
|
292 |
-
#
|
293 |
-
|
294 |
-
|
295 |
-
results = compute_metrics(
|
296 |
-
csv_file.name,
|
297 |
-
dataset=dataset.lower(),
|
298 |
-
split=split,
|
299 |
-
num_workers=4
|
300 |
-
)
|
301 |
|
302 |
-
|
303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
|
305 |
-
# Prepare
|
306 |
-
|
307 |
-
"
|
308 |
-
"
|
309 |
-
"
|
310 |
-
"
|
311 |
-
"
|
312 |
-
"
|
313 |
-
"
|
314 |
-
"
|
315 |
-
"
|
316 |
-
"results": results,
|
317 |
-
"status": "pending_review",
|
318 |
-
"submission_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
319 |
}
|
320 |
|
321 |
-
# Save
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
update_leaderboard_data(submission_data)
|
326 |
|
327 |
-
|
328 |
-
|
|
|
|
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
|
|
335 |
|
336 |
-
|
337 |
-
|
338 |
-
|
|
|
|
|
|
|
339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
except Exception as e:
|
341 |
-
|
|
|
|
|
342 |
|
343 |
def filter_by_model_type(df, selected_types):
|
344 |
if not selected_types:
|
|
|
8 |
import torch
|
9 |
from tqdm import tqdm
|
10 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
11 |
+
import smtplib
|
12 |
+
from email.mime.multipart import MIMEMultipart
|
13 |
+
from email.mime.text import MIMEText
|
14 |
|
15 |
from stark_qa import load_qa
|
16 |
from stark_qa.evaluator import Evaluator
|
|
|
286 |
# Add new row
|
287 |
df_to_update.loc[len(df_to_update)] = new_row
|
288 |
|
289 |
+
# Function to get emails from meta_data
|
290 |
+
def get_emails_from_metadata(meta_data):
|
291 |
+
"""
|
292 |
+
Extracts emails from the meta_data dictionary.
|
293 |
+
|
294 |
+
Args:
|
295 |
+
meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
|
296 |
+
|
297 |
+
Returns:
|
298 |
+
list: A list of email addresses.
|
299 |
+
"""
|
300 |
+
return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]
|
301 |
+
|
302 |
+
# Function to format meta_data as an HTML table (without Prediction CSV)
|
303 |
+
def format_metadata_as_table(meta_data):
|
304 |
+
"""
|
305 |
+
Formats metadata dictionary into an HTML table for the email.
|
306 |
+
Handles multiple contact emails separated by a semicolon.
|
307 |
+
|
308 |
+
Args:
|
309 |
+
meta_data (dict): Dictionary containing submission metadata.
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
str: HTML string representing the metadata table.
|
313 |
+
"""
|
314 |
+
table_rows = ""
|
315 |
+
|
316 |
+
for key, value in meta_data.items():
|
317 |
+
if key == "Contact Email(s)":
|
318 |
+
# Ensure that contact emails are split by semicolon
|
319 |
+
emails = value.split(';')
|
320 |
+
formatted_emails = "; ".join([email.strip() for email in emails])
|
321 |
+
table_rows += f"<tr><td><b>{key}</b></td><td>{formatted_emails}</td></tr>"
|
322 |
+
elif key != "Prediction CSV": # Exclude the Prediction CSV field
|
323 |
+
table_rows += f"<tr><td><b>{key}</b></td><td>{value}</td></tr>"
|
324 |
+
|
325 |
+
table_html = f"""
|
326 |
+
<table border="1" cellpadding="5" cellspacing="0">
|
327 |
+
{table_rows}
|
328 |
+
</table>
|
329 |
+
"""
|
330 |
+
return table_html
|
331 |
+
|
332 |
+
# Function to get emails from meta_data
|
333 |
+
def get_emails_from_metadata(meta_data):
|
334 |
+
"""
|
335 |
+
Extracts emails from the meta_data dictionary.
|
336 |
+
|
337 |
+
Args:
|
338 |
+
meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
|
339 |
+
|
340 |
+
Returns:
|
341 |
+
list: A list of email addresses.
|
342 |
+
"""
|
343 |
+
return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]
|
344 |
+
|
345 |
+
def send_error_notification(meta_data, error_info):
|
346 |
+
"""
|
347 |
+
Sends an email notification about an error during the evaluation process.
|
348 |
+
|
349 |
+
Args:
|
350 |
+
meta_data (dict): Submission metadata to be included in the email.
|
351 |
+
error_info (str): Error message or notification content to be included in the email.
|
352 |
+
|
353 |
+
Returns:
|
354 |
+
None
|
355 |
+
"""
|
356 |
+
emails_to_send = get_emails_from_metadata(meta_data)
|
357 |
+
send_from = 'stark-qa@cs.stanford.edu'
|
358 |
+
recipients_str = ', '.join(emails_to_send)
|
359 |
+
|
360 |
+
# Create the email container
|
361 |
+
msg = MIMEMultipart('alternative')
|
362 |
+
msg['Subject'] = 'STaRK Leaderboard Submission - Error Notification'
|
363 |
+
msg['From'] = send_from
|
364 |
+
msg['To'] = recipients_str
|
365 |
+
|
366 |
+
# Format the metadata table
|
367 |
+
metadata_table = format_metadata_as_table(meta_data)
|
368 |
+
|
369 |
+
# Email body content with metadata table
|
370 |
+
body = f"""
|
371 |
+
<p>Dear STaRK Leaderboard Participant,</p>
|
372 |
+
|
373 |
+
<p>We encountered an issue during the evaluation of your recent submission:</p>
|
374 |
+
|
375 |
+
<p><i>{error_info}</i></p>
|
376 |
+
|
377 |
+
<p>Please verify your inputs and resubmit. If the issue persists, feel free to contact us at stark-qa@cs.stanford.edu with the error details and your dataset information.</p>
|
378 |
+
|
379 |
+
<p>Submitted Metadata:</p>
|
380 |
+
{metadata_table}
|
381 |
+
|
382 |
+
<p>Thank you for your participation.</p>
|
383 |
+
|
384 |
+
<p>Best regards,<br>The STaRK QA Team</p>
|
385 |
+
"""
|
386 |
+
|
387 |
+
msg.attach(MIMEText(body, 'html'))
|
388 |
+
|
389 |
+
# Send the email
|
390 |
+
try:
|
391 |
+
with smtplib.SMTP('localhost') as server:
|
392 |
+
server.sendmail(send_from, emails_to_send, msg.as_string()) # No CC for error notification
|
393 |
+
print("Error notification sent successfully.")
|
394 |
+
except Exception as e:
|
395 |
+
print(f"Failed to send error notification: {e}")
|
396 |
+
|
397 |
+
# Function to send a submission confirmation with evaluation results and metadata, CCing the sender
|
398 |
+
def send_submission_confirmation(meta_data, eval_results):
|
399 |
+
"""
|
400 |
+
Sends an email notification confirming submission and including evaluation results and metadata,
|
401 |
+
with an option to CC the sender.
|
402 |
+
|
403 |
+
Args:
|
404 |
+
meta_data (dict): Submission metadata to be included in the email.
|
405 |
+
eval_results (dict): Dictionary of evaluation results to include in the email.
|
406 |
+
|
407 |
+
Returns:
|
408 |
+
None
|
409 |
+
"""
|
410 |
+
emails_to_send = get_emails_from_metadata(meta_data)
|
411 |
+
send_from = 'stark-qa@cs.stanford.edu'
|
412 |
+
recipients_str = ', '.join(emails_to_send)
|
413 |
+
|
414 |
+
# Create the email container
|
415 |
+
msg = MIMEMultipart('alternative')
|
416 |
+
msg['Subject'] = 'STaRK Leaderboard Submission - Evaluation Results'
|
417 |
+
msg['From'] = send_from
|
418 |
+
msg['To'] = recipients_str
|
419 |
+
msg['Cc'] = send_from # CC the sender only for success notification
|
420 |
+
|
421 |
+
# Format the evaluation results and metadata table
|
422 |
+
formatted_results = format_evaluation_results(eval_results)
|
423 |
+
metadata_table = format_metadata_as_table(meta_data)
|
424 |
+
|
425 |
+
# Email body content with evaluation results and metadata table
|
426 |
+
body = f"""
|
427 |
+
<p>Dear STaRK Leaderboard Participant,</p>
|
428 |
+
|
429 |
+
<p>Thank you for your submission to the STaRK leaderboard. We are pleased to inform you that the evaluation has been completed. Below are the results of your submission:</p>
|
430 |
+
|
431 |
+
<pre>{formatted_results}</pre>
|
432 |
+
|
433 |
+
<p>Submitted Metadata:</p>
|
434 |
+
{metadata_table}
|
435 |
+
|
436 |
+
<p>Your submission will be reviewed. Once approved, the results will be updated on the leaderboard within the next 48 business hours. If there are problems in the metadata that you submitted, one of our team members will reach out to you.</p>
|
437 |
+
|
438 |
+
<p>If you would like to withdraw your submission, simply reply to this email with "withdrawn."</p>
|
439 |
+
|
440 |
+
<p>We appreciate your participation and look forward to sharing your results on our leaderboard.</p>
|
441 |
+
|
442 |
+
<p>Best regards,<br>The STaRK QA Team</p>
|
443 |
+
"""
|
444 |
+
|
445 |
+
msg.attach(MIMEText(body, 'html'))
|
446 |
+
|
447 |
+
# Send the email
|
448 |
+
try:
|
449 |
+
with smtplib.SMTP('localhost') as server:
|
450 |
+
server.sendmail(send_from, emails_to_send + [send_from], msg.as_string()) # Include sender in recipients for CC
|
451 |
+
print("Submission confirmation sent successfully.")
|
452 |
+
except Exception as e:
|
453 |
+
print(f"Failed to send submission confirmation: {e}")
|
454 |
+
|
455 |
+
|
456 |
def process_submission(
|
457 |
method_name, team_name, dataset, split, contact_email,
|
458 |
code_repo, csv_file, model_description, hardware, paper_link
|
459 |
):
|
460 |
"""Process and validate submission"""
|
461 |
try:
|
462 |
+
# Input validation
|
463 |
+
if not all([method_name, team_name, dataset, split, contact_email, code_repo, csv_file]):
|
464 |
+
return "Error: Please fill in all required fields"
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
|
466 |
+
# Length validation
|
467 |
+
if len(method_name) > 25:
|
468 |
+
return "Error: Method name must be 25 characters or less"
|
469 |
+
if len(team_name) > 25:
|
470 |
+
return "Error: Team name must be 25 characters or less"
|
471 |
+
if not validate_email(contact_email):
|
472 |
+
return "Error: Invalid email format"
|
473 |
+
if not validate_github_url(code_repo):
|
474 |
+
return "Error: Invalid GitHub repository URL"
|
475 |
|
476 |
+
# Prepare metadata for email
|
477 |
+
meta_data = {
|
478 |
+
"Method Name": method_name,
|
479 |
+
"Team Name": team_name,
|
480 |
+
"Dataset": dataset,
|
481 |
+
"Split": split,
|
482 |
+
"Contact Email(s)": contact_email,
|
483 |
+
"Code Repository": code_repo,
|
484 |
+
"Model Description": model_description,
|
485 |
+
"Hardware": hardware,
|
486 |
+
"(Optional) Paper link": paper_link
|
|
|
|
|
|
|
487 |
}
|
488 |
|
489 |
+
# Save CSV file
|
490 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
491 |
+
model_name_clean = sanitize_name(method_name)
|
492 |
+
team_name_clean = sanitize_name(team_name)
|
|
|
493 |
|
494 |
+
# Create directory structure in the HuggingFace space
|
495 |
+
base_dir = "submissions" # This will be in the HF space root
|
496 |
+
submission_dir = os.path.join(base_dir, f"{model_name_clean}_{team_name_clean}")
|
497 |
+
os.makedirs(submission_dir, exist_ok=True)
|
498 |
|
499 |
+
# Save CSV file
|
500 |
+
csv_filename = f"predictions_{timestamp}.csv"
|
501 |
+
csv_path = os.path.join(submission_dir, csv_filename)
|
502 |
+
if hasattr(csv_file, 'name'):
|
503 |
+
with open(csv_file.name, 'rb') as source, open(csv_path, 'wb') as target:
|
504 |
+
target.write(source.read())
|
505 |
|
506 |
+
# Validate CSV file
|
507 |
+
csv_valid, csv_message = validate_csv(csv_file)
|
508 |
+
if not csv_valid:
|
509 |
+
error_message = f"Error with CSV file: {csv_message}"
|
510 |
+
send_error_notification(meta_data, error_message)
|
511 |
+
return error_message
|
512 |
|
513 |
+
# Process CSV file through evaluation pipeline
|
514 |
+
try:
|
515 |
+
results = compute_metrics(
|
516 |
+
csv_file.name,
|
517 |
+
dataset=dataset.lower(),
|
518 |
+
split=split,
|
519 |
+
num_workers=4
|
520 |
+
)
|
521 |
+
|
522 |
+
if isinstance(results, str) and results.startswith("Error"):
|
523 |
+
send_error_notification(meta_data, results)
|
524 |
+
return f"Evaluation error: {results}"
|
525 |
+
|
526 |
+
# Multiply results by 100 and round to 2 decimal places
|
527 |
+
processed_results = {
|
528 |
+
"hit@1": round(results['hit@1'] * 100, 2),
|
529 |
+
"hit@5": round(results['hit@5'] * 100, 2),
|
530 |
+
"recall@20": round(results['recall@20'] * 100, 2),
|
531 |
+
"mrr": round(results['mrr'] * 100, 2)
|
532 |
+
}
|
533 |
+
|
534 |
+
# Prepare submission data
|
535 |
+
submission_data = {
|
536 |
+
"method_name": method_name,
|
537 |
+
"team_name": team_name,
|
538 |
+
"dataset": dataset,
|
539 |
+
"split": split,
|
540 |
+
"contact_email": contact_email,
|
541 |
+
"code_repo": code_repo,
|
542 |
+
"model_description": model_description,
|
543 |
+
"hardware": hardware,
|
544 |
+
"paper_link": paper_link,
|
545 |
+
"results": processed_results,
|
546 |
+
"status": "pending_review",
|
547 |
+
"submission_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
548 |
+
"csv_path": csv_path
|
549 |
+
}
|
550 |
+
|
551 |
+
# Save metadata
|
552 |
+
metadata_path = os.path.join(submission_dir, f"metadata_{timestamp}.json")
|
553 |
+
with open(metadata_path, 'w') as f:
|
554 |
+
json.dump(submission_data, f, indent=4)
|
555 |
+
|
556 |
+
# Save latest.json
|
557 |
+
latest_path = os.path.join(submission_dir, "latest.json")
|
558 |
+
with open(latest_path, 'w') as f:
|
559 |
+
json.dump({
|
560 |
+
"latest_submission": timestamp,
|
561 |
+
"status": "pending_review",
|
562 |
+
"method_name": method_name
|
563 |
+
}, f, indent=4)
|
564 |
+
|
565 |
+
# Send email confirmation
|
566 |
+
send_submission_confirmation(meta_data, processed_results)
|
567 |
+
|
568 |
+
# Update leaderboard data
|
569 |
+
update_leaderboard_data(submission_data)
|
570 |
+
|
571 |
+
return f"""
|
572 |
+
Submission successful!
|
573 |
+
|
574 |
+
Evaluation Results:
|
575 |
+
Hit@1: {processed_results['hit@1']:.2f}%
|
576 |
+
Hit@5: {processed_results['hit@5']:.2f}%
|
577 |
+
Recall@20: {processed_results['recall@20']:.2f}%
|
578 |
+
MRR: {processed_results['mrr']:.2f}%
|
579 |
+
|
580 |
+
Your submission has been saved and is pending review.
|
581 |
+
A confirmation email has been sent to {contact_email}.
|
582 |
+
Once approved, your results will appear in the leaderboard under the method name: {method_name}
|
583 |
+
"""
|
584 |
+
|
585 |
+
except Exception as e:
|
586 |
+
error_message = f"Error processing submission: {str(e)}"
|
587 |
+
send_error_notification(meta_data, error_message)
|
588 |
+
return error_message
|
589 |
+
|
590 |
except Exception as e:
|
591 |
+
error_message = f"Error processing submission: {str(e)}"
|
592 |
+
send_error_notification(meta_data, error_message)
|
593 |
+
return error_message
|
594 |
|
595 |
def filter_by_model_type(df, selected_types):
|
596 |
if not selected_types:
|
requirements.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
APScheduler
|
2 |
black
|
3 |
datasets
|
|
|
4 |
gradio
|
5 |
gradio[oauth]
|
6 |
gradio_leaderboard==0.0.9
|
@@ -15,4 +16,5 @@ transformers
|
|
15 |
torch
|
16 |
tokenizers>=0.15.0
|
17 |
sentencepiece
|
18 |
-
stark_qa
|
|
|
|
1 |
APScheduler
|
2 |
black
|
3 |
datasets
|
4 |
+
email
|
5 |
gradio
|
6 |
gradio[oauth]
|
7 |
gradio_leaderboard==0.0.9
|
|
|
16 |
torch
|
17 |
tokenizers>=0.15.0
|
18 |
sentencepiece
|
19 |
+
stark_qa
|
20 |
+
smtplib
|