|
import pandas as pd |
|
import gradio as gr |
|
import csv |
|
import json |
|
import os |
|
import shutil |
|
from huggingface_hub import Repository |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
|
|
MODEL_INFO = [ |
|
"Models", "Model Size(B)", "Data Source", |
|
"DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC", |
|
"CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC" |
|
] |
|
|
|
|
|
DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', |
|
'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] |
|
|
|
SUBMISSION_NAME = "Chumor-submissions" |
|
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/dnaihao", SUBMISSION_NAME) |
|
CSV_DIR = "./Chumor-submissions/result.csv" |
|
|
|
COLUMN_NAMES = MODEL_INFO |
|
|
|
LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard |
|
|
|
## Introduction |
|
We construct Chumor, the first Chinese humor explanation dataset that exceeds the size of existing humor datasets. Chumor is sourced from Ruo Zhi Ba (弱智吧), a Chinese Reddit-like platform known for sharing intellectually challenging and culturally specific jokes. |
|
|
|
|
|
## What's new about Chumor |
|
|
|
Unlike existing datasets that focus on tasks such as humor detection, punchline identification, or humor generation, Chumor addresses the challenge of humor explanation. This involves not just identifying humor but understanding the reasoning behind it, a task that requires both linguistic and cultural knowledge. Specifically, Chumor tasks the LLMs with determining whether an explanation fully explains the joke. We source the explanations from GPT-4o and ERNIE-4-turbo, and have the entire dataset manually annotated by five native Chinese speakers. |
|
|
|
For detailed information about the dataset, visit our page on Hugging Face: https://huggingface.co/datasets/dnaihao/Chumor. |
|
|
|
If you are interested in replicating these results or wish to evaluate your models using our dataset, access our evaluation scripts available on GitHub: https://github.com/dnaihao/Chumor-dataset. |
|
|
|
If you would like to learn more details about our dataset, please check out our paper: https://arxiv.org/pdf/2406.12754; https://arxiv.org/pdf/2412.17729. |
|
|
|
Below you can find the accuracies of different models tested on this dataset. |
|
|
|
### Acknowledgements |
|
|
|
We construct the leaderboard based on the templated by https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro. |
|
|
|
""" |
|
|
|
TABLE_INTRODUCTION = """ |
|
""" |
|
|
|
LEADERBOARD_INFO = """ |
|
## Dataset Summary |
|
- **Questions and Labels:** The task is to decide whether the provided explanation fully explains the joke (good) or does not fully explain the joke (bad). |
|
- **Sources:** |
|
- **Jokes:** We construct our dataset by including RZB jokes from "Best Annual Threads" between 2018 and 2021 that have been previously crawled (https://github.com/Leymore/ruozhiba). In addition, we directly collect all threads in the "Moderator's Recommendation" section from RZB. |
|
- **Explanations:** We source the explanations from GPT-4o and ERNIE-4-turbo. |
|
- **Annotations:** We manually annotate the generated explanations as either "fully explain the joke" (good) or "partially explain or not explain the joke" (bad). The gold label is determined by the majority vote among five native Chinese speakers. |
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
CITATION_BUTTON_TEXT = r""" |
|
@article{he2024chumor, |
|
title={Chumor 1.0: A Truly Funny and Challenging Chinese Humor Understanding Dataset from Ruo Zhi Ba}, |
|
author={He, Ruiqi and He, Yushu and Bai, Longju and Liu, Jiarui and Sun, Zhenjie and Tang, Zenghao and Wang, He and Xia, Hanchen and Deng, Naihao}, |
|
journal={arXiv preprint arXiv:2406.12754}, |
|
year={2024} |
|
} |
|
|
|
@misc{he2024chumor20benchmarkingchinese, |
|
title={Chumor 2.0: Towards Benchmarking Chinese Humor Understanding}, |
|
author={Ruiqi He and Yushu He and Longju Bai and Jiarui Liu and Zhenjie Sun and Zenghao Tang and He Wang and Hanchen Xia and Rada Mihalcea and Naihao Deng}, |
|
year={2024}, |
|
eprint={2412.17729}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL}, |
|
url={https://arxiv.org/abs/2412.17729}, |
|
} |
|
""" |
|
|
|
SUBMIT_INTRODUCTION = """# Submit on MMLU-Pro Leaderboard Introduction |
|
|
|
## ⚠ Please note that you need to submit the CSV file with the following format: |
|
|
|
```csv |
|
labels |
|
good |
|
good |
|
bad |
|
... |
|
``` |
|
|
|
You can generate an output file in the above format using the evaluation script provided in our GitHub repository. For your convenience, the script and detailed instructions are available at GitHub: https://github.com/dnaihao/Chumor-dataset. After generating the file, please send us an email at dnaihao@umich.edu, attaching the output file. |
|
""" |
|
|
|
|
|
def get_df(): |
|
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) |
|
repo.git_pull() |
|
df = pd.read_csv(CSV_DIR) |
|
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size) |
|
df = df.sort_values(by=['DP Acc'], ascending=False) |
|
return df |
|
|
|
|
|
def add_new_eval( |
|
input_file, |
|
): |
|
if input_file is None: |
|
return "Error! Empty file!" |
|
|
|
upload_data = json.loads(input_file) |
|
print("upload_data:\n", upload_data) |
|
data_row = [f'{upload_data["Model"]}', upload_data['DP Acc']] |
|
print("data_row:\n", data_row) |
|
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, |
|
use_auth_token=HF_TOKEN, repo_type="dataset") |
|
submission_repo.git_pull() |
|
|
|
already_submitted = [] |
|
with open(CSV_DIR, mode='r') as file: |
|
reader = csv.reader(file, delimiter=',') |
|
for row in reader: |
|
already_submitted.append(row[0]) |
|
|
|
if data_row[0] not in already_submitted: |
|
with open(CSV_DIR, mode='a', newline='') as file: |
|
writer = csv.writer(file) |
|
writer.writerow(data_row) |
|
|
|
submission_repo.push_to_hub() |
|
print('Submission Successful') |
|
else: |
|
print('The entry already exists') |
|
|
|
def refresh_data(): |
|
df = get_df() |
|
return df[COLUMN_NAMES] |
|
|
|
|
|
def search_and_filter_models(df, query, min_size, max_size): |
|
filtered_df = df.copy() |
|
|
|
if query: |
|
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)] |
|
|
|
size_mask = filtered_df['Model Size(B)'].apply(lambda x: |
|
(min_size <= 1000.0 <= max_size) if x == 'unknown' or x == '-' or x == 'unk' |
|
else (min_size <= x <= max_size)) |
|
|
|
filtered_df = filtered_df[size_mask] |
|
|
|
return filtered_df[COLUMN_NAMES] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_models(df, query): |
|
if query: |
|
return df[df['Models'].str.contains(query, case=False, na=False)] |
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_size_range(df): |
|
sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown' or x == '-' or x == 'unk' else x) |
|
return float(sizes.min()), float(sizes.max()) |
|
|
|
|
|
def process_model_size(size): |
|
if pd.isna(size) or size == 'unk' or size == "-": |
|
return 'unknown' |
|
try: |
|
val = float(size) |
|
return val |
|
except (ValueError, TypeError): |
|
return 'unknown' |
|
|