import gradio as gr
import pandas as pd
import os
import itertools
from constants import metric_dict, tags, columns
# Download from github and load the data
# TODO: Download every x hours
def download_data(url = "https://github.com/Pranjal2041/GEO/GEO-Bench/leaderboard/leaderboard.jsonl", path = "leaderboard.jsonl"):
ret_code = os.system(f'wget {url} -O {path}_tmp')
if ret_code != 0:
return ret_code
os.system(f'mv {path}_tmp {path}')
return 0
def search_leaderboard(df, queries):
# Assuming DATA_OVERALL is the DataFrame containing the leaderboard data
# filtered_data = df[df["Method"].str.contains(query, case=False, na=False)]
temp_pds = []
for query in queries:
temp_pds.append(df[df["Method"].str.contains(query, case=False, na=False)])
return pd.concat(temp_pds).drop_duplicates()
def search_tags_leaderboard(df, tag_blocks, queries):
return search_leaderboard(filter_tags(df, tag_blocks), queries)
def filter_tags(df, tag_blocks):
def fuzzy_in(x, y_set):
return any(x in z for z in y_set)
all_tags_sets = [set(tag.lower() for tag in tag_block) for tag_block in tag_blocks]
filtered_rows = [i for i, tags in enumerate(complete_dt['tags']) if all('any' in tag_set or any(fuzzy_in(tag.lower(), tag_set) for tag in tags) for tag_set in all_tags_sets)]
return prepare_complete_dt(df.iloc[filtered_rows])
def prepare_complete_dt(complete_dt):
data = []
DATA_OVERALL = complete_dt.copy()
for Method in set(complete_dt['Method']):
data.append([])
data[-1].append(Method)
for metric in metric_dict:
metric_val = metric_dict[metric]
data[-1].append(complete_dt[complete_dt['Method'] == Method][metric_val].mean())
data[-1].append(complete_dt[complete_dt['Method'] == Method]['source'].iloc[0])
DATA_OVERALL = pd.DataFrame(data, columns=columns)
try:
DATA_OVERALL.sort_values(by=['WordPos Overall'], inplace=True, ascending=False)
except: ...
return DATA_OVERALL
def format_df_for_leaderboard(df):
# The source column needs to be embedded directly into the Method column using appropriate markdown.
df['Method'] = df[['source', 'Method']].apply(lambda x: f'{x[1]}', axis=1)
# Convert all float metrics to 1 decimal
df_copy = df.copy()
for metric in metric_dict:
df_copy[metric] = df_copy[metric].apply(lambda x: float(f'{(100*x):.1f}'))
# drop the source column
return df_copy.drop(columns=['source'])
ret_code = 0
# ret_code = download_data()
if ret_code != 0:
print("Leaderboard Download failed")
complete_dt = pd.read_json('leaderboard.jsonl', lines=True, orient='records')
DATA_OVERALL = prepare_complete_dt(complete_dt)
with gr.Blocks() as demo:
demo_content = """
GEO-Bench Leaderboard
- For benchmarking content optimization Methods for Generative Engines.
- GEO-Bench evaluates Methods for optimizing website content to improve visibility in generative engine responses. Benchmark contains 10K queries across 9 datasets covering diverse domains and intents.
- Refer to GEO paper for more details
"""
gr.HTML(demo_content)
with gr.Tabs():
with gr.TabItem('Overall 📊'):
with gr.Row():
gr.Markdown('## Overall Leaderboard')
with gr.Row():
data_overall = gr.components.Dataframe(
format_df_for_leaderboard(DATA_OVERALL),
datatype=["markdown"] + ["number"] * (len(DATA_OVERALL.columns) - 2) + ['markdown'],
type="pandas",
wrap=True,
interactive=False,
)
# data_overall.
with gr.Row():
# search_bar = gr.Textbox(type="text", label="Search for a Method:")
search_bar = gr.Textbox(
placeholder=" 🔍 Search for your Method (separate multiple queries with `,`) and press ENTER...",
show_label=False,
elem_id="search-bar",
)
def search_button_click(query):
filtered_data = search_leaderboard(DATA_OVERALL, [x.strip() for x in query.split(',')])
return format_df_for_leaderboard(filtered_data)
with gr.TabItem('Tag-Wise Results 📊'):
with gr.Row():
gr.Markdown(f"""
## Tag-Wise Results
- The following table shows the results for each tag.
- The tags are sorted in the order of their performance.
- The table is sorted in the order of the overall score.
""")
with gr.Row():
search_bar_tag = gr.Textbox(
placeholder=" 🔍 Search for your Method (separate multiple queries with `,`) and press ENTER...",
show_label=False,
elem_id="search-bar",
)
def search_button_click(query):
filtered_data = search_leaderboard(DATA_OVERALL, [x.strip() for x in query.split(',')])
return format_df_for_leaderboard(filtered_data)
with gr.Row():
boxes = dict()
with gr.Column(min_width=320):
for tag in list(tags.keys())[:3]:
with gr.Box(elem_id="box-filter"):
boxes[tag] = gr.CheckboxGroup(
label=tag,
choices=tags[tag],
value=tags[tag],
interactive=True,
elem_id=f"filter-{tag}",
)
with gr.Column(min_width=320):
for tag in list(tags.keys())[4:]:
with gr.Box(elem_id="box-filter"):
boxes[tag] = gr.CheckboxGroup(
label=tag,
choices=tags[tag],
value=tags[tag],
interactive=True,
elem_id=f"filter-{tag}",
)
with gr.Row():
tag = list(tags.keys())[3]
with gr.Box(elem_id="box-filter"):
boxes[tag] = gr.CheckboxGroup(
label=tag,
choices=tags[tag],
value=tags[tag],
interactive=True,
elem_id=f"filter-{tag}",
)
with gr.Row():
data_tag_wise = gr.components.Dataframe(
format_df_for_leaderboard(DATA_OVERALL),
datatype=["markdown"] + ["number"] * (len(DATA_OVERALL.columns) - 2) + ['markdown'],
type="pandas",
wrap=True,
interactive=False,
)
def filter_tag_click(*boxes):
return format_df_for_leaderboard(filter_tags(complete_dt, list(boxes)))
def search_tag_click(query, *boxes):
return format_df_for_leaderboard(search_tags_leaderboard(complete_dt, list(boxes), [x.strip() for x in query.split(',')]))
for box in boxes:
boxes[box].change(fn=filter_tag_click, inputs=list(boxes.values()), outputs=data_tag_wise)
search_bar_tag.submit(fn=search_tag_click, inputs=[search_bar_tag] + list(boxes.values()), outputs=data_tag_wise)
with gr.TabItem('About GEO-bench 📖'):
with gr.Row():
gr.Markdown(f"""
## About GEO-bench
- GEO-bench is a benchmarking platform for content optimization Methods for generative engines.
- It is a part of the work released under [GEO](https://arxiv.org/abs/2311.09735)
- The benchmark comprises of 9 datasets, 7 of which were publicly available, while 2 have been released by us.
- Dataset can be downloaded from [here](huggingface.co/datasets/GEO-optim/geo-bench)""")
with gr.Row():
# Goal of benchmarking content optimization for generative engines
# Contains 10K carefully curated queries
# Queries are diverse and cover many domains/intents
# Annotated with tags/dimensions like domain, difficulty, etc.
# Above list in HTML format
gr.HTML(f"""
Key-Highlights of GEO-bench
- Goal of benchmarking content optimization for generative engines
- Contains 10K carefully curated queries
- Queries are diverse and cover many domains/intents
- Annotated with tags/dimensions like domain, difficulty, etc.
""")
# Benchmark Link:
# gr.Markdown(f"""### Benchmark Link: [GEO-bench](huggingface.co/datasets/GEO-optim/geo-bench)""")
# Info about tags and other statistics
with gr.TabItem('Submit 📝'):
with gr.Row():
gr.Markdown(f"""
## Submit
- To submit your Method, please check [here](github.com/Pranjal2041/GEO/GEO-Bench/leaderboard/Readme.md)""")
# Create a form to submit, the response should be sent to a google form
search_bar.submit(fn=search_button_click, inputs=search_bar, outputs=data_overall)
if __name__ == "__main__":
demo.launch(share=True)