|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import polars as pl |
|
from huggingface_hub import HfApi |
|
import re |
|
|
|
org_name = "hugging-science" |
|
api = HfApi() |
|
|
|
def fetch_members(): |
|
members = api.list_organization_members(org_name) |
|
return [member.username for member in members] |
|
|
|
member_list = fetch_members() |
|
|
|
|
|
COMBINED_PARQUET_PATH = "datasetcards_new.parquet" |
|
UPDATED_PARQUET_PATH = "datasetcards_new.parquet" |
|
ROWS_PER_PAGE = 50 |
|
|
|
|
|
df = pl.read_parquet(COMBINED_PARQUET_PATH) |
|
df = df.with_columns([ |
|
pl.lit("todo").alias("status"), |
|
pl.lit("").alias("assigned_to") |
|
]).sort(by=["downloads", "last_modified", "usedStorage"], descending=[True, True, True]) |
|
|
|
if "reason" in df.columns: |
|
df = df.with_columns([ |
|
pl.Series( |
|
"reason", |
|
["short description" if x and "short description" in x.lower() else (x if x is not None else "") for x in df["reason"]] |
|
) |
|
]) |
|
|
|
|
|
|
|
|
|
|
|
for col in ["assigned_to", "status"]: |
|
if col not in df.columns: |
|
default_val = "" if col == "assigned_to" else "todo" |
|
df = df.with_columns(pl.lit(default_val).alias(col)) |
|
else: |
|
|
|
default_val = "" if col == "assigned_to" else "todo" |
|
df = df.with_columns(pl.col(col).fill_null(default_val)) |
|
|
|
|
|
DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword", "assigned_to", "status"] |
|
STATUS_OPTIONS = ["todo", "inprogress", "PR submitted", "PR merged"] |
|
|
|
|
|
unique_values = {col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS} |
|
unique_values['assigned_to'] = sorted(member_list) |
|
unique_values['status'] = STATUS_OPTIONS |
|
|
|
|
|
def get_page(df, page, column=None, query=None): |
|
filtered_df = df |
|
if column and query: |
|
if column in DROPDOWN_COLUMNS: |
|
filtered_df = filtered_df.filter(pl.col(column) == query) |
|
else: |
|
q = query.lower().strip() |
|
filtered_df = ( |
|
filtered_df.with_columns([pl.col(column).str.to_lowercase().alias(column)]) |
|
.filter(pl.col(column).str.contains(q, literal=False)) |
|
) |
|
start = page * ROWS_PER_PAGE |
|
page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("") |
|
total_rows = filtered_df.height |
|
total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1 |
|
return page_df, total_pages |
|
|
|
initial_df, total_pages = get_page(df, 0) |
|
columns = list(initial_df.columns) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(""" |
|
# Dataset Insight Portal |
|
|
|
Welcome! This portal helps you explore and manage datasets from our Hugging Face organization. |
|
|
|
## What is this space for? |
|
This space provides a table of datasets along with metadata. You can: |
|
- Browse datasets with pagination. |
|
- Search datasets by various fields. |
|
- Assign responsibility for reviewing datasets (`assigned_to`). |
|
- Track progress using `status`. |
|
|
|
## Why the table? |
|
The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. |
|
|
|
## What does the table contain? |
|
Each row represents a dataset. Columns include: |
|
- **dataset_id**: Unique identifier of the dataset. |
|
- **dataset_url**: Link to the dataset page on Hugging Face. |
|
- **downloads**: Number of downloads. |
|
- **author**: Dataset author. |
|
- **license**: License type. |
|
- **tags**: Tags describing the dataset. Obtained from the dataset card. |
|
- **task_categories**: Categories of tasks the dataset is useful for. Obtained from the dataset card. |
|
- **last_modified**: Date of last update. |
|
- **field, keyword**: Metadata columns describing dataset purpose based on heuristics. Use the `field` and `keyword` to filter for science based datasets. |
|
- **category**: Category of the dataset (`rich` means it is good dataset card. `minimal` means it needs improvement for the reasons below). |
|
- **reason**: Reason why the dataset is classified as `minimal`. Options: `Failed to load card`, `No metadata and no description`, `No metadata and has description`, `Short description`. |
|
- **usedStorage**: Storage used by the dataset (bytes). |
|
- **assigned_to**: Person responsible for the dataset (editable). |
|
- **status**: Progress status (editable). Options: `todo`, `inprogress`, `PR submitted`, `PR merged`. |
|
|
|
## How to use search |
|
- Select a **column** from the dropdown. |
|
- If the column is textual, type your query in the text box. |
|
- If the column is a dropdown (like `assigned_to` or `status`), select the value from the dropdown. |
|
- Click **Search** to filter the table. |
|
|
|
## How to add or update `assigned_to` and `status` |
|
1. Search for the **dataset_id** initially. |
|
2. Then, select the **dataset_id** from the dropdown below the table. |
|
3. Choose the person responsible in **Assigned To**. If you are a member of the organization, your username should appear in the list. Else refresh and try again. |
|
4. Select the current status in **Status**. |
|
5. Click **Save Changes** to update the table and persist the changes. |
|
6. Use **Refresh All** to reload the table and the latest members list. |
|
|
|
This portal makes it easy to keep track of dataset reviews, assignments, and progress all in one place. |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
prev_btn = gr.Button("Previous") |
|
next_btn = gr.Button("Next") |
|
page_number = gr.Number(value=0, label="Page", precision=0) |
|
total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") |
|
|
|
|
|
data_table = gr.Dataframe( |
|
value=initial_df, |
|
headers=columns, |
|
datatype="str", |
|
interactive=False, |
|
row_count=ROWS_PER_PAGE |
|
) |
|
|
|
|
|
with gr.Row(): |
|
col_dropdown = gr.Dropdown(choices=columns, label="Column to Search") |
|
search_text = gr.Textbox(label="Search Text") |
|
search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False) |
|
search_btn = gr.Button("Search") |
|
reset_btn = gr.Button("Reset") |
|
|
|
|
|
selected_dataset_id = gr.Dropdown(label="Select dataset_id", choices=initial_df['dataset_id'].tolist()) |
|
assigned_to_input = gr.Dropdown(choices=member_list, label="Assigned To") |
|
|
|
status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status", value="todo") |
|
|
|
|
|
save_btn = gr.Button("Save Changes") |
|
refresh_btn = gr.Button("Refresh All") |
|
save_message = gr.Textbox(label="Save Status", interactive=False) |
|
|
|
|
|
def update_search_input(column): |
|
if column in DROPDOWN_COLUMNS: |
|
return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False) |
|
else: |
|
return gr.update(visible=False), gr.update(visible=True) |
|
|
|
col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text]) |
|
|
|
|
|
def prefill_fields(dataset_id): |
|
if not dataset_id: |
|
return "", "todo" |
|
dataset_id = str(dataset_id) |
|
filtered = [row for row in df.to_dicts() if str(row.get("dataset_id")) == dataset_id] |
|
if not filtered: |
|
return "", "todo" |
|
row = filtered[0] |
|
return row.get("assigned_to", ""), row.get("status", "todo") |
|
|
|
selected_dataset_id.change(prefill_fields, selected_dataset_id, [assigned_to_input, status_input]) |
|
|
|
|
|
def search_func(page, column, txt, ddl): |
|
query = ddl if column in DROPDOWN_COLUMNS else txt |
|
page_df, total_pages = get_page(df, page, column, query) |
|
return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist()) |
|
|
|
|
|
def next_page(page, column, txt, ddl): |
|
page += 1 |
|
query = ddl if column in DROPDOWN_COLUMNS else txt |
|
page_df, total_pages = get_page(df, page, column, query) |
|
if page >= total_pages: |
|
page = total_pages - 1 |
|
page_df, total_pages = get_page(df, page, column, query) |
|
return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()) |
|
|
|
def prev_page(page, column, txt, ddl): |
|
page = max(0, page - 1) |
|
query = ddl if column in DROPDOWN_COLUMNS else txt |
|
page_df, total_pages = get_page(df, page, column, query) |
|
return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()) |
|
|
|
def reset_func(): |
|
page_df, total_pages = get_page(df, 0) |
|
return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist()) |
|
|
|
|
|
def save_changes(dataset_id, assigned_to_val, status_val, page_val, col, txt, ddl): |
|
global df |
|
if not dataset_id: |
|
return gr.update(value="Please select a row first."), None, None, None |
|
df = df.with_columns([ |
|
pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(assigned_to_val)).otherwise(pl.col("assigned_to")).alias("assigned_to"), |
|
pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(status_val)).otherwise(pl.col("status")).alias("status") |
|
]) |
|
df.write_parquet(UPDATED_PARQUET_PATH) |
|
page_df, total_pages = get_page(df, page_val, col, txt if col not in DROPDOWN_COLUMNS else ddl) |
|
return ( |
|
gr.update(value=f"Saved changes for dataset_id: {dataset_id}"), |
|
page_df, |
|
gr.update(choices=page_df['dataset_id'].tolist()), |
|
f"Total Pages: {total_pages}" |
|
) |
|
|
|
|
|
def refresh_all(page, column, txt, ddl): |
|
global df, member_list, unique_values |
|
|
|
member_list = fetch_members() |
|
unique_values['assigned_to'] = sorted(member_list) |
|
|
|
try: |
|
df = pl.read_parquet(UPDATED_PARQUET_PATH) |
|
except FileNotFoundError: |
|
pass |
|
page_df, total_pages = get_page(df, page, column, txt if column not in DROPDOWN_COLUMNS else ddl) |
|
return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()), gr.update(choices=member_list) |
|
|
|
|
|
inputs_search = [page_number, col_dropdown, search_text, search_dropdown] |
|
outputs_search = [data_table, total_pages_display, page_number, selected_dataset_id] |
|
|
|
search_btn.click(search_func, inputs_search, outputs_search) |
|
next_btn.click(next_page, inputs_search, outputs_search) |
|
prev_btn.click(prev_page, inputs_search, outputs_search) |
|
reset_btn.click(reset_func, [], outputs_search) |
|
save_btn.click( |
|
save_changes, |
|
[selected_dataset_id, assigned_to_input, status_input, page_number, col_dropdown, search_text, search_dropdown], |
|
[save_message, data_table, selected_dataset_id, total_pages_display] |
|
) |
|
refresh_btn.click( |
|
refresh_all, |
|
inputs=[page_number, col_dropdown, search_text, search_dropdown], |
|
outputs=[data_table, total_pages_display, page_number, selected_dataset_id, assigned_to_input] |
|
) |
|
|
|
demo.launch() |
|
|