# import datetime import json import os from pathlib import Path import gradio as gr import huggingface_hub as hfh # from apscheduler.schedulers.background import BackgroundScheduler DATASET_ID = "albertvillanova/datasets-report" DATASET_PATH = "dataset" DATA_DIR = "data" DATA_PATH = f"{DATASET_PATH}/{DATA_DIR}" def pull_dataset_repo(repo_id=DATASET_ID, repo_path=DATASET_PATH): token = os.environ.get('HUB_TOKEN') repo = hfh.Repository( local_dir=repo_path, clone_from=repo_id, repo_type="dataset", use_auth_token=token, ) repo.git_pull() return repo def load_dates(): return [data_path.stem for data_path in sorted(Path(DATA_PATH).iterdir())] repo = pull_dataset_repo() dates = load_dates() datasets = hfh.list_datasets() def filter_datasets_by_date(date_from, date_to): with open(f"{DATA_PATH}/{date_from}.json") as f: ids_from = json.load(f) with open(f"{DATA_PATH}/{date_to}.json") as f: ids_to = json.load(f) ids = set(ids_to) - set(ids_from) dss = [ds for ds in datasets if ds.id in ids] for ds in dss: try: _ = getattr(ds, "downloads") except AttributeError: setattr(ds, "downloads", 0) dss = sorted(dss, key=lambda item: item.downloads, reverse=True) return dss def filter_dataframe(date_from, date_to): dss = filter_datasets_by_date(date_from, date_to) return [[ds.id, ds.downloads] for ds in dss] # def update_datasets(): # # Retrieve datasets # datasets = hfh.list_datasets() # # Save dataset IDs # repo = pull_dataset_repo() # os.makedirs(DATA_PATH, exist_ok=True) # today = datetime.datetime.now(datetime.timezone.utc).date().isoformat() # with repo.commit(f"Add {today} data file"): # with open(f"data/{today}.json", "w") as f: # json.dump([ds.id for ds in sorted(datasets, key=lambda item: item.id)], f) # # # scheduler = BackgroundScheduler() # scheduler.add_job(update_datasets, trigger="cron", hour=0, minute=1, timezone=datetime.timezone.utc) # scheduler.start() with gr.Blocks() as demo: with gr.Row(): date_from = gr.Dropdown(choices=dates, label="Date from") date_to = gr.Dropdown(choices=dates, label="Date to") submit_btn = gr.Button("Submit") outputs = gr.Dataframe( headers=["Dataset", "Downloads"], datatype=["str", "number"], label="Created datasets", ) submit_btn.click(fn=filter_dataframe, inputs=[date_from, date_to], outputs=outputs) demo.launch()