import os from pathlib import Path import gradio as gr from huggingface_hub import WebhookPayload, WebhooksServer from utilities.my_logger import setup_logger from utilities.visualize_logs import log_file_to_html_string proj_dir = Path(__name__).parent SUBREDDIT = os.environ["SUBREDDIT"] USERNAME = os.environ["USERNAME"] DATASET_NAME = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}" FREQUENCY = os.environ.get("FREQUENCY", '').lower() if FREQUENCY not in ["daily", "hourly"]: raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'") WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret') logger = setup_logger(__name__) intro_md = f""" # Reddit Dataset Creator This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}) which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details. As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset. """ how_to_md = f""" # How to make your own space and dataset 1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use' - Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri' - You need the `secret` and the `Client ID` from the reddit application. - `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too. 2. Get your writable [huggingface token](https://huggingface.co/settings/tokens) 3. Duplicate Space and fill in the information """ how_does_it_work_md = f""" # Core Components There are 2 core components [main](main.py) and [app](app.py). Main does a few things: - Pulls from a datasource - Updates a dataset on the hub - Updates the README of the dataset - Writes a local log file (inaccessible outside the spaces container) App - Visualizes the log file from Main # Running it This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the log files. I use gradio for `app` and map that to the open port of huggingface spaces. The only communication between `app` and `main` is the log file. """ js_func = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ with gr.Blocks(js=js_func) as ui: with gr.Tab("Application"): gr.Markdown(intro_md) gr.Image(str(proj_dir / 'media' / 'reddit_scraper.drawio.png'), type='filepath') gr.Markdown("# Logs") output = gr.HTML(log_file_to_html_string, every=1) with gr.Tab("How to Create?"): gr.Markdown(how_to_md) with gr.Tab("How does it work?"): gr.Markdown(how_does_it_work_md) app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET) @app.add_webhook("/dataset_repo") async def community(payload: WebhookPayload): if payload.event.scope.startswith("repo"): logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}") if __name__ == '__main__': app.launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False) # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)