alvanlii's picture
Update app.py
f466394 verified
raw
history blame
3.77 kB
import os
from pathlib import Path
import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer
from utilities.my_logger import setup_logger
from utilities.visualize_logs import log_file_to_html_string
proj_dir = Path(__name__).parent
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
DATASET_NAME = f"{USERNAME}/reddit-{SUBREDDIT}"
FREQUENCY = os.environ.get("FREQUENCY", '').lower()
if FREQUENCY not in ["daily", "hourly"]:
raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
logger = setup_logger(__name__)
intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME})
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details.
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""
how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
- You need the `secret` and the `Client ID` from the reddit application.
- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""
how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things:
- Pulls from a datasource
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)
App
- Visualizes the log file from Main
# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces.
The only communication between `app` and `main` is the log file.
"""
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
with gr.Blocks(js=js_func) as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.Image(str(proj_dir / 'media' / 'reddit_scraper.drawio.png'), type='filepath')
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
with gr.Tab("How to Create?"):
gr.Markdown(how_to_md)
with gr.Tab("How does it work?"):
gr.Markdown(how_does_it_work_md)
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
if payload.event.scope.startswith("repo"):
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860, share=False)