File size: 5,051 Bytes
8e8a9fc
 
 
749d1d8
5d9e0b8
fceefe7
749d1d8
 
 
fceefe7
 
8e8a9fc
 
fceefe7
 
 
8e8a9fc
fceefe7
 
5d9e0b8
 
c1f39f8
fceefe7
 
 
749d1d8
 
 
e014498
749d1d8
e6a15ab
749d1d8
52bca1a
 
 
d8d1956
52bca1a
 
 
 
749d1d8
5d9e0b8
749d1d8
 
5d9e0b8
 
 
 
 
 
 
 
 
 
 
e6a15ab
 
 
5d9e0b8
 
 
 
 
 
 
749d1d8
5d9e0b8
fc00c85
5d9e0b8
 
 
fceefe7
 
8e8a9fc
24c9f40
8e8a9fc
749d1d8
5d9e0b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3165ac3
 
 
 
 
 
 
 
 
 
 
 
 
5d9e0b8
 
9cd0b93
9a66c2f
 
5d9e0b8
 
 
 
075c34d
f0e56b8
fceefe7
 
962f45f
fceefe7
 
ec203a9
fceefe7
749d1d8
 
f3c4357
fceefe7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
from pathlib import Path

import gradio as gr
from bs4 import BeautifulSoup
from huggingface_hub import WebhookPayload, WebhooksServer
from rich.console import Console
from rich.syntax import Syntax

from utilities.my_logger import setup_logger

proj_dir = Path(__name__).parent

SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
DATASET_NAME = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"

FREQUENCY = os.environ.get("FREQUENCY", '').lower()
if FREQUENCY not in ["daily", "hourly"]:
    raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")

WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')

logger = setup_logger(__name__)


def log_file_to_html_string():
    log_file = "mylog.log"
    num_lines_visualize = 50

    console = Console(record=True, width=150, style="#272822")
    with open(log_file, "rt") as f:
        # Seek to the end of the file minus 300 lines
        # Read the last 300 lines of the file
        lines = f.readlines()
        lines = lines[-num_lines_visualize:]

        # Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
        output = "".join(lines)
        syntax = Syntax(output, "python", theme="monokai", word_wrap=True)

    console.print(syntax);
    html_content = console.export_html(inline_styles=True)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'lxml')

    # Modify the <pre> tag
    pre_tag = soup.pre
    pre_tag['class'] = 'scrollable'
    del pre_tag['style']

    # Add your custom styles and the .scrollable CSS to the <style> tag
    style_tag = soup.style
    style_content = """
pre, code {
    background-color: #272822;
}
    .scrollable {
        font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
        height: 500px;
        overflow: auto;
    }
    """
    style_tag.append(style_content)

    return soup.prettify()


intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}) 
which pulls from [/r/{SUBREDDIT}](http://www.reddit.com/r/{SUBREDDIT}). Check the dataset for more details. 

As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""

how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
    - Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
    - You need the `secret` and the `Client ID` from the reddit application.
    - `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""

how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things: 
- Pulls from a datasource 
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)

App
- Visualizes the log file from Main

# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces. 

The only communication between `app` and `main` is the log file.
"""

js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""


with gr.Blocks(js=js_func) as ui:
    with gr.Tab("Application"):
        gr.Markdown(intro_md)
        gr.Image(str(proj_dir / 'media' / 'reddit_scraper.drawio.png'), type='filepath')
        gr.Markdown("# Logs")
        output = gr.HTML(log_file_to_html_string, every=1)
    with gr.Tab("How to Create?"):
        gr.Markdown(how_to_md)
    with gr.Tab("How does it work?"):
        gr.Markdown(how_does_it_work_md)

app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)


@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
    if payload.event.scope.startswith("repo"):
        logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")


if __name__ == '__main__':
    app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
    # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)