derek-thomas's picture
derek-thomas HF staff
Fixing tabs
9a66c2f
raw
history blame
No virus
4.46 kB
import os
from pathlib import Path
import gradio as gr
from bs4 import BeautifulSoup
from rich.console import Console
from rich.syntax import Syntax
proj_dir = Path(__name__).parent
subreddit = os.environ["SUBREDDIT"]
username = os.environ["USERNAME"]
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
frequency = os.environ.get("FREQUENCY", '').lower()
if frequency not in ["daily", "hourly"]:
raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")
def log_file_to_html_string():
log_file = "mylog.log"
num_lines_visualize = 50
console = Console(record=True, width=150, style="#272822")
with open(log_file, "rt") as f:
# Seek to the end of the file minus 300 lines
# Read the last 300 lines of the file
lines = f.readlines()
lines = lines[-num_lines_visualize:]
# Syntax-highlight the last 300 lines of the file using the Python lexer and Monokai style
output = "".join(lines)
syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
console.print(syntax);
html_content = console.export_html(inline_styles=True)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')
# Modify the <pre> tag
pre_tag = soup.pre
pre_tag['class'] = 'scrollable'
del pre_tag['style']
# Add your custom styles and the .scrollable CSS to the <style> tag
style_tag = soup.style
style_content = """
pre, code {
background-color: #272822;
}
.scrollable {
font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
height: 500px;
overflow: auto;
}
"""
style_tag.append(style_content)
return soup.prettify()
intro_md = f"""
# Reddit Dataset Creator
This is a reddit dataset creator which builds and updates [{dataset_name}](https://huggingface.co/datasets/{dataset_name})
which pulls from [/r/{subreddit}](http://www.reddit.com/r/{subreddit}). Check the dataset for more details.
As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
"""
how_to_md = f"""
# How to make your own space and dataset
1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
- Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
- You need the `secret` and the `Client ID` from the reddit application.
- `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
and fill in the information
"""
how_does_it_work_md = f"""
# Core Components
There are 2 core components [main](main.py) and [app](app.py).
Main does a few things:
- Pulls from a datasource
- Updates a dataset on the hub
- Updates the README of the dataset
- Writes a local log file (inaccessible outside the spaces container)
App
- Visualizes the log file from Main
# Running it
This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
log files. I use gradio for `app` and map that to the open port of huggingface spaces.
The only communication between `app` and `main` is the log file.
"""
with gr.Blocks() as demo:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
demo.load(None,
_js="""
() => {
document.body.classList.toggle('dark');
document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
}
""", )
with gr.Tab("How to Create?"):
gr.Markdown(how_to_md)
with gr.Tab("How does it work?"):
gr.Markdown(how_does_it_work_md)
if __name__ == '__main__':
demo.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)