File size: 3,420 Bytes
9ae1b66
 
 
 
 
 
 
ba7deb1
9ae1b66
ba7deb1
9ae1b66
 
 
 
 
 
 
ba7deb1
9ae1b66
 
 
 
 
 
 
 
 
 
 
 
aeb2044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ae1b66
 
 
aeb2044
 
 
9ae1b66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba7deb1
9ae1b66
 
 
ba7deb1
 
 
 
9ae1b66
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from pathlib import Path

import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer

from src.my_logger import setup_logger
from src.utilities import load_datasets, merge_and_update_datasets
from src.visualize_logs import log_file_to_html_string
from src.build_nomic import build_nomic

proj_dir = Path(__name__).parent

logger = setup_logger(__name__)

SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')

intro_md = """
# Processing BORU
This space is triggered by a webhook for changes on 
[derek-thomas/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-bestofredditorupdates).
 It then takes the updates from that dataset and get embeddings and puts the results in 
[https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed)
"""

html_str = """
<html>

<head>
  <title>conll2003</title>
  <style>
    body {
      font-family: Arial, sans-serif;
      background-color: #f0f0f0;
      display: flex;
      justify-content: center;
      align-items: center;
      height: 100vh;
      margin: 0;
      padding: 0;
      color: #333;
    }
    .iframe-container {
      border: 1px solid #ccc;
      border-radius: 10px;
      overflow: hidden;
      width: 80%;
      height: 80%;
      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
    }
    iframe {
      width: 100%;
      height: 100%;
      border: none;
    }
  </style>
</head>

<body>
  <div class="iframe-container">
    <iframe src="https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map/cdd8c890-2fac-4ea6-91f8-e6821203cfcb" allow="clipboard-read; clipboard-write"
      title="Nomic Atlas"></iframe>
  </div>
</body>

</html>"""

with gr.Blocks() as ui:
    with gr.Tab("Application"):
        gr.Markdown(intro_md)
        gr.HTML(html_str)
    with gr.Tab("Logs"):
        gr.Markdown("# Logs")
        output = gr.HTML(log_file_to_html_string, every=1)

app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)


@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
    if payload.event.scope.startswith("repo"):
        logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
    else:
        return

    logger.info(f"Loading new dataset...")
    dataset, original_dataset = load_datasets()
    logger.info(f"Loaded new dataset")

    logger.info(f"Merging and Updating row...")
    dataset = merge_and_update_datasets(dataset, original_dataset)

    # Push the augmented dataset to the Hugging Face hub
    logger.info(f"Pushing processed data to the Hugging Face Hub...")
    dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
    logger.info(f"Pushed processed data to the Hugging Face Hub")

    logger.info(f"Building Nomic...")
    build_nomic(dataset=dataset)
    logger.info(f"Built Nomic")

if __name__ == '__main__':
    app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
    # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)