derek-thomas's picture
derek-thomas HF staff
Adding nomic embedded
aeb2044
raw
history blame
3.42 kB
import os
from pathlib import Path
import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer
from src.my_logger import setup_logger
from src.utilities import load_datasets, merge_and_update_datasets
from src.visualize_logs import log_file_to_html_string
from src.build_nomic import build_nomic
proj_dir = Path(__name__).parent
logger = setup_logger(__name__)
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
intro_md = """
# Processing BORU
This space is triggered by a webhook for changes on
[derek-thomas/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-bestofredditorupdates).
It then takes the updates from that dataset and get embeddings and puts the results in
[https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed)
"""
html_str = """
<html>
<head>
<title>conll2003</title>
<style>
body {
font-family: Arial, sans-serif;
background-color: #f0f0f0;
display: flex;
justify-content: center;
align-items: center;
height: 100vh;
margin: 0;
padding: 0;
color: #333;
}
.iframe-container {
border: 1px solid #ccc;
border-radius: 10px;
overflow: hidden;
width: 80%;
height: 80%;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
iframe {
width: 100%;
height: 100%;
border: none;
}
</style>
</head>
<body>
<div class="iframe-container">
<iframe src="https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map/cdd8c890-2fac-4ea6-91f8-e6821203cfcb" allow="clipboard-read; clipboard-write"
title="Nomic Atlas"></iframe>
</div>
</body>
</html>"""
with gr.Blocks() as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.HTML(html_str)
with gr.Tab("Logs"):
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
if payload.event.scope.startswith("repo"):
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
else:
return
logger.info(f"Loading new dataset...")
dataset, original_dataset = load_datasets()
logger.info(f"Loaded new dataset")
logger.info(f"Merging and Updating row...")
dataset = merge_and_update_datasets(dataset, original_dataset)
# Push the augmented dataset to the Hugging Face hub
logger.info(f"Pushing processed data to the Hugging Face Hub...")
dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
logger.info(f"Pushed processed data to the Hugging Face Hub")
logger.info(f"Building Nomic...")
build_nomic(dataset=dataset)
logger.info(f"Built Nomic")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)