derek-thomas's picture
derek-thomas HF staff
Adding check to only operate on main
d0819c0
raw
history blame
5.4 kB
import os
from pathlib import Path
from fastapi import BackgroundTasks, Response, status
import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer
from src.my_logger import setup_logger
from src.utilities import load_datasets, merge_and_update_datasets
from src.visualize_logs import log_file_to_html_string
from src.build_nomic import build_nomic
from src.readme_update import update_dataset_readme
proj_dir = Path(__name__).parent
logger = setup_logger(__name__)
logger.info("Starting Application...")
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
# HF_TOKEN = os.environ["HF_TOKEN"]
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
intro_md = """
# Processing BORU
This is a space to visually search the subreddit [/r/bestofredditorupdates](https://www.reddit.com/r/BestofRedditorUpdates/).
Have you ever been curious to search for stories that are similar to one of your favorites? This can help!
- Each dot represents a post (try clicking on one)
- Closer dots are similar in topic
- Use the filters on the left to help you narrow down what you are looking for
- The lasso can help you search in a smaller range that you drag with your mouse
- The filter can help you narrow by field,
- Filtering posts that are `CONCLUDED`
- Filtering popular posts
- Filtering by date
- The search can help you look by keyword
Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map)
"""
details_md = """
# Details
## Creation Details
1. This space is triggered by a webhook for changes on [reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates).
2. It then takes the updates from that dataset and get embeddings by making leveraging [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings)
- [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) is using [zero-spaces](https://huggingface.co/zero-gpu-explorers) a free GPU service to compute the model [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
- Im calling this via [gradio_client](https://www.gradio.app/docs/client) which allows any space to be used as an API
3. The calculated embeddings are stored in this dataset [reddit-tools-HF/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/reddit-tools-HF/reddit-bestofredditorupdates-processed)
4. These get visualized by [nomic atlas](https://docs.nomic.ai/atlas/introduction/quick-start). You can see how I process it in [build_nomic.py](https://huggingface.co/spaces/reddit-tools-HF/processing-bestofredditorupdates/blob/main/src/build_nomic.py)
"""
url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map"
html_str = f'<iframe src={url} style="border:none;height:1024px;width:100%" allow="clipboard-read; clipboard-write" title="Nomic Atlas">'
with gr.Blocks() as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.HTML(html_str)
with gr.Tab("Logs"):
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
with gr.Tab("Details"):
gr.Markdown(details_md)
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload, task_queue: BackgroundTasks):
if not payload.event.scope.startswith("repo"):
return Response("No task scheduled", status_code=status.HTTP_200_OK)
# Only run if change is on main branch
try:
if payload.updatedRefs[0].ref != 'refs/heads/main':
return Response("No task scheduled", status_code=status.HTTP_200_OK)
except:
return Response("No task scheduled", status_code=status.HTTP_200_OK)
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
task_queue.add_task(_process_webhook, payload=payload)
return Response("Task scheduled.", status_code=status.HTTP_202_ACCEPTED)
def _process_webhook(payload: WebhookPayload):
logger.info(f"Loading new dataset...")
dataset, original_dataset = load_datasets()
logger.info(f"Loaded new dataset")
logger.info(f"Merging and Updating rows...")
dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset)
logger.info(f"Merged and Updated rows")
# Push the augmented dataset to the Hugging Face hub
logger.info(f"Pushing processed data to the Hugging Face Hub...")
dataset.push_to_hub(PROCESSED_DATASET)
logger.info(f"Pushed processed data to the Hugging Face Hub")
update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count)
logger.info(f"Updated README.")
# Build Nomic
logger.info(f"Building Nomic...")
build_nomic(dataset=dataset)
logger.info(f"Built Nomic")
logger.info(f"Update from webhook completed!")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
# ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)