Spaces:

sergeipetrov
/

webhook-space

Runtime error

App Files Files Community

plaggy commited on Feb 19, 2024

Commit

97fdba5

1 Parent(s): 40b241b

init

Browse files

Files changed (9) hide show

Dockerfile +16 -0
chunk_config.json +10 -0
embed_config.json +8 -0
home.html +18 -0
requirements.txt +8 -0
src/__init__.py +0 -0
src/main.py +185 -0
src/models.py +51 -0
style.css +28 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . .
+CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]

chunk_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "input_dataset": "sergeipetrov/transformers-diffusers-docs-raw",
+	"input_splits": ["train"],
+	"input_text_col": "text",
+	"output_dataset": "sergeipetrov/transformers-diffusers-docs-chunked",
+	"strategy": "spacy",
+	"split_seq": "\n\n",
+	"chunk_len": 512,
+	"private": "false"
+}

embed_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "input_dataset": "sergeipetrov/transformers-diffusers-docs-chunked",
+	"input_splits": ["train"],
+	"input_text_col": "text",
+	"output_dataset": "sergeipetrov/transformers-diffusers-docs-embed",
+	"private": "false",
+    "semaphore_bound": 5
+}

home.html ADDED Viewed

	@@ -0,0 +1,18 @@

+<!DOCTYPE html>
+<html>
+	<head>
+		<meta charset="utf-8" />
+		<meta name="viewport" content="width=device-width" />
+		<title>Auto Re-Train</title>
+		<link rel="stylesheet" href="style.css" />
+	</head>
+    <body>
+        <div class="card">
+            <h1>Auto Re-Train webhook</h1>
+            <p>This is a webhook space to auto-retrain on model when a dataset changes.</p>
+            <p>Check out the guide <a href="https://huggingface.co/docs/hub/webhooks-guide-auto-retrain" target="_blank">here</a>!</p>
+        </div>
+    </body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi==0.74.*
+requests==2.27.*
+huggingface_hub==0.11.*
+uvicorn[standard]==0.17.*
+numpy==1.25.*
+datasets==2.16.*
+langchain==0.0.*
+aiohttp==3.8.*

src/__init__.py ADDED Viewed

File without changes

src/main.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import asyncio
+import logging
+import numpy as np
+import time
+import json
+import os
+import tempfile
+import requests
+from fastapi import FastAPI, Header, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse
+from aiohttp import ClientSession
+from langchain.text_splitter import SpacyTextSplitter
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
+from tqdm.asyncio import tqdm_asyncio
+from src.models import chunk_config, embed_config, WebhookPayload
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+TEI_URL = os.getenv("TEI_URL")
+app = FastAPI()
+@app.get("/")
+async def home():
+    return FileResponse("home.html")
+@app.post("/webhook")
+async def post_webhook(
+        payload: WebhookPayload,
+        task_queue: BackgroundTasks
+):
+    if not (
+            payload.event.action == "update"
+            and payload.event.scope.startswith("repo.content")
+            and (
+                payload.repo.name == embed_config.input_dataset
+                # or payload.repo.name == chunk_config.input_dataset
+            )
+            and payload.repo.type == "dataset"
+    ):
+        # no-op
+        logger.info("Update detected, no action taken")
+        return {"processed": False}
+    if payload.repo.name == chunk_config.input_dataset:
+        task_queue.add_task(chunk_dataset)
+        task_queue.add_task(embed_dataset)
+    return {"processed": True}
+"""
+CHUNKING
+"""
+class Chunker:
+    def __init__(self, strategy, split_seq, chunk_len):
+        self.split_seq = split_seq
+        self.chunk_len = chunk_len
+        if strategy == "spacy":
+            self.split = SpacyTextSplitter().split_text
+        if strategy == "sequence":
+            self.split = self.seq_splitter
+        if strategy == "constant":
+            self.split = self.const_splitter
+    def seq_splitter(self, text):
+        return text.split(self.split_seq)
+    def const_splitter(self, text):
+        return [
+            text[i * self.chunk_len:(i + 1) * self.chunk_len]
+            for i in range(int(np.ceil(len(text) / self.chunk_len)))
+        ]
+def chunk_generator(input_dataset, chunker):
+    for i in tqdm(range(len(input_dataset))):
+        chunks = chunker.split(input_dataset[i][chunk_config.input_text_col])
+        for chunk in chunks:
+            if chunk:
+                yield {chunk_config.input_text_col: chunk}
+def chunk_dataset():
+    logger.info("Update detected, chunking is scheduled")
+    input_ds = load_dataset(chunk_config.input_dataset, split=chunk_config.input_splits)
+    chunker = Chunker(
+        strategy=chunk_config.strategy,
+        split_seq=chunk_config.split_seq,
+        chunk_len=chunk_config.chunk_len
+    )
+    dataset = Dataset.from_generator(
+        chunk_generator,
+        gen_kwargs={
+            "input_dataset": input_ds,
+            "chunker": chunker
+        }
+    )
+    dataset.push_to_hub(
+        chunk_config.output_dataset,
+        private=chunk_config.private,
+        token=HF_TOKEN
+    )
+    logger.info("Done chunking")
+    return {"processed": True}
+"""
+EMBEDDING
+"""
+async def embed_sent(sentence, semaphore, tei_url, tmp_file):
+    async with semaphore:
+        payload = {
+            "inputs": sentence,
+            "truncate": True
+        }
+        async with ClientSession(
+                headers={
+                    "Content-Type": "application/json",
+                    "Authorization": f"Bearer {HF_TOKEN}"
+                }
+        ) as session:
+            async with session.post(tei_url, json=payload) as resp:
+                if resp.status != 200:
+                    raise RuntimeError(await resp.text())
+                result = await resp.json()
+                tmp_file.write(
+                    json.dumps({"vector": result[0], chunk_config.input_text_col: sentence}) + "\n"
+                )
+async def embed(input_ds, tei_url, temp_file):
+    semaphore = asyncio.BoundedSemaphore(embed_config.semaphore_bound)
+    jobs = [
+        asyncio.create_task(embed_sent(row[chunk_config.input_text_col], semaphore, tei_url, temp_file))
+        for row in input_ds if row[chunk_config.input_text_col].strip()
+    ]
+    logger.info(f"num chunks to embed: {len(jobs)}")
+    tic = time.time()
+    await tqdm_asyncio.gather(*jobs)
+    logger.info(f"embed time: {time.time() - tic}")
+def wake_up_endpoint(url):
+    while requests.get(
+        url=url,
+        headers={"Authorization": f"Bearer {HF_TOKEN}"}
+    ).status_code != 200:
+        time.sleep(2)
+    logger.info("TEI endpoint is up")
+def embed_dataset():
+    logger.info("Update detected, embedding is scheduled")
+    wake_up_endpoint(embed_config.tei_url)
+    input_ds = load_dataset(embed_config.input_dataset, split=embed_config.input_splits)
+    with tempfile.NamedTemporaryFile(mode="a", suffix=".jsonl") as temp_file:
+        asyncio.run(embed(input_ds, embed_config.tei_url, temp_file))
+        dataset = Dataset.from_json(temp_file.name)
+        dataset.push_to_hub(
+            embed_config.output_dataset,
+            private=embed_config.private,
+            token=HF_TOKEN
+        )
+    logger.info("Done embedding")

src/models.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+import os
+from pydantic import BaseModel
+from typing import Literal
+class ChunkConfig(BaseModel):
+	input_dataset: str
+	input_splits: list[str]
+	input_text_col: str
+	output_dataset: str
+	strategy: Literal["spacy", "sequence", "constant"]
+	split_seq: str | list[str]
+	chunk_len: int
+	private: bool
+class EmbedConfig(BaseModel):
+	input_dataset: str
+	input_splits: list[str]
+	input_text_col: str
+	output_dataset: str
+	private: bool
+	semaphore_bound: int
+class WebhookPayloadEvent(BaseModel):
+	action: Literal["create", "update", "delete"]
+	scope: str
+class WebhookPayloadRepo(BaseModel):
+	type: Literal["dataset", "model", "space"]
+	name: str
+	id: str
+	private: bool
+	headSha: str
+class WebhookPayload(BaseModel):
+	event: WebhookPayloadEvent
+	repo: WebhookPayloadRepo
+with open(os.path.join(os.getcwd(), "chunk_config.json")) as c:
+	data = json.load(c)
+	chunk_config = ChunkConfig.model_validate_json(json.dumps(data))
+with open(os.path.join(os.getcwd(), "embed_config.json")) as c:
+	data = json.load(c)
+	embed_config = EmbedConfig.model_validate_json(json.dumps(data))

style.css ADDED Viewed

	@@ -0,0 +1,28 @@

+body {
+	padding: 2rem;
+	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
+}
+h1 {
+	font-size: 16px;
+	margin-top: 0;
+}
+p {
+	color: rgb(107, 114, 128);
+	font-size: 15px;
+	margin-bottom: 10px;
+	margin-top: 5px;
+}
+.card {
+	max-width: 620px;
+	margin: 0 auto;
+	padding: 16px;
+	border: 1px solid lightgray;
+	border-radius: 16px;
+}
+.card p:last-child {
+	margin-bottom: 0;
+}