Spaces:

Mike007123
/

test2

Build error

App Files Files Community

mike commited on Nov 3, 2022

Commit

c4bd190

•

1 Parent(s): 6f99174

add

Browse files

Files changed (8) hide show

Makefile +2 -0
README.md +15 -8
app.py +80 -0
embeddings_encoder.py +47 -0
packages.txt +0 -0
requirements.txt +11 -0
templates/index.html +132 -0
umap_reducer.py +27 -0

Makefile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ run:
2	+ PORT=3000 FLASK_ENV=development python app.py

README.md CHANGED Viewed

@@ -1,12 +1,19 @@
 ---
-title: Test2
-emoji: ⚡
-colorFrom: yellow
-colorTo: gray
-sdk: streamlit
-sdk_version: 1.10.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Sentence Embeddings Visualization
+emoji: 📈
+colorFrom: green
+colorTo: indigo
+sdk: gradio
 app_file: app.py
 pinned: false
 ---
+# Hugging Face Spaces + Observable
+### Sentence Embeddings Visualization
+Recently I've been exploring [Hugging face Spaces](https://huggingface.co/spaces) and [sentence-transformers](https://huggingface.co/sentence-transformers) to build an application to generate text embeddings and clustering visualization.
+Currently, the quickest way to build interactive ML apps with Python (backend/frontend), afaik, is to use [Streamlit](https://streamlit.io/) or [Gradio](https://www.gradio.app/). To embed an Observable notebook on Streamlit, you can use this custom component [streamlit-observable](https://github.com/asg017/streamlit-observable)
+This [Observable notebook](https://observablehq.com/@radames/hugging-face-spaces-observable-sentence-embeddings) is the frontend application for this [Hugging Face Spaces](https://huggingface.co/spaces/radames/sentence-embeddings-visualization) app.
+This notebook explores another way to integrate Observable inside Hugging Face Spaces. Currently,  [HF Spaces supports](https://huggingface.co/docs/hub/spaces#streamlit-and-gradio) Streamlit and Gradio or a simple static web page.
+The concept here is to use this entire notebook as the frontend and data visualization application for the [ML Flask/Python](https://huggingface.co/spaces/radames/sentence-embeddings-visualization/blob/main/app.py#L37-L75) backend.
+* The index route renders a [simple HTML template](https://huggingface.co/spaces/radames/sentence-embeddings-visualization/blob/main/templates/index.html) containing [Observable Runtime API code](https://observablehq.com/@observablehq/downloading-and-embedding-notebooks).
+* A single function, triggered by a POST request to \`run-umap\`, returns a low dimensional representation of the original sentence transformers embeddings using UMAP and cluster analysis with HDBSCAN.
+* All the visualization and interactive magic happen on the Javascript code inside the Observable Notebook.

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from umap_reducer import UMAPReducer
+from embeddings_encoder import EmbeddingsEncoder
+from flask import Flask, request, render_template, jsonify, make_response, session
+from flask_session import Session
+from flask_cors import CORS, cross_origin
+import os
+from dotenv import load_dotenv
+import feedparser
+import json
+from dateutil import parser
+import re
+import numpy as np
+import gzip
+import hashlib
+load_dotenv()
+app = Flask(__name__, static_url_path='/static')
+app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY")
+app.config["SESSION_PERMANENT"] = True
+app.config["SESSION_TYPE"] = "filesystem"
+app.config["SESSION_COOKIE_SAMESITE"] = "None"
+app.config["SESSION_COOKIE_SECURE"] = True
+Session(app)
+CORS(app)
+reducer = UMAPReducer()
+encoder = EmbeddingsEncoder()
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/run-umap', methods=['POST'])
+@cross_origin(supports_credentials=True)
+def run_umap():
+    input_data = request.get_json()
+    sentences = input_data['data']['sentences']
+    umap_options = input_data['data']['umap_options']
+    cluster_options = input_data['data']['cluster_options']
+    # create unique hash for input, avoid recalculating embeddings
+    sentences_input_hash = hashlib.sha256(
+        ''.join(sentences).encode("utf-8")).hexdigest()
+    print("input options:", sentences_input_hash,
+          umap_options, cluster_options, "\n\n")
+    try:
+        if not session.get(sentences_input_hash):
+            print("New input, calculating embeddings" "\n\n")
+            embeddings = encoder.encode(sentences)
+            session[sentences_input_hash] = embeddings.tolist()
+        else:
+            print("Input already calculated, using cached embeddings", "\n\n")
+            embeddings = session[sentences_input_hash]
+        # UMAP embeddings
+        reducer.setParams(umap_options, cluster_options)
+        umap_embeddings = reducer.embed(embeddings)
+        # HDBScan cluster analysis
+        clusters = reducer.clusterAnalysis(umap_embeddings)
+        content = gzip.compress(json.dumps(
+            {
+                "embeddings": umap_embeddings.tolist(),
+                "clusters": clusters.labels_.tolist()
+            }
+        ).encode('utf8'), 5)
+        response = make_response(content)
+        response.headers['Content-length'] = len(content)
+        response.headers['Content-Encoding'] = 'gzip'
+        return response
+    except Exception as e:
+        return jsonify({"error": str(e)}), 400
+if __name__ == '__main__':
+    app.run(host='0.0.0.0',  port=int(os.environ.get('PORT', 7860)))

embeddings_encoder.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class EmbeddingsEncoder:
+    def __init__(self):
+        # Load model from HuggingFace Hub
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            'sentence-transformers/all-MiniLM-L6-v2')
+        self.model = AutoModel.from_pretrained(
+            'sentence-transformers/all-MiniLM-L6-v2')
+    # Mean Pooling - Take average of all tokens
+    def mean_pooling(self, model_output, attention_mask):
+        # First element of model_output contains all token embeddings
+        token_embeddings = model_output.last_hidden_state
+        input_mask_expanded = attention_mask.unsqueeze(
+            -1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    # Encode text
+    def encode(self,  texts):
+        # Tokenize sentences
+        print("Tokenizing...")
+        encoded_input = self.tokenizer(
+            texts, padding=True, truncation=True, return_tensors='pt')
+        # Compute token embeddings
+        print("Computing embeddings...")
+        with torch.no_grad():
+            model_output = self.model(**encoded_input, return_dict=True)
+        # Perform pooling
+        print("Performing pooling...")
+        embeddings = self.mean_pooling(
+            model_output, encoded_input['attention_mask'])
+        # Normalize embeddings
+        print("Normalizing embeddings...")
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings

packages.txt ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+feedparser==6.0.8
+Flask==2.0.3
+flask_cors==3.0.10
+flask_session==0.4.0
+hdbscan==0.8.28
+numpy==1.22.2
+python-dotenv==0.19.2
+python_dateutil==2.8.2
+torch==1.10.2
+transformers==4.16.2
+umap-learn==0.5.2

templates/index.html ADDED Viewed

	@@ -0,0 +1,132 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <base href="" />
+    <meta charset="utf-8" />
+    <meta name="description" content="" />
+    <link rel="icon" href="favicon.png" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"></script>
+    <link
+      rel="stylesheet"
+      href="https://cdn.jsdelivr.net/npm/@observablehq/inspector@3/dist/inspector.css"
+    />
+    <style>
+      @import url("https://fonts.googleapis.com/css2?family=Roboto&family=Source+Serif+4:wght@700&display=swap");
+    </style>
+    <style>
+      .mw8 {
+        max-width: 64rem;
+      }
+      .mx-auto {
+        margin-left: auto;
+        margin-right: auto;
+      }
+      .ph3 {
+        padding-left: 1rem;
+        padding-right: 1rem;
+      }
+      .measure-wide {
+        max-width: 34em;
+      }
+      * {
+        font-family: "Roboto", sans-serif;
+      }
+      h1,
+      h2,
+      h3,
+      h4,
+      h5,
+      h6 {
+        font-family: "Source Serif 4", serif;
+      }
+    </style>
+  </head>
+  <body>
+    <article class="mw8 mx-auto ph3 sans-serif">
+      <div class="measure-wide" id="observablehq-intro-7dbb745c"></div>
+      <div id="observablehq-viewof-sentences-7dbb745c"></div>
+      <div id="observablehq-viewof-params-7dbb745c"></div>
+      <div id="observablehq-viewof-tryme-7dbb745c"></div>
+      <div id="observablehq-viewof-clear-7dbb745c"></div>
+      <div id="observablehq-datasets-7dbb745c"></div>
+      <div id="observablehq-dialog-7dbb745c"></div>
+      <div id="observablehq-viewof-scatter-7dbb745c"></div>
+      <div id="observablehq-umapoptions-7dbb745c"></div>
+      <div id="observablehq-viewof-umapOptions-7dbb745c"></div>
+      <div id="observablehq-umapbutton-7dbb745c"></div>
+      <div id="observablehq-hdbscanoptions-7dbb745c"></div>
+      <div id="observablehq-viewof-clusterOptions-7dbb745c"></div>
+      <div id="observablehq-hdbcscanbutton-7dbb745c"></div>
+    </article>
+    <script type="module">
+      import {
+        Runtime,
+        Inspector,
+      } from "https://cdn.jsdelivr.net/npm/@observablehq/runtime@4/dist/runtime.js";
+      import define from "https://api.observablehq.com/d/843a8bdf01fc2c8f.js?v=3";
+      new Runtime().module(define, (name) => {
+        if (name === "intro")
+          return new Inspector(
+            document.querySelector("#observablehq-intro-7dbb745c")
+          );
+        if (name === "viewof sentences")
+          return new Inspector(
+            document.querySelector("#observablehq-viewof-sentences-7dbb745c")
+          );
+        if (name === "viewof params")
+          return new Inspector(
+            document.querySelector("#observablehq-viewof-params-7dbb745c")
+          );
+        if (name === "viewof tryme")
+          return new Inspector(
+            document.querySelector("#observablehq-viewof-tryme-7dbb745c")
+          );
+        if (name === "viewof clear")
+          return new Inspector(
+            document.querySelector("#observablehq-viewof-clear-7dbb745c")
+          );
+        if (name === "datasets")
+          return new Inspector(
+            document.querySelector("#observablehq-datasets-7dbb745c")
+          );
+        if (name === "dialog")
+          return new Inspector(
+            document.querySelector("#observablehq-dialog-7dbb745c")
+          );
+        if (name === "viewof scatter")
+          return new Inspector(
+            document.querySelector("#observablehq-viewof-scatter-7dbb745c")
+          );
+        if (name === "umapoptions")
+          return new Inspector(
+            document.querySelector("#observablehq-umapoptions-7dbb745c")
+          );
+        if (name === "viewof umapOptions")
+          return new Inspector(
+            document.querySelector("#observablehq-viewof-umapOptions-7dbb745c")
+          );
+        if (name === "umapbutton")
+          return new Inspector(
+            document.querySelector("#observablehq-umapbutton-7dbb745c")
+          );
+        if (name === "hdbscanoptions")
+          return new Inspector(
+            document.querySelector("#observablehq-hdbscanoptions-7dbb745c")
+          );
+        if (name === "viewof clusterOptions")
+          return new Inspector(
+            document.querySelector(
+              "#observablehq-viewof-clusterOptions-7dbb745c"
+            )
+          );
+        if (name === "hdbcscanbutton")
+          return new Inspector(
+            document.querySelector("#observablehq-hdbcscanbutton-7dbb745c")
+          );
+        return ["update", "data", "colorScale"].includes(name);
+      });
+    </script>
+  </body>
+</html>

umap_reducer.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import umap
+import hdbscan
+import copy
+class UMAPReducer:
+    def __init__(self, umap_options={}, cluster_options={}):
+        # set options with defaults
+        self.umap_options = {'n_components': 2, 'spread': 1, 'min_dist': 0.1, 'n_neighbors': 15,
+                             'metric': 'cosine', "verbose": True, **umap_options}
+        self.cluster_options = {'allow_single_cluster': True, 'min_cluster_size': 500, 'min_samples': 10, **cluster_options}
+    def setParams(self, umap_options={}, cluster_options={}):
+        # update params
+        self.umap_options = {**self.umap_options, **umap_options}
+        self.cluster_options = {**self.cluster_options, **cluster_options}
+    def clusterAnalysis(self, data):
+        print("Cluster params:", self.cluster_options)
+        clusters = hdbscan.HDBSCAN().fit(data) # **self.cluster_options
+        return clusters
+    def embed(self, data):
+        print("UMAP params:", self.umap_options)
+        result = umap.UMAP(**self.umap_options).fit_transform(data)
+        return result