mike commited on
Commit
c4bd190
β€’
1 Parent(s): 6f99174
Files changed (8) hide show
  1. Makefile +2 -0
  2. README.md +15 -8
  3. app.py +80 -0
  4. embeddings_encoder.py +47 -0
  5. packages.txt +0 -0
  6. requirements.txt +11 -0
  7. templates/index.html +132 -0
  8. umap_reducer.py +27 -0
Makefile ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ run:
2
+ PORT=3000 FLASK_ENV=development python app.py
README.md CHANGED
@@ -1,12 +1,19 @@
1
  ---
2
- title: Test2
3
- emoji: ⚑
4
- colorFrom: yellow
5
- colorTo: gray
6
- sdk: streamlit
7
- sdk_version: 1.10.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Sentence Embeddings Visualization
3
+ emoji: πŸ“ˆ
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
9
  ---
10
+ # Hugging Face Spaces + Observable
11
+ ### Sentence Embeddings Visualization
12
+ Recently I've been exploring [Hugging face Spaces](https://huggingface.co/spaces) and [sentence-transformers](https://huggingface.co/sentence-transformers) to build an application to generate text embeddings and clustering visualization.
13
+ Currently, the quickest way to build interactive ML apps with Python (backend/frontend), afaik, is to use [Streamlit](https://streamlit.io/) or [Gradio](https://www.gradio.app/). To embed an Observable notebook on Streamlit, you can use this custom component [streamlit-observable](https://github.com/asg017/streamlit-observable)
14
+ This [Observable notebook](https://observablehq.com/@radames/hugging-face-spaces-observable-sentence-embeddings) is the frontend application for this [Hugging Face Spaces](https://huggingface.co/spaces/radames/sentence-embeddings-visualization) app.
15
+ This notebook explores another way to integrate Observable inside Hugging Face Spaces. Currently, [HF Spaces supports](https://huggingface.co/docs/hub/spaces#streamlit-and-gradio) Streamlit and Gradio or a simple static web page.
16
+ The concept here is to use this entire notebook as the frontend and data visualization application for the [ML Flask/Python](https://huggingface.co/spaces/radames/sentence-embeddings-visualization/blob/main/app.py#L37-L75) backend.
17
+ * The index route renders a [simple HTML template](https://huggingface.co/spaces/radames/sentence-embeddings-visualization/blob/main/templates/index.html) containing [Observable Runtime API code](https://observablehq.com/@observablehq/downloading-and-embedding-notebooks).
18
+ * A single function, triggered by a POST request to \`run-umap\`, returns a low dimensional representation of the original sentence transformers embeddings using UMAP and cluster analysis with HDBSCAN.
19
+ * All the visualization and interactive magic happen on the Javascript code inside the Observable Notebook.
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from umap_reducer import UMAPReducer
2
+ from embeddings_encoder import EmbeddingsEncoder
3
+ from flask import Flask, request, render_template, jsonify, make_response, session
4
+ from flask_session import Session
5
+ from flask_cors import CORS, cross_origin
6
+ import os
7
+ from dotenv import load_dotenv
8
+ import feedparser
9
+ import json
10
+ from dateutil import parser
11
+ import re
12
+ import numpy as np
13
+ import gzip
14
+ import hashlib
15
+
16
+ load_dotenv()
17
+
18
+
19
+ app = Flask(__name__, static_url_path='/static')
20
+ app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY")
21
+ app.config["SESSION_PERMANENT"] = True
22
+ app.config["SESSION_TYPE"] = "filesystem"
23
+ app.config["SESSION_COOKIE_SAMESITE"] = "None"
24
+ app.config["SESSION_COOKIE_SECURE"] = True
25
+ Session(app)
26
+ CORS(app)
27
+
28
+ reducer = UMAPReducer()
29
+ encoder = EmbeddingsEncoder()
30
+
31
+
32
+ @app.route('/')
33
+ def index():
34
+ return render_template('index.html')
35
+
36
+
37
+ @app.route('/run-umap', methods=['POST'])
38
+ @cross_origin(supports_credentials=True)
39
+ def run_umap():
40
+ input_data = request.get_json()
41
+ sentences = input_data['data']['sentences']
42
+ umap_options = input_data['data']['umap_options']
43
+ cluster_options = input_data['data']['cluster_options']
44
+ # create unique hash for input, avoid recalculating embeddings
45
+ sentences_input_hash = hashlib.sha256(
46
+ ''.join(sentences).encode("utf-8")).hexdigest()
47
+
48
+ print("input options:", sentences_input_hash,
49
+ umap_options, cluster_options, "\n\n")
50
+ try:
51
+ if not session.get(sentences_input_hash):
52
+ print("New input, calculating embeddings" "\n\n")
53
+ embeddings = encoder.encode(sentences)
54
+ session[sentences_input_hash] = embeddings.tolist()
55
+ else:
56
+ print("Input already calculated, using cached embeddings", "\n\n")
57
+ embeddings = session[sentences_input_hash]
58
+
59
+ # UMAP embeddings
60
+ reducer.setParams(umap_options, cluster_options)
61
+ umap_embeddings = reducer.embed(embeddings)
62
+ # HDBScan cluster analysis
63
+ clusters = reducer.clusterAnalysis(umap_embeddings)
64
+ content = gzip.compress(json.dumps(
65
+ {
66
+ "embeddings": umap_embeddings.tolist(),
67
+ "clusters": clusters.labels_.tolist()
68
+ }
69
+ ).encode('utf8'), 5)
70
+ response = make_response(content)
71
+ response.headers['Content-length'] = len(content)
72
+ response.headers['Content-Encoding'] = 'gzip'
73
+ return response
74
+ except Exception as e:
75
+ return jsonify({"error": str(e)}), 400
76
+
77
+
78
+ if __name__ == '__main__':
79
+ app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))
80
+
embeddings_encoder.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import os
6
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
7
+
8
+ class EmbeddingsEncoder:
9
+ def __init__(self):
10
+ # Load model from HuggingFace Hub
11
+ self.tokenizer = AutoTokenizer.from_pretrained(
12
+ 'sentence-transformers/all-MiniLM-L6-v2')
13
+ self.model = AutoModel.from_pretrained(
14
+ 'sentence-transformers/all-MiniLM-L6-v2')
15
+
16
+ # Mean Pooling - Take average of all tokens
17
+
18
+ def mean_pooling(self, model_output, attention_mask):
19
+ # First element of model_output contains all token embeddings
20
+ token_embeddings = model_output.last_hidden_state
21
+ input_mask_expanded = attention_mask.unsqueeze(
22
+ -1).expand(token_embeddings.size()).float()
23
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
24
+
25
+ # Encode text
26
+
27
+ def encode(self, texts):
28
+ # Tokenize sentences
29
+ print("Tokenizing...")
30
+ encoded_input = self.tokenizer(
31
+ texts, padding=True, truncation=True, return_tensors='pt')
32
+
33
+ # Compute token embeddings
34
+ print("Computing embeddings...")
35
+ with torch.no_grad():
36
+ model_output = self.model(**encoded_input, return_dict=True)
37
+
38
+ # Perform pooling
39
+ print("Performing pooling...")
40
+ embeddings = self.mean_pooling(
41
+ model_output, encoded_input['attention_mask'])
42
+
43
+ # Normalize embeddings
44
+ print("Normalizing embeddings...")
45
+ embeddings = F.normalize(embeddings, p=2, dim=1)
46
+
47
+ return embeddings
packages.txt ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feedparser==6.0.8
2
+ Flask==2.0.3
3
+ flask_cors==3.0.10
4
+ flask_session==0.4.0
5
+ hdbscan==0.8.28
6
+ numpy==1.22.2
7
+ python-dotenv==0.19.2
8
+ python_dateutil==2.8.2
9
+ torch==1.10.2
10
+ transformers==4.16.2
11
+ umap-learn==0.5.2
templates/index.html ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <base href="" />
5
+ <meta charset="utf-8" />
6
+ <meta name="description" content="" />
7
+ <link rel="icon" href="favicon.png" />
8
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
9
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"></script>
10
+ <link
11
+ rel="stylesheet"
12
+ href="https://cdn.jsdelivr.net/npm/@observablehq/inspector@3/dist/inspector.css"
13
+ />
14
+ <style>
15
+ @import url("https://fonts.googleapis.com/css2?family=Roboto&family=Source+Serif+4:wght@700&display=swap");
16
+ </style>
17
+ <style>
18
+ .mw8 {
19
+ max-width: 64rem;
20
+ }
21
+ .mx-auto {
22
+ margin-left: auto;
23
+ margin-right: auto;
24
+ }
25
+ .ph3 {
26
+ padding-left: 1rem;
27
+ padding-right: 1rem;
28
+ }
29
+ .measure-wide {
30
+ max-width: 34em;
31
+ }
32
+ * {
33
+ font-family: "Roboto", sans-serif;
34
+ }
35
+ h1,
36
+ h2,
37
+ h3,
38
+ h4,
39
+ h5,
40
+ h6 {
41
+ font-family: "Source Serif 4", serif;
42
+ }
43
+ </style>
44
+ </head>
45
+ <body>
46
+ <article class="mw8 mx-auto ph3 sans-serif">
47
+ <div class="measure-wide" id="observablehq-intro-7dbb745c"></div>
48
+ <div id="observablehq-viewof-sentences-7dbb745c"></div>
49
+ <div id="observablehq-viewof-params-7dbb745c"></div>
50
+ <div id="observablehq-viewof-tryme-7dbb745c"></div>
51
+ <div id="observablehq-viewof-clear-7dbb745c"></div>
52
+ <div id="observablehq-datasets-7dbb745c"></div>
53
+ <div id="observablehq-dialog-7dbb745c"></div>
54
+ <div id="observablehq-viewof-scatter-7dbb745c"></div>
55
+ <div id="observablehq-umapoptions-7dbb745c"></div>
56
+ <div id="observablehq-viewof-umapOptions-7dbb745c"></div>
57
+ <div id="observablehq-umapbutton-7dbb745c"></div>
58
+ <div id="observablehq-hdbscanoptions-7dbb745c"></div>
59
+ <div id="observablehq-viewof-clusterOptions-7dbb745c"></div>
60
+ <div id="observablehq-hdbcscanbutton-7dbb745c"></div>
61
+ </article>
62
+
63
+ <script type="module">
64
+ import {
65
+ Runtime,
66
+ Inspector,
67
+ } from "https://cdn.jsdelivr.net/npm/@observablehq/runtime@4/dist/runtime.js";
68
+ import define from "https://api.observablehq.com/d/843a8bdf01fc2c8f.js?v=3";
69
+ new Runtime().module(define, (name) => {
70
+ if (name === "intro")
71
+ return new Inspector(
72
+ document.querySelector("#observablehq-intro-7dbb745c")
73
+ );
74
+ if (name === "viewof sentences")
75
+ return new Inspector(
76
+ document.querySelector("#observablehq-viewof-sentences-7dbb745c")
77
+ );
78
+ if (name === "viewof params")
79
+ return new Inspector(
80
+ document.querySelector("#observablehq-viewof-params-7dbb745c")
81
+ );
82
+ if (name === "viewof tryme")
83
+ return new Inspector(
84
+ document.querySelector("#observablehq-viewof-tryme-7dbb745c")
85
+ );
86
+ if (name === "viewof clear")
87
+ return new Inspector(
88
+ document.querySelector("#observablehq-viewof-clear-7dbb745c")
89
+ );
90
+ if (name === "datasets")
91
+ return new Inspector(
92
+ document.querySelector("#observablehq-datasets-7dbb745c")
93
+ );
94
+ if (name === "dialog")
95
+ return new Inspector(
96
+ document.querySelector("#observablehq-dialog-7dbb745c")
97
+ );
98
+ if (name === "viewof scatter")
99
+ return new Inspector(
100
+ document.querySelector("#observablehq-viewof-scatter-7dbb745c")
101
+ );
102
+ if (name === "umapoptions")
103
+ return new Inspector(
104
+ document.querySelector("#observablehq-umapoptions-7dbb745c")
105
+ );
106
+ if (name === "viewof umapOptions")
107
+ return new Inspector(
108
+ document.querySelector("#observablehq-viewof-umapOptions-7dbb745c")
109
+ );
110
+ if (name === "umapbutton")
111
+ return new Inspector(
112
+ document.querySelector("#observablehq-umapbutton-7dbb745c")
113
+ );
114
+ if (name === "hdbscanoptions")
115
+ return new Inspector(
116
+ document.querySelector("#observablehq-hdbscanoptions-7dbb745c")
117
+ );
118
+ if (name === "viewof clusterOptions")
119
+ return new Inspector(
120
+ document.querySelector(
121
+ "#observablehq-viewof-clusterOptions-7dbb745c"
122
+ )
123
+ );
124
+ if (name === "hdbcscanbutton")
125
+ return new Inspector(
126
+ document.querySelector("#observablehq-hdbcscanbutton-7dbb745c")
127
+ );
128
+ return ["update", "data", "colorScale"].includes(name);
129
+ });
130
+ </script>
131
+ </body>
132
+ </html>
umap_reducer.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import umap
2
+ import hdbscan
3
+ import copy
4
+
5
+
6
+ class UMAPReducer:
7
+ def __init__(self, umap_options={}, cluster_options={}):
8
+
9
+ # set options with defaults
10
+ self.umap_options = {'n_components': 2, 'spread': 1, 'min_dist': 0.1, 'n_neighbors': 15,
11
+ 'metric': 'cosine', "verbose": True, **umap_options}
12
+ self.cluster_options = {'allow_single_cluster': True, 'min_cluster_size': 500, 'min_samples': 10, **cluster_options}
13
+
14
+ def setParams(self, umap_options={}, cluster_options={}):
15
+ # update params
16
+ self.umap_options = {**self.umap_options, **umap_options}
17
+ self.cluster_options = {**self.cluster_options, **cluster_options}
18
+
19
+ def clusterAnalysis(self, data):
20
+ print("Cluster params:", self.cluster_options)
21
+ clusters = hdbscan.HDBSCAN().fit(data) # **self.cluster_options
22
+ return clusters
23
+
24
+ def embed(self, data):
25
+ print("UMAP params:", self.umap_options)
26
+ result = umap.UMAP(**self.umap_options).fit_transform(data)
27
+ return result