Spaces:
Build error
Build error
mike
commited on
Commit
β’
c4bd190
1
Parent(s):
6f99174
add
Browse files- Makefile +2 -0
- README.md +15 -8
- app.py +80 -0
- embeddings_encoder.py +47 -0
- packages.txt +0 -0
- requirements.txt +11 -0
- templates/index.html +132 -0
- umap_reducer.py +27 -0
Makefile
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
run:
|
2 |
+
PORT=3000 FLASK_ENV=development python app.py
|
README.md
CHANGED
@@ -1,12 +1,19 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
sdk_version: 1.10.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Sentence Embeddings Visualization
|
3 |
+
emoji: π
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
|
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
10 |
+
# Hugging Face Spaces + Observable
|
11 |
+
### Sentence Embeddings Visualization
|
12 |
+
Recently I've been exploring [Hugging face Spaces](https://huggingface.co/spaces) and [sentence-transformers](https://huggingface.co/sentence-transformers) to build an application to generate text embeddings and clustering visualization.
|
13 |
+
Currently, the quickest way to build interactive ML apps with Python (backend/frontend), afaik, is to use [Streamlit](https://streamlit.io/) or [Gradio](https://www.gradio.app/). To embed an Observable notebook on Streamlit, you can use this custom component [streamlit-observable](https://github.com/asg017/streamlit-observable)
|
14 |
+
This [Observable notebook](https://observablehq.com/@radames/hugging-face-spaces-observable-sentence-embeddings) is the frontend application for this [Hugging Face Spaces](https://huggingface.co/spaces/radames/sentence-embeddings-visualization) app.
|
15 |
+
This notebook explores another way to integrate Observable inside Hugging Face Spaces. Currently, [HF Spaces supports](https://huggingface.co/docs/hub/spaces#streamlit-and-gradio) Streamlit and Gradio or a simple static web page.
|
16 |
+
The concept here is to use this entire notebook as the frontend and data visualization application for the [ML Flask/Python](https://huggingface.co/spaces/radames/sentence-embeddings-visualization/blob/main/app.py#L37-L75) backend.
|
17 |
+
* The index route renders a [simple HTML template](https://huggingface.co/spaces/radames/sentence-embeddings-visualization/blob/main/templates/index.html) containing [Observable Runtime API code](https://observablehq.com/@observablehq/downloading-and-embedding-notebooks).
|
18 |
+
* A single function, triggered by a POST request to \`run-umap\`, returns a low dimensional representation of the original sentence transformers embeddings using UMAP and cluster analysis with HDBSCAN.
|
19 |
+
* All the visualization and interactive magic happen on the Javascript code inside the Observable Notebook.
|
app.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from umap_reducer import UMAPReducer
|
2 |
+
from embeddings_encoder import EmbeddingsEncoder
|
3 |
+
from flask import Flask, request, render_template, jsonify, make_response, session
|
4 |
+
from flask_session import Session
|
5 |
+
from flask_cors import CORS, cross_origin
|
6 |
+
import os
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
import feedparser
|
9 |
+
import json
|
10 |
+
from dateutil import parser
|
11 |
+
import re
|
12 |
+
import numpy as np
|
13 |
+
import gzip
|
14 |
+
import hashlib
|
15 |
+
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
|
19 |
+
app = Flask(__name__, static_url_path='/static')
|
20 |
+
app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY")
|
21 |
+
app.config["SESSION_PERMANENT"] = True
|
22 |
+
app.config["SESSION_TYPE"] = "filesystem"
|
23 |
+
app.config["SESSION_COOKIE_SAMESITE"] = "None"
|
24 |
+
app.config["SESSION_COOKIE_SECURE"] = True
|
25 |
+
Session(app)
|
26 |
+
CORS(app)
|
27 |
+
|
28 |
+
reducer = UMAPReducer()
|
29 |
+
encoder = EmbeddingsEncoder()
|
30 |
+
|
31 |
+
|
32 |
+
@app.route('/')
|
33 |
+
def index():
|
34 |
+
return render_template('index.html')
|
35 |
+
|
36 |
+
|
37 |
+
@app.route('/run-umap', methods=['POST'])
|
38 |
+
@cross_origin(supports_credentials=True)
|
39 |
+
def run_umap():
|
40 |
+
input_data = request.get_json()
|
41 |
+
sentences = input_data['data']['sentences']
|
42 |
+
umap_options = input_data['data']['umap_options']
|
43 |
+
cluster_options = input_data['data']['cluster_options']
|
44 |
+
# create unique hash for input, avoid recalculating embeddings
|
45 |
+
sentences_input_hash = hashlib.sha256(
|
46 |
+
''.join(sentences).encode("utf-8")).hexdigest()
|
47 |
+
|
48 |
+
print("input options:", sentences_input_hash,
|
49 |
+
umap_options, cluster_options, "\n\n")
|
50 |
+
try:
|
51 |
+
if not session.get(sentences_input_hash):
|
52 |
+
print("New input, calculating embeddings" "\n\n")
|
53 |
+
embeddings = encoder.encode(sentences)
|
54 |
+
session[sentences_input_hash] = embeddings.tolist()
|
55 |
+
else:
|
56 |
+
print("Input already calculated, using cached embeddings", "\n\n")
|
57 |
+
embeddings = session[sentences_input_hash]
|
58 |
+
|
59 |
+
# UMAP embeddings
|
60 |
+
reducer.setParams(umap_options, cluster_options)
|
61 |
+
umap_embeddings = reducer.embed(embeddings)
|
62 |
+
# HDBScan cluster analysis
|
63 |
+
clusters = reducer.clusterAnalysis(umap_embeddings)
|
64 |
+
content = gzip.compress(json.dumps(
|
65 |
+
{
|
66 |
+
"embeddings": umap_embeddings.tolist(),
|
67 |
+
"clusters": clusters.labels_.tolist()
|
68 |
+
}
|
69 |
+
).encode('utf8'), 5)
|
70 |
+
response = make_response(content)
|
71 |
+
response.headers['Content-length'] = len(content)
|
72 |
+
response.headers['Content-Encoding'] = 'gzip'
|
73 |
+
return response
|
74 |
+
except Exception as e:
|
75 |
+
return jsonify({"error": str(e)}), 400
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == '__main__':
|
79 |
+
app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))
|
80 |
+
|
embeddings_encoder.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
2 |
+
from transformers import AutoTokenizer, AutoModel
|
3 |
+
import torch
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import os
|
6 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
7 |
+
|
8 |
+
class EmbeddingsEncoder:
|
9 |
+
def __init__(self):
|
10 |
+
# Load model from HuggingFace Hub
|
11 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
12 |
+
'sentence-transformers/all-MiniLM-L6-v2')
|
13 |
+
self.model = AutoModel.from_pretrained(
|
14 |
+
'sentence-transformers/all-MiniLM-L6-v2')
|
15 |
+
|
16 |
+
# Mean Pooling - Take average of all tokens
|
17 |
+
|
18 |
+
def mean_pooling(self, model_output, attention_mask):
|
19 |
+
# First element of model_output contains all token embeddings
|
20 |
+
token_embeddings = model_output.last_hidden_state
|
21 |
+
input_mask_expanded = attention_mask.unsqueeze(
|
22 |
+
-1).expand(token_embeddings.size()).float()
|
23 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
24 |
+
|
25 |
+
# Encode text
|
26 |
+
|
27 |
+
def encode(self, texts):
|
28 |
+
# Tokenize sentences
|
29 |
+
print("Tokenizing...")
|
30 |
+
encoded_input = self.tokenizer(
|
31 |
+
texts, padding=True, truncation=True, return_tensors='pt')
|
32 |
+
|
33 |
+
# Compute token embeddings
|
34 |
+
print("Computing embeddings...")
|
35 |
+
with torch.no_grad():
|
36 |
+
model_output = self.model(**encoded_input, return_dict=True)
|
37 |
+
|
38 |
+
# Perform pooling
|
39 |
+
print("Performing pooling...")
|
40 |
+
embeddings = self.mean_pooling(
|
41 |
+
model_output, encoded_input['attention_mask'])
|
42 |
+
|
43 |
+
# Normalize embeddings
|
44 |
+
print("Normalizing embeddings...")
|
45 |
+
embeddings = F.normalize(embeddings, p=2, dim=1)
|
46 |
+
|
47 |
+
return embeddings
|
packages.txt
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
feedparser==6.0.8
|
2 |
+
Flask==2.0.3
|
3 |
+
flask_cors==3.0.10
|
4 |
+
flask_session==0.4.0
|
5 |
+
hdbscan==0.8.28
|
6 |
+
numpy==1.22.2
|
7 |
+
python-dotenv==0.19.2
|
8 |
+
python_dateutil==2.8.2
|
9 |
+
torch==1.10.2
|
10 |
+
transformers==4.16.2
|
11 |
+
umap-learn==0.5.2
|
templates/index.html
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<base href="" />
|
5 |
+
<meta charset="utf-8" />
|
6 |
+
<meta name="description" content="" />
|
7 |
+
<link rel="icon" href="favicon.png" />
|
8 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
9 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"></script>
|
10 |
+
<link
|
11 |
+
rel="stylesheet"
|
12 |
+
href="https://cdn.jsdelivr.net/npm/@observablehq/inspector@3/dist/inspector.css"
|
13 |
+
/>
|
14 |
+
<style>
|
15 |
+
@import url("https://fonts.googleapis.com/css2?family=Roboto&family=Source+Serif+4:wght@700&display=swap");
|
16 |
+
</style>
|
17 |
+
<style>
|
18 |
+
.mw8 {
|
19 |
+
max-width: 64rem;
|
20 |
+
}
|
21 |
+
.mx-auto {
|
22 |
+
margin-left: auto;
|
23 |
+
margin-right: auto;
|
24 |
+
}
|
25 |
+
.ph3 {
|
26 |
+
padding-left: 1rem;
|
27 |
+
padding-right: 1rem;
|
28 |
+
}
|
29 |
+
.measure-wide {
|
30 |
+
max-width: 34em;
|
31 |
+
}
|
32 |
+
* {
|
33 |
+
font-family: "Roboto", sans-serif;
|
34 |
+
}
|
35 |
+
h1,
|
36 |
+
h2,
|
37 |
+
h3,
|
38 |
+
h4,
|
39 |
+
h5,
|
40 |
+
h6 {
|
41 |
+
font-family: "Source Serif 4", serif;
|
42 |
+
}
|
43 |
+
</style>
|
44 |
+
</head>
|
45 |
+
<body>
|
46 |
+
<article class="mw8 mx-auto ph3 sans-serif">
|
47 |
+
<div class="measure-wide" id="observablehq-intro-7dbb745c"></div>
|
48 |
+
<div id="observablehq-viewof-sentences-7dbb745c"></div>
|
49 |
+
<div id="observablehq-viewof-params-7dbb745c"></div>
|
50 |
+
<div id="observablehq-viewof-tryme-7dbb745c"></div>
|
51 |
+
<div id="observablehq-viewof-clear-7dbb745c"></div>
|
52 |
+
<div id="observablehq-datasets-7dbb745c"></div>
|
53 |
+
<div id="observablehq-dialog-7dbb745c"></div>
|
54 |
+
<div id="observablehq-viewof-scatter-7dbb745c"></div>
|
55 |
+
<div id="observablehq-umapoptions-7dbb745c"></div>
|
56 |
+
<div id="observablehq-viewof-umapOptions-7dbb745c"></div>
|
57 |
+
<div id="observablehq-umapbutton-7dbb745c"></div>
|
58 |
+
<div id="observablehq-hdbscanoptions-7dbb745c"></div>
|
59 |
+
<div id="observablehq-viewof-clusterOptions-7dbb745c"></div>
|
60 |
+
<div id="observablehq-hdbcscanbutton-7dbb745c"></div>
|
61 |
+
</article>
|
62 |
+
|
63 |
+
<script type="module">
|
64 |
+
import {
|
65 |
+
Runtime,
|
66 |
+
Inspector,
|
67 |
+
} from "https://cdn.jsdelivr.net/npm/@observablehq/runtime@4/dist/runtime.js";
|
68 |
+
import define from "https://api.observablehq.com/d/843a8bdf01fc2c8f.js?v=3";
|
69 |
+
new Runtime().module(define, (name) => {
|
70 |
+
if (name === "intro")
|
71 |
+
return new Inspector(
|
72 |
+
document.querySelector("#observablehq-intro-7dbb745c")
|
73 |
+
);
|
74 |
+
if (name === "viewof sentences")
|
75 |
+
return new Inspector(
|
76 |
+
document.querySelector("#observablehq-viewof-sentences-7dbb745c")
|
77 |
+
);
|
78 |
+
if (name === "viewof params")
|
79 |
+
return new Inspector(
|
80 |
+
document.querySelector("#observablehq-viewof-params-7dbb745c")
|
81 |
+
);
|
82 |
+
if (name === "viewof tryme")
|
83 |
+
return new Inspector(
|
84 |
+
document.querySelector("#observablehq-viewof-tryme-7dbb745c")
|
85 |
+
);
|
86 |
+
if (name === "viewof clear")
|
87 |
+
return new Inspector(
|
88 |
+
document.querySelector("#observablehq-viewof-clear-7dbb745c")
|
89 |
+
);
|
90 |
+
if (name === "datasets")
|
91 |
+
return new Inspector(
|
92 |
+
document.querySelector("#observablehq-datasets-7dbb745c")
|
93 |
+
);
|
94 |
+
if (name === "dialog")
|
95 |
+
return new Inspector(
|
96 |
+
document.querySelector("#observablehq-dialog-7dbb745c")
|
97 |
+
);
|
98 |
+
if (name === "viewof scatter")
|
99 |
+
return new Inspector(
|
100 |
+
document.querySelector("#observablehq-viewof-scatter-7dbb745c")
|
101 |
+
);
|
102 |
+
if (name === "umapoptions")
|
103 |
+
return new Inspector(
|
104 |
+
document.querySelector("#observablehq-umapoptions-7dbb745c")
|
105 |
+
);
|
106 |
+
if (name === "viewof umapOptions")
|
107 |
+
return new Inspector(
|
108 |
+
document.querySelector("#observablehq-viewof-umapOptions-7dbb745c")
|
109 |
+
);
|
110 |
+
if (name === "umapbutton")
|
111 |
+
return new Inspector(
|
112 |
+
document.querySelector("#observablehq-umapbutton-7dbb745c")
|
113 |
+
);
|
114 |
+
if (name === "hdbscanoptions")
|
115 |
+
return new Inspector(
|
116 |
+
document.querySelector("#observablehq-hdbscanoptions-7dbb745c")
|
117 |
+
);
|
118 |
+
if (name === "viewof clusterOptions")
|
119 |
+
return new Inspector(
|
120 |
+
document.querySelector(
|
121 |
+
"#observablehq-viewof-clusterOptions-7dbb745c"
|
122 |
+
)
|
123 |
+
);
|
124 |
+
if (name === "hdbcscanbutton")
|
125 |
+
return new Inspector(
|
126 |
+
document.querySelector("#observablehq-hdbcscanbutton-7dbb745c")
|
127 |
+
);
|
128 |
+
return ["update", "data", "colorScale"].includes(name);
|
129 |
+
});
|
130 |
+
</script>
|
131 |
+
</body>
|
132 |
+
</html>
|
umap_reducer.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import umap
|
2 |
+
import hdbscan
|
3 |
+
import copy
|
4 |
+
|
5 |
+
|
6 |
+
class UMAPReducer:
|
7 |
+
def __init__(self, umap_options={}, cluster_options={}):
|
8 |
+
|
9 |
+
# set options with defaults
|
10 |
+
self.umap_options = {'n_components': 2, 'spread': 1, 'min_dist': 0.1, 'n_neighbors': 15,
|
11 |
+
'metric': 'cosine', "verbose": True, **umap_options}
|
12 |
+
self.cluster_options = {'allow_single_cluster': True, 'min_cluster_size': 500, 'min_samples': 10, **cluster_options}
|
13 |
+
|
14 |
+
def setParams(self, umap_options={}, cluster_options={}):
|
15 |
+
# update params
|
16 |
+
self.umap_options = {**self.umap_options, **umap_options}
|
17 |
+
self.cluster_options = {**self.cluster_options, **cluster_options}
|
18 |
+
|
19 |
+
def clusterAnalysis(self, data):
|
20 |
+
print("Cluster params:", self.cluster_options)
|
21 |
+
clusters = hdbscan.HDBSCAN().fit(data) # **self.cluster_options
|
22 |
+
return clusters
|
23 |
+
|
24 |
+
def embed(self, data):
|
25 |
+
print("UMAP params:", self.umap_options)
|
26 |
+
result = umap.UMAP(**self.umap_options).fit_transform(data)
|
27 |
+
return result
|