no breaking code
Browse files- aggregate_data.py +84 -0
- app.py +5 -57
- data.json +17 -0
- download_tasks.py +33 -0
- link.sh +13 -0
- models/all-MiniLM-L6-v2-q8/modules.json +0 -0
- models/all-MiniLM-L6-v2-q8/sentence_bert_config.json +4 -0
- ort.py +14 -0
- setup.sh +23 -0
- test_convert.py +49 -0
aggregate_data.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
TASKS_CLUSTERING = [
|
8 |
+
"ArxivClusteringP2P",
|
9 |
+
"ArxivClusteringS2S",
|
10 |
+
"BiorxivClusteringP2P",
|
11 |
+
"BiorxivClusteringS2S",
|
12 |
+
"MedrxivClusteringP2P",
|
13 |
+
"MedrxivClusteringS2S",
|
14 |
+
"RedditClustering",
|
15 |
+
"RedditClusteringP2P",
|
16 |
+
"StackExchangeClustering",
|
17 |
+
"StackExchangeClusteringP2P",
|
18 |
+
"TwentyNewsgroupsClustering",
|
19 |
+
]
|
20 |
+
|
21 |
+
TASKS_PAIR_CLASSIFICATION = [
|
22 |
+
"SprintDuplicateQuestions",
|
23 |
+
"TwitterSemEval2015",
|
24 |
+
"TwitterURLCorpus",
|
25 |
+
]
|
26 |
+
|
27 |
+
|
28 |
+
MODELS = [
|
29 |
+
"all-MiniLM-L6-v2"
|
30 |
+
]
|
31 |
+
|
32 |
+
|
33 |
+
def get_model_size(model_name):
|
34 |
+
return os.path.getsize(f"models/{model_name}/pytorch_model.bin") / (1024.0 * 1024.0)
|
35 |
+
|
36 |
+
|
37 |
+
def compute_model_score(model_name):
|
38 |
+
results_dir = "results"
|
39 |
+
model_dir = os.path.join(results_dir, model_name)
|
40 |
+
|
41 |
+
scores = []
|
42 |
+
|
43 |
+
# Get scores for clustering tasks
|
44 |
+
for task in TASKS_CLUSTERING:
|
45 |
+
task_file = os.path.join(model_dir, f"{task}.json")
|
46 |
+
with open(task_file, 'r') as f:
|
47 |
+
data = json.load(f)
|
48 |
+
v_measure = data['test']['v_measure']
|
49 |
+
scores.append(v_measure)
|
50 |
+
|
51 |
+
# Get scores for pair classification tasks
|
52 |
+
for task in TASKS_PAIR_CLASSIFICATION:
|
53 |
+
task_file = os.path.join(model_dir, f"{task}.json")
|
54 |
+
with open(task_file, 'r') as f:
|
55 |
+
data = json.load(f)
|
56 |
+
max_ap = data['test']['max']['ap']
|
57 |
+
scores.append(max_ap)
|
58 |
+
|
59 |
+
# Compute average score
|
60 |
+
average_score = sum(scores) / len(scores)
|
61 |
+
return average_score
|
62 |
+
|
63 |
+
|
64 |
+
DATA = {
|
65 |
+
"Model": MODELS,
|
66 |
+
"Model Size (MB)": [
|
67 |
+
get_model_size(model) for model in MODELS
|
68 |
+
],
|
69 |
+
"Score": [
|
70 |
+
5 # compute_model_score(model) for model in MODELS
|
71 |
+
],
|
72 |
+
"q8 Model Size (MB)": [
|
73 |
+
get_model_size(model + "-q8") for model in MODELS
|
74 |
+
],
|
75 |
+
"q8 Score": [
|
76 |
+
compute_model_score(model + "-q8") for model in MODELS
|
77 |
+
],
|
78 |
+
}
|
79 |
+
|
80 |
+
with open('data.json', 'w') as json_file:
|
81 |
+
json.dump(DATA, json_file)
|
82 |
+
|
83 |
+
|
84 |
+
|
app.py
CHANGED
@@ -4,37 +4,15 @@ import json
|
|
4 |
import os
|
5 |
|
6 |
# Given list of tasks for clustering and pair classification
|
7 |
-
TASKS_CLUSTERING = [
|
8 |
-
"ArxivClusteringP2P",
|
9 |
-
"ArxivClusteringS2S",
|
10 |
-
"BiorxivClusteringP2P",
|
11 |
-
"BiorxivClusteringS2S",
|
12 |
-
"MedrxivClusteringP2P",
|
13 |
-
"MedrxivClusteringS2S",
|
14 |
-
"RedditClustering",
|
15 |
-
"RedditClusteringP2P",
|
16 |
-
"StackExchangeClustering",
|
17 |
-
"StackExchangeClusteringP2P",
|
18 |
-
"TwentyNewsgroupsClustering",
|
19 |
-
]
|
20 |
-
|
21 |
-
TASKS_PAIR_CLASSIFICATION = [
|
22 |
-
"SprintDuplicateQuestions",
|
23 |
-
"TwitterSemEval2015",
|
24 |
-
"TwitterURLCorpus",
|
25 |
-
]
|
26 |
|
27 |
def display_table():
|
28 |
-
|
29 |
-
|
30 |
-
"Model": ["ModelA", "ModelB", "ModelC"],
|
31 |
-
"Model Size (MB)": [293, 793, 1000],
|
32 |
-
"Score": [0.92, 0.85, 0.89],
|
33 |
-
"Quantized Score": [0.91, 0.84, 0.88]
|
34 |
-
}
|
35 |
df = pd.DataFrame(data)
|
36 |
|
37 |
-
df
|
|
|
|
|
38 |
html_table = df.to_html()
|
39 |
|
40 |
html_content = f"""
|
@@ -49,36 +27,6 @@ def display_table():
|
|
49 |
return html_content
|
50 |
|
51 |
|
52 |
-
def compute_model_score(model_name):
|
53 |
-
results_dir = "results"
|
54 |
-
model_dir = os.path.join(results_dir, model_name)
|
55 |
-
|
56 |
-
scores = []
|
57 |
-
|
58 |
-
# Get scores for clustering tasks
|
59 |
-
for task in TASKS_CLUSTERING:
|
60 |
-
task_file = os.path.join(model_dir, f"{task}.json")
|
61 |
-
with open(task_file, 'r') as f:
|
62 |
-
data = json.load(f)
|
63 |
-
v_measure = data['test']['v_measure']
|
64 |
-
scores.append(v_measure)
|
65 |
-
|
66 |
-
# Get scores for pair classification tasks
|
67 |
-
for task in TASKS_PAIR_CLASSIFICATION:
|
68 |
-
task_file = os.path.join(model_dir, f"{task}.json")
|
69 |
-
with open(task_file, 'r') as f:
|
70 |
-
data = json.load(f)
|
71 |
-
max_ap = data['test']['max']['ap']
|
72 |
-
scores.append(max_ap)
|
73 |
-
|
74 |
-
# Compute average score
|
75 |
-
average_score = sum(scores) / len(scores)
|
76 |
-
return average_score
|
77 |
-
|
78 |
-
|
79 |
-
# score = compute_model_score("ModelA")
|
80 |
-
|
81 |
-
# Create Gradio interface
|
82 |
iface = gr.Interface(fn=display_table, live=True, inputs=[], outputs="html")
|
83 |
|
84 |
iface.launch()
|
|
|
4 |
import os
|
5 |
|
6 |
# Given list of tasks for clustering and pair classification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def display_table():
|
9 |
+
with open('data.json', 'r') as json_file:
|
10 |
+
data = json.load(json_file)
|
|
|
|
|
|
|
|
|
|
|
11 |
df = pd.DataFrame(data)
|
12 |
|
13 |
+
df = df.reset_index()
|
14 |
+
df.columns = ['Rank', 'Model', 'Score', 'Quantized Score']
|
15 |
+
|
16 |
html_table = df.to_html()
|
17 |
|
18 |
html_content = f"""
|
|
|
27 |
return html_content
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
iface = gr.Interface(fn=display_table, live=True, inputs=[], outputs="html")
|
31 |
|
32 |
iface.launch()
|
data.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": [
|
3 |
+
"all-MiniLM-L6-v2"
|
4 |
+
],
|
5 |
+
"Model Size (MB)": [
|
6 |
+
86.67845249176025
|
7 |
+
],
|
8 |
+
"Score": [
|
9 |
+
5
|
10 |
+
],
|
11 |
+
"q8 Model Size (MB)": [
|
12 |
+
55.91230869293213
|
13 |
+
],
|
14 |
+
"q8 Score": [
|
15 |
+
0.26228089622461903
|
16 |
+
]
|
17 |
+
}
|
download_tasks.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Downloads MTEB tasks"""
|
2 |
+
import os
|
3 |
+
|
4 |
+
TASK_LIST = [
|
5 |
+
"ArxivClusteringP2P",
|
6 |
+
"ArxivClusteringS2S",
|
7 |
+
"BiorxivClusteringP2P",
|
8 |
+
"BiorxivClusteringS2S",
|
9 |
+
"MedrxivClusteringP2P",
|
10 |
+
"MedrxivClusteringS2S",
|
11 |
+
"RedditClustering",
|
12 |
+
"RedditClusteringP2P",
|
13 |
+
"StackExchangeClustering",
|
14 |
+
"StackExchangeClusteringP2P",
|
15 |
+
"TwentyNewsgroupsClustering",
|
16 |
+
|
17 |
+
"SprintDuplicateQuestions",
|
18 |
+
"TwitterSemEval2015",
|
19 |
+
"TwitterURLCorpus",
|
20 |
+
]
|
21 |
+
|
22 |
+
|
23 |
+
os.environ["TRANSFORMERS_CACHE"]="./transformers_cache/"
|
24 |
+
os.environ["HF_DATASETS_CACHE"]="./hf_datasets_cache/"
|
25 |
+
os.environ["HF_MODULES_CACHE"]="./hf_modules_cache/"
|
26 |
+
os.environ["HF_METRICS_CACHE"]="./hf_metrics_cache/"
|
27 |
+
|
28 |
+
from mteb import MTEB
|
29 |
+
evaluation = MTEB(tasks=TASK_LIST, task_langs=["en"])
|
30 |
+
|
31 |
+
for task in evaluation.tasks:
|
32 |
+
print(f"Loading {task}")
|
33 |
+
task.load_data()
|
link.sh
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/zsh
|
2 |
+
|
3 |
+
OS_NAME="macos"
|
4 |
+
COMPILER="clang"
|
5 |
+
CONDA_PREFIX=$(conda info --base)
|
6 |
+
|
7 |
+
if [[ "$OS_NAME" == "macos" && "$COMPILER" == "clang" ]]; then
|
8 |
+
for LIBOMP_ALIAS in libgomp.dylib libiomp5.dylib libomp.dylib
|
9 |
+
do
|
10 |
+
sudo ln -sf "$(brew --cellar libomp)"/*/lib/libomp.dylib $CONDA_PREFIX/lib/$LIBOMP_ALIAS
|
11 |
+
done
|
12 |
+
fi
|
13 |
+
|
models/all-MiniLM-L6-v2-q8/modules.json
ADDED
File without changes
|
models/all-MiniLM-L6-v2-q8/sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 256,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
ort.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
3 |
+
import torch
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("./models/optimum/all-MiniLM-L6-v2")
|
6 |
+
model = ORTModelForFeatureExtraction.from_pretrained("./models/optimum/all-MiniLM-L6-v2")
|
7 |
+
|
8 |
+
inputs = tokenizer("My name is Philipp and I live in Germany.", return_tensors="np")
|
9 |
+
|
10 |
+
outputs = model(**inputs)
|
11 |
+
last_hidden_state = outputs.last_hidden_state
|
12 |
+
|
13 |
+
# no mean pooling
|
14 |
+
print(list(last_hidden_state.shape))
|
setup.sh
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
conda env remove --name qMTEB -y
|
4 |
+
conda create --name qMTEB python=3.9 -y
|
5 |
+
|
6 |
+
source activate qMTEB
|
7 |
+
|
8 |
+
|
9 |
+
conda install -c intel openmp
|
10 |
+
conda install nomkl
|
11 |
+
|
12 |
+
conda install pytorch torchvision -c pytorch
|
13 |
+
conda install -c conda-forge sentence-transformers
|
14 |
+
conda install -c huggingface transformers
|
15 |
+
|
16 |
+
pip install mteb
|
17 |
+
|
18 |
+
rm -rf results/
|
19 |
+
|
20 |
+
source link.sh
|
21 |
+
|
22 |
+
echo "Setup completed!"
|
23 |
+
|
test_convert.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from transformers import AutoModel, AutoTokenizer
|
4 |
+
from scipy.spatial import distance
|
5 |
+
os.environ["TRANSFORMERS_OFFLINE"] = "1" # 1 for offline
|
6 |
+
|
7 |
+
|
8 |
+
def load_model_and_tokenizer(filepath):
|
9 |
+
model = AutoModel.from_pretrained(filepath)
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(filepath)
|
11 |
+
return model, tokenizer
|
12 |
+
|
13 |
+
|
14 |
+
def get_sentence_embeddings(sentences, model, tokenizer):
|
15 |
+
tokens = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
|
16 |
+
with torch.no_grad():
|
17 |
+
embeddings = model(**tokens).last_hidden_state.mean(dim=1).numpy()
|
18 |
+
return embeddings
|
19 |
+
|
20 |
+
|
21 |
+
def compare_embeddings(path1, path2):
|
22 |
+
model, tokenizer1 = load_model_and_tokenizer(path1)
|
23 |
+
quantized_model, tokenizer2 = load_model_and_tokenizer(path2)
|
24 |
+
|
25 |
+
sentence_embeddings1 = get_sentence_embeddings(sentences, model, tokenizer1)
|
26 |
+
sentence_embeddings2 = get_sentence_embeddings(sentences, quantized_model, tokenizer2)
|
27 |
+
|
28 |
+
for sentence, emb1, emb2 in zip(sentences, sentence_embeddings1, sentence_embeddings2):
|
29 |
+
cosine_similarity = 1 - distance.cosine(emb1, emb2) # scipy's cosine returns dissimilarity
|
30 |
+
euclidean_distance = distance.euclidean(emb1, emb2)
|
31 |
+
|
32 |
+
print("Sentence:", sentence)
|
33 |
+
print("Embedding1 shape:", emb1.shape)
|
34 |
+
print("Embedding2 shape:", emb2.shape)
|
35 |
+
print("Cosine Similarity:", cosine_similarity)
|
36 |
+
print("Euclidean Distance:", euclidean_distance)
|
37 |
+
print("--------")
|
38 |
+
|
39 |
+
|
40 |
+
# Testing the comparison function
|
41 |
+
model_filepath = "./models/all-MiniLM-L6-v2"
|
42 |
+
quantized_model_filepath = "./models/all-MiniLM-L6-v2-q8"
|
43 |
+
sentences = [
|
44 |
+
'This framework generates embeddings for each input sentence',
|
45 |
+
'Sentences are passed as a list of string.',
|
46 |
+
'The quick brown fox jumps over the lazy dog.'
|
47 |
+
]
|
48 |
+
|
49 |
+
compare_embeddings(model_filepath, quantized_model_filepath)
|