varun4 commited on
Commit
f70e4f4
·
1 Parent(s): 38f9e2f

no breaking code

Browse files
aggregate_data.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+
6
+
7
+ TASKS_CLUSTERING = [
8
+ "ArxivClusteringP2P",
9
+ "ArxivClusteringS2S",
10
+ "BiorxivClusteringP2P",
11
+ "BiorxivClusteringS2S",
12
+ "MedrxivClusteringP2P",
13
+ "MedrxivClusteringS2S",
14
+ "RedditClustering",
15
+ "RedditClusteringP2P",
16
+ "StackExchangeClustering",
17
+ "StackExchangeClusteringP2P",
18
+ "TwentyNewsgroupsClustering",
19
+ ]
20
+
21
+ TASKS_PAIR_CLASSIFICATION = [
22
+ "SprintDuplicateQuestions",
23
+ "TwitterSemEval2015",
24
+ "TwitterURLCorpus",
25
+ ]
26
+
27
+
28
+ MODELS = [
29
+ "all-MiniLM-L6-v2"
30
+ ]
31
+
32
+
33
+ def get_model_size(model_name):
34
+ return os.path.getsize(f"models/{model_name}/pytorch_model.bin") / (1024.0 * 1024.0)
35
+
36
+
37
+ def compute_model_score(model_name):
38
+ results_dir = "results"
39
+ model_dir = os.path.join(results_dir, model_name)
40
+
41
+ scores = []
42
+
43
+ # Get scores for clustering tasks
44
+ for task in TASKS_CLUSTERING:
45
+ task_file = os.path.join(model_dir, f"{task}.json")
46
+ with open(task_file, 'r') as f:
47
+ data = json.load(f)
48
+ v_measure = data['test']['v_measure']
49
+ scores.append(v_measure)
50
+
51
+ # Get scores for pair classification tasks
52
+ for task in TASKS_PAIR_CLASSIFICATION:
53
+ task_file = os.path.join(model_dir, f"{task}.json")
54
+ with open(task_file, 'r') as f:
55
+ data = json.load(f)
56
+ max_ap = data['test']['max']['ap']
57
+ scores.append(max_ap)
58
+
59
+ # Compute average score
60
+ average_score = sum(scores) / len(scores)
61
+ return average_score
62
+
63
+
64
+ DATA = {
65
+ "Model": MODELS,
66
+ "Model Size (MB)": [
67
+ get_model_size(model) for model in MODELS
68
+ ],
69
+ "Score": [
70
+ 5 # compute_model_score(model) for model in MODELS
71
+ ],
72
+ "q8 Model Size (MB)": [
73
+ get_model_size(model + "-q8") for model in MODELS
74
+ ],
75
+ "q8 Score": [
76
+ compute_model_score(model + "-q8") for model in MODELS
77
+ ],
78
+ }
79
+
80
+ with open('data.json', 'w') as json_file:
81
+ json.dump(DATA, json_file)
82
+
83
+
84
+
app.py CHANGED
@@ -4,37 +4,15 @@ import json
4
  import os
5
 
6
  # Given list of tasks for clustering and pair classification
7
- TASKS_CLUSTERING = [
8
- "ArxivClusteringP2P",
9
- "ArxivClusteringS2S",
10
- "BiorxivClusteringP2P",
11
- "BiorxivClusteringS2S",
12
- "MedrxivClusteringP2P",
13
- "MedrxivClusteringS2S",
14
- "RedditClustering",
15
- "RedditClusteringP2P",
16
- "StackExchangeClustering",
17
- "StackExchangeClusteringP2P",
18
- "TwentyNewsgroupsClustering",
19
- ]
20
-
21
- TASKS_PAIR_CLASSIFICATION = [
22
- "SprintDuplicateQuestions",
23
- "TwitterSemEval2015",
24
- "TwitterURLCorpus",
25
- ]
26
 
27
  def display_table():
28
- # Create a sample dataframe
29
- data = {
30
- "Model": ["ModelA", "ModelB", "ModelC"],
31
- "Model Size (MB)": [293, 793, 1000],
32
- "Score": [0.92, 0.85, 0.89],
33
- "Quantized Score": [0.91, 0.84, 0.88]
34
- }
35
  df = pd.DataFrame(data)
36
 
37
- df.index.name = "Rank"
 
 
38
  html_table = df.to_html()
39
 
40
  html_content = f"""
@@ -49,36 +27,6 @@ def display_table():
49
  return html_content
50
 
51
 
52
- def compute_model_score(model_name):
53
- results_dir = "results"
54
- model_dir = os.path.join(results_dir, model_name)
55
-
56
- scores = []
57
-
58
- # Get scores for clustering tasks
59
- for task in TASKS_CLUSTERING:
60
- task_file = os.path.join(model_dir, f"{task}.json")
61
- with open(task_file, 'r') as f:
62
- data = json.load(f)
63
- v_measure = data['test']['v_measure']
64
- scores.append(v_measure)
65
-
66
- # Get scores for pair classification tasks
67
- for task in TASKS_PAIR_CLASSIFICATION:
68
- task_file = os.path.join(model_dir, f"{task}.json")
69
- with open(task_file, 'r') as f:
70
- data = json.load(f)
71
- max_ap = data['test']['max']['ap']
72
- scores.append(max_ap)
73
-
74
- # Compute average score
75
- average_score = sum(scores) / len(scores)
76
- return average_score
77
-
78
-
79
- # score = compute_model_score("ModelA")
80
-
81
- # Create Gradio interface
82
  iface = gr.Interface(fn=display_table, live=True, inputs=[], outputs="html")
83
 
84
  iface.launch()
 
4
  import os
5
 
6
  # Given list of tasks for clustering and pair classification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def display_table():
9
+ with open('data.json', 'r') as json_file:
10
+ data = json.load(json_file)
 
 
 
 
 
11
  df = pd.DataFrame(data)
12
 
13
+ df = df.reset_index()
14
+ df.columns = ['Rank', 'Model', 'Score', 'Quantized Score']
15
+
16
  html_table = df.to_html()
17
 
18
  html_content = f"""
 
27
  return html_content
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  iface = gr.Interface(fn=display_table, live=True, inputs=[], outputs="html")
31
 
32
  iface.launch()
data.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": [
3
+ "all-MiniLM-L6-v2"
4
+ ],
5
+ "Model Size (MB)": [
6
+ 86.67845249176025
7
+ ],
8
+ "Score": [
9
+ 5
10
+ ],
11
+ "q8 Model Size (MB)": [
12
+ 55.91230869293213
13
+ ],
14
+ "q8 Score": [
15
+ 0.26228089622461903
16
+ ]
17
+ }
download_tasks.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Downloads MTEB tasks"""
2
+ import os
3
+
4
+ TASK_LIST = [
5
+ "ArxivClusteringP2P",
6
+ "ArxivClusteringS2S",
7
+ "BiorxivClusteringP2P",
8
+ "BiorxivClusteringS2S",
9
+ "MedrxivClusteringP2P",
10
+ "MedrxivClusteringS2S",
11
+ "RedditClustering",
12
+ "RedditClusteringP2P",
13
+ "StackExchangeClustering",
14
+ "StackExchangeClusteringP2P",
15
+ "TwentyNewsgroupsClustering",
16
+
17
+ "SprintDuplicateQuestions",
18
+ "TwitterSemEval2015",
19
+ "TwitterURLCorpus",
20
+ ]
21
+
22
+
23
+ os.environ["TRANSFORMERS_CACHE"]="./transformers_cache/"
24
+ os.environ["HF_DATASETS_CACHE"]="./hf_datasets_cache/"
25
+ os.environ["HF_MODULES_CACHE"]="./hf_modules_cache/"
26
+ os.environ["HF_METRICS_CACHE"]="./hf_metrics_cache/"
27
+
28
+ from mteb import MTEB
29
+ evaluation = MTEB(tasks=TASK_LIST, task_langs=["en"])
30
+
31
+ for task in evaluation.tasks:
32
+ print(f"Loading {task}")
33
+ task.load_data()
link.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/zsh
2
+
3
+ OS_NAME="macos"
4
+ COMPILER="clang"
5
+ CONDA_PREFIX=$(conda info --base)
6
+
7
+ if [[ "$OS_NAME" == "macos" && "$COMPILER" == "clang" ]]; then
8
+ for LIBOMP_ALIAS in libgomp.dylib libiomp5.dylib libomp.dylib
9
+ do
10
+ sudo ln -sf "$(brew --cellar libomp)"/*/lib/libomp.dylib $CONDA_PREFIX/lib/$LIBOMP_ALIAS
11
+ done
12
+ fi
13
+
models/all-MiniLM-L6-v2-q8/modules.json ADDED
File without changes
models/all-MiniLM-L6-v2-q8/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
ort.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
3
+ import torch
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("./models/optimum/all-MiniLM-L6-v2")
6
+ model = ORTModelForFeatureExtraction.from_pretrained("./models/optimum/all-MiniLM-L6-v2")
7
+
8
+ inputs = tokenizer("My name is Philipp and I live in Germany.", return_tensors="np")
9
+
10
+ outputs = model(**inputs)
11
+ last_hidden_state = outputs.last_hidden_state
12
+
13
+ # no mean pooling
14
+ print(list(last_hidden_state.shape))
setup.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ conda env remove --name qMTEB -y
4
+ conda create --name qMTEB python=3.9 -y
5
+
6
+ source activate qMTEB
7
+
8
+
9
+ conda install -c intel openmp
10
+ conda install nomkl
11
+
12
+ conda install pytorch torchvision -c pytorch
13
+ conda install -c conda-forge sentence-transformers
14
+ conda install -c huggingface transformers
15
+
16
+ pip install mteb
17
+
18
+ rm -rf results/
19
+
20
+ source link.sh
21
+
22
+ echo "Setup completed!"
23
+
test_convert.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
+ from scipy.spatial import distance
5
+ os.environ["TRANSFORMERS_OFFLINE"] = "1" # 1 for offline
6
+
7
+
8
+ def load_model_and_tokenizer(filepath):
9
+ model = AutoModel.from_pretrained(filepath)
10
+ tokenizer = AutoTokenizer.from_pretrained(filepath)
11
+ return model, tokenizer
12
+
13
+
14
+ def get_sentence_embeddings(sentences, model, tokenizer):
15
+ tokens = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
16
+ with torch.no_grad():
17
+ embeddings = model(**tokens).last_hidden_state.mean(dim=1).numpy()
18
+ return embeddings
19
+
20
+
21
+ def compare_embeddings(path1, path2):
22
+ model, tokenizer1 = load_model_and_tokenizer(path1)
23
+ quantized_model, tokenizer2 = load_model_and_tokenizer(path2)
24
+
25
+ sentence_embeddings1 = get_sentence_embeddings(sentences, model, tokenizer1)
26
+ sentence_embeddings2 = get_sentence_embeddings(sentences, quantized_model, tokenizer2)
27
+
28
+ for sentence, emb1, emb2 in zip(sentences, sentence_embeddings1, sentence_embeddings2):
29
+ cosine_similarity = 1 - distance.cosine(emb1, emb2) # scipy's cosine returns dissimilarity
30
+ euclidean_distance = distance.euclidean(emb1, emb2)
31
+
32
+ print("Sentence:", sentence)
33
+ print("Embedding1 shape:", emb1.shape)
34
+ print("Embedding2 shape:", emb2.shape)
35
+ print("Cosine Similarity:", cosine_similarity)
36
+ print("Euclidean Distance:", euclidean_distance)
37
+ print("--------")
38
+
39
+
40
+ # Testing the comparison function
41
+ model_filepath = "./models/all-MiniLM-L6-v2"
42
+ quantized_model_filepath = "./models/all-MiniLM-L6-v2-q8"
43
+ sentences = [
44
+ 'This framework generates embeddings for each input sentence',
45
+ 'Sentences are passed as a list of string.',
46
+ 'The quick brown fox jumps over the lazy dog.'
47
+ ]
48
+
49
+ compare_embeddings(model_filepath, quantized_model_filepath)