Spaces:

varun4
/

qMTEB

Sleeping

App Files Files Community

varun4 commited on Aug 31, 2023

Commit

f70e4f4

1 Parent(s): 38f9e2f

no breaking code

Browse files

Files changed (10) hide show

aggregate_data.py +84 -0
app.py +5 -57
data.json +17 -0
download_tasks.py +33 -0
link.sh +13 -0
models/all-MiniLM-L6-v2-q8/modules.json +0 -0
models/all-MiniLM-L6-v2-q8/sentence_bert_config.json +4 -0
ort.py +14 -0
setup.sh +23 -0
test_convert.py +49 -0

aggregate_data.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import gradio as gr
+import pandas as pd
+import json
+import os
+TASKS_CLUSTERING = [
+    "ArxivClusteringP2P",
+    "ArxivClusteringS2S",
+    "BiorxivClusteringP2P",
+    "BiorxivClusteringS2S",
+    "MedrxivClusteringP2P",
+    "MedrxivClusteringS2S",
+    "RedditClustering",
+    "RedditClusteringP2P",
+    "StackExchangeClustering",
+    "StackExchangeClusteringP2P",
+    "TwentyNewsgroupsClustering",
+]
+TASKS_PAIR_CLASSIFICATION = [
+    "SprintDuplicateQuestions",
+    "TwitterSemEval2015",
+    "TwitterURLCorpus",
+]
+MODELS = [
+    "all-MiniLM-L6-v2"
+]
+def get_model_size(model_name):
+    return os.path.getsize(f"models/{model_name}/pytorch_model.bin") / (1024.0 * 1024.0)
+def compute_model_score(model_name):
+    results_dir = "results"
+    model_dir = os.path.join(results_dir, model_name)
+    scores = []
+    # Get scores for clustering tasks
+    for task in TASKS_CLUSTERING:
+        task_file = os.path.join(model_dir, f"{task}.json")
+        with open(task_file, 'r') as f:
+            data = json.load(f)
+            v_measure = data['test']['v_measure']
+            scores.append(v_measure)
+    # Get scores for pair classification tasks
+    for task in TASKS_PAIR_CLASSIFICATION:
+        task_file = os.path.join(model_dir, f"{task}.json")
+        with open(task_file, 'r') as f:
+            data = json.load(f)
+            max_ap = data['test']['max']['ap']
+            scores.append(max_ap)
+    # Compute average score
+    average_score = sum(scores) / len(scores)
+    return average_score
+DATA = {
+        "Model": MODELS,
+        "Model Size (MB)": [
+                get_model_size(model) for model in MODELS
+            ],
+        "Score": [
+                5  # compute_model_score(model) for model in MODELS
+            ],
+        "q8 Model Size (MB)": [
+                get_model_size(model + "-q8") for model in MODELS
+            ],
+        "q8 Score": [
+                compute_model_score(model + "-q8") for model in MODELS
+            ],
+    }
+with open('data.json', 'w') as json_file:
+    json.dump(DATA, json_file)

app.py CHANGED Viewed

@@ -4,37 +4,15 @@ import json
 import os
 # Given list of tasks for clustering and pair classification
-TASKS_CLUSTERING = [
-    "ArxivClusteringP2P",
-    "ArxivClusteringS2S",
-    "BiorxivClusteringP2P",
-    "BiorxivClusteringS2S",
-    "MedrxivClusteringP2P",
-    "MedrxivClusteringS2S",
-    "RedditClustering",
-    "RedditClusteringP2P",
-    "StackExchangeClustering",
-    "StackExchangeClusteringP2P",
-    "TwentyNewsgroupsClustering",
-]
-TASKS_PAIR_CLASSIFICATION = [
-    "SprintDuplicateQuestions",
-    "TwitterSemEval2015",
-    "TwitterURLCorpus",
-]
 def display_table():
-    # Create a sample dataframe
-    data = {
-        "Model": ["ModelA", "ModelB", "ModelC"],
-        "Model Size (MB)": [293, 793, 1000],
-        "Score": [0.92, 0.85, 0.89],
-        "Quantized Score": [0.91, 0.84, 0.88]
-    }
     df = pd.DataFrame(data)
-    df.index.name = "Rank"
     html_table = df.to_html()
     html_content = f"""
@@ -49,36 +27,6 @@ def display_table():
     return html_content
-def compute_model_score(model_name):
-    results_dir = "results"
-    model_dir = os.path.join(results_dir, model_name)
-    scores = []
-    # Get scores for clustering tasks
-    for task in TASKS_CLUSTERING:
-        task_file = os.path.join(model_dir, f"{task}.json")
-        with open(task_file, 'r') as f:
-            data = json.load(f)
-            v_measure = data['test']['v_measure']
-            scores.append(v_measure)
-    # Get scores for pair classification tasks
-    for task in TASKS_PAIR_CLASSIFICATION:
-        task_file = os.path.join(model_dir, f"{task}.json")
-        with open(task_file, 'r') as f:
-            data = json.load(f)
-            max_ap = data['test']['max']['ap']
-            scores.append(max_ap)
-    # Compute average score
-    average_score = sum(scores) / len(scores)
-    return average_score
-# score = compute_model_score("ModelA")
-# Create Gradio interface
 iface = gr.Interface(fn=display_table, live=True, inputs=[], outputs="html")
 iface.launch()

 import os
 # Given list of tasks for clustering and pair classification
 def display_table():
+    with open('data.json', 'r') as json_file:
+        data = json.load(json_file)
     df = pd.DataFrame(data)
+    df = df.reset_index()
+    df.columns = ['Rank', 'Model', 'Score', 'Quantized Score']
     html_table = df.to_html()
     html_content = f"""
     return html_content
 iface = gr.Interface(fn=display_table, live=True, inputs=[], outputs="html")
 iface.launch()

data.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "Model": [
+    "all-MiniLM-L6-v2"
+  ],
+  "Model Size (MB)": [
+    86.67845249176025
+  ],
+  "Score": [
+    5
+  ],
+  "q8 Model Size (MB)": [
+    55.91230869293213
+  ],
+  "q8 Score": [
+    0.26228089622461903
+  ]
+}

download_tasks.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""Downloads MTEB tasks"""
+import os
+TASK_LIST = [
+    "ArxivClusteringP2P",
+    "ArxivClusteringS2S",
+    "BiorxivClusteringP2P",
+    "BiorxivClusteringS2S",
+    "MedrxivClusteringP2P",
+    "MedrxivClusteringS2S",
+    "RedditClustering",
+    "RedditClusteringP2P",
+    "StackExchangeClustering",
+    "StackExchangeClusteringP2P",
+    "TwentyNewsgroupsClustering",
+    "SprintDuplicateQuestions",
+    "TwitterSemEval2015",
+    "TwitterURLCorpus",
+]
+os.environ["TRANSFORMERS_CACHE"]="./transformers_cache/"
+os.environ["HF_DATASETS_CACHE"]="./hf_datasets_cache/"
+os.environ["HF_MODULES_CACHE"]="./hf_modules_cache/"
+os.environ["HF_METRICS_CACHE"]="./hf_metrics_cache/"
+from mteb import MTEB
+evaluation = MTEB(tasks=TASK_LIST, task_langs=["en"])
+for task in evaluation.tasks:
+    print(f"Loading {task}")
+    task.load_data()

link.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/zsh
+OS_NAME="macos"
+COMPILER="clang"
+CONDA_PREFIX=$(conda info --base)
+if [[ "$OS_NAME" == "macos" && "$COMPILER" == "clang" ]]; then
+    for LIBOMP_ALIAS in libgomp.dylib libiomp5.dylib libomp.dylib
+    do
+        sudo ln -sf "$(brew --cellar libomp)"/*/lib/libomp.dylib $CONDA_PREFIX/lib/$LIBOMP_ALIAS
+    done
+fi

models/all-MiniLM-L6-v2-q8/modules.json ADDED Viewed

File without changes

models/all-MiniLM-L6-v2-q8/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 256,
+  "do_lower_case": false
+}

ort.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+import torch
+tokenizer = AutoTokenizer.from_pretrained("./models/optimum/all-MiniLM-L6-v2")
+model = ORTModelForFeatureExtraction.from_pretrained("./models/optimum/all-MiniLM-L6-v2")
+inputs = tokenizer("My name is Philipp and I live in Germany.", return_tensors="np")
+outputs = model(**inputs)
+last_hidden_state = outputs.last_hidden_state
+# no mean pooling
+print(list(last_hidden_state.shape))

setup.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+conda env remove --name qMTEB -y
+conda create --name qMTEB python=3.9 -y
+source activate qMTEB
+conda install -c intel openmp
+conda install nomkl
+conda install pytorch torchvision -c pytorch
+conda install -c conda-forge sentence-transformers
+conda install -c huggingface transformers
+pip install mteb
+rm -rf results/
+source link.sh
+echo "Setup completed!"

test_convert.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import torch
+from transformers import AutoModel, AutoTokenizer
+from scipy.spatial import distance
+os.environ["TRANSFORMERS_OFFLINE"] = "1"  # 1 for offline
+def load_model_and_tokenizer(filepath):
+    model = AutoModel.from_pretrained(filepath)
+    tokenizer = AutoTokenizer.from_pretrained(filepath)
+    return model, tokenizer
+def get_sentence_embeddings(sentences, model, tokenizer):
+    tokens = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
+    with torch.no_grad():
+        embeddings = model(**tokens).last_hidden_state.mean(dim=1).numpy()
+    return embeddings
+def compare_embeddings(path1, path2):
+    model, tokenizer1 = load_model_and_tokenizer(path1)
+    quantized_model, tokenizer2 = load_model_and_tokenizer(path2)
+    sentence_embeddings1 = get_sentence_embeddings(sentences, model, tokenizer1)
+    sentence_embeddings2 = get_sentence_embeddings(sentences, quantized_model, tokenizer2)
+    for sentence, emb1, emb2 in zip(sentences, sentence_embeddings1, sentence_embeddings2):
+        cosine_similarity = 1 - distance.cosine(emb1, emb2)  # scipy's cosine returns dissimilarity
+        euclidean_distance = distance.euclidean(emb1, emb2)
+        print("Sentence:", sentence)
+        print("Embedding1 shape:", emb1.shape)
+        print("Embedding2 shape:", emb2.shape)
+        print("Cosine Similarity:", cosine_similarity)
+        print("Euclidean Distance:", euclidean_distance)
+        print("--------")
+# Testing the comparison function
+model_filepath = "./models/all-MiniLM-L6-v2"
+quantized_model_filepath = "./models/all-MiniLM-L6-v2-q8"
+sentences = [
+    'This framework generates embeddings for each input sentence',
+    'Sentences are passed as a list of string.',
+    'The quick brown fox jumps over the lazy dog.'
+]
+compare_embeddings(model_filepath, quantized_model_filepath)