Spaces:

taskswithcode
/

semantic_clustering

Build error

App Files Files Community

taskswithcode commited on Sep 29, 2022

Commit

d872b74

•

1 Parent(s): 515e4d1

Fixes

Browse files

Files changed (14) hide show

app.py +24 -16
clus_app_models.json +61 -1
text-similarity-ada-001imdb_sent_embed.json +0 -0
text-similarity-ada-001larger_test_embed.json +0 -0
text-similarity-ada-001small_test_embed.json +0 -0
text-similarity-babbage-001imdb_sent_embed.json +0 -0
text-similarity-babbage-001larger_test_embed.json +0 -0
text-similarity-babbage-001small_test_embed.json +0 -0
text-similarity-curie-001imdb_sent_embed.json +0 -0
text-similarity-curie-001larger_test_embed.json +0 -0
text-similarity-curie-001small_test_embed.json +0 -0
text-similarity-davinci-001small_test_embed.json +0 -0
twc_embeddings.py +6 -6
twc_openai_embeddings.py +102 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from io import StringIO
 import pdb
 import json
 from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
 from twc_clustering import TWCClustering
 import torch
 import requests
@@ -60,7 +61,7 @@ def get_views(action):
 def construct_model_info_for_display(model_names):
     options_arr  = []
-    markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>These are either state-of-the-art or the most downloaded models on Hugging Face</i></div>"
     markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
     for node in model_names:
         options_arr .append(node["name"])
@@ -96,22 +97,22 @@ def load_model(model_name,model_class,load_model_name):
         ret_model.init_model(load_model_name)
         assert(ret_model is not None)
     except Exception as e:
-        st.error("Unable to load model:" + model_name + " " + load_model_name + " " +  str(e))
         pass
     return ret_model
 @st.experimental_memo
-def cached_compute_similarity(sentences,_model,model_name,threshold,_cluster,clustering_type):
-    texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
     results = _cluster.cluster(None,texts,embeddings,threshold,clustering_type)
     return results
-def uncached_compute_similarity(sentences,_model,model_name,threshold,cluster,clustering_type):
     with st.spinner('Computing vectors for sentences'):
-        texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
         results = cluster.cluster(None,texts,embeddings,threshold,clustering_type)
     #st.success("Similarity computation complete")
     return results
@@ -124,7 +125,7 @@ def get_model_info(model_names,model_name):
     return get_model_info(model_names,DEFAULT_HF_MODEL)
-def run_test(model_names,model_name,sentences,display_area,threshold,user_uploaded,custom_model,clustering_type):
     display_area.text("Loading model:" + model_name)
     #Note. model_name may get mapped to new name in the call below for custom models
     orig_model_name = model_name
@@ -136,14 +137,18 @@ def run_test(model_names,model_name,sentences,display_area,threshold,user_upload
     if ("Note" in model_info):
         fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
         display_area.write(fail_link)
     model = load_model(model_name,model_info["class"],load_model_name)
     display_area.text("Model " + model_name  + " load complete")
     try:
             if (user_uploaded):
-                results = uncached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
             else:
                 display_area.text("Computing vectors for sentences")
-                results = cached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
                 display_area.text("Similarity computation complete")
             return results
@@ -263,15 +268,18 @@ def app_main(app_mode,example_files,model_name_files,clus_types):
             st.session_state["model_name"] = selected_model
             st.session_state["threshold"] = threshold
             st.session_state["overlapped"] = cluster_types[clustering_type]["type"]
-            results = run_test(model_names,run_model,sentences,display_area,threshold,(uploaded_file is not None),(len(custom_model_selection) != 0),cluster_types[clustering_type]["type"])
             display_area.empty()
             with display_area.container():
-                device = 'GPU' if torch.cuda.is_available() else 'CPU'
-                response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
-                if (len(custom_model_selection) != 0):
-                    st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
-                display_results(sentences,results,response_info,app_mode,run_model)
-                #st.json(results)
       st.download_button(
          label="Download results as json",
          data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",

 import pdb
 import json
 from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
+from twc_openai_embeddings import OpenAIModel
 from twc_clustering import TWCClustering
 import torch
 import requests
 def construct_model_info_for_display(model_names):
     options_arr  = []
+    markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>The selected models satisfy one or more of the following (1) state-of-the-art (2) the most downloaded models on Hugging Face (3) Large Language Models (e.g. GPT-3)</i></div>"
     markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
     for node in model_names:
         options_arr .append(node["name"])
         ret_model.init_model(load_model_name)
         assert(ret_model is not None)
     except Exception as e:
+        st.error(f"Unable to load model class:{model_class} model_name: {model_name} load_model_name: {load_model_name}   {str(e)}")
         pass
     return ret_model
 @st.experimental_memo
+def cached_compute_similarity(input_file_name,sentences,_model,model_name,threshold,_cluster,clustering_type):
+    texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
     results = _cluster.cluster(None,texts,embeddings,threshold,clustering_type)
     return results
+def uncached_compute_similarity(input_file_name,sentences,_model,model_name,threshold,cluster,clustering_type):
     with st.spinner('Computing vectors for sentences'):
+        texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
         results = cluster.cluster(None,texts,embeddings,threshold,clustering_type)
     #st.success("Similarity computation complete")
     return results
     return get_model_info(model_names,DEFAULT_HF_MODEL)
+def run_test(model_names,model_name,input_file_name,sentences,display_area,threshold,user_uploaded,custom_model,clustering_type):
     display_area.text("Loading model:" + model_name)
     #Note. model_name may get mapped to new name in the call below for custom models
     orig_model_name = model_name
     if ("Note" in model_info):
         fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
         display_area.write(fail_link)
+    if (user_uploaded and "custom_load" in model_info and model_info["custom_load"] == "False"):
+        fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
+        display_area.write(fail_link)
+        return {"error":fail_link}
     model = load_model(model_name,model_info["class"],load_model_name)
     display_area.text("Model " + model_name  + " load complete")
     try:
             if (user_uploaded):
+                results = uncached_compute_similarity(input_file_name,sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
             else:
                 display_area.text("Computing vectors for sentences")
+                results = cached_compute_similarity(input_file_name,sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
                 display_area.text("Similarity computation complete")
             return results
             st.session_state["model_name"] = selected_model
             st.session_state["threshold"] = threshold
             st.session_state["overlapped"] = cluster_types[clustering_type]["type"]
+            results = run_test(model_names,run_model,st.session_state["file_name"],sentences,display_area,threshold,(uploaded_file is not None),(len(custom_model_selection) != 0),cluster_types[clustering_type]["type"])
             display_area.empty()
             with display_area.container():
+                if ("error" in results):
+                    st.error(results["error"])
+                else:
+                    device = 'GPU' if torch.cuda.is_available() else 'CPU'
+                    response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
+                    if (len(custom_model_selection) != 0):
+                        st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
+                    display_results(sentences,results,response_info,app_mode,run_model)
+                    #st.json(results)
       st.download_button(
          label="Download results as json",
          data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",

clus_app_models.json CHANGED Viewed

@@ -84,7 +84,67 @@
                             },
                 "paper_url":"https://arxiv.org/abs/2104.08821v4",
                 "mark":"True",
-                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
             ]

                             },
                 "paper_url":"https://arxiv.org/abs/2104.08821v4",
                 "mark":"True",
+                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
+            {  "name":"GPT-3-175B (text-similarity-davinci-001)" ,
+                "model":"text-similarity-davinci-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-6.7B (text-similarity-curie-001)" ,
+                "model":"text-similarity-curie-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-1.3B (text-similarity-babbage-001)" ,
+                "model":"text-similarity-babbage-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-350M (text-similarity-ada-001)" ,
+                "model":"text-similarity-ada-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"}
             ]

text-similarity-ada-001imdb_sent_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-ada-001larger_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-ada-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-babbage-001imdb_sent_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-babbage-001larger_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-babbage-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-curie-001imdb_sent_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-curie-001larger_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-curie-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-davinci-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

twc_embeddings.py CHANGED Viewed

@@ -32,7 +32,7 @@ class CausalLMModel:
         self.model.eval()
         self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
-    def compute_embeddings(self,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
@@ -160,7 +160,7 @@ class SGPTQnAModel:
         return embeddings
-    def compute_embeddings(self,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
@@ -215,7 +215,7 @@ class SimCSEModel:
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = AutoModel.from_pretrained(model_name)
-    def compute_embeddings(self,input_data,is_file):
         texts = read_text(input_data) if is_file == True else input_data
         inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
         with torch.no_grad():
@@ -266,7 +266,7 @@ class SGPTModel:
         # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
         self.model.eval()
-    def compute_embeddings(self,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
@@ -353,7 +353,7 @@ class HFModel:
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    def compute_embeddings(self,input_data,is_file):
         #print("Computing embeddings for:", input_data[:20])
         model = self.model
         tokenizer = self.tokenizer
@@ -403,5 +403,5 @@ if __name__ == '__main__':
         results = parser.parse_args()
         obj = HFModel()
         obj.init_model(results.model)
-        texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
         results = obj.output_results(results.output,texts,embeddings)

         self.model.eval()
         self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
         return embeddings
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = AutoModel.from_pretrained(model_name)
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         texts = read_text(input_data) if is_file == True else input_data
         inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
         with torch.no_grad():
         # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
         self.model.eval()
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         #print("Computing embeddings for:", input_data[:20])
         model = self.model
         tokenizer = self.tokenizer
         results = parser.parse_args()
         obj = HFModel()
         obj.init_model(results.model)
+        texts, embeddings = obj.compute_embeddings(results.input,results.input,is_file = True)
         results = obj.output_results(results.output,texts,embeddings)

twc_openai_embeddings.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from scipy.spatial.distance import cosine
+import argparse
+import json
+import os
+import openai
+import pdb
+def read_text(input_file):
+    arr = open(input_file).read().split("\n")
+    return arr[:-1]
+class OpenAIModel:
+    def __init__(self):
+        self.debug = False
+        self.model_name = None
+        self.skip_key = True
+        print("In OpenAI API constructor")
+    def init_model(self,model_name = None):
+        #print("OpenAI: Init model",model_name)
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        if (openai.api_key == None):
+            openai.api_key = ""
+            print("API key not set")
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+        if (model_name is None):
+            self.model_name = "text-similarity-ada-001"
+        else:
+            self.model_name = model_name
+        print("OpenAI: Init model complete",model_name)
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+                return [],[]
+        #print("In compute embeddings after key check")
+        in_file = self.model_name + '.'.join(input_file_name.split('.')[:-1]) + "_embed.json"
+        cached = False
+        try:
+            fp = open(in_file)
+            cached = True
+            embeddings = json.load(fp)
+            print("Using cached embeddings")
+        except:
+            pass
+        texts = read_text(input_data) if is_file == True else input_data
+        if (not cached):
+            print(f"Computing embeddings for {input_file_name} and model {self.model_name}")
+            response = openai.Embedding.create(
+                input=texts,
+                model=self.model_name
+            )
+            embeddings = []
+            for i in range(len(response['data'])):
+                embeddings.append(response['data'][i]['embedding'])
+        if (not cached):
+            with open(in_file,"w") as fp:
+                json.dump(embeddings,fp)
+        return texts,embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+                return {}
+        #print("In output results after key check")
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        #print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        #print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+if __name__ == '__main__':
+        parser = argparse.ArgumentParser(description='OpenAI model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
+        parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
+        parser.add_argument('-model', action="store", dest="model",default="text-similarity-ada-001",help="model name")
+        results = parser.parse_args()
+        obj = OpenAIModel()
+        obj.init_model(results.model)
+        texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
+        results = obj.output_results(results.output,texts,embeddings)