Spaces:

taskswithcode
/

semantic_search

Runtime error

App Files Files Community

taskswithcode commited on Sep 17, 2022

Commit

ce6a2ba

1 Parent(s): fb189b3

Initial addition

Browse files

Files changed (12) hide show

app.py +250 -0
doc_app_examples.json +5 -0
doc_app_models.json +114 -0
imdb_sent.txt +62 -0
planets_qna.txt +20 -0
qna.txt +16 -0
qna2.txt +13 -0
requirements.txt +3 -0
sim_app_examples.json +5 -0
sim_app_models.json +134 -0
twc_embeddings.py +407 -0
view_count.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import time
+import sys
+import streamlit as st
+import string
+from io import StringIO
+import pdb
+import json
+from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
+import torch
+MAX_INPUT = 100
+SEM_SIMILARITY="1"
+DOC_RETRIEVAL="2"
+CLUSTERING="3"
+use_case = {"1":"Finding similar phrases/sentences","2":"Retrieving semantically matching information to a query. It may not be a factual match","3":"Clustering"}
+from transformers import BertTokenizer, BertForMaskedLM
+view_count_file = "view_count.txt"
+def get_views():
+    ret_val = 0
+    if ("view_count" not in st.session_state):
+        try:
+           data = int(open(view_count_file).read().strip("\n"))
+        except:
+           data = 0
+        data += 1
+        ret_val = data
+        st.session_state["view_count"] = data
+        with open(view_count_file,"w") as fp:
+            fp.write(str(data))
+    else:
+        ret_val = st.session_state["view_count"]
+    return "{:,}".format(ret_val)
+def construct_model_info_for_display(model_names):
+    options_arr  = []
+    markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b></div>"
+    for node in model_names:
+        options_arr .append(node["name"])
+        if (node["mark"] == "True"):
+            markdown_str += f"<div style=\"font-size:16px; color: #5f5f5f; text-align: left\">&nbsp;•&nbsp;Model:&nbsp;<a href=\'{node['paper_url']}\' target='_blank'>{node['name']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Code released by:&nbsp;<a href=\'{node['orig_author_url']}\' target='_blank'>{node['orig_author']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Model info:&nbsp;<a href=\'{node['sota_info']['sota_link']}\' target='_blank'>{node['sota_info']['task']}</a></div>"
+            if ("Note" in node):
+                markdown_str += f"<div style=\"font-size:16px; color: #a91212; text-align: left\">&nbsp;&nbsp;&nbsp;&nbsp;{node['Note']}<a href=\'{node['alt_url']}\' target='_blank'>link</a></div>"
+            markdown_str += "<div style=\"font-size:16px; color: #5f5f5f; text-align: left\"><br/></div>"
+    markdown_str += "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><b>Note:</b><br/>•&nbsp;Uploaded files are loaded into non-persistent memory for the duration of the computation. They are not cached</div>"
+    limit = "{:,}".format(MAX_INPUT)
+    markdown_str += f"<div style=\"font-size:12px; color: #9f9f9f; text-align: left\">•&nbsp;User uploaded file has a maximum limit of {limit} sentences.</div>"
+    return options_arr,markdown_str
+st.set_page_config(page_title='TWC - Compare popular/state-of-the-art models for tasks using sentence embeddings', page_icon="logo.jpg", layout='centered', initial_sidebar_state='auto',
+            menu_items={
+             'About': 'This app was created by taskswithcode. http://taskswithcode.com'
+              })
+col,pad = st.columns([85,15])
+with col:
+    st.image("long_form_logo_with_icon.png")
+@st.experimental_memo
+def load_model(model_name,model_names):
+    try:
+        ret_model = None
+        for node in model_names:
+            if (model_name.startswith(node["name"])):
+                obj_class = globals()[node["class"]]
+                ret_model = obj_class()
+                ret_model.init_model(node["model"])
+        assert(ret_model is not None)
+    except Exception as e:
+        st.error("Unable to load model:" + model_name + " " +  str(e))
+        pass
+    return ret_model
+@st.experimental_memo
+def cached_compute_similarity(sentences,_model,model_name,main_index):
+    texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
+    results = _model.output_results(None,texts,embeddings,main_index)
+    return results
+def uncached_compute_similarity(sentences,_model,model_name,main_index):
+    with st.spinner('Computing vectors for sentences'):
+        texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
+        results = _model.output_results(None,texts,embeddings,main_index)
+    #st.success("Similarity computation complete")
+    return results
+def get_model_info(model_names,model_name):
+    for node in model_names:
+        if (model_name == node["name"]):
+            return node
+def run_test(model_names,model_name,sentences,display_area,main_index,user_uploaded):
+    display_area.text("Loading model:" + model_name)
+    model_info = get_model_info(model_names,model_name)
+    if ("Note" in model_info):
+        fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
+        display_area.write(fail_link)
+    model = load_model(model_name,model_names)
+    display_area.text("Model " + model_name  + " load complete")
+    try:
+            if (user_uploaded):
+                results = uncached_compute_similarity(sentences,model,model_name,main_index)
+            else:
+                display_area.text("Computing vectors for sentences")
+                results = cached_compute_similarity(sentences,model,model_name,main_index)
+                display_area.text("Similarity computation complete")
+            return results
+    except Exception as e:
+        st.error("Some error occurred during prediction" + str(e))
+        st.stop()
+    return {}
+def display_results(orig_sentences,main_index,results,response_info,app_mode):
+    main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
+    score_text = "cosine_distance" if app_mode == "similarity" else "cosine_distance/score"
+    pivot_name = "main sentence" if app_mode == "similarity" else "query"
+    main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Results sorted by {score_text}. Closest to furthest away from {pivot_name}</div>"
+    pivot_name = pivot_name[0].upper() + pivot_name[1:]
+    main_sent += f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><b>{pivot_name}:</b>&nbsp;&nbsp;{orig_sentences[main_index]}</div>"
+    body_sent = []
+    download_data = {}
+    first = True
+    for key in results:
+        if (app_mode == DOC_RETRIEVAL and first):
+            first = False
+            continue
+        index = orig_sentences.index(key) + 1
+        body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{index}]&nbsp;{key}&nbsp;&nbsp;&nbsp;<b>{results[key]:.2f}</b></div>")
+        download_data[key] =  f"{results[key]:.2f}"
+    main_sent = main_sent + "\n" + '\n'.join(body_sent)
+    st.markdown(main_sent,unsafe_allow_html=True)
+    st.session_state["download_ready"] = json.dumps(download_data,indent=4)
+def init_session():
+    st.session_state["download_ready"] = None
+    st.session_state["model_name"] = "ss_test"
+    st.session_state["main_index"] = 1
+    st.session_state["file_name"] = "default"
+def app_main(app_mode,example_files,model_name_files):
+  init_session()
+  with open(example_files) as fp:
+        example_file_names = json.load(fp)
+  with open(model_name_files) as fp:
+        model_names = json.load(fp)
+  curr_use_case = use_case[app_mode].split(".")[0]
+  st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for tasks using sentence embeddings</h5>", unsafe_allow_html=True)
+  st.markdown(f"<div style='color: #4f4f4f; text-align: left'>Use cases for sentence embeddings<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['1']}<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['2']}<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['3']}<br/><i>This app illustrates <b>'{curr_use_case}'</b> use case</i></div>", unsafe_allow_html=True)
+  st.markdown(f"<div style='color: #9f9f9f; text-align: right'>views:&nbsp;{get_views()}</div>", unsafe_allow_html=True)
+  try:
+      with st.form('twc_form'):
+        uploaded_file = st.file_uploader("Step 1. Upload text file(one sentence in a line) or choose an example text file below", type=".txt")
+        selected_file_index = st.selectbox(label=f'Example files ({len(example_file_names)})',
+                    options = list(dict.keys(example_file_names)), index=0,  key = "twc_file")
+        st.write("")
+        options_arr,markdown_str = construct_model_info_for_display(model_names)
+        selection_label = 'Step 2. Select Model'
+        selected_model = st.selectbox(label=selection_label,
+                    options = options_arr, index=0,  key = "twc_model")
+        st.write("")
+        if (app_mode == "similarity"):
+            main_index = st.number_input('Step 3. Enter index of sentence in file to make it the main sentence',value=1,min_value = 1)
+        else:
+            main_index = 1
+        st.write("")
+        submit_button = st.form_submit_button('Run')
+        input_status_area = st.empty()
+        display_area = st.empty()
+        if submit_button:
+            start = time.time()
+            if uploaded_file is not None:
+                st.session_state["file_name"]  = uploaded_file.name
+                sentences = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
+            else:
+                st.session_state["file_name"]  = example_file_names[selected_file_index]["name"]
+                sentences = open(example_file_names[selected_file_index]["name"]).read()
+            sentences = sentences.split("\n")[:-1]
+            if (len(sentences) < main_index):
+                main_index = len(sentences)
+                st.info("Selected sentence index is larger than number of sentences in file. Truncating to " + str(main_index))
+            if (len(sentences) > MAX_INPUT):
+                st.info(f"Input sentence count exceeds maximum sentence limit. First {MAX_INPUT} out of {len(sentences)} sentences chosen")
+                sentences = sentences[:MAX_INPUT]
+            st.session_state["model_name"] = selected_model
+            st.session_state["main_index"] = main_index
+            results = run_test(model_names,selected_model,sentences,display_area,main_index - 1,(uploaded_file is not None))
+            display_area.empty()
+            with display_area.container():
+                device = 'GPU' if torch.cuda.is_available() else 'CPU'
+                response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
+                display_results(sentences,main_index - 1,results,response_info,app_mode)
+                #st.json(results)
+      st.download_button(
+         label="Download results as json",
+         data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
+         disabled = False if st.session_state["download_ready"] != None else True,
+         file_name= (st.session_state["model_name"] + "_" +  str(st.session_state["main_index"]) + "_" + '_'.join(st.session_state["file_name"].split(".")[:-1]) + ".json").replace("/","_"),
+         mime='text/json',
+         key ="download"
+        )
+  except Exception as e:
+    st.error("Some error occurred during loading" + str(e))
+    st.stop()
+  st.markdown(markdown_str, unsafe_allow_html=True)
+if __name__ == "__main__":
+   #print("comand line input:",len(sys.argv),str(sys.argv))
+   #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
+   #app_main("1","sim_app_examples.json","sim_app_models.json")
+   app_main("2","doc_app_examples.json","doc_app_models.json")

doc_app_examples.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+"Querying about a planet": {"name":"planets_qna.txt"},
+"Querying about a disease": {"name":"qna.txt"},
+"Querying about a protein": {"name":"qna2.txt"}
+}

doc_app_models.json ADDED Viewed

	@@ -0,0 +1,114 @@

+[
+            {   "name":"SGPT-125M-Search",
+                "model":"Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit",
+                "fork_url":"https://github.com/taskswithcode/sgpt",
+                "orig_author_url":"https://github.com/Muennighoff",
+                "orig_author":"Niklas Muennighoff",
+                "sota_info": {
+                                 "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
+                            },
+                "paper_url":"https://arxiv.org/abs/2202.08904v5",
+                "mark":"True",
+                "class":"SGPTQnAModel"},
+            {   "name":"GPT-Neo-125M",
+                "model":"EleutherAI/gpt-neo-125M",
+                "fork_url":"https://github.com/taskswithcode/sgpt",
+                "orig_author_url":"https://www.eleuther.ai/",
+                "orig_author":"EleuthorAI",
+                "sota_info": {
+                                 "task":"Top 20 in multiple NLP tasks (smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/gpt-neox-20b-an-open-source-autoregressive-1"
+                            },
+                "paper_url":"https://zenodo.org/record/5551208#.YyV0k-zMLX0",
+                "mark":"True",
+                "class":"CausalLMModel"},
+            {   "name":"sentence-transformers/all-MiniLM-L6-v2",
+                "model":"sentence-transformers/all-MiniLM-L6-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 3.8  million downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
+                "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 2 million downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/bert-base-nli-mean-tokens",
+                "model":"sentence-transformers/bert-base-nli-mean-tokens",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 700,000 downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/all-mpnet-base-v2",
+                "model":"sentence-transformers/all-mpnet-base-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 500,000 downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/all-MiniLM-L12-v2",
+                "model":"sentence-transformers/all-MiniLM-L12-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 500,000 downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"SGPT-125M",
+                "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
+                "fork_url":"https://github.com/taskswithcode/sgpt",
+                "orig_author_url":"https://github.com/Muennighoff",
+                "orig_author":"Niklas Muennighoff",
+                "sota_info": {
+                                 "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
+                            },
+                "paper_url":"https://arxiv.org/abs/2202.08904v5",
+                "mark":"True",
+                "class":"SGPTModel"},
+            {  "name":"SIMCSE-base" ,
+                "model":"princeton-nlp/sup-simcse-roberta-base",
+                "fork_url":"https://github.com/taskswithcode/SimCSE",
+                "orig_author_url":"https://github.com/princeton-nlp",
+                "orig_author":"Princeton Natural Language Processing",
+                "sota_info": {
+                                 "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
+                            },
+                "paper_url":"https://arxiv.org/abs/2104.08821v4",
+                "mark":"True",
+                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
+            ]

imdb_sent.txt ADDED Viewed

	@@ -0,0 +1,62 @@

+"A rating of ""1"" does not begin to express how dull, depressing and relentlessly bad this movie is."
+Hated it with all my being. Worst movie ever. Mentally- scarred. Help me. It was that bad.TRUST ME!!!
+"Long, boring, blasphemous. Never have I been so glad to see ending credits roll."
+This film made John Glover a star. Alan Raimy is one of the most compelling character that I have ever seen on film. And I mean that sport.
+"Were I not with friends, and so cheap, I would have walked out. It failed miserably as satire and didn't even have the redemption of camp."
+For pure gothic vampire cheese nothing can compare to the Subspecies films. I highly recommend each and every one of them.
+"A great film in its genre, the direction, acting, most especially the casting of the film makes it even more powerful. A must see."
+"This is a terrible movie, don't waste your money on it. Don't even watch it for free. That's all I have to say."
+I wouldn't rent this one even on dollar rental night.
+"More suspenseful, more subtle, much, much more disturbing...."
+This is a good film. This is very funny. Yet after this film there were no good Ernest films!
+A touching movie. It is full of emotions and wonderful acting. I could have sat through it a second time.
+"Great movie - especially the music - Etta James - ""At Last"". This speaks volumes when you have finally found that special someone."
+If you've ever had a mad week-end out with your mates then you'll appreciate this film. Excellent fun and a laugh a minute.
+"I think it's one of the greatest movies which are ever made, and I've seen many... The book is better, but it's still a very good movie!"
+Brilliant and moving performances by Tom Courtenay and Peter Finch.
+The characters are unlikeable and the script is awful. It's a waste of the talents of Deneuve and Auteuil.
+You've got to be kidding. This movie sucked for the sci-fi fans. I would only recommend watching this only if you think Armageddon was good.
+Ten minutes of people spewing gallons of pink vomit. Recurring scenes of enormous piles of dog excrement - need one say more???
+"As usual, Sean Connery does a great job. Lawrence Fishburn is good, but I have a hard time not seeing him as Ike Turner."
+This movie is terrible but it has some good effects.
+You'd better choose Paul Verhoeven's even if you have watched it.
+"Brilliant. Ranks along with Citizen Kane, The Matrix and Godfathers. Must see, at least for basset in her early days. Watch it."
+"I don't know why I like this movie so well, but I never get tired of watching it."
+The one-liners fly so fast in this movie that you can watch it over and over and still catch new ones. By far one of the best of this genre.
+"Don't waste your time and money on it. It's not quite as bad as ""Adrenalin"", by the same director but that's not saying much."
+"Read the book, forget the movie!"
+This is a great movie. Too bad it is not available on home video.
+"Very intelligent language usage of Ali, which you musn't miss! In one word: (eeh sentence...) Wicked, so keep it real and pass it on!"
+Primary plot!Primary direction!Poor interpretation.
+"If you like Pauly Shore, you'll love Son in Law. If you hate Pauly Shore, then, well...I liked it!"
+Just love the interplay between two great characters of stage & screen - Veidt & Barrymore
+"This movie will always be a Broadway and Movie classic, as long as there are still people who sing, dance, and act."
+This is the greatest movie ever. If you have written it off with out ever seeing it. You must give it a second try.
+"What a script, what a story, what a mess!"
+"I caught this film late at night on HBO. Talk about wooden acting, unbelievable plot, et al. Very little going in its favor. Skip it."
+This is without a doubt the worst movie I have ever seen. It is not funny. It is not interesting and should not have been made.
+Ming The Merciless does a little Bardwork and a movie most foul!
+This is quite possibly the worst sequel ever made. The script is unfunny and the acting stinks. The exact opposite of the original.
+"This is the definitive movie version of Hamlet. Branagh cuts nothing, but there are no wasted moments."
+My favorite movie. What a great story this really was. I'd just like to be able to buy a copy of it but this does not seem possible.
+"Comment this movie is impossible. Is terrible, very improbable, bad interpretation e direction. Not look!!!!!"
+"Brilliant movie. The drawings were just amazing. Too bad it ended before it begun. I´ve waited 21 years for a sequel, but nooooo!!!"
+a mesmerizing film that certainly keeps your attention... Ben Daniels is fascinating (and courageous) to watch.
+"This is a very cool movie. The ending of the movie is a bit more defined than the play's ending, but either way it is still a good movie."
+"Without a doubt, one of Tobe Hoppor's best! Epic storytellng, great special effects, and The Spacegirl (vamp me baby!)."
+I hope this group of film-makers never re-unites.
+Unwatchable. You can't even make it past the first three minutes. And this is coming from a huge Adam Sandler fan!!1
+"One of the funniest movies made in recent years. Good characterization, plot and exceptional chemistry make this one a classic"
+"Add this little gem to your list of holiday regulars. It is<br /><br />sweet, funny, and endearing"
+"no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!"
+"If you haven't seen this, it's terrible. It is pure trash. I saw this about 17 years ago, and I'm still screwed up from it."
+Absolutely fantastic! Whatever I say wouldn't do this underrated movie the justice it deserves. Watch it now! FANTASTIC!
+"As a big fan of Tiny Toon Adventures, I loved this movie!!! It was so funny!!! It really captured how cartoons spent their summers."
+Widow hires a psychopath as a handyman. Sloppy film noir thriller which doesn't make much of its tension promising set-up. (3/10)
+The Fiendish Plot of Dr. Fu Manchu (1980). This is hands down the worst film I've ever seen. What a sad way for a great comedian to go out.
+"Obviously written for the stage. Lightweight but worthwhile. How can you go wrong with Ralph Richardson, Olivier and Merle Oberon."
+This movie turned out to be better than I had expected it to be. Some parts were pretty funny. It was nice to have a movie with a new plot.
+This movie is terrible. It's about some no brain surfin dude that inherits some company. Does Carrot Top have no shame?<br /><br />
+Adrian Pasdar is excellent is this film. He makes a fascinating woman.
+"An unfunny, unworthy picture which is an undeserving end to Peter Sellers' career. It is a pity this movie was ever made."
+"The plot was really weak and confused. This is a true Oprah flick. (In Oprah's world, all men are evil and all women are victims.)"

planets_qna.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+I'm searching for a planet not too far from Earth.
+Mercury is closest to the sun
+Venus and Mars are closest to earth
+Pluto is not too far from neptune
+Neptune is the eighth and farthest-known Solar planet from the Sun. In the Solar System, it is the fourth-largest planet by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth, slightly more massive than its near-twin Uranus.
+TRAPPIST-1d, also designated as 2MASS J23062928-0502285 d, is a small exoplanet (about 30% the mass of the earth), which orbits on the inner edge of the habitable zone of the ultracool dwarf star TRAPPIST-1 approximately 40 light-years (12.1 parsecs, or nearly 3.7336×1014 km) away from Earth in the constellation of Aquarius.
+A harsh desert world orbiting twin suns in the galaxy’s Outer Rim, Tatooine is a lawless place ruled by Hutt gangsters. Many settlers scratch out a living on moisture farms, while spaceport cities such as Mos Eisley and Mos Espa serve as home base for smugglers, criminals, and other rogues.
+For some individual rights are very close to the heart and the reduction of it, regardless of the valid reasons, was uncacceptable
+A quasar is an extremely luminous active galactic nucleus, powered by a supermassive black hole, with mass ranging from millions to tens of billions of solar masses, surrounded by a gaseous accretion disc
+Politics is the last resort of the scounderal
+Planets with water have been found in many places now
+Some drugs have serious side effects
+The dog ran all around the park like a planet orbiting a star
+Quantum computing is yet to take off
+His world revolved around his girl friend like a planet revolving around a star
+The news cycle obsessively revolved around the newly weds with starry eyed fascination for the royalty
+Starry nights with planets is a rare sight due to city lights
+The dog ran all around the park in circles around the cat
+Milky way Galaxy
+Twin planet stars

qna.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+Is Hirschsprung disease a mendelian or a multifactorial disorder?
+Coding sequence mutations in RET, GDNF, EDNRB, EDN3, and SOX10 are involved in the development of Hirschsprung disease. The majority of these genes was shown to be related to Mendelian syndromic forms of Hirschsprung's disease, whereas the non-Mendelian inheritance of sporadic non-syndromic Hirschsprung disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model.
+Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes
+In this study, we review the identification of genes and loci involved in the non-syndromic common form and syndromic Mendelian forms of Hirschsprung's disease. The majority of the identified genes are related to Mendelian syndromic forms of Hirschsprung's disease. The non-Mendelian inheritance of sporadic non-syndromic Hirschsprung's disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model
+Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes
+For almost all of the identified HSCR genes incomplete penetrance of the HSCR phenotype has been reported, probably due to modifier loci. Therefore, HSCR has become a model for a complex oligo-/polygenic disorder in which the relationship between different genes creating a non-mendelian inheritance pattern still remains to be elucidated
+Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes.
+The inheritance of Hirschsprung disease is generally consistent with sex-modified multifactorial inheritance with a lower threshold of expression in males.
+Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes.
+Differential contributions of rare and common, coding and noncoding Ret mutations to multifactorial Hirschsprung disease liability.
+In the etiology of Hirschsprung disease various genes play a role; these are: RET, EDNRB, GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these genes may result in dominant, recessive or multifactorial patterns of inheritance.
+The majority of the identified genes are related to Mendelian syndromic forms of Hirschsprung's disease
+In the etiology of Hirschsprung disease various genes play a role; these are: RET, EDNRB, GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these genes may result in dominant, recessive or multifactorial patterns of inheritance
+On the basis of a skewed sex-ratio (M/F = 4/1) and a risk to relatives much higher than the incidence in the general population, HSCR has long been regarded as a sex-modified multifactorial disorder
+The inheritance of Hirschsprung disease is generally consistent with sex-modified multifactorial inheritance with a lower threshold of expression in males
+The non-Mendelian inheritance of sporadic non-syndromic Hirschsprung's disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model

qna2.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+Is the protein Papilin secreted?
+Yes,  papilin is a secreted protein
+Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin.
+We found that mig-6 encodes long (MIG-6L) and short (MIG-6S) isoforms of the extracellular matrix protein papilin, each required for distinct aspects of DTC migration. Both MIG-6 isoforms have a predicted N-terminal papilin cassette
+For almost all of the identified HSCR genes incomplete penetrance of the HSCR phenotype has been reported, probably due to modifier loci. Therefore, HSCR has become a model for a complex oligo-/polygenic disorder in which the relationship between different genes creating a non-mendelian inheritance pattern still remains to be elucidated
+apilins are homologous, secreted extracellular matrix proteins which share a common order of protein domains.
+The TSR superfamily is a diverse family of extracellular matrix and transmembrane proteins, many of which have functions related to regulating matrix organization, cell-cell interactions and cell guidance. This review samples some of the contemporary literature regarding TSR superfamily members (e.g. F-spondin, UNC-5, ADAMTS, papilin, and TRAP) where specific functions are assigned to the TSR domains.
+Papilin is an extracellular matrix glycoprotein
+ Collagen IV, laminin, glutactin, papilin, and other extracellular matrix proteins were made primarily by hemocytes and were secreted into the medium.
+A sulfated glycoprotein was isolated from the culture media of Drosophila Kc cells and named papilin.
+A sulfated glycoprotein was isolated from the culture media of Drosophila Kc cells and named papilin.
+The majority of the identified genes are related to Mendelian syndromic forms of Hirschsprung's disease
+In the etiology of Hirschsprung disease various genes play a role; these are: RET, EDNRB, GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these genes may result in dominant, recessive or multifactorial patterns of inheritance

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+scipy
+torch

sim_app_examples.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+"Machine learning terms (phrases test)": {"name":"small_test.txt"},
+"Customer feedback mixed with noise":{"name":"larger_test.txt"},
+"Movie reviews": {"name":"imdb_sent.txt"}
+}

sim_app_models.json ADDED Viewed

	@@ -0,0 +1,134 @@

+[
+            {   "name":"sentence-transformers/all-MiniLM-L6-v2",
+                "model":"sentence-transformers/all-MiniLM-L6-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 3.8  million downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
+                "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 2 million downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/bert-base-nli-mean-tokens",
+                "model":"sentence-transformers/bert-base-nli-mean-tokens",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 700,000 downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/all-mpnet-base-v2",
+                "model":"sentence-transformers/all-mpnet-base-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 500,000 downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/all-MiniLM-L12-v2",
+                "model":"sentence-transformers/all-MiniLM-L12-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 500,000 downloads from huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"SGPT-125M",
+                "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
+                "fork_url":"https://github.com/taskswithcode/sgpt",
+                "orig_author_url":"https://github.com/Muennighoff",
+                "orig_author":"Niklas Muennighoff",
+                "sota_info": {
+                                 "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
+                            },
+                "paper_url":"https://arxiv.org/abs/2202.08904v5",
+                "mark":"True",
+                "class":"SGPTModel"},
+            {   "name":"SGPT-1.3B",
+                "model": "Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit",
+                "fork_url":"https://github.com/taskswithcode/sgpt",
+                "orig_author_url":"https://github.com/Muennighoff",
+                "orig_author":"Niklas Muennighoff",
+                "sota_info": {
+                                 "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
+                            },
+                "paper_url":"https://arxiv.org/abs/2202.08904v5",
+                "Note":"If this large model takes too long or fails to load , try this ",
+                "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
+                "mark":"True",
+                "class":"SGPTModel"},
+            {   "name":"SGPT-5.8B",
+                "model": "Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit" ,
+                "fork_url":"https://github.com/taskswithcode/sgpt",
+                "orig_author_url":"https://github.com/Muennighoff",
+                "orig_author":"Niklas Muennighoff",
+                "Note":"If this large model takes too long or fails to load , try this ",
+                "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
+                "sota_info": {
+                                 "task":"#1 in multiple information retrieval & search tasks",
+                                 "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
+                            },
+                "paper_url":"https://arxiv.org/abs/2202.08904v5",
+                "mark":"True",
+                "class":"SGPTModel"},
+            {   "name":"SIMCSE-large" ,
+                "model":"princeton-nlp/sup-simcse-roberta-large",
+                "fork_url":"https://github.com/taskswithcode/SimCSE",
+                "orig_author_url":"https://github.com/princeton-nlp",
+                "orig_author":"Princeton Natural Language Processing",
+                "Note":"If this large model takes too long or fails to load , try this ",
+                "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
+                "sota_info": {
+                                 "task":"Within top 10 in multiple semantic textual similarity tasks",
+                                 "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
+                            },
+                "paper_url":"https://arxiv.org/abs/2104.08821v4",
+                "mark":"True",
+                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
+            {  "name":"SIMCSE-base" ,
+                "model":"princeton-nlp/sup-simcse-roberta-base",
+                "fork_url":"https://github.com/taskswithcode/SimCSE",
+                "orig_author_url":"https://github.com/princeton-nlp",
+                "orig_author":"Princeton Natural Language Processing",
+                "sota_info": {
+                                 "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
+                            },
+                "paper_url":"https://arxiv.org/abs/2104.08821v4",
+                "mark":"True",
+                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
+            ]

twc_embeddings.py ADDED Viewed

	@@ -0,0 +1,407 @@

+from transformers import AutoModel, AutoTokenizer
+from transformers import AutoModelForCausalLM
+from scipy.spatial.distance import cosine
+import argparse
+import json
+import pdb
+import torch
+import torch.nn.functional as F
+def read_text(input_file):
+    arr = open(input_file).read().split("\n")
+    return arr[:-1]
+class CausalLMModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In CausalLMModel Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        if (self.debug):
+            print("Init model",model_name)
+        # For best performance: EleutherAI/gpt-j-6B
+        if (model_name is None):
+            model_name = "EleutherAI/gpt-neo-125M"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.model.eval()
+        self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
+    def compute_embeddings(self,input_data,is_file):
+        if (self.debug):
+            print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        query = texts[0]
+        docs = texts[1:]
+        # Tokenize input texts
+        #print(f"Query: {query}")
+        scores = []
+        for doc in docs:
+            context = self.prompt.format(doc)
+            context_enc = tokenizer.encode(context, add_special_tokens=False)
+            continuation_enc = tokenizer.encode(query, add_special_tokens=False)
+            # Slice off the last token, as we take its probability from the one before
+            model_input = torch.tensor(context_enc+continuation_enc[:-1])
+            continuation_len = len(continuation_enc)
+            input_len, = model_input.shape
+            # [seq_len] -> [seq_len, vocab]
+            logprobs = torch.nn.functional.log_softmax(model(model_input)[0], dim=-1).cpu()
+            # [seq_len, vocab] -> [continuation_len, vocab]
+            logprobs = logprobs[input_len-continuation_len:]
+            # Gather the log probabilities of the continuation tokens -> [continuation_len]
+            logprobs = torch.gather(logprobs, 1, torch.tensor(continuation_enc).unsqueeze(-1)).squeeze(-1)
+            score = torch.sum(logprobs)
+            scores.append(score.tolist())
+        return texts,scores
+    def output_results(self,output_file,texts,scores,main_index = 0):
+        cosine_dict = {}
+        docs = texts[1:]
+        if (self.debug):
+            print("Total sentences",len(texts))
+        assert(len(scores) == len(docs))
+        for i in range(len(docs)):
+            cosine_dict[docs[i]] = scores[i]
+        if (self.debug):
+            print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Document score for \"%s\" is: %.3f" % (key[:100], sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class SGPTQnAModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In SGPT Q&A Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        if (self.debug):
+            print("Init model",model_name)
+        if (model_name is None):
+            model_name = "Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval()
+        self.SPECB_QUE_BOS = self.tokenizer.encode("[", add_special_tokens=False)[0]
+        self.SPECB_QUE_EOS = self.tokenizer.encode("]", add_special_tokens=False)[0]
+        self.SPECB_DOC_BOS = self.tokenizer.encode("{", add_special_tokens=False)[0]
+        self.SPECB_DOC_EOS = self.tokenizer.encode("}", add_special_tokens=False)[0]
+    def tokenize_with_specb(self,texts, is_query):
+        # Tokenize without padding
+        batch_tokens = self.tokenizer(texts, padding=False, truncation=True)
+        # Add special brackets & pay attention to them
+        for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
+            if is_query:
+                seq.insert(0, self.SPECB_QUE_BOS)
+                seq.append(self.SPECB_QUE_EOS)
+            else:
+                seq.insert(0, self.SPECB_DOC_BOS)
+                seq.append(self.SPECB_DOC_EOS)
+            att.insert(0, 1)
+            att.append(1)
+        # Add padding
+        batch_tokens = self.tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
+        return batch_tokens
+    def get_weightedmean_embedding(self,batch_tokens, model):
+        # Get the embeddings
+        with torch.no_grad():
+            # Get hidden state of shape [bs, seq_len, hid_dim]
+            last_hidden_state = self.model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
+        # Get weights of shape [bs, seq_len, hid_dim]
+        weights = (
+            torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float().to(last_hidden_state.device)
+        )
+        # Get attn mask of shape [bs, seq_len, hid_dim]
+        input_mask_expanded = (
+            batch_tokens["attention_mask"]
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float()
+        )
+        # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
+        sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
+        embeddings = sum_embeddings / sum_mask
+        return embeddings
+    def compute_embeddings(self,input_data,is_file):
+        if (self.debug):
+            print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        queries = [texts[0]]
+        docs = texts[1:]
+        query_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(queries, is_query=True), self.model)
+        doc_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(docs, is_query=False), self.model)
+        return texts,(query_embeddings,doc_embeddings)
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        query_embeddings = embeddings[0]
+        doc_embeddings = embeddings[1]
+        cosine_dict = {}
+        queries = [texts[0]]
+        docs = texts[1:]
+        if (self.debug):
+            print("Total sentences",len(texts))
+        for i in range(len(docs)):
+            cosine_dict[docs[i]] = 1 - cosine(query_embeddings[0], doc_embeddings[i])
+        if (self.debug):
+            print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class SimCSEModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In SimCSE constructor")
+    def init_model(self,model_name = None):
+        if (model_name == None):
+            model_name = "princeton-nlp/sup-simcse-roberta-large"
+        #self.model = SimCSE(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+    def compute_embeddings(self,input_data,is_file):
+        texts = read_text(input_data) if is_file == True else input_data
+        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+        with torch.no_grad():
+            embeddings = self.model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
+        return texts,embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        #print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        #print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class SGPTModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In SGPT Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        if (self.debug):
+            print("Init model",model_name)
+        if (model_name is None):
+            model_name = "Muennighoff/SGPT-125M-weightedmean-nli-bitfit"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
+        #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
+        #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
+        #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
+        # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
+        self.model.eval()
+    def compute_embeddings(self,input_data,is_file):
+        if (self.debug):
+            print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        # Tokenize input texts
+        batch_tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+    # Get the embeddings
+        with torch.no_grad():
+            # Get hidden state of shape [bs, seq_len, hid_dim]
+            last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
+        # Get weights of shape [bs, seq_len, hid_dim]
+        weights = (
+            torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float().to(last_hidden_state.device)
+        )
+        # Get attn mask of shape [bs, seq_len, hid_dim]
+        input_mask_expanded = (
+            batch_tokens["attention_mask"]
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float()
+        )
+        # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
+        sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
+        embeddings = sum_embeddings / sum_mask
+        return texts,embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        if (self.debug):
+            print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        if (self.debug):
+            print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class HFModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In HF Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        #print("Init model",model_name)
+        if (model_name is None):
+            model_name = "sentence-transformers/all-MiniLM-L6-v2"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval()
+    def mean_pooling(self,model_output, attention_mask):
+        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def compute_embeddings(self,input_data,is_file):
+        #print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = model(**encoded_input)
+        # Perform pooling
+        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
+        # Normalize embeddings
+        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+        return texts,sentence_embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        #print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        #print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+if __name__ == '__main__':
+        parser = argparse.ArgumentParser(description='SGPT model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
+        parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
+        parser.add_argument('-model', action="store", dest="model",default="sentence-transformers/all-MiniLM-L6-v2",help="model name")
+        results = parser.parse_args()
+        obj = HFModel()
+        obj.init_model(results.model)
+        texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
+        results = obj.output_results(results.output,texts,embeddings)

view_count.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 23