Spaces:

espejelomar
/

Detecting_vaccine_misinformation_with_Sentence_Embeddings

Build error

App Files Files Community

espejelomar commited on Aug 4, 2021

Commit

5df619b

•

1 Parent(s): bbee703

Add first version

Browse files

Files changed (20) hide show

README.md +4 -4
__init__.py +0 -0
app.py +160 -0
backend/__init__.py +0 -0
backend/__pycache__/__init__.cpython-36.pyc +0 -0
backend/__pycache__/__init__.cpython-38.pyc +0 -0
backend/__pycache__/config.cpython-36.pyc +0 -0
backend/__pycache__/config.cpython-38.pyc +0 -0
backend/__pycache__/inference.cpython-36.pyc +0 -0
backend/__pycache__/inference.cpython-38.pyc +0 -0
backend/__pycache__/utils.cpython-36.pyc +0 -0
backend/__pycache__/utils.cpython-38.pyc +0 -0
backend/config.py +14 -0
backend/inference.py +199 -0
backend/utils.py +46 -0
data/.DS_Store +0 -0
data/__init__.py +0 -0
data/stackoverflow-titles-distilbert-emb.csv +3 -0
data/stackoverflow-titles.jsonl.gz +3 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Detecting_vaccine_misinformation_with_Sentence_Embeddings
-emoji: 😻
-colorFrom: gray
-colorTo: gray
 sdk: streamlit
 app_file: app.py
 pinned: false

 ---
+title: Sentence Embeddings
+emoji: 🔥
+colorFrom: yellow
+colorTo: purple
 sdk: streamlit
 app_file: app.py
 pinned: false

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import streamlit as st
+import pandas as pd
+from backend import inference
+from backend.config import MODELS_ID, QA_MODELS_ID, SEARCH_MODELS_ID
+st.title('Demo using Flax-Sentence-Tranformers')
+st.sidebar.title('Tasks')
+menu = st.sidebar.radio("", options=["Sentence Similarity", "Asymmetric QA", "Search / Cluster", 'Identifying misleading vaccine texts'], index=0)
+st.markdown('''
+Hi! This is the demo for the [flax sentence embeddings](https://huggingface.co/flax-sentence-embeddings) created for the **Flax/JAX community week 🤗**.
+We trained three general-purpose flax-sentence-embeddings models: a **distilroberta base**, a **mpnet base** and a **minilm-l6**.
+All were trained on all the dataset of the 1B+ train corpus with the v3 setup.
+In addition, we trained 20 models focused on general-purpose, QuestionAnswering and Codesearch.
+View our models here : https://huggingface.co/flax-sentence-embeddings
+''')
+if menu == "Sentence Similarity":
+    st.header('Sentence Similarity')
+    st.markdown('''
+**Instructions**: You can compare the similarity of a main text with other texts of your choice. In the background, we'll create an embedding for each text, and then we'll use the cosine similarity function to calculate a similarity metric between our main sentence and the others.
+For more cool information on sentence embeddings, see the [sBert project](https://www.sbert.net/examples/applications/computing-embeddings/README.html).
+''')
+    select_models = st.multiselect("Choose models", options=list(MODELS_ID), default=list(MODELS_ID)[0])
+    anchor = st.text_input(
+        'Please enter here the main text you want to compare:'
+    )
+    n_texts = st.number_input(
+        f'''How many texts you want to compare with: '{anchor}'?''',
+        value=2,
+        min_value=2)
+    inputs = []
+    for i in range(int(n_texts)):
+        input = st.text_input(f'Text {i + 1}:')
+        inputs.append(input)
+    if st.button('Tell me the similarity.'):
+        results = {model: inference.text_similarity(anchor, inputs, model, MODELS_ID) for model in select_models}
+        df_results = {model: results[model] for model in results}
+        index = [f"{idx + 1}:{input[:min(15, len(input))]}..." for idx, input in enumerate(inputs)]
+        df_total = pd.DataFrame(index=index)
+        for key, value in df_results.items():
+            df_total[key] = list(value['score'].values)
+        st.write('Here are the results for selected models:')
+        st.write(df_total)
+        st.write('Visualize the results of each model:')
+        st.line_chart(df_total)
+elif menu == "Asymmetric QA":
+    st.header('Asymmetric QA')
+    st.markdown('''
+**Instructions**: You can compare the Answer likeliness of a given Query with answer candidates of your choice. In the background, we'll create an embedding for each answers, and then we'll use the cosine similarity function to calculate a similarity metric between our query sentence and the others.
+`mpnet_asymmetric_qa` model works best for hard negative answers or distinguishing similar queries due to separate models applied for encoding questions and answers.
+For more cool information on sentence embeddings, see the [sBert project](https://www.sbert.net/examples/applications/computing-embeddings/README.html).
+''')
+    select_models = st.multiselect("Choose models", options=list(QA_MODELS_ID), default=list(QA_MODELS_ID)[0])
+    anchor = st.text_input(
+        'Please enter here the query you want to compare with given answers:',
+        value="What is the weather in Paris?"
+    )
+    n_texts = st.number_input(
+        f'''How many answers you want to compare with: '{anchor}'?''',
+        value=10,
+        min_value=2)
+    inputs = []
+    defaults = ["It is raining in Paris right now with 70 F temperature.", "What is the weather in Berlin?", "I have 3 brothers."]
+    for i in range(int(n_texts)):
+        input = st.text_input(f'Answer {i + 1}:', value=defaults[i] if i < len(defaults) else "")
+        inputs.append(input)
+    if st.button('Tell me Answer likeliness.'):
+        results = {model: inference.text_similarity(anchor, inputs, model, QA_MODELS_ID) for model in select_models}
+        df_results = {model: results[model] for model in results}
+        index = [f"{idx + 1}:{input[:min(15, len(input))]}..." for idx, input in enumerate(inputs)]
+        df_total = pd.DataFrame(index=index)
+        for key, value in df_results.items():
+            df_total[key] = list(value['score'].values)
+        st.write('Here are the results for selected models:')
+        st.write(df_total)
+        st.write('Visualize the results of each model:')
+        st.line_chart(df_total)
+elif menu == "Search / Cluster":
+    st.header('Search / Cluster')
+    st.markdown('''
+**Instructions**: Make a query for anything related to "Python" and the model you choose will return you similar queries.
+For more cool information on sentence embeddings, see the [sBert project](https://www.sbert.net/examples/applications/computing-embeddings/README.html).
+''')
+    select_models = st.multiselect("Choose models", options=list(SEARCH_MODELS_ID), default=list(SEARCH_MODELS_ID)[0])
+    anchor = st.text_input(
+        'Please enter here your query about "Python", we will look for similar ones:',
+        value="How do I sort a dataframe by column"
+    )
+    n_texts = st.number_input(
+        f'''How many similar queries you want?''',
+        value=3,
+        min_value=2)
+    if st.button('Give me my search.'):
+        results = {model: inference.text_search(anchor, n_texts, model, QA_MODELS_ID) for model in select_models}
+        st.table(pd.DataFrame(results[select_models[0]]).T)
+    if st.button('3D Clustering of search result using T-SNE on generated embeddings'):
+        st.write("Currently only works at local due to Spaces / plotly integration.")
+        st.write("Demonstration : https://gyazo.com/1ff0aa438ae533de3b3c63382af7fe80")
+        # fig = inference.text_cluster(anchor, 1000, select_models[0], QA_MODELS_ID)
+        # fig.show()
+elif menu == "Identifying misleading vaccine texts":
+    st.header('Identifying misleading vaccine texts')
+    st.markdown('''
+**Instructions**: You can compare the similarity  of a given text and key words that identify 'misleading' texts regarding vaccination. In the background, we'll create an embedding for each text, and then we'll use the cosine similarity function to calculate a similarity metric between our main sentence and the keywords.
+We use keywords identified by **Muric, Goran and Wu, Yusong and Ferrara, Emilio (2021), 'COVID-19 Vaccine Hesitancy on Social Media: Building a Public Twitter Dataset of Anti-vaccine Content, Vaccine Misinformation and Conspiracies'**
+For more cool information on sentence embeddings, see the [sBert project](https://www.sbert.net/examples/applications/computing-embeddings/README.html).
+''')
+    select_models = st.multiselect("Choose models", options=list(MODELS_ID), default=list(MODELS_ID)[0])
+    anchor = st.text_input(
+        'Please enter here the text/tweet you want to evaluate:'
+    )
+    if st.button('Tell me the similarity.'):
+        results = {model: inference.tweets_vaccine(anchor, model, MODELS_ID) for model in select_models}
+        df_results = {model: results[model] for model in results}
+        #index = [f"{idx + 1}:{input[:min(15, len(input))]}..." for idx, input in enumerate(inputs)]
+        df_total = pd.DataFrame(index=[0])
+        for key, value in df_results.items():
+            df_total[key] = list(value['score'].values)
+        st.write('Here are the results for selected models:')
+        st.write(df_total)

backend/__init__.py ADDED Viewed

File without changes

backend/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (159 Bytes). View file

backend/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (152 Bytes). View file

backend/__pycache__/config.cpython-36.pyc ADDED Viewed

Binary file (737 Bytes). View file

backend/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (738 Bytes). View file

backend/__pycache__/inference.cpython-36.pyc ADDED Viewed

Binary file (2.2 kB). View file

backend/__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (5.38 kB). View file

backend/__pycache__/utils.cpython-36.pyc ADDED Viewed

Binary file (1.54 kB). View file

backend/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (1.56 kB). View file

backend/config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+MODELS_ID = dict(distilroberta = 'flax-sentence-embeddings/st-codesearch-distilroberta-base',
+                 mpnet = 'flax-sentence-embeddings/all_datasets_v3_mpnet-base',
+                 minilm_l6 = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L6')
+QA_MODELS_ID = dict(
+    mpnet_asymmetric_qa = ['flax-sentence-embeddings/multi-QA_v1-mpnet-asymmetric-Q',
+                           'flax-sentence-embeddings/multi-QA_v1-mpnet-asymmetric-A'],
+    mpnet_qa='flax-sentence-embeddings/mpnet_stackexchange_v1',
+    distilbert_qa = 'flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot'
+)
+SEARCH_MODELS_ID = dict(
+    distilbert_qa = 'flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot'
+)

backend/inference.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import gzip
+import json
+from collections import Counter
+import pandas as pd
+import numpy as np
+import jax.numpy as jnp
+import tqdm
+from sentence_transformers import util
+from typing import List, Union
+import torch
+from backend.utils import load_model, filter_questions, load_embeddings
+from sklearn.manifold import TSNE
+def cos_sim(a, b):
+    return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b))
+# We get similarity between embeddings.
+def text_similarity(anchor: str, inputs: List[str], model_name: str, model_dict: dict):
+    print(model_name)
+    model = load_model(model_name, model_dict)
+    # Creating embeddings
+    if hasattr(model, 'encode'):
+        anchor_emb = model.encode(anchor)[None, :]
+        inputs_emb = model.encode(inputs)
+    else:
+        assert len(model) == 2
+        anchor_emb = model[0].encode(anchor)[None, :]
+        inputs_emb = model[1].encode(inputs)
+    # Obtaining similarity
+    similarity = list(jnp.squeeze(cos_sim(anchor_emb, inputs_emb)))
+    # Returning a Pandas' dataframe
+    d = {'inputs': inputs,
+         'score': [round(similarity[i], 3) for i in range(len(similarity))]}
+    df = pd.DataFrame(d, columns=['inputs', 'score'])
+    return df
+# Search
+def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict):
+    # Proceeding with model
+    print(model_name)
+    assert model_name == "distilbert_qa"
+    model = load_model(model_name, model_dict)
+    # Creating embeddings
+    query_emb = model.encode(anchor, convert_to_tensor=True)[None, :]
+    print("loading embeddings")
+    corpus_emb = load_embeddings()
+    # Getting hits
+    hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0]
+    filtered_posts = filter_questions("python")
+    print(f"{len(filtered_posts)} posts found with tag: python")
+    hits_titles = []
+    hits_scores = []
+    urls = []
+    for hit in hits:
+        post = filtered_posts[hit['corpus_id']]
+        hits_titles.append(post['title'])
+        hits_scores.append("{:.3f}".format(hit['score']))
+        urls.append(f"https://stackoverflow.com/q/{post['id']}")
+    return hits_titles, hits_scores, urls
+def text_cluster(anchor: str, n_answers: int, model_name: str, model_dict: dict):
+    # Proceeding with model
+    print(model_name)
+    assert model_name == "distilbert_qa"
+    model = load_model(model_name, model_dict)
+    # Creating embeddings
+    query_emb = model.encode(anchor, convert_to_tensor=True)[None, :]
+    print("loading embeddings")
+    corpus_emb = load_embeddings()
+    # Getting hits
+    hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0]
+    filtered_posts = filter_questions("python")
+    hits_dict = [filtered_posts[hit['corpus_id']] for hit in hits]
+    hits_dict.append(dict(id = '1', title = anchor, tags = ['']))
+    hits_emb = torch.stack([corpus_emb[hit['corpus_id']] for hit in hits])
+    hits_emb = torch.cat((hits_emb, query_emb))
+    # Dimensionality reduction with t-SNE
+    tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=1000)
+    tsne_results = tsne.fit_transform(hits_emb.cpu())
+    df = pd.DataFrame(hits_dict)
+    tags = list(df['tags'])
+    counter = Counter(tags[0])
+    for i in tags[1:]:
+        counter.update(i)
+    df_tags = pd.DataFrame(counter.most_common(), columns=['Tag', 'Mentions'])
+    most_common_tags = list(df_tags['Tag'])[1:5]
+    labels = []
+    for tags_list in list(df['tags']):
+        for common_tag in most_common_tags:
+            if common_tag in tags_list:
+                labels.append(common_tag)
+                break
+            elif common_tag != most_common_tags[-1]:
+                continue
+            else:
+                labels.append('others')
+    df['title'] = [post['title'] for post in hits_dict]
+    df['labels'] = labels
+    df['tsne_x'] = tsne_results[:, 0]
+    df['tsne_y'] = tsne_results[:, 1]
+    df['tsne_z'] = tsne_results[:, 2]
+    df['size'] = [2 for i in range(len(df))]
+    # Making the query bigger than the rest of the observations
+    df['size'][len(df) - 1] = 10
+    df['labels'][len(df) - 1] = 'QUERY'
+    import plotly.express as px
+    fig = px.scatter_3d(df, x='tsne_x', y='tsne_y', z='tsne_z', color='labels', size='size',
+                        color_discrete_sequence=px.colors.qualitative.D3, hover_data=[df.title])
+    return fig
+# We get similarity between embeddings.
+def tweets_vaccine(anchor: str, model_name: str, model_dict: dict):
+    print(model_name)
+    model = load_model(model_name, model_dict)
+    # Keywords common in disinformation tweets
+    keywords = '''abolish big pharma,
+    no forced flu shots,
+    antivaccine,
+    No Forced Vaccines,
+    Arrest Bill Gates,
+    not mandatory vaccines,
+    No Vaccine,
+    big pharma mafia,
+    No Vaccine For Me,
+    big pharma kills,
+    no vaccine mandates,
+    parents over pharma,
+    say no to vaccines,
+    stop mandatory vaccination,
+    vaccines are poison,
+    learn the risk,
+    vaccines cause,
+    medical freedom,
+    vaccines kill,
+    medical freedom of choice,
+    vaxxed,
+    my body my choice,
+    vaccines have very dangerous consequences,
+    Vaccines harm your organism'''
+    # Creating embeddings
+    if hasattr(model, 'encode'):
+        anchor_emb = model.encode(anchor)[None, :]
+        inputs_emb = model.encode(keywords)
+    else:
+        assert len(model) == 2
+        anchor_emb = model[0].encode(anchor)[None, :]
+        inputs_emb = model[1].encode(keywords)
+    # Obtaining similarity
+    similarity = jnp.squeeze(jnp.matmul(anchor_emb, jnp.transpose(inputs_emb)) / (jnp.linalg.norm(anchor_emb) * jnp.linalg.norm(inputs_emb))).tolist()
+    # Returning a Pandas' dataframe
+    d = dict(tweet = anchor,
+             score = [round(similarity, 3)])
+    df = pd.DataFrame(d, columns=['tweet', 'score'])
+    return df

backend/utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import gzip
+import json
+import numpy as np
+import streamlit as st
+import torch
+import tqdm
+from sentence_transformers import SentenceTransformer
+@st.cache(allow_output_mutation=True)
+def load_model(model_name, model_dict):
+    assert model_name in model_dict.keys()
+    # Lazy downloading
+    model_ids = model_dict[model_name]
+    if type(model_ids) == str:
+        output = SentenceTransformer(model_ids)
+    elif hasattr(model_ids, '__iter__'):
+        output = [SentenceTransformer(name) for name in model_ids]
+    return output
+@st.cache(allow_output_mutation=True)
+def load_embeddings():
+    # embedding pre-generated
+    corpus_emb = torch.from_numpy(np.loadtxt('./data/stackoverflow-titles-distilbert-emb.csv', max_rows=10000))
+    return corpus_emb.float()
+@st.cache(allow_output_mutation=True)
+def filter_questions(tag, max_questions=10000):
+    posts = []
+    max_posts = 6e6
+    with gzip.open("./data/stackoverflow-titles.jsonl.gz", "rt") as fIn:
+        for line in tqdm.auto.tqdm(fIn, total=max_posts, desc="Load data"):
+            posts.append(json.loads(line))
+            if len(posts) >= max_posts:
+                break
+    filtered_posts = []
+    for post in posts:
+        if tag in post["tags"]:
+            filtered_posts.append(post)
+            if len(filtered_posts) >= max_questions:
+                break
+    return filtered_posts

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/__init__.py ADDED Viewed

File without changes

data/stackoverflow-titles-distilbert-emb.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f54b58e7835fac510ef46b8ba38c58c9942d769cace977e42a3bb274344ee9f
+size 3916646328

data/stackoverflow-titles.jsonl.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de83e615c8200395a6c32c9e0fc34cc0ca72d0207a65150d27a4a168ec52dbab
+size 426579165

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+sentence_transformers
+pandas
+jax
+jaxlib
+streamlit
+numpy
+torch
+scikit-learn
+plotly
+matplotlib