|
import gzip |
|
import json |
|
from collections import Counter |
|
|
|
import pandas as pd |
|
import numpy as np |
|
import jax.numpy as jnp |
|
import tqdm |
|
|
|
from sentence_transformers import util |
|
from typing import List, Union |
|
import torch |
|
|
|
from backend.utils import load_model, filter_questions, load_embeddings |
|
from sklearn.manifold import TSNE |
|
|
|
def cos_sim(a, b): |
|
return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b)) |
|
|
|
|
|
|
|
def text_similarity(anchor: str, inputs: List[str], model_name: str, model_dict: dict): |
|
print(model_name) |
|
model = load_model(model_name, model_dict) |
|
|
|
|
|
if hasattr(model, 'encode'): |
|
anchor_emb = model.encode(anchor)[None, :] |
|
inputs_emb = model.encode(inputs) |
|
else: |
|
assert len(model) == 2 |
|
anchor_emb = model[0].encode(anchor)[None, :] |
|
inputs_emb = model[1].encode(inputs) |
|
|
|
|
|
similarity = list(jnp.squeeze(cos_sim(anchor_emb, inputs_emb))) |
|
|
|
|
|
d = {'inputs': inputs, |
|
'score': [round(similarity[i], 3) for i in range(len(similarity))]} |
|
df = pd.DataFrame(d, columns=['inputs', 'score']) |
|
|
|
return df |
|
|
|
|
|
|
|
def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict): |
|
|
|
print(model_name) |
|
assert model_name == "distilbert_qa" |
|
model = load_model(model_name, model_dict) |
|
|
|
|
|
query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] |
|
|
|
print("loading embeddings") |
|
corpus_emb = load_embeddings() |
|
|
|
|
|
hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] |
|
|
|
filtered_posts = filter_questions("python") |
|
print(f"{len(filtered_posts)} posts found with tag: python") |
|
|
|
hits_titles = [] |
|
hits_scores = [] |
|
urls = [] |
|
for hit in hits: |
|
post = filtered_posts[hit['corpus_id']] |
|
hits_titles.append(post['title']) |
|
hits_scores.append("{:.3f}".format(hit['score'])) |
|
urls.append(f"https://stackoverflow.com/q/{post['id']}") |
|
|
|
return hits_titles, hits_scores, urls |
|
|
|
|
|
def text_cluster(anchor: str, n_answers: int, model_name: str, model_dict: dict): |
|
|
|
print(model_name) |
|
assert model_name == "distilbert_qa" |
|
model = load_model(model_name, model_dict) |
|
|
|
|
|
query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] |
|
|
|
print("loading embeddings") |
|
corpus_emb = load_embeddings() |
|
|
|
|
|
hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] |
|
|
|
filtered_posts = filter_questions("python") |
|
|
|
hits_dict = [filtered_posts[hit['corpus_id']] for hit in hits] |
|
hits_dict.append(dict(id = '1', title = anchor, tags = [''])) |
|
|
|
hits_emb = torch.stack([corpus_emb[hit['corpus_id']] for hit in hits]) |
|
hits_emb = torch.cat((hits_emb, query_emb)) |
|
|
|
|
|
tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=1000) |
|
tsne_results = tsne.fit_transform(hits_emb.cpu()) |
|
df = pd.DataFrame(hits_dict) |
|
tags = list(df['tags']) |
|
|
|
counter = Counter(tags[0]) |
|
for i in tags[1:]: |
|
counter.update(i) |
|
|
|
df_tags = pd.DataFrame(counter.most_common(), columns=['Tag', 'Mentions']) |
|
most_common_tags = list(df_tags['Tag'])[1:5] |
|
|
|
labels = [] |
|
|
|
for tags_list in list(df['tags']): |
|
for common_tag in most_common_tags: |
|
if common_tag in tags_list: |
|
labels.append(common_tag) |
|
break |
|
elif common_tag != most_common_tags[-1]: |
|
continue |
|
else: |
|
labels.append('others') |
|
|
|
df['title'] = [post['title'] for post in hits_dict] |
|
df['labels'] = labels |
|
df['tsne_x'] = tsne_results[:, 0] |
|
df['tsne_y'] = tsne_results[:, 1] |
|
df['tsne_z'] = tsne_results[:, 2] |
|
|
|
df['size'] = [2 for i in range(len(df))] |
|
|
|
|
|
df['size'][len(df) - 1] = 10 |
|
df['labels'][len(df) - 1] = 'QUERY' |
|
import plotly.express as px |
|
|
|
fig = px.scatter_3d(df, x='tsne_x', y='tsne_y', z='tsne_z', color='labels', size='size', |
|
color_discrete_sequence=px.colors.qualitative.D3, hover_data=[df.title]) |
|
return fig |
|
|
|
|
|
|
|
|
|
def tweets_vaccine(anchor: str, model_name: str, model_dict: dict): |
|
print(model_name) |
|
model = load_model(model_name, model_dict) |
|
|
|
|
|
keywords = '''abolish big pharma, |
|
no forced flu shots, |
|
antivaccine, |
|
No Forced Vaccines, |
|
Arrest Bill Gates, |
|
not mandatory vaccines, |
|
No Vaccine, |
|
big pharma mafia, |
|
No Vaccine For Me, |
|
big pharma kills, |
|
no vaccine mandates, |
|
parents over pharma, |
|
say no to vaccines, |
|
stop mandatory vaccination, |
|
vaccines are poison, |
|
learn the risk, |
|
vaccines cause, |
|
medical freedom, |
|
vaccines kill, |
|
medical freedom of choice, |
|
vaxxed, |
|
my body my choice, |
|
vaccines have very dangerous consequences, |
|
Vaccines harm your organism''' |
|
|
|
|
|
|
|
|
|
|
|
if hasattr(model, 'encode'): |
|
anchor_emb = model.encode(anchor)[None, :] |
|
inputs_emb = model.encode(keywords) |
|
else: |
|
assert len(model) == 2 |
|
anchor_emb = model[0].encode(anchor)[None, :] |
|
inputs_emb = model[1].encode(keywords) |
|
|
|
|
|
|
|
similarity = jnp.squeeze(jnp.matmul(anchor_emb, jnp.transpose(inputs_emb)) / (jnp.linalg.norm(anchor_emb) * jnp.linalg.norm(inputs_emb))).tolist() |
|
|
|
|
|
d = dict(tweet = anchor, |
|
score = [round(similarity, 3)]) |
|
df = pd.DataFrame(d, columns=['tweet', 'score']) |
|
|
|
|
|
return df |
|
|
|
|
|
|