zayedupal's picture
Upload 11 files
1ee5c89
raw
history blame contribute delete
No virus
3.45 kB
import numpy as np
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
nltk.download('stopwords')
nltk.download('punkt')
STOP_WORDS = list(stopwords.words('english'))
BERTOPIC_REPRESENTATIONS = [
"KeyBERTInspired",
"MaximalMarginalRelevance",
]
TRANSFORMERS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"]
TRANSFORMERS_INFO = ["all-mpnet-base-v2: All-round model tuned for many use-cases. "
"Trained on a large and diverse dataset of over 1 billion training pairs",
"multi-qa-mpnet-base-dot-v1: This model was tuned for semantic search: Given a query/question, "
"if can find relevant passages. "
"It was trained on a large and diverse set of (question, answer) pairs."
]
def get_bertopic_representation(representation: str):
if representation == BERTOPIC_REPRESENTATIONS[0]:
return KeyBERTInspired()
elif representation == BERTOPIC_REPRESENTATIONS[1]:
return MaximalMarginalRelevance()
else:
return None
def tokenize_explode(df, col):
df['tokenized'] = df[col].apply(word_tokenize)
df = df.explode('tokenized')
df['tokenized'] = df['tokenized'].str.strip()
df['tokenized'] = df['tokenized'].str.lower()
return df
def cleanup_tokens(df, col):
df = df[df[col].apply(lambda x: len(x) > 2)]
df = df[~df[col].str.contains(r'^(\d+\.?\d*)$', regex=True)]
df = df[~df[col].isin([p for p in string.punctuation])]
df = df[df[col].isin(STOP_WORDS) == False]
return df
def get_embedding_model(transformer=BERTOPIC_REPRESENTATIONS[0]) -> SentenceTransformer:
"""
get given sentence transformer model
:param transformer:
:return:
"""
sentence_model = SentenceTransformer(transformer)
return sentence_model
def str_to_vector_list(text_list, sentence_model, replace_dict=None):
"""
embedding for the given text list using provided embedding model
:param text_list:
:param sentence_model:
:param replace_dict: any values in the string that we may need to replace
:return:
"""
text_list = [str(x).replace('[^\w\s]', '') for x in text_list]
if replace_dict:
for stp in replace_dict:
text_list = [str(x).replace(stp, replace_dict[stp]) for x in text_list]
embeddings = sentence_model.encode(text_list, show_progress_bar=True, batch_size=1000)
return embeddings.tolist()
def remove_unnecessary_tokens_from_df(df, columns, extra_stopwords=None) -> None:
"""
removes unnecessary token from the given columns of the dataframe
:param df:
:param columns:
:param extra_stopwords:
:return:
"""
df[columns] = df[columns].apply(lambda x: x.str.replace('[^\w\s]', ''))
if extra_stopwords:
for stp in extra_stopwords:
df[columns] = df[columns].apply(lambda x: x.str.replace(stp, ' '))
def cosine_sim_matrix(embeddings_a, embeddings_b) -> np.array:
"""
finds out cosine similarity matrix for the given embeddings
:param embeddings_a:
:param embeddings_b:
:return: numpy array showing the cosine similarity matrix
"""
return np.array(
util.pytorch_cos_sim(embeddings_a, embeddings_b)
)