course_recommender / utils.py
Hong
Upload utils.py
d49e18f
raw
history blame contribute delete
No virus
3.07 kB
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
from sentence_transformers import SentenceTransformer, util
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
import numpy as np
import pandas as pd
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
nli_model = (
AutoModelForSequenceClassification.from_pretrained(
"facebook/bart-large-mnli"
).cuda()
if torch.cuda.is_available()
else AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
)
def get_prob(sequence, label):
premise = sequence
hypothesis = f"This example is {label}."
# run through model pre-trained on MNLI
x = tokenizer.encode(
premise, hypothesis, return_tensors="pt", truncation_strategy="only_first"
)
logits = nli_model(x.to(device))[0]
# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true
entail_contradiction_logits = logits[:, [0, 2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:, 1]
return prob_label_is_true[0].item()
def get_prob_lists(sequence, labels):
out = []
for l in labels:
out.append(get_prob(sequence, l))
return out
compare_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
def compare_sentence(query, docs):
query_emb = compare_model.encode(query)
doc_emb = compare_model.encode(docs)
scores = util.dot_score(query_emb, doc_emb)[0].to(device).tolist()
return np.mean(scores)
def query_jds(DB, keyword):
keywords = " ".join(gensim.utils.simple_preprocess(keyword, deacc=True))
temp_tf_matrix = tfidf_matrix(DB, tokenized="tokenized", name="Title")
target = query(DB, keywords, temp_tf_matrix)
return target
def query(df, keywords, tf_matrix):
keywords = " ".join(gensim.utils.simple_preprocess(keywords, deacc=True))
df["Query_score"] = tfidf_score(tf_matrix, keywords)
q = df.loc[df["Query_score"] > 0.3].sort_values(by="Query_score", ascending=False)
result = q[:5].reset_index(drop=True)
# print(result[["Title", "Query_score"]])
return result.drop("Query_score", axis=1)
def tfidf_score(tf_matrix, keyword):
vector = np.array([0] * tf_matrix.shape[1])
for i in keyword.split():
if i in tf_matrix.index:
vector = vector + tf_matrix.loc[i].values
return vector
def tfidf_matrix(data, tokenized="tokenized", name="Course_Name"):
corpus = [" ".join(i) for i in data[tokenized]]
tfidf_voctorize = TfidfVectorizer().fit(corpus)
avg_score = tfidf_voctorize.transform(corpus).toarray().T
vocab = tfidf_voctorize.get_feature_names()
courses = data[name].values
avg_score = preprocessing.minmax_scale(avg_score.T).T
scores = pd.DataFrame(avg_score, index=vocab, columns=courses)
return scores