File size: 3,074 Bytes
d49e18f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
from sentence_transformers import SentenceTransformer, util
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
import numpy as np
import pandas as pd

device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
nli_model = (
    AutoModelForSequenceClassification.from_pretrained(
        "facebook/bart-large-mnli"
    ).cuda()
    if torch.cuda.is_available()
    else AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
)


def get_prob(sequence, label):
    premise = sequence
    hypothesis = f"This example is {label}."

    # run through model pre-trained on MNLI
    x = tokenizer.encode(
        premise, hypothesis, return_tensors="pt", truncation_strategy="only_first"
    )
    logits = nli_model(x.to(device))[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    prob_label_is_true = probs[:, 1]
    return prob_label_is_true[0].item()


def get_prob_lists(sequence, labels):
    out = []
    for l in labels:
        out.append(get_prob(sequence, l))
    return out


compare_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


def compare_sentence(query, docs):
    query_emb = compare_model.encode(query)
    doc_emb = compare_model.encode(docs)
    scores = util.dot_score(query_emb, doc_emb)[0].to(device).tolist()
    return np.mean(scores)


def query_jds(DB, keyword):
    keywords = " ".join(gensim.utils.simple_preprocess(keyword, deacc=True))
    temp_tf_matrix = tfidf_matrix(DB, tokenized="tokenized", name="Title")
    target = query(DB, keywords, temp_tf_matrix)
    return target


def query(df, keywords, tf_matrix):

    keywords = " ".join(gensim.utils.simple_preprocess(keywords, deacc=True))
    df["Query_score"] = tfidf_score(tf_matrix, keywords)
    q = df.loc[df["Query_score"] > 0.3].sort_values(by="Query_score", ascending=False)

    result = q[:5].reset_index(drop=True)
    # print(result[["Title", "Query_score"]])
    return result.drop("Query_score", axis=1)


def tfidf_score(tf_matrix, keyword):
    vector = np.array([0] * tf_matrix.shape[1])
    for i in keyword.split():
        if i in tf_matrix.index:
            vector = vector + tf_matrix.loc[i].values
    return vector


def tfidf_matrix(data, tokenized="tokenized", name="Course_Name"):
    corpus = [" ".join(i) for i in data[tokenized]]
    tfidf_voctorize = TfidfVectorizer().fit(corpus)

    avg_score = tfidf_voctorize.transform(corpus).toarray().T
    vocab = tfidf_voctorize.get_feature_names()
    courses = data[name].values
    avg_score = preprocessing.minmax_scale(avg_score.T).T
    scores = pd.DataFrame(avg_score, index=vocab, columns=courses)
    return scores