File size: 6,620 Bytes
4506cfb
 
 
2f47f53
4506cfb
 
2f47f53
 
4506cfb
 
 
 
3d0672e
d0b4009
 
4506cfb
 
 
 
 
2f47f53
 
 
 
4506cfb
 
 
2f47f53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cf0473
372587b
 
 
 
 
2f47f53
 
4336198
 
2f47f53
 
 
 
 
 
 
d0b4009
4506cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

import streamlit as st

import pandas as pd
from sentence_transformers import SentenceTransformer, util

from transformers import AutoTokenizer, pipeline
import numpy as np

def sentence_sim(sentence1, sentence2):
    #model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    #model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    #model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    #model = SentenceTransformer('bert-base-uncased')
    embedding1 = model.encode(sentence1)
    embedding2 = model.encode(sentence2)
    cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
    return cos_scores[0][0]


def dot_product(v1, v2):
    return round(np.dot(v1, v2), 3)
    
st.title('Similarity Computations')


if st.button('Context Sim Bert'):
    model = 'bert-base-uncased'
    framework = 'tf'
    tokenizer = AutoTokenizer.from_pretrained(model)
    feature_extractor = pipeline(
            model=model,
            framework=framework,
            tokenizer=tokenizer,
            task="feature-extraction",
        )
    
    sentencetriplets = ["record the play", "play the record", "play the game"]
    index = 0
    #sentence = sentencetriplets[index]
    test_word = 'play'
    test_word_vector = {}
    for index, sentence in enumerate(sentencetriplets):
        tokens = tokenizer.tokenize(sentence)
        vectors = feature_extractor(sentence, return_tensors=True).numpy()
        test_word_location = [i for i in range(len(tokens)) if test_word == tokens[i]][0]
        test_word_vector[index] = vectors[0, test_word_location + 1, :]  # 0 is '[CLS]'
        magnitude = np.linalg.norm(test_word_vector[index])
        test_word_vector[index] = test_word_vector[index] / magnitude

    dot_product(test_word_vector[0], test_word_vector[1])
    dot_product(test_word_vector[1], test_word_vector[2])
    dot_product(test_word_vector[0], test_word_vector[2])

if st.button('Instructor'):
    from InstructorEmbedding import INSTRUCTOR
    model = INSTRUCTOR('hkunlp/instructor-xl')
    #sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
    #instruction = "Represent the Science title:"
    #embeddings = model.encode([[instruction,sentence]])
    #st.write(instruction)
    #st.write(embeddings)

    from sklearn.metrics.pairwise import cosine_similarity
    sentences_a = [['Represent the sentence: ','play the record'], ['Represent the sentence: ','play the game']]
    sentences_b = [['Represent the sentence: ','record the play'],['Represent the sentence: ','play the game']]
    embeddings_a = model.encode(sentences_a)
    embeddings_b = model.encode(sentences_b)
    similarities = cosine_similarity(embeddings_a,embeddings_b)
    st.write(sentences_a)
    st.write(sentences_b)
    st.write(similarities)

if st.button('Cos Sim SBERT'):
    #title = "I Tried Using ChatGPT To Earn $6,147 In Just 1 Week"
    #summary = "Unveiling the Reality: The Perils of Using ChatGPT for Content Generation and Monetization"
    #summary = "The article is a summary of a video tutorial that teaches how to use ChatGPT to generate blog posts and video scripts, post them online, and earn money from ads and affiliate products. The author follows the tutorial and generates blog posts using ChatGPT, but they find the resulting articles to be robotic and unreadable. They try to use a paraphrasing tool to pass an AI checker, but the result is still unreadable. They post the articles on Medium and Quora but receive no views. The author concludes that the problem may not be with ChatGPT but with the tutorial's approach."

    sentencetriplets = [["record the play", "play the record", "play the game"],
                     ["germany sells arms to saudi arabia", "arms bend at the elbow", "wave your arms around"],
                     ["the problem has no solution", "boil the solution with salt", "heat the solution to 75 degrees"],
                     ["all income is subject to tax", "economics an arts subject", "i have one subject for credit"],
                     ["the key issue is quality not quantity", "the key broke in the lock", "i lost my key"]]


    distances = []
    for triplet in sentencetriplets:
        cos_sim = sentence_sim(triplet[0], triplet[1])
        tokens = [triplet[0], triplet[1], cos_sim]
        distances.append(tokens)
        cos_sim = sentence_sim(triplet[0], triplet[2])
        tokens = [triplet[0], triplet[2], cos_sim]
        distances.append(tokens)
        cos_sim = sentence_sim(triplet[1], triplet[2])
        tokens = [triplet[1], triplet[2], cos_sim]
        distances.append(tokens)


    df = pd.DataFrame(distances, columns=['sentence1', 'sentence2', 'distance'])
    
    #df = pd.DataFrame(cos_sim)
    #st.write(title)
    #st.write(summary)
    #st.write(cos_sim)
    st.write(df)
    #print(df)


# output
# The output is: [[0.79056942]], indicating a relatively high cosine similarity between the title and summary.

# example:
# calculate the cosine similarity of the title "17 Money Secrets To Make Your First Million" and 
# the summary "The author shares their money secrets for making millions, including knowing when 
# to say no to opportunities, spending money wisely, building wealth slowly, prioritizing peace of mind, 
# protecting your reputation, assessing opportunities based on your personality, and embracing constraints. 
# They also emphasize the importance of increasing the velocity of your income over time and being mindful 
# of how easy it is to access your investments."
# To calculate the cosine similarity, we first need to convert the text into vectors. We can use the bag-of-words representation, which represents each text as a vector of word frequencies.

# Title: [1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1]
# Summary: [2, 2, 2, 2, 1, 4, 4, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]

# To calculate the cosine similarity, we take the dot product of the two vectors and 
# divide it by the product of their magnitudes.

# Dot product: 12 + 02 + 12 + 02 + 11 + 14 + 14 + 12 + 02 + 01 + 1*1 = 20
# Magnitude of title vector: sqrt(1^2 + 0^2 + 1^2 + 0^2 + 1^2 + 1^2 + 1^2 + 1^2 + 0^2 + 0^2 + 1^2) = sqrt(7) ≈ 2.65
# Magnitude of summary vector: sqrt(2^2 + 2^2 + 2^2 + 2^2 + 1^2 + 4^2 + 4^2 + 2^2 + 2^2 + 1^2 + 1^2 + 1^2 + 1^2 + 2^2 + 2^2 + 1^2 + 2^2 + 1^2 + 1^2 + 1^2) = sqrt(44) ≈ 6.63

# Cosine similarity: 20 / (2.65 * 6.63) ≈ 0.463

# Therefore, the cosine similarity of the title and summary is approximately 0.463.