Spaces:
Sleeping
Sleeping
File size: 6,620 Bytes
4506cfb 2f47f53 4506cfb 2f47f53 4506cfb 3d0672e d0b4009 4506cfb 2f47f53 4506cfb 2f47f53 6cf0473 372587b 2f47f53 4336198 2f47f53 d0b4009 4506cfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, pipeline
import numpy as np
def sentence_sim(sentence1, sentence2):
#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
#model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
#model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
#model = SentenceTransformer('bert-base-uncased')
embedding1 = model.encode(sentence1)
embedding2 = model.encode(sentence2)
cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
return cos_scores[0][0]
def dot_product(v1, v2):
return round(np.dot(v1, v2), 3)
st.title('Similarity Computations')
if st.button('Context Sim Bert'):
model = 'bert-base-uncased'
framework = 'tf'
tokenizer = AutoTokenizer.from_pretrained(model)
feature_extractor = pipeline(
model=model,
framework=framework,
tokenizer=tokenizer,
task="feature-extraction",
)
sentencetriplets = ["record the play", "play the record", "play the game"]
index = 0
#sentence = sentencetriplets[index]
test_word = 'play'
test_word_vector = {}
for index, sentence in enumerate(sentencetriplets):
tokens = tokenizer.tokenize(sentence)
vectors = feature_extractor(sentence, return_tensors=True).numpy()
test_word_location = [i for i in range(len(tokens)) if test_word == tokens[i]][0]
test_word_vector[index] = vectors[0, test_word_location + 1, :] # 0 is '[CLS]'
magnitude = np.linalg.norm(test_word_vector[index])
test_word_vector[index] = test_word_vector[index] / magnitude
dot_product(test_word_vector[0], test_word_vector[1])
dot_product(test_word_vector[1], test_word_vector[2])
dot_product(test_word_vector[0], test_word_vector[2])
if st.button('Instructor'):
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-xl')
#sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
#instruction = "Represent the Science title:"
#embeddings = model.encode([[instruction,sentence]])
#st.write(instruction)
#st.write(embeddings)
from sklearn.metrics.pairwise import cosine_similarity
sentences_a = [['Represent the sentence: ','play the record'], ['Represent the sentence: ','play the game']]
sentences_b = [['Represent the sentence: ','record the play'],['Represent the sentence: ','play the game']]
embeddings_a = model.encode(sentences_a)
embeddings_b = model.encode(sentences_b)
similarities = cosine_similarity(embeddings_a,embeddings_b)
st.write(sentences_a)
st.write(sentences_b)
st.write(similarities)
if st.button('Cos Sim SBERT'):
#title = "I Tried Using ChatGPT To Earn $6,147 In Just 1 Week"
#summary = "Unveiling the Reality: The Perils of Using ChatGPT for Content Generation and Monetization"
#summary = "The article is a summary of a video tutorial that teaches how to use ChatGPT to generate blog posts and video scripts, post them online, and earn money from ads and affiliate products. The author follows the tutorial and generates blog posts using ChatGPT, but they find the resulting articles to be robotic and unreadable. They try to use a paraphrasing tool to pass an AI checker, but the result is still unreadable. They post the articles on Medium and Quora but receive no views. The author concludes that the problem may not be with ChatGPT but with the tutorial's approach."
sentencetriplets = [["record the play", "play the record", "play the game"],
["germany sells arms to saudi arabia", "arms bend at the elbow", "wave your arms around"],
["the problem has no solution", "boil the solution with salt", "heat the solution to 75 degrees"],
["all income is subject to tax", "economics an arts subject", "i have one subject for credit"],
["the key issue is quality not quantity", "the key broke in the lock", "i lost my key"]]
distances = []
for triplet in sentencetriplets:
cos_sim = sentence_sim(triplet[0], triplet[1])
tokens = [triplet[0], triplet[1], cos_sim]
distances.append(tokens)
cos_sim = sentence_sim(triplet[0], triplet[2])
tokens = [triplet[0], triplet[2], cos_sim]
distances.append(tokens)
cos_sim = sentence_sim(triplet[1], triplet[2])
tokens = [triplet[1], triplet[2], cos_sim]
distances.append(tokens)
df = pd.DataFrame(distances, columns=['sentence1', 'sentence2', 'distance'])
#df = pd.DataFrame(cos_sim)
#st.write(title)
#st.write(summary)
#st.write(cos_sim)
st.write(df)
#print(df)
# output
# The output is: [[0.79056942]], indicating a relatively high cosine similarity between the title and summary.
# example:
# calculate the cosine similarity of the title "17 Money Secrets To Make Your First Million" and
# the summary "The author shares their money secrets for making millions, including knowing when
# to say no to opportunities, spending money wisely, building wealth slowly, prioritizing peace of mind,
# protecting your reputation, assessing opportunities based on your personality, and embracing constraints.
# They also emphasize the importance of increasing the velocity of your income over time and being mindful
# of how easy it is to access your investments."
# To calculate the cosine similarity, we first need to convert the text into vectors. We can use the bag-of-words representation, which represents each text as a vector of word frequencies.
# Title: [1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1]
# Summary: [2, 2, 2, 2, 1, 4, 4, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]
# To calculate the cosine similarity, we take the dot product of the two vectors and
# divide it by the product of their magnitudes.
# Dot product: 12 + 02 + 12 + 02 + 11 + 14 + 14 + 12 + 02 + 01 + 1*1 = 20
# Magnitude of title vector: sqrt(1^2 + 0^2 + 1^2 + 0^2 + 1^2 + 1^2 + 1^2 + 1^2 + 0^2 + 0^2 + 1^2) = sqrt(7) ≈ 2.65
# Magnitude of summary vector: sqrt(2^2 + 2^2 + 2^2 + 2^2 + 1^2 + 4^2 + 4^2 + 2^2 + 2^2 + 1^2 + 1^2 + 1^2 + 1^2 + 2^2 + 2^2 + 1^2 + 2^2 + 1^2 + 1^2 + 1^2) = sqrt(44) ≈ 6.63
# Cosine similarity: 20 / (2.65 * 6.63) ≈ 0.463
# Therefore, the cosine similarity of the title and summary is approximately 0.463. |