Spaces:

wldmr
/

similarity-st1

Sleeping

App Files Files Community

similarity-st1 / app.py

wldmr

mpnet

d0b4009 almost 2 years ago

raw

history blame contribute delete

6.62 kB


	import streamlit as st

	import pandas as pd
	from sentence_transformers import SentenceTransformer, util

	from transformers import AutoTokenizer, pipeline
	import numpy as np

	def sentence_sim(sentence1, sentence2):
	#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
	#model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
	#model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
	model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
	#model = SentenceTransformer('bert-base-uncased')
	embedding1 = model.encode(sentence1)
	embedding2 = model.encode(sentence2)
	cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
	return cos_scores[0][0]


	def dot_product(v1, v2):
	return round(np.dot(v1, v2), 3)

	st.title('Similarity Computations')


	if st.button('Context Sim Bert'):
	model = 'bert-base-uncased'
	framework = 'tf'
	tokenizer = AutoTokenizer.from_pretrained(model)
	feature_extractor = pipeline(
	model=model,
	framework=framework,
	tokenizer=tokenizer,
	task="feature-extraction",
	)

	sentencetriplets = ["record the play", "play the record", "play the game"]
	index = 0
	#sentence = sentencetriplets[index]
	test_word = 'play'
	test_word_vector = {}
	for index, sentence in enumerate(sentencetriplets):
	tokens = tokenizer.tokenize(sentence)
	vectors = feature_extractor(sentence, return_tensors=True).numpy()
	test_word_location = [i for i in range(len(tokens)) if test_word == tokens[i]][0]
	test_word_vector[index] = vectors[0, test_word_location + 1, :] # 0 is '[CLS]'
	magnitude = np.linalg.norm(test_word_vector[index])
	test_word_vector[index] = test_word_vector[index] / magnitude

	dot_product(test_word_vector[0], test_word_vector[1])
	dot_product(test_word_vector[1], test_word_vector[2])
	dot_product(test_word_vector[0], test_word_vector[2])

	if st.button('Instructor'):
	from InstructorEmbedding import INSTRUCTOR
	model = INSTRUCTOR('hkunlp/instructor-xl')
	#sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
	#instruction = "Represent the Science title:"
	#embeddings = model.encode([[instruction,sentence]])
	#st.write(instruction)
	#st.write(embeddings)

	from sklearn.metrics.pairwise import cosine_similarity
	sentences_a = [['Represent the sentence: ','play the record'], ['Represent the sentence: ','play the game']]
	sentences_b = [['Represent the sentence: ','record the play'],['Represent the sentence: ','play the game']]
	embeddings_a = model.encode(sentences_a)
	embeddings_b = model.encode(sentences_b)
	similarities = cosine_similarity(embeddings_a,embeddings_b)
	st.write(sentences_a)
	st.write(sentences_b)
	st.write(similarities)

	if st.button('Cos Sim SBERT'):
	#title = "I Tried Using ChatGPT To Earn $6,147 In Just 1 Week"
	#summary = "Unveiling the Reality: The Perils of Using ChatGPT for Content Generation and Monetization"
	#summary = "The article is a summary of a video tutorial that teaches how to use ChatGPT to generate blog posts and video scripts, post them online, and earn money from ads and affiliate products. The author follows the tutorial and generates blog posts using ChatGPT, but they find the resulting articles to be robotic and unreadable. They try to use a paraphrasing tool to pass an AI checker, but the result is still unreadable. They post the articles on Medium and Quora but receive no views. The author concludes that the problem may not be with ChatGPT but with the tutorial's approach."

	sentencetriplets = [["record the play", "play the record", "play the game"],
	["germany sells arms to saudi arabia", "arms bend at the elbow", "wave your arms around"],
	["the problem has no solution", "boil the solution with salt", "heat the solution to 75 degrees"],
	["all income is subject to tax", "economics an arts subject", "i have one subject for credit"],
	["the key issue is quality not quantity", "the key broke in the lock", "i lost my key"]]


	distances = []
	for triplet in sentencetriplets:
	cos_sim = sentence_sim(triplet[0], triplet[1])
	tokens = [triplet[0], triplet[1], cos_sim]
	distances.append(tokens)
	cos_sim = sentence_sim(triplet[0], triplet[2])
	tokens = [triplet[0], triplet[2], cos_sim]
	distances.append(tokens)
	cos_sim = sentence_sim(triplet[1], triplet[2])
	tokens = [triplet[1], triplet[2], cos_sim]
	distances.append(tokens)


	df = pd.DataFrame(distances, columns=['sentence1', 'sentence2', 'distance'])

	#df = pd.DataFrame(cos_sim)
	#st.write(title)
	#st.write(summary)
	#st.write(cos_sim)
	st.write(df)
	#print(df)


	# output
	# The output is: [[0.79056942]], indicating a relatively high cosine similarity between the title and summary.

	# example:
	# calculate the cosine similarity of the title "17 Money Secrets To Make Your First Million" and
	# the summary "The author shares their money secrets for making millions, including knowing when
	# to say no to opportunities, spending money wisely, building wealth slowly, prioritizing peace of mind,
	# protecting your reputation, assessing opportunities based on your personality, and embracing constraints.
	# They also emphasize the importance of increasing the velocity of your income over time and being mindful
	# of how easy it is to access your investments."
	# To calculate the cosine similarity, we first need to convert the text into vectors. We can use the bag-of-words representation, which represents each text as a vector of word frequencies.

	# Title: [1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1]
	# Summary: [2, 2, 2, 2, 1, 4, 4, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]

	# To calculate the cosine similarity, we take the dot product of the two vectors and
	# divide it by the product of their magnitudes.

	# Dot product: 12 + 02 + 12 + 02 + 11 + 14 + 14 + 12 + 02 + 01 + 1*1 = 20
	# Magnitude of title vector: sqrt(1^2 + 0^2 + 1^2 + 0^2 + 1^2 + 1^2 + 1^2 + 1^2 + 0^2 + 0^2 + 1^2) = sqrt(7) ≈ 2.65
	# Magnitude of summary vector: sqrt(2^2 + 2^2 + 2^2 + 2^2 + 1^2 + 4^2 + 4^2 + 2^2 + 2^2 + 1^2 + 1^2 + 1^2 + 1^2 + 2^2 + 2^2 + 1^2 + 2^2 + 1^2 + 1^2 + 1^2) = sqrt(44) ≈ 6.63

	# Cosine similarity: 20 / (2.65 * 6.63) ≈ 0.463

	# Therefore, the cosine similarity of the title and summary is approximately 0.463.