Spaces:

transZ
/

sbert_cosine

Sleeping

App Files Files Community

sbert_cosine / sbert_cosine.py

transZ

Fix bug

e29d4b0 over 1 year ago

raw

history blame contribute delete

No virus

5.71 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""SBERT consime similarity metric."""

	import evaluate
	import datasets
	import torch
	import torch.nn as nn
	from transformers import AutoTokenizer, BertModel

	_CITATION = """\
	@article{Reimers2019,
	archivePrefix = {arXiv},
	arxivId = {1908.10084},
	author = {Reimers, Nils and Gurevych, Iryna},
	doi = {10.18653/v1/d19-1410},
	eprint = {1908.10084},
	isbn = {9781950737901},
	journal = {EMNLP-IJCNLP 2019 - 2019 Conference on Empirical Methods in Natural Language Processing and 9th International Joint Conference on Natural Language Processing, Proceedings of the Conference},
	pages = {3982--3992},
	title = {{Sentence-BERT: Sentence embeddings using siamese BERT-networks}},
	year = {2019}
	}
	"""

	_DESCRIPTION = """\
	Use SBERT to produce embedding and score the similarity by cosine similarity
	"""


	_KWARGS_DESCRIPTION = """
	Calculates how semantic similarity are predictions given some references, using certain scores
	Args:
	predictions: list of predictions to score. Each predictions
	should be a string with tokens separated by spaces.
	references: list of reference for each prediction. Each
	reference should be a string with tokens separated by spaces.
	Returns:
	score: description of the first score,
	Examples:
	Examples should be written in doctest format, and should illustrate how
	to use the function.

	>>> sbert_cosine = evaluate.load("transZ/sbert_cosine")
	>>> results = my_new_module.compute(references=["Nice to meet you"], predictions=["It is my pleasure to meet you"])
	>>> print(results)
	{'score': 0.85}
	"""

	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class sbert_cosine(evaluate.Metric):
	"""TODO: Short description of my evaluation module."""

	def _info(self):
	# TODO: Specifies the evaluate.EvaluationModuleInfo object
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=[
	datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
	}
	),
	datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Value("string", id="sequence"),
	}
	),
	],
	# Homepage of the module for documentation
	homepage="http://sbert.net",
	# Additional links to the codebase or references
	codebase_urls=["https://github.com/UKPLab/sentence-transformers"],
	reference_urls=["https://github.com/UKPLab/sentence-transformers"]
	)

	def _download_and_prepare(self, dl_manager):
	"""Optional: download external resources useful to compute the scores"""
	# TODO: Download external resources if needed
	pass

	def _compute(self, predictions, references, model_type='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
	"""Returns the scores"""
	def mean_pooling(model_output, attention_mask):
	token_embeddings = model_output[0] #First element of model_output contains all token embeddings
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

	def batch_to_device(batch, target_device):
	"""
	send a pytorch batch to a device (CPU/GPU)
	"""
	for key in batch:
	if isinstance(batch[key], torch.Tensor):
	batch[key] = batch[key].to(target_device)
	return batch

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	tokenizer = AutoTokenizer.from_pretrained(model_type)
	model = BertModel.from_pretrained(model_type)
	model = model.to(device)
	cosine = nn.CosineSimilarity(dim=0)

	def calculate(x: str, y: str):
	encoded_input = tokenizer([x, y], padding=True, truncation=True, return_tensors='pt')
	encoded_input = batch_to_device(encoded_input, device)
	model_output = model(**encoded_input)
	embeds = mean_pooling(model_output, encoded_input['attention_mask'])
	res = cosine(embeds[0, :], embeds[1, :]).item()
	return res

	# avg = lambda x: sum(x) / len(x)

	with torch.no_grad():
	scores = [calculate(pred, ref) for pred, ref in zip(predictions, references)]

	return {
	"score": scores,
	}