Spaces:

ZurichNLP
/

unsupervised-semantic-diff

Runtime error

App Files Files Community

jvamvas commited on Oct 16, 2023

Commit

fda57dd

1 Parent(s): 2f6a112

Basic implementation

Browse files

Files changed (10) hide show

app.py +79 -4
get_max_min_values.py +21 -0
recognizers/__init__.py +2 -0
recognizers/base.py +36 -0
recognizers/diff_align.py +48 -0
recognizers/diff_del.py +217 -0
recognizers/feature_based.py +136 -0
recognizers/utils.py +129 -0
result_template.html +47 -0
tests.py +26 -0

app.py CHANGED Viewed

@@ -1,9 +1,84 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+from pathlib import Path
 import gradio as gr
+from jinja2 import Environment
+from tokenizers.pre_tokenizers import Whitespace
+from transformers import pipeline
+from recognizers import DiffAlign, DiffDel
+def load_pipeline(model_name_or_path: str = "ZurichNLP/unsup-simcse-xlm-roberta-base"):
+    return pipeline("feature-extraction", model=model_name_or_path)
+def generate_diff(text_a: str, text_b: str, method: str):
+    global my_pipeline
+    if my_pipeline is None:
+        my_pipeline = load_pipeline()
+    if method == "DiffAlign":
+        diff = DiffAlign(pipeline=my_pipeline)
+        min_value = 0.3758048415184021 - 0.37
+        max_value = 1.045647144317627 - 0.1
+    elif method == "DiffDel":
+        diff = DiffDel(pipeline=my_pipeline)
+        min_value = 0.4864141941070556
+        max_value = 0.5012983083724976 + 0.025
+    else:
+        raise ValueError(f"Unknown method: {method}")
+    encoding_a = tokenizer.pre_tokenize_str(text_a)
+    encoding_b = tokenizer.pre_tokenize_str(text_b)
+    result = diff.predict(
+        a=" ".join([token[0] for token in encoding_a]),
+        b=" ".join([token[0] for token in encoding_b]),
+    )
+    result.add_whitespace(encoding_a, encoding_b)
+    # Normalize labels based on empirical min/max values
+    result.labels_a = tuple([(label - min_value) / (max_value - min_value) for label in result.labels_a])
+    result.labels_b = tuple([(label - min_value) / (max_value - min_value) for label in result.labels_b])
+    # Round labels to range 0, 2, ... 10
+    result.labels_a = tuple([round(min(10, label * 10)) for label in result.labels_a])
+    result.labels_b = tuple([round(min(10, label * 10)) for label in result.labels_b])
+    template_path = Path(__file__).parent / "result_template.html"
+    template = Environment().from_string(template_path.read_text())
+    html_dir = Path(__file__).parent / "html_out"
+    html_dir.mkdir(exist_ok=True)
+    html_a = template.render(token_labels=result.token_labels_a)
+    html_b = template.render(token_labels=result.token_labels_b)
+    return str(html_a), str(html_b)
+my_pipeline = None
+tokenizer = Whitespace()
+with gr.Blocks() as demo:
+    with gr.Row():
+        text_a = gr.Textbox(label="Text A", value="Chinese shares close higher Friday.", lines=2)
+        text_b = gr.Textbox(label="Text B", value="Les actions chinoises clôturent en baisse mercredi.", lines=2)
+    with gr.Row():
+        method = gr.Dropdown(choices=["DiffAlign", "DiffDel"], label="Comparison Method", value="DiffAlign")
+    with gr.Row():
+        with gr.Column(variant="panel"):
+            output_a = gr.HTML(label="Result for text A", show_label=True)
+        with gr.Column(variant="panel"):
+            output_b = gr.HTML(label="Result for text B", show_label=True)
+    with gr.Row():
+        submit_btn = gr.Button(label="Generate Diff")
+        submit_btn.click(
+            fn=generate_diff,
+            inputs=[text_a, text_b, method],
+            outputs=[output_a, output_b],
+        )
+if my_pipeline is None:
+    my_pipeline = load_pipeline()
+demo.launch()

get_max_min_values.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Get similarities of similar and dissimilar pairs.
+The values are used for normalizing the colors in the visualization.
+"""
+from app import load_pipeline
+from recognizers import DiffAlign, DiffDel
+similar_pair = ("Hello!", "Hi!")
+dissimilar_pair = ("Hello!", "asdf")
+pipeline = load_pipeline()
+diff_align = DiffAlign(pipeline=pipeline)
+diff_del = DiffDel(pipeline=pipeline)
+print("Similar pair:")
+print(diff_align.predict(*similar_pair).min)
+print(diff_del.predict(*similar_pair).min)
+print("Dissimilar pair:")
+print(diff_align.predict(*dissimilar_pair).max)
+print(diff_del.predict(*dissimilar_pair).max)

recognizers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from recognizers.diff_align import DiffAlign
2	+ from recognizers.diff_del import DiffDel

recognizers/base.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Source: https://github.com/ZurichNLP/recognizing-semantic-differences
+MIT License
+Copyright (c) 2023 University of Zurich
+"""
+from typing import List
+from tqdm import tqdm
+from recognizers.utils import DifferenceSample
+class DifferenceRecognizer:
+    def __str__(self):
+        raise NotImplemented
+    def predict(self,
+                a: str,
+                b: str,
+                **kwargs,
+                ) -> DifferenceSample:
+        raise NotImplemented
+    def predict_all(self,
+                    a: List[str],
+                    b: List[str],
+                    **kwargs,
+                    ) -> List[DifferenceSample]:
+        assert len(a) == len(b)
+        predictions = []
+        for i in tqdm(list(range(len(a)))):
+            prediction = self.predict(a[i], b[i], **kwargs)
+            predictions.append(prediction)
+        return predictions

recognizers/diff_align.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Source: https://github.com/ZurichNLP/recognizing-semantic-differences
+MIT License
+Copyright (c) 2023 University of Zurich
+"""
+from typing import List
+import torch
+from recognizers.feature_based import FeatureExtractionRecognizer
+from recognizers.utils import DifferenceSample, cos_sim
+class DiffAlign(FeatureExtractionRecognizer):
+    def __str__(self):
+        return f"DiffAlign(model={self.pipeline.model.name_or_path}, layer={self.layer}"
+    @torch.no_grad()
+    def _predict_all(self,
+                    a: List[str],
+                    b: List[str],
+                    **kwargs,
+                    ) -> List[DifferenceSample]:
+        outputs_a = self.encode_batch(a, **kwargs)
+        outputs_b = self.encode_batch(b, **kwargs)
+        subwords_by_words_a = [self._get_subwords_by_word(sentence) for sentence in a]
+        subwords_by_words_b = [self._get_subwords_by_word(sentence) for sentence in b]
+        subword_labels_a = []
+        subword_labels_b = []
+        for i in range(len(a)):
+            cosine_similarities = cos_sim(outputs_a[i], outputs_b[i])
+            max_similarities_a = torch.max(cosine_similarities, dim=1).values
+            max_similarities_b = torch.max(cosine_similarities, dim=0).values
+            subword_labels_a.append((1 - max_similarities_a))
+            subword_labels_b.append((1 - max_similarities_b))
+        samples = []
+        for i in range(len(a)):
+            labels_a = self._subword_labels_to_word_labels(subword_labels_a[i], subwords_by_words_a[i])
+            labels_b = self._subword_labels_to_word_labels(subword_labels_b[i], subwords_by_words_b[i])
+            samples.append(DifferenceSample(
+                tokens_a=tuple(a[i].split()),
+                tokens_b=tuple(b[i].split()),
+                labels_a=tuple(labels_a),
+                labels_b=tuple(labels_b),
+            ))
+        return samples

recognizers/diff_del.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+Source: https://github.com/ZurichNLP/recognizing-semantic-differences
+MIT License
+Copyright (c) 2023 University of Zurich
+"""
+import itertools
+from copy import deepcopy
+from typing import Union, List
+import torch
+from transformers import Pipeline, FeatureExtractionPipeline
+from recognizers.feature_based import FeatureExtractionRecognizer, Ngram
+from recognizers.utils import DifferenceSample, pairwise_cos_sim, cos_sim
+class DiffDel(FeatureExtractionRecognizer):
+    def __init__(self,
+                 model_name_or_path: str = None,
+                 pipeline: Union[FeatureExtractionPipeline, Pipeline] = None,
+                 layer: int = -1,
+                 batch_size: int = 16,
+                 min_n: int = 1,
+                 max_n: int = 1,  # Inclusive
+                 ):
+        super().__init__(model_name_or_path, pipeline, layer, batch_size)
+        assert min_n <= max_n
+        self.min_n = min_n
+        self.max_n = max_n
+    def __str__(self):
+        return f"DiffDel(model={self.pipeline.model.name_or_path}, layer={self.layer}, " \
+               f"min_n={self.min_n}, max_n={self.max_n})"
+    @torch.no_grad()
+    def _predict_all(self,
+                    a: List[str],
+                    b: List[str],
+                    **kwargs,
+                    ) -> List[DifferenceSample]:
+        outputs_a = self.encode_batch(a, **kwargs)
+        outputs_b = self.encode_batch(b, **kwargs)
+        subwords_by_words_a = [self._get_subwords_by_word(sentence) for sentence in a]
+        subwords_by_words_b = [self._get_subwords_by_word(sentence) for sentence in b]
+        ngrams_a = [self._get_ngrams(subwords_by_word) for subwords_by_word in subwords_by_words_a]
+        ngrams_b = [self._get_ngrams(subwords_by_word) for subwords_by_word in subwords_by_words_b]
+        sentence_embeddings_a = self._get_full_sentence_embeddings(outputs_a, [list(itertools.chain.from_iterable(subwords)) for subwords in subwords_by_words_a])
+        sentence_embeddings_b = self._get_full_sentence_embeddings(outputs_b, [list(itertools.chain.from_iterable(subwords)) for subwords in subwords_by_words_b])
+        full_similarities = pairwise_cos_sim(sentence_embeddings_a, sentence_embeddings_b)
+        all_labels_a = []
+        all_labels_b = []
+        for i in range(len(a)):
+            partial_embeddings_a = self._get_partial_sentence_embeddings_for_sample(outputs_a[i], ngrams_a[i])
+            partial_embeddings_b = self._get_partial_sentence_embeddings_for_sample(outputs_b[i], ngrams_b[i])
+            partial_similarities_a = cos_sim(partial_embeddings_a, sentence_embeddings_b[i].unsqueeze(0)).squeeze(1)
+            partial_similarities_b = cos_sim(partial_embeddings_b, sentence_embeddings_a[i].unsqueeze(0)).squeeze(1)
+            ngram_labels_a = (partial_similarities_a - full_similarities[i] + 1) / 2
+            ngram_labels_b = (partial_similarities_b - full_similarities[i] + 1) / 2
+            subword_labels_a = self._distribute_ngram_labels_to_subwords(ngram_labels_a, ngrams_a[i])
+            subword_labels_b = self._distribute_ngram_labels_to_subwords(ngram_labels_b, ngrams_b[i])
+            labels_a = self._subword_labels_to_word_labels(subword_labels_a, subwords_by_words_a[i])
+            labels_b = self._subword_labels_to_word_labels(subword_labels_b, subwords_by_words_b[i])
+            all_labels_a.append(labels_a)
+            all_labels_b.append(labels_b)
+        samples = []
+        for i in range(len(a)):
+            samples.append(DifferenceSample(
+                tokens_a=tuple(a[i].split()),
+                tokens_b=tuple(b[i].split()),
+                labels_a=tuple(all_labels_a[i]),
+                labels_b=tuple(all_labels_b[i]),
+            ))
+        return samples
+    def _get_full_sentence_embeddings(self, token_embeddings: torch.Tensor, include_subwords: List[List[int]]) -> torch.Tensor:
+        """
+        :param token_embeddings: batch x seq_len x dim
+        :param include_subwords: batch x num_subwords
+        :return: A tensor of shape batch x dim
+        """
+        pool_mask = torch.zeros(token_embeddings.shape[0], token_embeddings.shape[1], device=token_embeddings.device)
+        for i, subword_indices in enumerate(include_subwords):
+            pool_mask[i, subword_indices] = 1
+        sentence_embeddings = self._pool(token_embeddings, pool_mask)
+        return sentence_embeddings
+    def _get_partial_sentence_embeddings_for_sample(self, token_embeddings: torch.Tensor, ngrams: List[Ngram]) -> torch.Tensor:
+        """
+        :param token_embeddings: seq_len x dim
+        :param ngrams: num_ngrams x n
+        :return: A tensor of shape num_ngrams x dim
+        """
+        pool_mask = torch.zeros(len(ngrams), token_embeddings.shape[0], device=token_embeddings.device)
+        pool_mask[:, list(itertools.chain.from_iterable(ngrams))] = 1
+        for i, subword_indices in enumerate(ngrams):
+            pool_mask[i, subword_indices] = 0
+        partial_embeddings = self._pool(token_embeddings.unsqueeze(0).repeat(len(ngrams), 1, 1), pool_mask)
+        return partial_embeddings
+    def _distribute_ngram_labels_to_subwords(self, ngram_labels: torch.Tensor, ngrams: List[Ngram]) -> torch.Tensor:
+        """
+        :param ngram_labels: num_ngrams
+        :param ngrams: num_ngrams x n
+        :return: num_subwords
+        """
+        max_subword_idx = max(itertools.chain.from_iterable(ngrams))
+        subword_contributions = torch.zeros(max_subword_idx + 1, device=ngram_labels.device)
+        contribution_count = torch.zeros(max_subword_idx + 1, device=ngram_labels.device)
+        for i, ngram in enumerate(ngrams):
+            subword_contributions[ngram] += ngram_labels[i] / len(ngram)
+            contribution_count[ngram] += 1 / len(ngram)
+        subword_contributions /= contribution_count
+        return subword_contributions
+class DiffDelWithReencode(FeatureExtractionRecognizer):
+    """
+    Version of DiffDel that encodes the partial sentences from scratch (instead of encoding the full sentence once and
+    then excluding hidden states from the mean)
+    """
+    def __init__(self,
+                 model_name_or_path: str = None,
+                 pipeline: Union[FeatureExtractionPipeline, Pipeline] = None,
+                 layer: int = -1,
+                 batch_size: int = 16,
+                 ):
+        super().__init__(model_name_or_path, pipeline, layer, batch_size)
+    def __str__(self):
+        return f"DiffDelWithReencode(model={self.pipeline.model.name_or_path}, layer={self.layer})"
+    @torch.no_grad()
+    def _predict_all(self,
+                    a: List[str],
+                    b: List[str],
+                    **kwargs,
+                    ) -> List[DifferenceSample]:
+        a_words = [sentence.split() for sentence in a]
+        b_words = [sentence.split() for sentence in b]
+        a_words_partial = []
+        b_words_partial = []
+        for words in a_words:
+            for i, word in enumerate(words):
+                partial = deepcopy(words)
+                del partial[i]
+                a_words_partial.append(partial)
+        for words in b_words:
+            for i, word in enumerate(words):
+                partial = deepcopy(words)
+                del partial[i]
+                b_words_partial.append(partial)
+        a_partial = [" ".join([word for word in words if word]) for words in a_words_partial]
+        b_partial = [" ".join([word for word in words if word]) for words in b_words_partial]
+        a_num_partial = [len(words) for words in a_words]
+        b_num_partial = [len(words) for words in b_words]
+        a_embedding_full = self._encode_and_pool(a, **kwargs)
+        b_embedding_full = self._encode_and_pool(b, **kwargs)
+        a_embeddings_partial = []
+        b_embeddings_partial = []
+        for i in range(0, len(a_partial), self.batch_size):
+            a_embeddings_partial_batch = self._encode_and_pool(a_partial[i:i + self.batch_size], **kwargs)
+            a_embeddings_partial.append(a_embeddings_partial_batch)
+        for i in range(0, len(b_partial), self.batch_size):
+            b_embeddings_partial_batch = self._encode_and_pool(b_partial[i:i + self.batch_size], **kwargs)
+            b_embeddings_partial.append(b_embeddings_partial_batch)
+        a_embeddings_partial = torch.cat(a_embeddings_partial, dim=0)
+        b_embeddings_partial = torch.cat(b_embeddings_partial, dim=0)
+        labels_a = []
+        labels_b = []
+        similarity_full = pairwise_cos_sim(a_embedding_full, b_embedding_full)
+        for i in range(len(a)):
+            a_embeddings_partial_i = a_embeddings_partial[sum(a_num_partial[:i]):sum(a_num_partial[:i + 1])]
+            similarities_partial = pairwise_cos_sim(a_embeddings_partial_i, b_embedding_full[i].unsqueeze(0)).squeeze(0)
+            labels = (similarities_partial - similarity_full[i] + 1) / 2
+            labels = labels.detach().cpu().tolist()
+            if isinstance(labels, float):
+                labels = [labels]
+            assert len(labels) == len(a_words[i])
+            labels_a.append(labels)
+        for i in range(len(b)):
+            b_embeddings_partial_i = b_embeddings_partial[sum(b_num_partial[:i]):sum(b_num_partial[:i + 1])]
+            similarities_partial = pairwise_cos_sim(b_embeddings_partial_i, a_embedding_full[i].unsqueeze(0)).squeeze(0)
+            labels = (similarities_partial - similarity_full[i] + 1) / 2
+            labels = labels.detach().cpu().tolist()
+            if isinstance(labels, float):
+                labels = [labels]
+            assert len(labels) == len(b_words[i])
+            labels_b.append(labels)
+        samples = []
+        for i in range(len(a)):
+            samples.append(DifferenceSample(
+                tokens_a=tuple(a_words[i]),
+                tokens_b=tuple(b_words[i]),
+                labels_a=tuple(labels_a[i]),
+                labels_b=tuple(labels_b[i]),
+            ))
+        return samples
+    def _encode_and_pool(self, sentences: List[str], **kwargs) -> torch.Tensor:
+        model_inputs = self.pipeline.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
+        model_inputs = model_inputs.to(self.pipeline.device)
+        outputs = self.pipeline.model(**model_inputs, output_hidden_states=True, **kwargs)
+        if self.layer == "mean":
+            token_embeddings = torch.stack(outputs.hidden_states, dim=0).mean(dim=0)
+        else:
+            assert isinstance(self.layer, int)
+            token_embeddings = outputs.hidden_states[self.layer]
+        mask = model_inputs["attention_mask"]
+        sentence_embeddings = torch.sum(token_embeddings * mask.unsqueeze(-1), dim=1)
+        return sentence_embeddings

recognizers/feature_based.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+Source: https://github.com/ZurichNLP/recognizing-semantic-differences
+MIT License
+Copyright (c) 2023 University of Zurich
+"""
+import itertools
+from typing import List, Union
+import torch
+import transformers
+from transformers import FeatureExtractionPipeline, Pipeline
+from recognizers.base import DifferenceRecognizer
+from recognizers.utils import DifferenceSample
+Ngram = List[int]  # A span of subword indices
+class FeatureExtractionRecognizer(DifferenceRecognizer):
+    def __init__(self,
+                 model_name_or_path: str = None,
+                 pipeline: Union[FeatureExtractionPipeline, Pipeline] = None,
+                 layer: int = -1,
+                 batch_size: int = 16,
+                 ):
+        assert model_name_or_path is not None or pipeline is not None
+        if pipeline is None:
+            pipeline = transformers.pipeline(
+                model=model_name_or_path,
+                task="feature-extraction",
+            )
+        self.pipeline = pipeline
+        self.layer = layer
+        self.batch_size = batch_size
+    def encode_batch(self, sentences: List[str], **kwargs) -> torch.Tensor:
+        model_inputs = self.pipeline.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
+        model_inputs = model_inputs.to(self.pipeline.device)
+        outputs = self.pipeline.model(**model_inputs, output_hidden_states=True, **kwargs)
+        return outputs.hidden_states[self.layer]
+    def predict(self,
+                a: str,
+                b: str,
+                **kwargs,
+                ) -> DifferenceSample:
+        return self.predict_all([a], [b], **kwargs)[0]
+    def predict_all(self,
+                    a: List[str],
+                    b: List[str],
+                    **kwargs,
+                    ) -> List[DifferenceSample]:
+        samples = []
+        for i in range(0, len(a), self.batch_size):
+            samples.extend(self._predict_all(
+                a[i:i + self.batch_size],
+                b[i:i + self.batch_size],
+                **kwargs,
+            ))
+        return samples
+    @torch.no_grad()
+    def _predict_all(self,
+                    a: List[str],
+                    b: List[str],
+                    **kwargs,
+                    ) -> List[DifferenceSample]:
+        raise NotImplementedError
+    def _pool(self, token_embeddings: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        :param token_embeddings: batch x seq_len x dim
+        :param mask: batch x seq_len; 1 if token should be included in the pooling
+        :return: batch x dim
+        Do only sum and do not divide by the number of tokens because cosine similarity is length-invariant.
+        """
+        return torch.sum(token_embeddings * mask.unsqueeze(-1), dim=1)
+    def _get_subwords_by_word(self, sentence: str) -> List[Ngram]:
+        """
+        :return: For each word in the sentence, the positions of the subwords that make up the word.
+        """
+        batch_encoding = self.pipeline.tokenizer(
+            sentence,
+            padding=True,
+            truncation=True,
+        )
+        subword_ids: List[List[int]] = []
+        for subword_idx in range(len(batch_encoding.encodings[0].word_ids)):
+            if batch_encoding.encodings[0].word_ids[subword_idx] is None:  # Special token
+                continue
+            char_idx = batch_encoding.encodings[0].offsets[subword_idx][0]
+            if isinstance(self.pipeline.tokenizer, transformers.XLMRobertaTokenizerFast) or \
+                    isinstance(self.pipeline.tokenizer, transformers.XLMRobertaTokenizer):
+                token = batch_encoding.encodings[0].tokens[subword_idx]
+                is_tail = not token.startswith("▁") and token not in self.pipeline.tokenizer.all_special_tokens
+            elif isinstance(self.pipeline.tokenizer, transformers.RobertaTokenizerFast) or \
+                    isinstance(self.pipeline.tokenizer, transformers.RobertaTokenizer):
+                token = batch_encoding.encodings[0].tokens[subword_idx]
+                is_tail = not token.startswith("Ġ") and token not in self.pipeline.tokenizer.all_special_tokens
+            else:
+                is_tail = char_idx > 0 and char_idx == batch_encoding.encodings[0].offsets[subword_idx - 1][1]
+            if is_tail and len(subword_ids) > 0:
+                subword_ids[-1].append(subword_idx)
+            else:
+                subword_ids.append([subword_idx])
+        return subword_ids
+    def _get_ngrams(self, subwords_by_word: List[Ngram]) -> List[Ngram]:
+        """
+        :return: For each subword ngram in the sentence, the positions of the subwords that make up the ngram.
+        """
+        subwords = list(itertools.chain.from_iterable(subwords_by_word))
+        # Always return at least one ngram (reduce n if necessary)
+        min_n = min(self.min_n, len(subwords))
+        ngrams = []
+        for n in range(min_n, self.max_n + 1):
+            for i in range(len(subwords) - n + 1):
+                ngrams.append(subwords[i:i + n])
+        return ngrams
+    def _subword_labels_to_word_labels(self, subword_labels: torch.Tensor, subwords_by_words: List[Ngram]) -> List[float]:
+        """
+        :param subword_labels: num_subwords
+        :param subwords_by_words: num_words x num_subwords
+        :return: num_words
+        """
+        labels = []
+        for subword_indices in subwords_by_words:
+            label = subword_labels[subword_indices].mean().item()
+            labels.append(label)
+        return labels

recognizers/utils.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Source: https://github.com/ZurichNLP/recognizing-semantic-differences
+MIT License
+Copyright (c) 2023 University of Zurich
+"""
+from dataclasses import dataclass
+from typing import Tuple, Optional
+import torch
+from tokenizers.pre_tokenizers import Whitespace
+from torch import Tensor
+@dataclass
+class DifferenceSample:
+    tokens_a: Tuple[str, ...]
+    tokens_b: Tuple[str, ...]
+    labels_a: Tuple[float, ...]
+    labels_b: Optional[Tuple[float, ...]]
+    def add_whitespace(self, encoding_a, encoding_b):
+        self.tokens_a = self._add_whitespace(self.tokens_a, encoding_a)
+        self.tokens_b = self._add_whitespace(self.tokens_b, encoding_b)
+    def _add_whitespace(self, tokens, encoding) -> Tuple[str, ...]:
+        assert len(tokens) == len(encoding)
+        new_tokens = []
+        for i in range(len(encoding)):
+            token = tokens[i]
+            if i < len(encoding) - 1:
+                cur_end = encoding[i][1][1]
+                next_start = encoding[i + 1][1][0]
+                token += " " * (next_start - cur_end)
+            new_tokens.append(token)
+        return tuple(new_tokens)
+    # For rendering with Jinja2
+    @property
+    def token_labels_a(self) -> Tuple[Tuple[str, float], ...]:
+        return tuple(zip(self.tokens_a, self.labels_a))
+    @property
+    def token_labels_b(self) -> Tuple[Tuple[str, float], ...]:
+        return tuple(zip(self.tokens_b, self.labels_b))
+    @property
+    def min(self) -> float:
+        return min(self.labels_a + self.labels_b)
+    @property
+    def max(self) -> float:
+        return max(self.labels_a + self.labels_b)
+def tokenize(text: str) -> Tuple[str]:
+    """
+    Apply Moses-like tokenization to a string.
+    """
+    whitespace_tokenizer = Whitespace()
+    output = whitespace_tokenizer.pre_tokenize_str(text)
+    # [('This', (0, 4)), ('is', (5, 7)), ('a', (8, 9)), ('test', (10, 14)), ('.', (14, 15))]
+    tokens = [str(token[0]) for token in output]
+    return tuple(tokens)
+def cos_sim(a: Tensor, b: Tensor):
+    """
+    Copied from https://github.com/UKPLab/sentence-transformers/blob/d928410803bb90f555926d145ee7ad3bd1373a83/sentence_transformers/util.py#L31
+    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
+    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
+    """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+    if len(a.shape) == 1:
+        a = a.unsqueeze(0)
+    if len(b.shape) == 1:
+        b = b.unsqueeze(0)
+    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
+    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
+    return torch.mm(a_norm, b_norm.transpose(0, 1))
+def pairwise_dot_score(a: Tensor, b: Tensor):
+    """
+    Copied from https://github.com/UKPLab/sentence-transformers/blob/d928410803bb90f555926d145ee7ad3bd1373a83/sentence_transformers/util.py#L73
+    Computes the pairwise dot-product dot_prod(a[i], b[i])
+    :return: Vector with res[i] = dot_prod(a[i], b[i])
+    """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+    return (a * b).sum(dim=-1)
+def normalize_embeddings(embeddings: Tensor):
+    """
+    Copied from https://github.com/UKPLab/sentence-transformers/blob/d928410803bb90f555926d145ee7ad3bd1373a83/sentence_transformers/util.py#L101
+    Normalizes the embeddings matrix, so that each sentence embedding has unit length
+    """
+    return torch.nn.functional.normalize(embeddings, p=2, dim=1)
+def pairwise_cos_sim(a: Tensor, b: Tensor):
+    """
+    Copied from https://github.com/UKPLab/sentence-transformers/blob/d928410803bb90f555926d145ee7ad3bd1373a83/sentence_transformers/util.py#L87
+    Computes the pairwise cossim cos_sim(a[i], b[i])
+    :return: Vector with res[i] = cos_sim(a[i], b[i])
+   """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+    return pairwise_dot_score(normalize_embeddings(a), normalize_embeddings(b))

result_template.html ADDED Viewed

	@@ -0,0 +1,47 @@

+<p>
+    {% for token, label in token_labels %}<span class="highlight-{{ label }}">{{ token }}</span>{% endfor %}
+</p>
+<style>
+    .highlight-1 {
+        background: linear-gradient(90deg, transparent, rgba(255, 245, 235, 0.05), transparent);
+    }
+    .highlight-2 {
+        background: linear-gradient(90deg, transparent, rgba(254, 230, 206, 0.10), transparent);
+    }
+    .highlight-3 {
+        background: linear-gradient(90deg, transparent, rgba(253, 208, 162, 0.15), transparent);
+    }
+    .highlight-4 {
+        background: linear-gradient(90deg, transparent, rgba(253, 141, 60, 0.20), transparent);
+    }
+    .highlight-5 {
+        background: linear-gradient(90deg, transparent, rgba(241, 105, 19, 0.25), transparent);
+    }
+    .highlight-6 {
+        background: linear-gradient(90deg, transparent, rgba(217, 72, 1, 0.30), transparent);
+    }
+    .highlight-7 {
+        background: linear-gradient(90deg, transparent, rgba(127, 39, 4, 0.35), transparent);
+    }
+    .highlight-8 {
+        background: linear-gradient(90deg, transparent, rgba(127, 39, 4, 0.40), transparent);
+    }
+    .highlight-9 {
+        background: linear-gradient(90deg, transparent, rgba(127, 39, 4, 0.45), transparent);
+    }
+    .highlight-10 {
+        background: linear-gradient(90deg, transparent, rgba(127, 39, 4, 0.50), transparent);
+    }
+</style>

tests.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from unittest import TestCase
+from tokenizers.pre_tokenizers import Whitespace
+from recognizers.utils import DifferenceSample
+class DifferenceSampleTestCase(TestCase):
+    def setUp(self):
+        self.text_a = "Chinese shares close higher Friday."
+        self.text_b = "Les actions chinoises clôturent en baisse mercredi."
+        self.tokenizer = Whitespace()
+        self.encoding_a = self.tokenizer.pre_tokenize_str(self.text_a)
+        self.encoding_b = self.tokenizer.pre_tokenize_str(self.text_b)
+        self.result = DifferenceSample(
+            tokens_a=tuple([token[0] for token in self.encoding_a]),
+            tokens_b=tuple([token[0] for token in self.encoding_b]),
+            labels_a=tuple([0.1 for _ in range(len(self.encoding_a))]),
+            labels_b=tuple([0.1 for _ in range(len(self.encoding_b))]),
+        )
+    def test_add_whitespace(self):
+        self.result.add_whitespace(self.encoding_a, self.encoding_b)
+        self.assertEqual("".join(self.result.tokens_a), self.text_a)
+        self.assertEqual("".join(self.result.tokens_b), self.text_b)