File size: 4,896 Bytes
82ce39a b8597d7 82ce39a 4f8d97f d970f1e 4f8d97f bb8ff39 4f8d97f 4be7f26 4f8d97f 502f610 4f8d97f 40c7ddf 4f8d97f 40c7ddf 4f8d97f d970f1e 4f8d97f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
---
tags: salesken
license: apache-2.0
inference: false
---
We have trained a model to evaluate if a paraphrase is a semantic variation to the input query or just a surface level variation. Data augmentation by adding Surface level variations does not add much value to the NLP model training. if the approach to paraphrase generation is "OverGenerate and Rank" , Its important to have a robust model of scoring/ ranking paraphrases. NLG Metrics like bleu ,BleuRT, gleu , Meteor have not proved very effective in scoring paraphrases.
```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("salesken/paraphrase_diversity_ranker")
model = AutoModelForSequenceClassification.from_pretrained("salesken/paraphrase_diversity_ranker")
input_query = ["tough challenges make you stronger."]
paraphrases = [
"tough problems make you stronger",
"tough problems will make you stronger",
"tough challenges make you stronger",
"tough challenges will make you a stronger person",
"tough challenges will make you stronger",
"tough tasks make you stronger",
"the tough task makes you stronger",
"tough stuff makes you stronger",
"if tough times make you stronger",
"the tough part makes you stronger",
"tough issues strengthens you",
"tough shit makes you stronger",
"tough tasks force you to be stronger",
"tough challenge is making you stronger",
"tough problems make you have more strength"]
para_pairs=list(pd.MultiIndex.from_product([input_query, paraphrases]))
features = tokenizer(para_pairs, padding=True, truncation=True, return_tensors="pt")
model.eval()
with torch.no_grad():
scores = model(**features).logits
label_mapping = ['surface_level_variation', 'semantic_variation']
labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
sorted_diverse_paraphrases= np.array(para_pairs)[scores[:,1].sort(descending=True).indices].tolist()
print(sorted_diverse_paraphrases)
# to identify the type of paraphrase (surface-level variation or semantic variation)
print("Paraphrase type detection=====", list(zip(para_pairs, labels)))
```
============================================================================
For more robust results, filter out the paraphrases which are not semantically
similar using a model trained on NLI, STS task and then apply the ranker .
```python
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("salesken/paraphrase_diversity_ranker")
model = AutoModelForSequenceClassification.from_pretrained("salesken/paraphrase_diversity_ranker")
embedder = SentenceTransformer('stsb-bert-large')
input_query = ["tough challenges make you stronger."]
paraphrases = [
"tough problems make you stronger",
"tough problems will make you stronger",
"tough challenges make you stronger",
"tough challenges will make you a stronger person",
"tough challenges will make you stronger",
"tough tasks make you stronger",
"the tough task makes you stronger",
"tough stuff makes you stronger",
"tough people make you stronger",
"if tough times make you stronger",
"the tough part makes you stronger",
"tough issues strengthens you",
"tough shit makes you stronger",
"tough tasks force you to be stronger",
"tough challenge is making you stronger",
"tough problems make you have more strength"]
corpus_embeddings = embedder.encode(paraphrases, convert_to_tensor=True)
query_embedding = embedder.encode(input_query, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
para_set=np.array(paraphrases)
a=cos_scores.sort(descending=True)
para= para_set[a.indices[a.values>=0.7].cpu()].tolist()
para_pairs=list(pd.MultiIndex.from_product([input_query, para]))
import torch
features = tokenizer(para_pairs, padding=True, truncation=True, return_tensors="pt")
model.eval()
with torch.no_grad():
scores = model(**features).logits
label_mapping = ['surface_level_variation', 'semantic_variation']
labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
sorted_diverse_paraphrases= np.array(para)[scores[:,1].sort(descending=True).indices].tolist()
print("Paraphrases sorted by diversity:=======",sorted_diverse_paraphrases)
# to identify the type of paraphrase (surface-level variation or semantic variation)
print("Paraphrase type detection=====", list(zip(para_pairs, labels)))
``` |