Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- app.py +2 -4
- masking_methods.py +44 -1
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
from transformers import AutoTokenizer
|
2 |
from transformers import AutoModelForSeq2SeqLM
|
3 |
import plotly.graph_objs as go
|
@@ -26,7 +28,6 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaske
|
|
26 |
import random
|
27 |
from nltk.corpus import stopwords
|
28 |
from termcolor import colored
|
29 |
-
import nltk
|
30 |
from nltk.translate.bleu_score import sentence_bleu
|
31 |
from transformers import BertTokenizer, BertModel
|
32 |
import gradio as gr
|
@@ -36,9 +37,6 @@ from lcs import find_common_subsequences
|
|
36 |
from highlighter import highlight_common_words, highlight_common_words_dict
|
37 |
from entailment import analyze_entailment
|
38 |
|
39 |
-
nltk.download('stopwords')
|
40 |
-
|
41 |
-
|
42 |
# Function for the Gradio interface
|
43 |
def model(prompt):
|
44 |
sentence = prompt
|
|
|
1 |
+
import nltk
|
2 |
+
nltk.download('stopwords')
|
3 |
from transformers import AutoTokenizer
|
4 |
from transformers import AutoModelForSeq2SeqLM
|
5 |
import plotly.graph_objs as go
|
|
|
28 |
import random
|
29 |
from nltk.corpus import stopwords
|
30 |
from termcolor import colored
|
|
|
31 |
from nltk.translate.bleu_score import sentence_bleu
|
32 |
from transformers import BertTokenizer, BertModel
|
33 |
import gradio as gr
|
|
|
37 |
from highlighter import highlight_common_words, highlight_common_words_dict
|
38 |
from entailment import analyze_entailment
|
39 |
|
|
|
|
|
|
|
40 |
# Function for the Gradio interface
|
41 |
def model(prompt):
|
42 |
sentence = prompt
|
masking_methods.py
CHANGED
@@ -2,6 +2,7 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM
|
|
2 |
from transformers import pipeline
|
3 |
import random
|
4 |
from nltk.corpus import stopwords
|
|
|
5 |
|
6 |
# Masking Model
|
7 |
def mask_non_stopword(sentence):
|
@@ -14,6 +15,47 @@ def mask_non_stopword(sentence):
|
|
14 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
15 |
return masked_sentence
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Load tokenizer and model for masked language model
|
18 |
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
19 |
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
@@ -22,4 +64,5 @@ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
|
22 |
def mask(sentence):
|
23 |
predictions = fill_mask(sentence)
|
24 |
masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
|
25 |
-
return masked_sentences
|
|
|
|
2 |
from transformers import pipeline
|
3 |
import random
|
4 |
from nltk.corpus import stopwords
|
5 |
+
import math
|
6 |
|
7 |
# Masking Model
|
8 |
def mask_non_stopword(sentence):
|
|
|
15 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
16 |
return masked_sentence
|
17 |
|
18 |
+
def mask_non_stopword_pseudorandom(sentence):
|
19 |
+
stop_words = set(stopwords.words('english'))
|
20 |
+
words = sentence.split()
|
21 |
+
non_stop_words = [word for word in words if word.lower() not in stop_words]
|
22 |
+
if not non_stop_words:
|
23 |
+
return sentence
|
24 |
+
random.seed(10)
|
25 |
+
word_to_mask = random.choice(non_stop_words)
|
26 |
+
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
27 |
+
return masked_sentence
|
28 |
+
|
29 |
+
def high_entropy_words(sentence, non_melting_points):
|
30 |
+
stop_words = set(stopwords.words('english'))
|
31 |
+
words = sentence.split()
|
32 |
+
|
33 |
+
non_melting_words = set()
|
34 |
+
for _, point in non_melting_points:
|
35 |
+
non_melting_words.update(point.lower().split())
|
36 |
+
|
37 |
+
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
|
38 |
+
|
39 |
+
if not candidate_words:
|
40 |
+
return sentence
|
41 |
+
|
42 |
+
max_entropy = -float('inf')
|
43 |
+
max_entropy_word = None
|
44 |
+
|
45 |
+
for word in candidate_words:
|
46 |
+
masked_sentence = sentence.replace(word, '[MASK]', 1)
|
47 |
+
predictions = fill_mask(masked_sentence)
|
48 |
+
|
49 |
+
# Calculate entropy based on top 5 predictions
|
50 |
+
entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
|
51 |
+
|
52 |
+
if entropy > max_entropy:
|
53 |
+
max_entropy = entropy
|
54 |
+
max_entropy_word = word
|
55 |
+
|
56 |
+
return sentence.replace(max_entropy_word, '[MASK]', 1)
|
57 |
+
|
58 |
+
|
59 |
# Load tokenizer and model for masked language model
|
60 |
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
61 |
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
|
|
64 |
def mask(sentence):
|
65 |
predictions = fill_mask(sentence)
|
66 |
masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
|
67 |
+
return masked_sentences
|
68 |
+
|