jgyasu commited on
Commit
4b89d6b
·
verified ·
1 Parent(s): 4f150bd

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +2 -4
  2. masking_methods.py +44 -1
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from transformers import AutoTokenizer
2
  from transformers import AutoModelForSeq2SeqLM
3
  import plotly.graph_objs as go
@@ -26,7 +28,6 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaske
26
  import random
27
  from nltk.corpus import stopwords
28
  from termcolor import colored
29
- import nltk
30
  from nltk.translate.bleu_score import sentence_bleu
31
  from transformers import BertTokenizer, BertModel
32
  import gradio as gr
@@ -36,9 +37,6 @@ from lcs import find_common_subsequences
36
  from highlighter import highlight_common_words, highlight_common_words_dict
37
  from entailment import analyze_entailment
38
 
39
- nltk.download('stopwords')
40
-
41
-
42
  # Function for the Gradio interface
43
  def model(prompt):
44
  sentence = prompt
 
1
+ import nltk
2
+ nltk.download('stopwords')
3
  from transformers import AutoTokenizer
4
  from transformers import AutoModelForSeq2SeqLM
5
  import plotly.graph_objs as go
 
28
  import random
29
  from nltk.corpus import stopwords
30
  from termcolor import colored
 
31
  from nltk.translate.bleu_score import sentence_bleu
32
  from transformers import BertTokenizer, BertModel
33
  import gradio as gr
 
37
  from highlighter import highlight_common_words, highlight_common_words_dict
38
  from entailment import analyze_entailment
39
 
 
 
 
40
  # Function for the Gradio interface
41
  def model(prompt):
42
  sentence = prompt
masking_methods.py CHANGED
@@ -2,6 +2,7 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM
2
  from transformers import pipeline
3
  import random
4
  from nltk.corpus import stopwords
 
5
 
6
  # Masking Model
7
  def mask_non_stopword(sentence):
@@ -14,6 +15,47 @@ def mask_non_stopword(sentence):
14
  masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
15
  return masked_sentence
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Load tokenizer and model for masked language model
18
  tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
19
  model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
@@ -22,4 +64,5 @@ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
22
  def mask(sentence):
23
  predictions = fill_mask(sentence)
24
  masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
25
- return masked_sentences
 
 
2
  from transformers import pipeline
3
  import random
4
  from nltk.corpus import stopwords
5
+ import math
6
 
7
  # Masking Model
8
  def mask_non_stopword(sentence):
 
15
  masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
16
  return masked_sentence
17
 
18
+ def mask_non_stopword_pseudorandom(sentence):
19
+ stop_words = set(stopwords.words('english'))
20
+ words = sentence.split()
21
+ non_stop_words = [word for word in words if word.lower() not in stop_words]
22
+ if not non_stop_words:
23
+ return sentence
24
+ random.seed(10)
25
+ word_to_mask = random.choice(non_stop_words)
26
+ masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
27
+ return masked_sentence
28
+
29
+ def high_entropy_words(sentence, non_melting_points):
30
+ stop_words = set(stopwords.words('english'))
31
+ words = sentence.split()
32
+
33
+ non_melting_words = set()
34
+ for _, point in non_melting_points:
35
+ non_melting_words.update(point.lower().split())
36
+
37
+ candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
38
+
39
+ if not candidate_words:
40
+ return sentence
41
+
42
+ max_entropy = -float('inf')
43
+ max_entropy_word = None
44
+
45
+ for word in candidate_words:
46
+ masked_sentence = sentence.replace(word, '[MASK]', 1)
47
+ predictions = fill_mask(masked_sentence)
48
+
49
+ # Calculate entropy based on top 5 predictions
50
+ entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
51
+
52
+ if entropy > max_entropy:
53
+ max_entropy = entropy
54
+ max_entropy_word = word
55
+
56
+ return sentence.replace(max_entropy_word, '[MASK]', 1)
57
+
58
+
59
  # Load tokenizer and model for masked language model
60
  tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
61
  model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
 
64
  def mask(sentence):
65
  predictions = fill_mask(sentence)
66
  masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
67
+ return masked_sentences
68
+