ronaldahmed commited on
Commit
73342a7
1 Parent(s): 65798f3

Update ccl_win.py

Browse files
Files changed (1) hide show
  1. ccl_win.py +101 -66
ccl_win.py CHANGED
@@ -64,34 +64,46 @@ Examples:
64
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
65
  WINDOW_SIZE = 3
66
 
67
- @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
68
- class ccl_win(evaluate.Measurement):
69
- """TODO: Short description of my evaluation module."""
70
-
71
 
72
- def _info(self):
73
- # TODO: Specifies the evaluate.EvaluationModuleInfo object
74
- return evaluate.MeasurementInfo(
75
- # This is the description that will appear on the modules page.
76
- module_type="measurement",
77
- description=_DESCRIPTION,
78
- citation=_CITATION,
79
- inputs_description=_KWARGS_DESCRIPTION,
80
- # This defines the format of each prediction and reference
81
- features=datasets.Features({
82
- 'predictions': datasets.Value('string'),
83
- }),
84
- # Homepage of the module for documentation
85
- homepage="http://module.homepage",
86
- # Additional links to the codebase or references
87
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
88
- reference_urls=["http://path.to.reference.url/new_module"]
89
- )
90
 
91
- def _download_and_prepare(self, dl_manager):
92
- """Optional: download external resources useful to compute the scores"""
93
- # TODO: Download external resources if needed
94
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  def preprocess_adjacent_window(self,preds):
97
  pred_list = []
@@ -114,37 +126,7 @@ class ccl_win(evaluate.Measurement):
114
  return pred_list,lens
115
 
116
 
117
-
118
- def _compute(self, predictions, dataset="arxiv", batch_size: int = 16, device=None, use_aggregator=True):
119
- """Returns the scores"""
120
- MODEL_CACHE_DIR = "/home/rcardena/.cache/huggingface/"
121
- BASEDIR = "/bask/projects/j/jlxi8926-auto-sum/rcardenas/tools/ccl_win"
122
- if getpass.getuser() == "s1987051":
123
- MODEL_CACHE_DIR="/disk/ocean/rcardenas/tools/huggingface/"
124
- elif getpass.getuser() == "rcardena":
125
- MODEL_CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface/"
126
- elif getpass.getuser() == "gvhr8913":
127
- MODEL_CACHE_DIR="/bask/projects/j/jlxi8926-auto-sum/rcardenas/cache"
128
-
129
- if device is not None:
130
- # assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
131
- if device == "gpu":
132
- device = "cuda"
133
- else:
134
- device = "cuda" if torch.cuda.is_available() else "cpu"
135
-
136
- results = []
137
- sent_lens = [len(x.split("\n")) for x in predictions]
138
- aggregator = None
139
- if use_aggregator:
140
- np.random.seed(42)
141
- aggregator = scoring.BootstrapAggregator()
142
-
143
- tokenizer = AutoTokenizer.from_pretrained("roberta-large")
144
-
145
- model = AutoModelForSequenceClassification.from_pretrained(os.path.join(BASEDIR,dataset))
146
- model.to(device)
147
- model.eval()
148
 
149
  pred_list,len_by_sample = self.preprocess_adjacent_window(predictions)
150
 
@@ -153,27 +135,80 @@ class ccl_win(evaluate.Measurement):
153
  with torch.no_grad():
154
  for b in range(0,n_preds,batch_size):
155
  strides = [x.lower() for x in pred_list[b:b+batch_size]]
156
- tinput = tokenizer(strides,padding=True,truncation=True,max_length=512,return_tensors="pt")
157
- tinput = {k:v.to(device) for k,v in tinput.items()}
158
- output = model(**tinput)
159
  probs = torch.softmax(output.logits,dim=-1).detach().cpu().numpy()
160
  scores.extend(probs[:,0].tolist())
161
  #
162
 
 
163
  offset = 0
 
164
  for i,_len in enumerate(len_by_sample):
165
  score = float(np.mean(scores[offset:offset+_len])) if sent_lens[i]>1 else 0.
166
- if use_aggregator:
167
- aggregator.add_scores({"loc_coh_ccl": score})
168
- else:
169
- results.append(score)
170
  offset += _len
171
  #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  outres = {}
 
 
173
  if use_aggregator:
 
 
 
 
 
 
174
  res = aggregator.aggregate()
175
  for k in res: outres[k] = res[k].mid
176
  else:
177
  outres = {"loc_coh_ccl": results}
178
-
179
- return outres
 
64
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
65
  WINDOW_SIZE = 3
66
 
 
 
 
 
67
 
68
+ @contextmanager
69
+ def filter_logging_context():
70
+ def filter_log(record):
71
+ return False if "This IS expected if you are initializing" in record.msg else True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ logger = datasets.utils.logging.get_logger("transformers.modeling_utils")
74
+ logger.addFilter(filter_log)
75
+ try:
76
+ yield
77
+ finally:
78
+ logger.removeFilter(filter_log)
79
+
80
+
81
+ class Scorer:
82
+
83
+ def __init__(
84
+ self,
85
+ model_type=None,
86
+ batch_size=64,
87
+ device=None,
88
+ use_fast_tokenizer=False):
89
+
90
+ if device is not None:
91
+ # assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
92
+ if device == "gpu":
93
+ device = "cuda"
94
+ else:
95
+ device = "cuda" if torch.cuda.is_available() else "cpu"
96
+ self.device = device
97
+ self.model_type = model_type
98
+ self.batch_size = batch_size
99
+ self._tokenizer = AutoTokenizer.from_pretrained("roberta-large")
100
+ self._model = AutoModelForSequenceClassification.from_pretrained(model_type)
101
+ self._model.to(device)
102
+ self._model.eval()
103
+
104
+ @property
105
+ def hash(self):
106
+ return self.model_type
107
 
108
  def preprocess_adjacent_window(self,preds):
109
  pred_list = []
 
126
  return pred_list,lens
127
 
128
 
129
+ def score(self,predictions):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  pred_list,len_by_sample = self.preprocess_adjacent_window(predictions)
132
 
 
135
  with torch.no_grad():
136
  for b in range(0,n_preds,batch_size):
137
  strides = [x.lower() for x in pred_list[b:b+batch_size]]
138
+ tinput = self._tokenizer(strides,padding=True,truncation=True,max_length=512,return_tensors="pt")
139
+ tinput = {k:v.to(self.device) for k,v in tinput.items()}
140
+ output = self._model(**tinput)
141
  probs = torch.softmax(output.logits,dim=-1).detach().cpu().numpy()
142
  scores.extend(probs[:,0].tolist())
143
  #
144
 
145
+ results = []
146
  offset = 0
147
+
148
  for i,_len in enumerate(len_by_sample):
149
  score = float(np.mean(scores[offset:offset+_len])) if sent_lens[i]>1 else 0.
150
+ results.append(score)
 
 
 
151
  offset += _len
152
  #
153
+ return results
154
+
155
+
156
+
157
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
158
+ class ccl_win(evaluate.Measurement):
159
+ """TODO: Short description of my evaluation module."""
160
+
161
+
162
+ def _info(self):
163
+ # TODO: Specifies the evaluate.EvaluationModuleInfo object
164
+ return evaluate.MeasurementInfo(
165
+ # This is the description that will appear on the modules page.
166
+ module_type="measurement",
167
+ description=_DESCRIPTION,
168
+ citation=_CITATION,
169
+ inputs_description=_KWARGS_DESCRIPTION,
170
+ # This defines the format of each prediction and reference
171
+ features=datasets.Features({
172
+ 'predictions': datasets.Value('string'),
173
+ }),
174
+ # Homepage of the module for documentation
175
+ homepage="http://module.homepage",
176
+ # Additional links to the codebase or references
177
+ codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
178
+ reference_urls=["http://path.to.reference.url/new_module"]
179
+ )
180
+
181
+ def _download_and_prepare(self, dl_manager):
182
+ """Optional: download external resources useful to compute the scores"""
183
+ # TODO: Download external resources if needed
184
+ pass
185
+
186
+
187
+
188
+
189
+ def _compute(self, predictions, dataset="arxiv", batch_size: int = 16, device=None, use_aggregator=True):
190
+ """Returns the scores"""
191
+ hashcode = dataset
192
+ with filter_logging_context():
193
+ if not hasattr(self, "cached_scorer") or self.cached_scorer.hash != hashcode:
194
+ self.cached_scorer = Scorer(
195
+ model_type=dataset,
196
+ batch_size=batch_size,
197
+ device=device,
198
+ )
199
+ results = self.cached_scorer.score(predictions)
200
  outres = {}
201
+
202
+ aggregator = None
203
  if use_aggregator:
204
+ np.random.seed(42)
205
+ aggregator = scoring.BootstrapAggregator()
206
+ for score in results:
207
+ aggregator.add_scores({"loc_coh_ccl": score})
208
+ #
209
+
210
  res = aggregator.aggregate()
211
  for k in res: outres[k] = res[k].mid
212
  else:
213
  outres = {"loc_coh_ccl": results}
214
+ return outres