ronald commited on
Commit
68677f3
1 Parent(s): 026f3d2
Files changed (1) hide show
  1. ccl_win.py +67 -7
ccl_win.py CHANGED
@@ -15,7 +15,11 @@
15
 
16
  import evaluate
17
  import datasets
18
-
 
 
 
 
19
 
20
  # TODO: Add BibTeX citation
21
  _CITATION = """\
@@ -28,7 +32,7 @@ year={2020}
28
 
29
  # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
32
  """
33
 
34
 
@@ -55,11 +59,12 @@ Examples:
55
 
56
  # TODO: Define external resources urls if needed
57
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
-
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class ccl_win(evaluate.Measurement):
62
  """TODO: Short description of my evaluation module."""
 
63
 
64
  def _info(self):
65
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
@@ -86,10 +91,65 @@ class ccl_win(evaluate.Measurement):
86
  # TODO: Download external resources if needed
87
  pass
88
 
89
- def _compute(self, predictions, references):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
  }
 
15
 
16
  import evaluate
17
  import datasets
18
+ import numpy as np
19
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
20
+ import getpass
21
+ import pdb
22
+ import os
23
 
24
  # TODO: Add BibTeX citation
25
  _CITATION = """\
 
32
 
33
  # TODO: Add description of the module here
34
  _DESCRIPTION = """\
35
+ local coherecence with classifier trained on the shuffle task, window=3 sentences
36
  """
37
 
38
 
 
59
 
60
  # TODO: Define external resources urls if needed
61
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
62
+ WINDOW_SIZE = 3
63
 
64
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
65
  class ccl_win(evaluate.Measurement):
66
  """TODO: Short description of my evaluation module."""
67
+
68
 
69
  def _info(self):
70
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
 
91
  # TODO: Download external resources if needed
92
  pass
93
 
94
+ def preprocess_adjacent_window(self,preds):
95
+ pred_list = []
96
+ lens = []
97
+ for pred in preds:
98
+ sents = pred.split("\n")
99
+ ns = len(sents)
100
+ if ns <= WINDOW_SIZE:
101
+ pred_list.append(pred)
102
+ lens.append(1)
103
+ else:
104
+ llen = 0
105
+ for i in range(0,ns-WINDOW_SIZE+1):
106
+ sss = sents[i:i+WINDOW_SIZE]
107
+ ss = "\n".join(sss)
108
+ pred_list.append(ss)
109
+ llen += 1
110
+ lens.append(llen)
111
+ #
112
+ return pred_list,lens
113
+
114
+
115
+
116
+ def _compute(self, predictions, dataset, device=None):
117
  """Returns the scores"""
118
+ MODEL_CACHE_DIR = "/home/rcardena/.cache/huggingface/"
119
+ if getpass.getuser() == "s1987051":
120
+ MODEL_CACHE_DIR="/disk/ocean/rcardenas/tools/huggingface/"
121
+ elif getpass.getuser() == "rcardena":
122
+ MODEL_CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface/"
123
+
124
+ if device is not None:
125
+ assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
126
+ if device == "gpu":
127
+ device = "cuda"
128
+ else:
129
+ device = "cuda" if torch.cuda.is_available() else "cpu"
130
+
131
+ tokenizer = AutoTokenizer.from_pretrained("roberta-large",cache_dir=MODEL_CACHE_DIR,use_fast="cnn_dailymail" not in dataset)
132
+ model = transformers.AutoModelForSequenceClassification.from_pretrained(f"./{dataset}/", num_labels=2,cache_dir=MODEL_CACHE_DIR)
133
+ model.to(device)
134
+
135
+ pred_list,len_by_sample = preprocess_adjacent_window(preds)
136
+
137
+ scores = []
138
+ for text in pred_list:
139
+ sents = text.lower().split("\n")
140
+ strides = ["\n".join(sents[i:i+WINDOW_SIZE]) for i in range(0,len(sents),WINDOW_SIZE)]
141
+ tinput = tokenizer(strides,padding=True,truncation=True,max_length=512,return_tensors="pt")
142
+ tinput = {k:v.to(device) for k,v in tinput.items()}
143
+ output = model(**tinput)
144
+ probs = torch.softmax(output.logits,dim=-1).detach().cpu().numpy()
145
+ scores.append(probs[:,0].mean())
146
+ #
147
+ results = []
148
+ offset = 0
149
+ for _len in len_by_sample:
150
+ results.append( float(np.mean(scores[offset:offset+_len])) )
151
+ offset += _len
152
+ #
153
  return {
154
+ "loc_coh_ccl": results,
155
  }