prajwal967 commited on
Commit
584d7ba
·
1 Parent(s): d27326e

delete unused folders

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. deid/__init__.py +0 -2
  2. deid/text_deid.py +0 -307
  3. deid/utils.py +0 -43
  4. ner_datasets/__init__.py +0 -4
  5. ner_datasets/__pycache__/__init__.cpython-37.pyc +0 -0
  6. ner_datasets/dataset_builder/__init__.py +0 -3
  7. ner_datasets/dataset_builder/dataset.py +0 -119
  8. ner_datasets/dataset_builder/labels/__init__.py +0 -4
  9. ner_datasets/dataset_builder/labels/mismatch_error.py +0 -7
  10. ner_datasets/dataset_builder/labels/ner_predict_token_labels.py +0 -30
  11. ner_datasets/dataset_builder/labels/ner_token_labels.py +0 -156
  12. ner_datasets/dataset_builder/sentence_dataset.py +0 -355
  13. ner_datasets/dataset_creator.py +0 -322
  14. ner_datasets/dataset_splitter.py +0 -294
  15. ner_datasets/distribution/__init__.py +0 -4
  16. ner_datasets/distribution/dataset_splits.py +0 -218
  17. ner_datasets/distribution/ner_distribution.py +0 -54
  18. ner_datasets/distribution/print_distribution.py +0 -49
  19. ner_datasets/preprocessing/__init__.py +0 -2
  20. ner_datasets/preprocessing/preprocessing_loader.py +0 -64
  21. ner_datasets/preprocessing/sentencizers/__init__.py +0 -3
  22. ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py +0 -37
  23. ner_datasets/preprocessing/sentencizers/note_sentencizer.py +0 -33
  24. ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py +0 -37
  25. ner_datasets/preprocessing/tokenizers/__init__.py +0 -4
  26. ner_datasets/preprocessing/tokenizers/abbreviations/check.txt +0 -20
  27. ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt +0 -87
  28. ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt +0 -459
  29. ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py +0 -73
  30. ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py +0 -58
  31. ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py +0 -49
  32. ner_datasets/preprocessing/tokenizers/utils/__init__.py +0 -4
  33. ner_datasets/preprocessing/tokenizers/utils/clean_regex.py +0 -64
  34. ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py +0 -309
  35. ner_datasets/preprocessing/tokenizers/utils/date_regex.py +0 -104
  36. ner_datasets/span_fixer.py +0 -380
  37. ner_datasets/span_validation.py +0 -91
  38. sequence_tagging/.DS_Store +0 -0
  39. sequence_tagging/__init__.py +0 -2
  40. sequence_tagging/__pycache__/__init__.cpython-37.pyc +0 -0
  41. sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc +0 -0
  42. sequence_tagging/arguments/__init__.py +0 -8
  43. sequence_tagging/arguments/data_training_arguments.py +0 -115
  44. sequence_tagging/arguments/evaluation_arguments.py +0 -26
  45. sequence_tagging/arguments/model_arguments.py +0 -43
  46. sequence_tagging/dataset_builder/__init__.py +0 -5
  47. sequence_tagging/dataset_builder/dataset_tokenizer.py +0 -178
  48. sequence_tagging/dataset_builder/label_mapper.py +0 -87
  49. sequence_tagging/dataset_builder/ner_dataset.py +0 -102
  50. sequence_tagging/dataset_builder/ner_labels.py +0 -67
deid/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .text_deid import TextDeid
2
- __all__ = ["TextDeid"]
 
 
 
deid/text_deid.py DELETED
@@ -1,307 +0,0 @@
1
- import json
2
- import re
3
- from argparse import ArgumentParser
4
- from typing import Sequence, List, Tuple, Mapping, Union, Any, Type
5
-
6
- import regex
7
- from seqeval.scheme import IOB1, IOB2, IOBES, BILOU, Entities
8
-
9
- from .utils import remove, replace_tag_type, replace_informative
10
-
11
-
12
- class TextDeid(object):
13
-
14
- def __init__(self, notation, span_constraint):
15
- self._span_constraint = span_constraint
16
- if self._span_constraint == 'strict':
17
- self._scheme = TextDeid.__get_scheme('IO')
18
- elif self._span_constraint == 'super_strict':
19
- self._scheme = TextDeid.__get_scheme('IO')
20
- else:
21
- self._scheme = TextDeid.__get_scheme(notation)
22
-
23
- def decode(self, tokens, predictions):
24
- if self._span_constraint == 'exact':
25
- return predictions
26
- elif self._span_constraint == 'strict':
27
- return TextDeid.__get_relaxed_predictions(predictions)
28
- elif self._span_constraint == 'super_strict':
29
- return TextDeid.__get_super_relaxed_predictions(tokens, predictions)
30
-
31
- def get_predicted_entities_positions(
32
- self,
33
- tokens: Sequence[Mapping[str, Union[str, int]]],
34
- predictions: List[str],
35
- suffix: bool
36
- ) -> List[List[Union[Tuple[Union[str, int], Union[str, int]], Any]]]:
37
- """
38
- Use the seqeval get_entities method, which goes through the predictions and returns
39
- where the span starts and ends. - [O, O, B-AGE, I-AGE, O, O] this will return
40
- spans starts at token 2 and ends at token 3 - with type AGE. We then extract the
41
- position of the token in the note (character position) - so we return that
42
- this span starts at 32 and ends at 37. The function then returns a nested list
43
- that contains a tuple of tag type and tag position (character positions).
44
- Example: [[(3, 9), LOC], [(34, 41), PATIENT], ...]]
45
- Args:
46
- tokens (Sequence[Mapping[str, Union[str, int]]]): The list of tokens in the note
47
- predictions (Sequence[str]): The list of predictions for the note
48
- suffix (str): Whether the B, I etc is in the prefix or the suffix
49
- Returns:
50
- positions_info (List[Tuple[Tuple[int, int], str]])): List containing tuples of tag positions and tag type
51
- """
52
- positions_info = list()
53
- entities = Entities(sequences=[predictions], scheme=self._scheme, suffix=suffix)
54
- for entity_list in entities.entities:
55
- for entity in entity_list:
56
- position = (tokens[entity.start]['start'], tokens[entity.end - 1]['end'])
57
- positions_info.append([position, entity.tag])
58
- return positions_info
59
-
60
- def run_deid(
61
- self,
62
- input_file,
63
- predictions_file,
64
- deid_strategy,
65
- keep_age: bool = False,
66
- metadata_key: str = 'meta',
67
- note_id_key: str = 'note_id',
68
- tokens_key: str = 'tokens',
69
- predictions_key: str = 'predictions',
70
- text_key: str = 'text'
71
- ):
72
- # Store note_id to note mapping
73
- note_map = dict()
74
- for line in open(input_file, 'r'):
75
- note = json.loads(line)
76
- note_id = note[metadata_key][note_id_key]
77
- note_map[note_id] = note
78
- # Go through note predictions and de identify the note accordingly
79
- for line in open(predictions_file, 'r'):
80
- note = json.loads(line)
81
- # Get the text using the note_id for this note from the note_map dict
82
- note_id = note[note_id_key]
83
- # Get the note from the note_map dict
84
- deid_note = note_map[note_id]
85
- # Get predictions
86
- predictions = self.decode(tokens=note[tokens_key], predictions=note[predictions_key])
87
- # Get entities and their positions
88
- entity_positions = self.get_predicted_entities_positions(
89
- tokens=note[tokens_key],
90
- predictions=predictions,
91
- suffix=False
92
- )
93
- yield TextDeid.__get_deid_text(
94
- deid_note=deid_note,
95
- entity_positions=entity_positions,
96
- deid_strategy=deid_strategy,
97
- keep_age=keep_age,
98
- text_key=text_key
99
- )
100
-
101
- @staticmethod
102
- def __get_deid_text(
103
- deid_note,
104
- entity_positions,
105
- deid_strategy,
106
- keep_age: bool = False,
107
- text_key: str = 'text'
108
- ):
109
- tag_mapping = TextDeid.__get_tag_mapping(deid_strategy=deid_strategy)
110
- age_pattern = '((?<!\d+)([1-7]\d?)(?!\d+))|((?<!\d+)(8[0-8]?)(?!\d+))'
111
- # Sort positions - store the last occurring tag first - i.e in descending order
112
- # of start positions.
113
- entity_positions.sort(key=lambda info: info[0][0], reverse=True)
114
- # Get text and de identify it
115
- note_text = deid_note[text_key]
116
- deid_text = deid_note[text_key]
117
- # Go through the entities and their positions and de identify the text
118
- # Since we have the positions in sorted order (descending by start positions)
119
- # we de identify the text from the end to the start - i.e back to front
120
- for positions, tag in entity_positions:
121
- start_pos, end_pos = positions
122
- deid_tag = tag_mapping[tag]
123
- age_unchanged = False
124
- if tag == 'AGE' and keep_age:
125
- span_text = note_text[start_pos:end_pos]
126
- if regex.search(age_pattern, span_text, flags=regex.IGNORECASE):
127
- deid_tag = span_text
128
- age_unchanged = True
129
- else:
130
- deid_tag = deid_tag
131
- if deid_strategy == 'replace_informative' and not age_unchanged:
132
- deid_text = deid_text[:start_pos] + deid_tag.format(note_text[start_pos:end_pos]) + deid_text[end_pos:]
133
- else:
134
- deid_text = deid_text[:start_pos] + deid_tag + deid_text[end_pos:]
135
- deid_note['deid_text'] = regex.sub('[\n]+', '\n', regex.sub('[ \t\r\f\v]+', ' ', deid_text)).strip()
136
- return deid_note
137
-
138
- @staticmethod
139
- def __get_tag_mapping(deid_strategy):
140
- if deid_strategy == 'remove':
141
- return remove()
142
- elif deid_strategy == 'replace_tag_type':
143
- return replace_tag_type()
144
- elif deid_strategy == 'replace_informative':
145
- return replace_informative()
146
-
147
- @staticmethod
148
- def __get_relaxed_predictions(predictions):
149
- return ['I-' + prediction[2:] if '-' in prediction else prediction for prediction in predictions]
150
-
151
- @staticmethod
152
- def __get_super_relaxed_predictions(tokens, predictions):
153
- # Super relaxed
154
- # 360 Longwood Ave, OBI, Boston
155
- # Tokens: ['360', 'Longwood', 'Ave', ',', 'OBI', ',', Boston[
156
- # Predictions: [B-LOC, I-LOC, L-LOC, O, U-LOC, O, U-LOC]
157
- # Relaxed: [I-LOC, I-LOC, I-LOC, O, I-LOC, O, I-LOC]
158
- # Super relaxed: [I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC]
159
- relaxed_predictions = TextDeid.__get_relaxed_predictions(predictions)
160
- prev_type = None
161
- replace_indexes = list()
162
- super_relaxed_predictions = list()
163
- for index, (token, prediction) in enumerate(zip(tokens, relaxed_predictions)):
164
- super_relaxed_predictions.append(prediction)
165
- # Check special characters that appear after a prediction
166
- # we can assign the prediction label to this sequence of special characters
167
- if prediction == 'O' and prev_type is not None:
168
- # [a-zA-Z0-9]
169
- if re.search('^(\W|_)+$', token['text'], flags=re.IGNORECASE | re.DOTALL):
170
- replace_indexes.append(index)
171
- else:
172
- prev_type = None
173
- replace_indexes = list()
174
- # Replace all the tokens identified above with the NER prediction type
175
- # This is done only ig the current prediction type matches the previous type
176
- elif prediction != 'O':
177
- if prediction[2:] == prev_type and replace_indexes != []:
178
- for replace_index in replace_indexes:
179
- super_relaxed_predictions[replace_index] = 'I-' + prev_type
180
- # Reset list and previous type
181
- replace_indexes = list()
182
- prev_type = prediction[2:]
183
- else:
184
- prev_type = None
185
- return super_relaxed_predictions
186
-
187
- @staticmethod
188
- def __get_scheme(notation: str) -> Union[Type[IOB2], Type[IOBES], Type[BILOU], Type[IOB1]]:
189
- """
190
- Get the seqeval scheme based on the notation
191
- Args:
192
- notation (str): The NER notation
193
- Returns:
194
- (Union[IOB2, IOBES, BILOU, IOB1]): The seqeval scheme
195
- """
196
- if notation == 'BIO':
197
- return IOB2
198
- elif notation == 'BIOES':
199
- return IOBES
200
- elif notation == 'BILOU':
201
- return BILOU
202
- elif notation == 'IO':
203
- return IOB1
204
- else:
205
- raise ValueError('Invalid Notation')
206
-
207
-
208
- def main():
209
- # The following code sets up the arguments to be passed via CLI or via a JSON file
210
- cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
211
- cli_parser.add_argument(
212
- '--input_file',
213
- type=str,
214
- required=True,
215
- help='the the jsonl file that contains the notes'
216
- )
217
- cli_parser.add_argument(
218
- '--predictions_file',
219
- type=str,
220
- required=True,
221
- help='the location where the predictions are'
222
- )
223
- cli_parser.add_argument(
224
- '--span_constraint',
225
- type=str,
226
- required=True,
227
- choices=['exact', 'strict', 'super_strict'],
228
- help='whether we want to modify the predictions, make the process of removing phi more struct etc'
229
- )
230
- cli_parser.add_argument(
231
- '--notation',
232
- type=str,
233
-
234
- required=True,
235
- help='the NER notation in the predictions'
236
- )
237
- cli_parser.add_argument(
238
- '--deid_strategy',
239
- type=str,
240
- required=True,
241
- choices=['remove', 'replace_tag_type', 'replace_informative'],
242
- help='The strategy '
243
- )
244
- cli_parser.add_argument(
245
- '--keep_age',
246
- action='store_true',
247
- help='whether to keep ages below 89'
248
- )
249
- cli_parser.add_argument(
250
- '--text_key',
251
- type=str,
252
- default='text',
253
- help='the key where the note text is present in the json object'
254
- )
255
- cli_parser.add_argument(
256
- '--metadata_key',
257
- type=str,
258
- default='meta',
259
- help='the key where the note metadata is present in the json object'
260
- )
261
- cli_parser.add_argument(
262
- '--note_id_key',
263
- type=str,
264
- default='note_id',
265
- help='the key where the note id is present in the json object'
266
- )
267
- cli_parser.add_argument(
268
- '--tokens_key',
269
- type=str,
270
- default='tokens',
271
- help='the key where the tokens for the notes are present in the json object'
272
- )
273
- cli_parser.add_argument(
274
- '--predictions_key',
275
- type=str,
276
- default='predictions',
277
- help='the key where the note predictions is present in the json object'
278
- )
279
- cli_parser.add_argument(
280
- '--output_file',
281
- type=str,
282
- required=True,
283
- help='the location we would write the deid notes'
284
- )
285
- # Parse args
286
- args = cli_parser.parse_args()
287
- text_deid = TextDeid(notation=args.notation, span_constraint=args.span_constraint)
288
- deid_notes = text_deid.run_deid(
289
- input_file=args.input_file,
290
- predictions_file=args.predictions_file,
291
- deid_strategy=args.deid_strategy,
292
- keep_age=args.keep_age,
293
- metadata_key=args.metadata_key,
294
- note_id_key=args.note_id_key,
295
- tokens_key=args.tokens_key,
296
- predictions_key=args.predictions_key,
297
- text_key=args.text_key
298
- )
299
- # Write the dataset to the output file
300
- with open(args.output_file, 'w') as file:
301
- for deid_note in deid_notes:
302
- file.write(json.dumps(deid_note) + '\n')
303
-
304
-
305
- if __name__ == "__main__":
306
- # Get deid notes
307
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
deid/utils.py DELETED
@@ -1,43 +0,0 @@
1
- def remove():
2
- return {'PATIENT': '',
3
- 'STAFF': '',
4
- 'AGE': '',
5
- 'DATE': '',
6
- 'PHONE': '',
7
- 'MRN': '',
8
- 'ID': '',
9
- 'EMAIL': '',
10
- 'PATORG': '',
11
- 'LOC': '',
12
- 'HOSP': '',
13
- 'OTHERPHI': ''}
14
-
15
-
16
- def replace_tag_type():
17
- return {'PATIENT': 'PATIENT',
18
- 'STAFF': 'STAFF',
19
- 'AGE': 'AGE',
20
- 'DATE': 'DATE',
21
- 'PHONE': 'PHONE',
22
- 'MRN': 'MRN',
23
- 'ID': 'ID',
24
- 'EMAIL': 'EMAIL',
25
- 'PATORG': 'PATORG',
26
- 'LOC': 'LOCATION',
27
- 'HOSP': 'HOSPITAL',
28
- 'OTHERPHI': 'OTHERPHI'}
29
-
30
-
31
- def replace_informative():
32
- return {'PATIENT': '<<PATIENT:{}>>',
33
- 'STAFF': '<<STAFF:{}>>',
34
- 'AGE': '<<AGE:{}>>',
35
- 'DATE': '<<DATE:{}>>',
36
- 'PHONE': '<<PHONE:{}>>',
37
- 'MRN': '<<MRN:{}>>',
38
- 'ID': '<<ID:{}>>',
39
- 'EMAIL': '<<EMAIL:{}>>',
40
- 'PATORG': '<<PATORG:{}>>',
41
- 'LOC': '<<LOCATION:{}>>',
42
- 'HOSP': '<<HOSPITAL:{}>>',
43
- 'OTHERPHI': '<<OTHERPHI:{}>>'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .span_fixer import SpanFixer
2
- from .dataset_splitter import DatasetSplitter
3
- from .dataset_creator import DatasetCreator
4
- __all__ = ["SpanFixer", "DatasetSplitter", "DatasetCreator"]
 
 
 
 
 
ner_datasets/__pycache__/__init__.cpython-37.pyc DELETED
Binary file (487 Bytes)
 
ner_datasets/dataset_builder/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .dataset import Dataset
2
- from .sentence_dataset import SentenceDataset
3
- __all__ = ["SentenceDataset", "Dataset"]
 
 
 
 
ner_datasets/dataset_builder/dataset.py DELETED
@@ -1,119 +0,0 @@
1
- import random
2
- import re
3
- from typing import Iterable, Dict, Sequence, Union, Mapping, Optional, List
4
-
5
- from .labels import NERTokenLabels, NERPredictTokenLabels, MismatchError
6
-
7
- random.seed(41)
8
-
9
-
10
- class Dataset(object):
11
- """
12
- Build a NER token classification dataset. Each token should have a corresponding label
13
- based on the annotated spans
14
- For training we will build the dataset using the annotated spans (e.g from prodigy)
15
- For predictions we will assign default labels. to keep the format of the dataset the same
16
- The dataset is on a sentence level, i.e each note is split into sentences and the
17
- task is run on a sentence level. Even the predictions are run on a sentence level
18
- The dataset would be something like:
19
- Tokens: [tok1, tok2, ... tok n]
20
- Labels: [lab1, lab2, ... lab n]
21
- For the prediction mode the labels would be: [default, default, default .... default]
22
- This script can also be used for predictions, the Labels will be filled with some
23
- default value. This is done so that we can use the same script for building a dataset to train a model
24
- and a dataset to obtain predictions using a model
25
- """
26
-
27
- def __init__(
28
- self,
29
- sentencizer,
30
- tokenizer
31
- ):
32
- """
33
- Build a NER token classification dataset
34
- For training we will build the dataset using the annotated spans (e.g from prodigy)
35
- For predictions we will assign default labels.
36
- The dataset is on a sentence level, i.e each note is split into sentences and the de-id
37
- task is run on a sentence level. Even the predictions are run on a sentence level
38
- The dataset would be something like:
39
- Tokens: [tok1, tok2, ... tok n]
40
- Labels: [lab1, lab2, ... lab n]
41
- This script can also be used for predictions, the Labels will be filled with some
42
- default value. This is done so that we can use the same script for building a dataset to train a model
43
- and a dataset to obtain predictions using a model
44
- Args:
45
- sentencizer (Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]): The sentencizer to use for
46
- splitting notes into
47
- sentences
48
- tokenizer (Union[ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer]): The tokenizer to use for
49
- splitting text into tokens
50
- """
51
- self._sentencizer = sentencizer
52
- self._tokenizer = tokenizer
53
-
54
- def get_tokens(
55
- self,
56
- text: str,
57
- spans: Optional[List[Mapping[str, Union[str, int]]]] = None,
58
- notation: str = 'BIO',
59
- token_text_key: str = 'text',
60
- label_key: str = 'label'
61
- ) -> Iterable[Sequence[Dict[str, Union[str, int]]]]:
62
- """
63
- Get a nested list of tokens where the the inner list represents the tokens in the
64
- sentence and the outer list will contain all the sentences in the note
65
- Args:
66
- text (str): The text present in the note
67
- spans (Optional[List[Mapping[str, Union[str, int]]]]): The NER spans in the note. This will be none if
68
- building the dataset for prediction
69
- notation (str): The notation we will be using for the label scheme (e.g BIO, BILOU etc)
70
- token_text_key (str): The key where the note text is present
71
- label_key (str): The key where the note label for each token is present
72
- Returns:
73
- Iterable[Sequence[Dict[str, Union[str, int]]]]: Iterable that iterates through all the sentences
74
- and yields the list of tokens in each sentence
75
- """
76
- # Initialize the object that will be used to align tokens and spans based on the notation
77
- # as mentioned earlier - this will be used only when mode is train - because we have
78
- # access to labelled spans for the notes
79
- if spans is None:
80
- label_spans = NERPredictTokenLabels('O')
81
- else:
82
- label_spans = NERTokenLabels(spans=spans, notation=notation)
83
- # Iterate through the sentences in the note
84
- for sentence in self._sentencizer.get_sentences(text=text):
85
- # This is used to determine the position of the tokens with respect to the entire note
86
- offset = sentence['start']
87
- # Keeps track of the tokens in the sentence
88
- tokens = list()
89
- for token in self._tokenizer.get_tokens(text=sentence['text']):
90
- # Get the token position (start, end) in the note
91
- token['start'] += offset
92
- token['end'] += offset
93
- if token[token_text_key].strip() in ['\n', '\t', ' ', ''] or token['start'] == token['end']:
94
- continue
95
- # Shorten consecutive sequences of special characters, this can prevent BERT from truncating
96
- # extremely long sentences - that could arise because of these characters
97
- elif re.search('(\W|_){9,}', token[token_text_key]):
98
- print('WARNING - Shortening a long sequence of special characters from {} to 8'.format(
99
- len(token[token_text_key])))
100
- token[token_text_key] = re.sub('(?P<specchar>(\W|_)){8,}', '\g<specchar>' * 8,
101
- token[token_text_key])
102
- elif len(token[token_text_key].split(' ')) != 1:
103
- print('WARNING - Token contains a space character - will be replaced with hyphen')
104
- token[token_text_key] = token[token_text_key].replace(' ', '-')
105
- # Get the labels for each token based on the notation (BIO)
106
- # In predict mode - the default label (e.g O) will be assigned
107
- try:
108
- # Get the label for the token - based on the notation
109
- label = label_spans.get_labels(token=token)
110
- if label[2:] == 'OTHERISSUE':
111
- raise ValueError('Fix OTHERISSUE spans')
112
- # Check if there is a token and span mismatch, i.e the token and span does not align
113
- except MismatchError:
114
- print(token)
115
- raise ValueError('Token-Span mismatch')
116
- token[label_key] = label
117
- tokens.append(token)
118
- if tokens:
119
- yield tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/dataset_builder/labels/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .mismatch_error import MismatchError
2
- from .ner_token_labels import NERTokenLabels
3
- from .ner_predict_token_labels import NERPredictTokenLabels
4
- __all__=["NERTokenLabels", "NERPredictTokenLabels", "MismatchError"]
 
 
 
 
 
ner_datasets/dataset_builder/labels/mismatch_error.py DELETED
@@ -1,7 +0,0 @@
1
- # Exception thrown when there is a mismatch between a token and span
2
- # The token and spans don't line up due to a tokenization issue
3
- # E.g - 79M - span is AGE - 79, but token is 79M
4
- # There is a mismatch and an error will be thrown - that is the token does
5
- # not line up with the span
6
- class MismatchError(Exception):
7
- pass
 
 
 
 
 
 
 
 
ner_datasets/dataset_builder/labels/ner_predict_token_labels.py DELETED
@@ -1,30 +0,0 @@
1
- from typing import Mapping, Union, NoReturn
2
-
3
-
4
- class NERPredictTokenLabels(object):
5
- """
6
- Assign a default label while creating the dataset for prediction.
7
- This is done since the sequence tagging code expects the input
8
- file to contain a labels field, hence we assign a default label
9
- to meet this requirement
10
- """
11
-
12
- def __init__(self, default_label: str) -> NoReturn:
13
- """
14
- Initialize the default label
15
- Args:
16
- default_label (str): Default label that will be used
17
- """
18
- # Keeps track of all the spans (list) in the text (note)
19
- self._default_label = default_label
20
-
21
- def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
22
- """
23
- Given a token, return the default label.
24
- Args:
25
- token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
26
- in the text
27
- Returns:
28
- default_label (str): default label
29
- """
30
- return self._default_label
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/dataset_builder/labels/ner_token_labels.py DELETED
@@ -1,156 +0,0 @@
1
- from typing import Mapping, Union, Sequence, List
2
- from .mismatch_error import MismatchError
3
-
4
-
5
- class NERTokenLabels(object):
6
- """
7
- This class is used to align tokens with the spans
8
- Each token is assigned one of the following labels
9
- 'B-LABEL', 'I-LABEL', 'O'. For example the text
10
- 360 Longwood Avenue is 2 tokens - [360, Longwood, Avenue]
11
- and each token would be assigned the following labels
12
- [B-LOC, I-LOC, I-LOC] (this would also depend on what
13
- notation we are using). Generally the data after prodigy
14
- annotation has all the tokens and all the spans.
15
- We would have tokens:[tok1, tok2, ... tokn]
16
- and spans:[span1:[tok1, tok2, tok3], span2:[tok7], ... span k]
17
- This would be used to convert into the format we are using
18
- which is assign the label to each token based on which span it
19
- belongs to.
20
- """
21
-
22
- def __init__(
23
- self,
24
- spans: List[Mapping[str, Union[str, int]]],
25
- notation: str
26
- ):
27
- """
28
- Initialize variables that will be used to align tokens
29
- and span labels. The spans variable will contain all the spans
30
- in the note. Notation is whether we would like to use BIO, IO, BILOU,
31
- when assigning the label to each token based on which span it belongs to.
32
- Keep track of the total number of spans etc.
33
- Args:
34
- spans (Sequence[Mapping[str, Union[str, int]]]): List of all the spans in the text
35
- notation (str): NER label notation
36
- """
37
- # Keeps track of all the spans (list) in the text (note)
38
- self._spans = spans
39
- for span in self._spans:
40
- if type(span['start']) != int or type(span['end']) != int:
41
- raise ValueError('The start and end keys of the span must be of type int')
42
- self._spans.sort(key=lambda _span: (_span['start'], _span['end']))
43
- # The current span is the first element of the list
44
- self._current_span = 0
45
- # Boolean variable that indicates whether the token is inside
46
- # the span (I-LABEL)
47
- self._inside = False
48
- # Total number of spans
49
- self._span_count = len(self._spans)
50
- # Depending on the notation passed, we will return the label for
51
- # the token accordingly
52
- if notation == 'BIO':
53
- self._prefix_single = 'B-'
54
- self._prefix_begin = 'B-'
55
- self._prefix_inside = 'I-'
56
- self._prefix_end = 'I-'
57
- self._prefix_outside = 'O'
58
- elif notation == 'BIOES':
59
- self._prefix_single = 'S-'
60
- self._prefix_begin = 'B-'
61
- self._prefix_inside = 'I-'
62
- self._prefix_end = 'E-'
63
- self._prefix_outside = 'O'
64
- elif notation == 'BILOU':
65
- self._prefix_single = 'U-'
66
- self._prefix_begin = 'B-'
67
- self._prefix_inside = 'I-'
68
- self._prefix_end = 'L-'
69
- self._prefix_outside = 'O'
70
- elif notation == 'IO':
71
- self._prefix_single = 'I-'
72
- self._prefix_begin = 'I-'
73
- self._prefix_inside = 'I-'
74
- self._prefix_end = 'I-'
75
- self._prefix_outside = 'O'
76
-
77
- def __check_begin(self, token: Mapping[str, Union[str, int]]) -> str:
78
- """
79
- Given a token, return the label (B-LABEL) and check whether the token
80
- covers the entire span or is a sub set of the span
81
- Args:
82
- token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
83
- in the text
84
- Returns:
85
- (str): The label - 'B-LABEL'
86
- """
87
- # Set the inside flag to true to indicate that the next token that is checked
88
- # will be checked to see if it belongs 'inside' the span
89
- self._inside = True
90
- if token['end'] > int(self._spans[self._current_span]['end']):
91
- raise MismatchError('Span and Token mismatch - Begin Token extends longer than the span')
92
- # If this token does not cover the entire span then we expect another token
93
- # to be in the span and that token should be assigned the I-LABEL
94
- elif token['end'] < int(self._spans[self._current_span]['end']):
95
- return self._prefix_begin + self._spans[self._current_span]['label']
96
- # If this token does cover the entire span then we set inside = False
97
- # to indicate this span is complete and increment the current span
98
- # to move onto the next span in the text
99
- elif token['end'] == int(self._spans[self._current_span]['end']):
100
- self._current_span += 1
101
- self._inside = False
102
- return self._prefix_single + self._spans[self._current_span - 1]['label']
103
-
104
- def __check_inside(self, token: Mapping[str, Union[str, int]]) -> str:
105
- """
106
- Given a token, return the label (I-LABEL) and check whether the token
107
- covers the entire span or is still inside the span.
108
- Args:
109
- token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
110
- in the text
111
- Returns:
112
- (str): The label - 'I-LABEL'
113
- """
114
-
115
- if (token['start'] >= int(self._spans[self._current_span]['end'])
116
- or token['end'] > int(self._spans[self._current_span]['end'])):
117
- raise MismatchError('Span and Token mismatch - Inside Token starts after the span ends')
118
- # If this token does not cover the entire span then we expect another token
119
- # to be in the span and that token should be assigned the I-LABEL
120
- elif token['end'] < int(self._spans[self._current_span]['end']):
121
- return self._prefix_inside + self._spans[self._current_span]['label']
122
- # If this token does cover the entire span then we set inside = False
123
- # to indicate this span is complete and increment the current span
124
- # to move onto the next span in the text
125
- elif token['end'] == int(self._spans[self._current_span]['end']):
126
- self._current_span += 1
127
- self._inside = False
128
- return self._prefix_end + self._spans[self._current_span - 1]['label']
129
-
130
- def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
131
- """
132
- Given a token, return the label (B-LABEL, I-LABEL, O) based on
133
- the spans present in the text & the desired notation.
134
- Args:
135
- token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
136
- in the text
137
- Returns:
138
- (str): One of the labels according to the notation - 'B-LABEL', 'I-LABEL', 'O'
139
- """
140
- # If we have iterated through all the spans in the text (note), all the tokens that
141
- # come after the last span will be marked as 'O' - since they don't belong to any span
142
- if self._current_span >= self._span_count:
143
- return self._prefix_outside
144
- # Check if the span can be assigned the B-LABEL
145
- if token['start'] == int(self._spans[self._current_span]['start']):
146
- return self.__check_begin(token)
147
- # Check if the span can be assigned the I-LABEL
148
- elif token['start'] > int(self._spans[self._current_span]['start']) and self._inside is True:
149
- return self.__check_inside(token)
150
- # Check if the token is outside a span
151
- elif self._inside is False and (token['end'] <= int(self._spans[self._current_span]['start'])):
152
- return self._prefix_outside
153
- else:
154
- raise MismatchError(
155
- 'Span and Token mismatch - the span and tokens don\'t line up. There might be a tokenization issue '
156
- 'that needs to be fixed')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/dataset_builder/sentence_dataset.py DELETED
@@ -1,355 +0,0 @@
1
- from collections import deque
2
- from typing import Deque, List, Sequence, Iterable, Optional, NoReturn, Dict, Mapping, Union, Tuple
3
-
4
-
5
- class SentenceDataset(object):
6
- """
7
- When we mention previous sentence and next sentence, we don't mean exactly one sentence
8
- but rather a previous chunk and a next chunk. This can include one or more sentences and
9
- it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
10
- This class is used to build a dataset at the sentence
11
- level. It takes as input all the tokenized sentences in the note. So the input is
12
- a list of lists where the outer list represents the sentences in the note and the inner list
13
- is a list of tokens in the sentence. It then returns a dataset where each sentence is
14
- concatenated with the previous and a next chunk. This is done so that when we build a model
15
- we can use the previous and next chunks to add context to the sentence/model. The weights and loss etc
16
- will be computed and updated based on the current sentence. The previous and next chunks will
17
- only be used to add context. We could have different sizes of previous and next chunks
18
- depending on the position of the sentence etc. Essentially we build a sentence level dataset
19
- where we can also provide context to the sentence by including the previous and next chunks
20
- """
21
-
22
- def __init__(
23
- self,
24
- max_tokens: int,
25
- max_prev_sentence_token: int,
26
- max_next_sentence_token: int,
27
- default_chunk_size: int,
28
- ignore_label: str
29
- ) -> NoReturn:
30
- """
31
- Set the maximum token length a given training example (sentence level) can have.
32
- That is the total length of the current sentence + previous chunk + next chunk
33
- We also set the the maximum length of the previous and next chunks. That is how many
34
- tokens can be in these chunks. However if the total length exceeds, tokens in the
35
- previous and next chunks will be dropped to ensure that the total length is < max_tokens
36
- The default chunk size ensures that the length of the chunks will be a minimum number of
37
- tokens based on the value passed. For example is default_chunk_size=10, the length
38
- of the previous chunks and next chunks will be at least 10 tokens.
39
- Args:
40
- max_tokens (int): maximum token length a given training example (sentence level) can have
41
- max_prev_sentence_token (int): The max chunk size for the previous chunks for a given sentence
42
- (training/prediction example) in the note can have
43
- max_next_sentence_token (int): The max chunk size for the next chunks for a given sentence
44
- (training/prediction example) in the note can have
45
- default_chunk_size (int): the training example will always include a chunk of this length
46
- as part of the previous and next chunks
47
- ignore_label (str): The label assigned to the previous and next chunks to distinguish
48
- from the current sentence
49
- """
50
- self._id_num = None
51
- self._max_tokens = max_tokens
52
- self._max_prev_sentence_token = max_prev_sentence_token
53
- self._max_next_sentence_token = max_next_sentence_token
54
- self._default_chunk_size = default_chunk_size
55
- self._ignore_label = ignore_label
56
-
57
- @staticmethod
58
- def chunker(
59
- seq: Sequence[Mapping[str, Union[str, int]]],
60
- size: int
61
- ) -> Iterable[Sequence[Mapping[str, Union[str, int]]]]:
62
- """
63
- Return chunks of the sequence. The size of each chunk will be based
64
- on the value passed to the size argument.
65
- Args:
66
- seq (Sequence): maximum token length a given training example (sentence level) can have
67
- size (int): The max chunk size for the chunks
68
- Return:
69
- (Iterable[Sequence[Mapping[str, Union[str, int]]]]): Iterable that iterates through fixed size chunks of
70
- the input sequence chunked version of the sequence
71
-
72
- """
73
- return (seq[pos:pos + size] for pos in range(0, len(seq), size))
74
-
75
- def get_previous_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
76
- """
77
- Go through all the sentences in the medical note and create a list of
78
- previous sentences. The output of this function will be a list of chunks
79
- where each index of the list contains the sentences (chunks) - (tokens) present before
80
- the sentence at that index in the medical note. For example prev_sent[0] will
81
- be empty since there is no sentence before the first sentence in the note
82
- prev_sent[1] will be equal to sent[0], that is the previous sentence of the
83
- second sentence will be the first sentence. We make use of deque, where we
84
- start to deque elements when it start to exceed max_prev_sentence_token. This
85
- list of previous sentences will be used to define the previous chunks
86
- Args:
87
- sent_tokens (Sequence[str]): Sentences in the note and
88
- each element of the list contains a
89
- list of tokens in that sentence
90
- Returns:
91
- previous_sentences (List[deque]): A list of deque objects where each index contains a
92
- list (queue) of previous tokens (chunk) with respect
93
- to the sentence represented by that index in the note
94
- """
95
- previous_sentences = list()
96
- # Create a queue and specify the capacity of the queue
97
- # Tokens will be popped from the queue when the capacity is exceeded
98
- prev_sentence = deque(maxlen=self._max_prev_sentence_token)
99
- # The first previous chunk is empty since the first sentence in the note does not have
100
- # anything before it
101
- previous_sentences.append(prev_sentence.copy())
102
- # As we iterate through the list of sentences in the not, we add the tokens from the previous chunks
103
- # to the the queue. Since we have a queue, as soon as the capacity is exceeded we pop tokens from
104
- # the queue
105
- for sent_token in sent_tokens[:-1]:
106
- for token in sent_token:
107
- prev_sentence.append(token)
108
- # As soon as each sentence in the list is processed
109
- # We add a copy of the current queue to a list - this list keeps track of the
110
- # previous chunks for a sentence
111
- previous_sentences.append(prev_sentence.copy())
112
-
113
- return previous_sentences
114
-
115
- def get_next_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
116
- """
117
- Go through all the sentences in the medical note and create a list of
118
- next sentences. The output of this function will be a list of lists
119
- where each index of the list contains the list of sentences present after
120
- the sentence at that index in the medical note. For example next_sent[-] will
121
- be empty since there is no sentence after the last sentence in the note
122
- next_sent[0] will be equal to sent[1:], that is the next sentence of the
123
- first sentence will be the subsequent sentences. We make use of deque, where we
124
- start to deque elements when it start to exceed max_next_sentence_token. This
125
- list of previous sentences will be used to define the previous chunks
126
- Args:
127
- sent_tokens (Sequence[str]): Sentences in the note and each
128
- element of the list contains a
129
- list of tokens in that sentence
130
- Returns:
131
- next_sentences (List[deque]): A list of deque objects where each index contains a list (queue)
132
- of next tokens (chunk) with respect to the sentence represented
133
- by that index in the note
134
- """
135
- # A list of next sentences is first created and reversed
136
- next_sentences = list()
137
- # Create a queue and specify the capacity of the queue
138
- # Tokens will be popped from the queue when the capacity is exceeded
139
- next_sentence = deque(maxlen=self._max_next_sentence_token)
140
- # The first (which becomes the last chunk when we reverse this list) next chunk is empty since
141
- # the last sentence in the note does not have
142
- # anything after it
143
- next_sentences.append(next_sentence.copy())
144
- for sent_token in reversed(sent_tokens[1:]):
145
- for token in reversed(sent_token):
146
- next_sentence.appendleft(token)
147
- next_sentences.append(next_sentence.copy())
148
- # The list is reversed - since we went through the sentences in the reverse order in
149
- # the earlier steps
150
- return [next_sent for next_sent in reversed(next_sentences)]
151
-
152
- def get_sentences(
153
- self,
154
- sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]],
155
- token_text_key: str = 'text',
156
- label_key: str = 'label',
157
- start_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
158
- end_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
159
- sub: bool = False
160
- ) -> Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]:
161
- """
162
- When we mention previous sentence and next sentence, we don't mean exactly one sentence
163
- but rather a previous chunk and a next chunk. This can include one or more sentences and
164
- it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
165
- We iterate through all the tokenized sentences in the note. So the input is
166
- a list of lists where the outer list represents the sentences in the note and the inner list
167
- is a list of tokens in the sentence. It then returns a dataset where each sentence is
168
- concatenated with the previous and the next sentence. This is done so that when we build a model
169
- we can use the previous and next sentence to add context to the model. The weights and loss etc
170
- will be computed and updated based on the current sentence. The previous and next sentence will
171
- only be used to add context. We could have different sizes of previous and next chunks
172
- depending on the position of the sentence etc. Since we split a note in several sentences which are
173
- then used as training data.
174
- ignore_label is used to differentiate between the current sentence and the previous and next
175
- chunks. The chunks will have the label NA so that and the current sentence
176
- will have the label (DATE, AGE etc) so that they can be distinguished.
177
- If however we are building a dataset for predictions
178
- the current sentence will have the default label O, but the next and previous chunks will still
179
- have the label NA. However if the total length exceeds, tokens in the
180
- previous and next chunks will be dropped to ensure that the total length is < max_tokens
181
- The default chunk size ensures that the length of the chunks will be a minimum number of
182
- tokens based on the value passed. For example is default_chunk_size=10, the length
183
- of the previous chunks and next chunks will be at least 10 tokens. If the total length > max tokens
184
- even after decreasing the sizes of the previous and next chunks, then we split this long
185
- sentence into sub sentences and repeat the process described above.
186
- Args:
187
- sent_tokens (Sequence[Sequence[Mapping[str, Union[str, int]]]]): Sentences in the note and each sentence
188
- contains the tokens (dict) in that sentence
189
- the token dict object contains the
190
- token text, start, end etc
191
- token_text_key (str): Each sentence contains a list of tokens where each token is a dict. We use the text
192
- key to extract the text of the token from the dictionary
193
- label_key (str): Each sentence contains a list of tokens where each token is a dict. We use the label_key
194
- key to extract the label of the token from the dictionary. (if it does not have a label
195
- the default label will be assigned)
196
- start_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Prefix the first sentence of with some
197
- pre-defined chunk
198
- end_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Suffix the last sentence of with some
199
- pre-defined chunk
200
- sub (bool): Whether the function is called to process sub-sentences (used when we are splitting
201
- long sentences into smaller sub sentences to keep sentence length < max_tokens
202
- Returns:
203
- (Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]): Iterate through the
204
- returned sentences,
205
- where each sentence
206
- has the previous
207
- chunks and next
208
- chunks attached
209
- to it.
210
- """
211
- # Id num keeps track of the id of the sentence - that is the position the sentence occurs in
212
- # the note. We keep the id of sub sentences the same as the sentence, so that the user
213
- # knows that these sub sentences are chunked from a longer sentence.
214
- # <SENT 0> <SENT 1>. Say length of sent 0 with the previous and next chunks is less than max_tokens
215
- # we return sent 0 with id 0. For sent 1, say the length is longer, we split it into sub
216
- # sentences - <SUB 1><SUB 2> - we return SUB 1, and SUB 2 with id 1 - so we know that it belongs
217
- # to <SENT 1> in the note.
218
- if not sub:
219
- self._id_num = -1
220
- # Initialize the object that will take all the sentences in the note and return
221
- # a dataset where each row represents a sentence in the note. The sentence in each
222
- # row will also contain a previous chunk and next chunk (tokens) that will act as context
223
- # when training the mode
224
- # [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
225
- # which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
226
- # provide context to the current sentence
227
- # Get the previous sentences (chunks) for each sentence in the note
228
- previous_sentences = self.get_previous_sentences(sent_tokens)
229
- # Get the next sentences (chunks) for each sentence in the note
230
- next_sentences = self.get_next_sentences(sent_tokens)
231
- # For the note we are going to iterate through all the sentences in the note and
232
- # concatenate each sentence with the previous and next chunks. (This forms the data that
233
- # will be used for training/predictions) Each sentence with the concatenated chunks will be
234
- # a training sample. We would do the same thing for getting predictions on a sentence as well
235
- # The only difference would be the labels that are used. We would use the default label O for
236
- # prediction and the annotated labels for prediction
237
- if len(sent_tokens) != len(previous_sentences) or len(sent_tokens) != len(next_sentences):
238
- raise ValueError('Sentence length mismatch')
239
- for index, (previous_sent, current_sent, next_sent) in enumerate(
240
- zip(previous_sentences, sent_tokens, next_sentences)):
241
- sent_tokens_text = list()
242
- sent_labels = list()
243
- sent_toks = list()
244
- # Get the tokens and labels for the current sentence
245
- for token in current_sent:
246
- # We store this, if we need to process sub sentences when a sentence exceeds max_tokens
247
- sent_toks.append(token)
248
- sent_tokens_text.append(token[token_text_key])
249
- sent_labels.append(token[label_key])
250
- # We check if the number of tokens in teh current sentence + previous chunk
251
- # + next chunk exceeds max tokens. If it does we start popping tokens from the previous and next chunks
252
- # until the number of tokens is equal to max tokens
253
- previous_sent_length = len(previous_sent)
254
- current_sent_length = len(sent_tokens_text)
255
- next_sent_length = len(next_sent)
256
- total_length = previous_sent_length + current_sent_length + next_sent_length
257
- # If the length of the current sentence plus the length of the previous and next
258
- # chunks exceeds the max_tokens, start popping tokens from the previous and next
259
- # chunks until either total length < max_tokens or the number of tokens in the previous and
260
- # next chunks goes below the default chunk size
261
- while total_length > self._max_tokens and \
262
- (next_sent_length > self._default_chunk_size or previous_sent_length > self._default_chunk_size):
263
- if next_sent_length >= previous_sent_length:
264
- next_sent.pop()
265
- next_sent_length -= 1
266
- total_length -= 1
267
- elif previous_sent_length > next_sent_length:
268
- previous_sent.popleft()
269
- previous_sent_length -= 1
270
- total_length -= 1
271
- # If this is not a sub sentence, increment the ID to
272
- # indicate the processing of the next sentence of the note
273
- # If it is a sub sentence, keep the ID the same, to indicate
274
- # it belongs to a larger sentence
275
- if not sub:
276
- self._id_num += 1
277
- # If total length < max_tokens - process the sentence with the current sentence
278
- # and add on the previous and next chunks and return
279
- if total_length <= self._max_tokens:
280
- # Check if we want to add a pre-defined chunk for the first sentence in the note
281
- if index == 0 and start_chunk is not None:
282
- previous_sent_tokens = [chunk[token_text_key] for chunk in start_chunk] + \
283
- [prev_token[token_text_key] for prev_token in list(previous_sent)]
284
- else:
285
- previous_sent_tokens = [prev_token[token_text_key] for prev_token in list(previous_sent)]
286
- # Check if we want to add a pre-defined chunk for the last sentence in the note
287
- if index == len(sent_tokens) - 1 and end_chunk is not None:
288
- next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] + \
289
- [chunk[token_text_key] for chunk in end_chunk]
290
- else:
291
- next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)]
292
- previous_sent_length = len(previous_sent_tokens)
293
- next_sent_length = len(next_sent_tokens)
294
- # Store information about the current sentence - start and end pos etc
295
- # this can be used to distinguish from the next and previous chunks
296
- # current_sent_info = {'token_info':current_sent}
297
- # Assign an different label (the ignore label) to the chunks - since they are used only for context
298
- previous_sent_labels = list()
299
- next_sent_labels = list()
300
- if self._ignore_label == 'NA':
301
- previous_sent_labels = [self._ignore_label] * previous_sent_length
302
- next_sent_labels = [self._ignore_label] * next_sent_length
303
- elif self._ignore_label == 'label':
304
- if index == 0 and start_chunk is not None:
305
- previous_sent_labels = [chunk[label_key] for chunk in start_chunk] + \
306
- [prev_token[label_key] for prev_token in list(previous_sent)]
307
- else:
308
- previous_sent_labels = [prev_token[label_key] for prev_token in list(previous_sent)]
309
- if index == len(sent_tokens) - 1 and end_chunk is not None:
310
- next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] + \
311
- [chunk[label_key] for chunk in end_chunk]
312
- else:
313
- next_sent_labels = [next_token[label_key] for next_token in list(next_sent)]
314
- # Concatenate the chunks and the sentence
315
- # sent_tokens_text.append(token[token_text_key])
316
- tokens_data = previous_sent_tokens + sent_tokens_text + next_sent_tokens
317
- labels_data = previous_sent_labels + sent_labels + next_sent_labels
318
- # Return processed sentences
319
- yield self._id_num, {'tokens': tokens_data, 'labels': labels_data, 'current_sent_info': current_sent}
320
- # Process the sub sentences - we take a long sentence
321
- # and split it into smaller chunks - and we recursively call the function on this list
322
- # of smaller chunks - as mentioned before the smaller chunks (sub sentences) will have the
323
- # same ID as the original sentence
324
- else:
325
- # Store the smaller chunks - say <SENT1> is too long
326
- # <PREV CHUNK><SENT1><NEXT CHUNK>
327
- # We get chunk sent 1 - to <SUB1><SUB2><SUB3> and we pass this [<SUB1><SUB2><SUB3>] to the function
328
- # as a recursive call. This list is now processed as a smaller note that essentially belongs
329
- # to a sentence. But as you can see we did not pass <PREV CHUNK> & <NEXT CHUNK>, because
330
- # these are chunks that are not part of the current sentence, but they still need to be
331
- # included in the final output - and the work around is mentioned below
332
- # So that we have a previous chunk for <SUB1> and next chunk for <SUB3>
333
- # we include the previous_sent_tokens and next_sent_tokens as the start chunk
334
- # and the next chunk in the function call below
335
- # <PREV CHUNK><SUB1><NEXT SUB1>, id = x
336
- # <PREV SUB2><SUB2><NEXT SUB2>, id = x
337
- # <PREV SUB3><SUB3><NEXT CHUNK>, id = x
338
- sub_sentences = list()
339
- # Prefix the first sentence in these smaller chunks
340
- previous_sent_tokens = list(previous_sent)
341
- # Suffix the last sentence in these smaller chunks
342
- next_sent_tokens = list(next_sent)
343
- # Get chunks
344
- for chunk in SentenceDataset.chunker(sent_toks, self._max_tokens - (2 * self._default_chunk_size)):
345
- sub_sentences.append(chunk)
346
- # Process list of smaller chunks
347
- for sub_sent in self.get_sentences(
348
- sub_sentences,
349
- token_text_key,
350
- label_key,
351
- start_chunk=previous_sent_tokens,
352
- end_chunk=next_sent_tokens,
353
- sub=True
354
- ):
355
- yield sub_sent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/dataset_creator.py DELETED
@@ -1,322 +0,0 @@
1
- import json
2
- import random
3
- from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4
- from typing import Iterable, Dict, List, Union, Optional, Sequence, NoReturn
5
-
6
- from .dataset_builder import Dataset, SentenceDataset
7
- from .preprocessing import PreprocessingLoader
8
-
9
- random.seed(41)
10
-
11
-
12
- class DatasetCreator(object):
13
- """
14
- Build a NER token classification dataset
15
- For training we will build the dataset using the annotated spans (e.g from prodigy)
16
- For predictions we will assign default labels.
17
- The dataset is on a sentence level, i.e each note is split into sentences and the de-id
18
- task is run on a sentence level. Even the predictions are run on a sentence level
19
- The dataset would be something like:
20
- Tokens: [[tok1, tok2, ... tok-n], [tok ...], ..., [tok ...]]
21
- Labels: [[lab1, lab2, ... lab-n], [lab ...], ..., [lab ...]]
22
- Where the inner list represents the sentences - the tokens in the sentence and the respective
23
- labels for each token. The labels depend on the notation
24
- This script can also be used for predictions, the Labels will be filled with some
25
- default value. This is done so that we can use the same script for building a dataset to train a model
26
- and a dataset to obtain predictions using a model
27
- Example:
28
- Note: Bruce Wayne is a 60yo man. He lives in Gotham
29
- Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
30
- Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, .], [He, lives, in, Gotham]]
31
- Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O], [O, O, O, B-LOC]]
32
- Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O], [O, O, O, U-LOC]]
33
- We also can create sentences that uses previous/next chunks as context - in this case the dataset would
34
- look something like this. (Assume we limit the size of the chunks to 3 tokens)
35
- Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
36
- Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, ., He, lives, in], [yo, man, ., He, lives, in, Gotham]]
37
- Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, B-LOC]]
38
- Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, U-LOC]]
39
- NA represents the token is used for context
40
- """
41
-
42
- def __init__(
43
- self,
44
- sentencizer: str,
45
- tokenizer: str,
46
- abbreviations: Optional[Sequence[str]] = None,
47
- max_tokens: int = 128,
48
- max_prev_sentence_token: int = 32,
49
- max_next_sentence_token: int = 32,
50
- default_chunk_size: int = 32,
51
- ignore_label: str = 'NA'
52
- ) -> NoReturn:
53
- """
54
- Initialize the sentencizer and tokenizer
55
- Args:
56
- sentencizer (str): Specify which sentencizer you want to use
57
- tokenizer (str): Specify which tokenizer you want to use
58
- abbreviations (Optional[Sequence[str]]): A list of abbreviations for which tokens will not be split
59
- - works only with with custom clinical tokenizer.
60
- max_tokens (int): The maximum number of tokens allowed in a sentence/training example,
61
- truncate if it exceeds.
62
- max_prev_sentence_token (int): The maximum number of previous chunk tokens allowed in a
63
- sentence/training example
64
- max_next_sentence_token (int): The maximum number of next chunk tokens allowed in a
65
- sentence/training example.
66
- ignore_label (str): The label assigned to the previous and next chunks to distinguish
67
- from the current sentence
68
- """
69
- self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer=sentencizer)
70
- self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer=tokenizer, abbreviations=abbreviations)
71
- # Initialize the object that will be used to get the tokens and the sentences
72
- self._dataset = Dataset(sentencizer=self._sentencizer, tokenizer=self._tokenizer)
73
- # Initialize the object that will take all the sentences in the note and return
74
- # a dataset where each row represents a sentence in the note. The sentence in each
75
- # row will also contain a previous chunk and next chunk (tokens) that will act as context
76
- # when training the mode
77
- # [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
78
- # which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
79
- # provide context to the current sentence
80
- self._sentence_dataset = SentenceDataset(
81
- max_tokens=max_tokens,
82
- max_prev_sentence_token=max_prev_sentence_token,
83
- max_next_sentence_token=max_next_sentence_token,
84
- default_chunk_size=default_chunk_size,
85
- ignore_label=ignore_label
86
- )
87
-
88
- def create(
89
- self,
90
- input_file: str,
91
- mode: str = 'predict',
92
- notation: str = 'BIO',
93
- token_text_key: str = 'text',
94
- metadata_key: str = 'meta',
95
- note_id_key: str = 'note_id',
96
- label_key: str = 'labels',
97
- span_text_key: str = 'spans'
98
- ) -> Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]:
99
- """
100
- This function is used to get the sentences that will be part of the NER dataset.
101
- We check whether the note belongs to the desired dataset split. If it does,
102
- we fix any spans that can cause token-span alignment errors. Then we extract
103
- all the sentences in the notes, the tokens in each sentence. Finally we
104
- add some context tokens to the sentence if required. This function returns
105
- an iterable that iterated through each of the processed sentences
106
- Args:
107
- input_file (str): Input jsonl file. Make sure the spans are in ascending order (based on start position)
108
- mode (str): Dataset being built for train or predict.
109
- notation (str): The NER labelling notation
110
- token_text_key (str): The key where the note text and token text is present in the json object
111
- metadata_key (str): The key where the note metadata is present in the json object
112
- note_id_key (str): The key where the note id is present in the json object
113
- label_key (str): The key where the token label will be stored in the json object
114
- span_text_key (str): The key where the note spans is present in the json object
115
- Returns:
116
- (Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]): Iterate through the processed
117
- sentences/training examples
118
- """
119
- # Go through the notes
120
- for line in open(input_file, 'r'):
121
- note = json.loads(line)
122
- note_text = note[token_text_key]
123
- note_id = note[metadata_key][note_id_key]
124
- if mode == 'train':
125
- note_spans = note[span_text_key]
126
- # No spans in predict mode
127
- elif mode == 'predict':
128
- note_spans = None
129
- else:
130
- raise ValueError("Invalid mode - can only be train/predict")
131
- # Store the list of tokens in the sentence
132
- # Eventually this list will contain all the tokens in the note (split on the sentence level)
133
- # Store the start and end positions of the sentence in the note. This can
134
- # be used later to reconstruct the note from the sentences
135
- # we also store the note_id for each sentence so that we can map it back
136
- # to the note and therefore have all the sentences mapped back to the notes they belong to.
137
- sent_tokens = [sent_tok for sent_tok in self._dataset.get_tokens(
138
- text=note_text,
139
- spans=note_spans,
140
- notation=notation
141
- )]
142
- # The following loop goes through each sentence in the note and returns
143
- # the current sentence and previous and next chunks that will be used for context
144
- # The chunks will have a default label (e.g NA) to distinguish from the current sentence
145
- # and so that we can ignore these chunks when calculating loss and updating weights
146
- # during training
147
- for ner_sent_index, ner_sentence in self._sentence_dataset.get_sentences(
148
- sent_tokens=sent_tokens,
149
- token_text_key=token_text_key,
150
- label_key=label_key
151
- ):
152
- # Return the processed sentence. This sentence will then be used
153
- # by the model
154
- current_sent_info = ner_sentence['current_sent_info']
155
- note_sent_info_store = {'start': current_sent_info[0]['start'],
156
- 'end': current_sent_info[-1]['end'], 'note_id': note_id}
157
- ner_sentence['note_sent_info'] = note_sent_info_store
158
- yield ner_sentence
159
-
160
-
161
- def main():
162
- cli_parser = ArgumentParser(
163
- description='configuration arguments provided at run time from the CLI',
164
- formatter_class=ArgumentDefaultsHelpFormatter
165
- )
166
- cli_parser.add_argument(
167
- '--input_file',
168
- type=str,
169
- required=True,
170
- help='the the jsonl file that contains the notes. spans need to be sorted in ascending order (based on start '
171
- 'position) '
172
- )
173
- cli_parser.add_argument(
174
- '--notation',
175
- type=str,
176
- default='BIO',
177
- help='the notation we will be using for the label scheme'
178
- )
179
- cli_parser.add_argument(
180
- '--max_tokens',
181
- type=int,
182
- default=128,
183
- help='The max tokens that a given sentence (training/prediction example) in the note can have'
184
- )
185
- cli_parser.add_argument(
186
- '--default_chunk_size',
187
- type=int,
188
- default=32,
189
- help='the default chunk size for the previous and next chunks for a given sentence (training/prediction '
190
- 'example) in the note can have '
191
- )
192
- cli_parser.add_argument(
193
- '--max_prev_sentence_token',
194
- type=int,
195
- default=32,
196
- help='the max chunk size for the previous chunks for a given sentence (training/prediction example) in the '
197
- 'note can have '
198
- )
199
- cli_parser.add_argument(
200
- '--max_next_sentence_token',
201
- type=int,
202
- default=32,
203
- help='the max chunk size for the next chunks for a given sentence (training/prediction example) in the note '
204
- 'can have '
205
- )
206
- cli_parser.add_argument(
207
- '--mode',
208
- type=str,
209
- choices=['train', 'predict'],
210
- required=True,
211
- help='whether we are building the dataset for training or prediction'
212
- )
213
- cli_parser.add_argument(
214
- '--sentencizer',
215
- type=str,
216
- required=True,
217
- help='the sentencizer to use for splitting notes into sentences'
218
- )
219
- cli_parser.add_argument(
220
- '--tokenizer',
221
- type=str,
222
- required=True,
223
- help='the tokenizer to use for splitting text into tokens'
224
- )
225
- cli_parser.add_argument(
226
- '--abbreviations',
227
- type=str,
228
- default=None,
229
- help='file that will be used by clinical tokenizer to handle abbreviations'
230
- )
231
- cli_parser.add_argument(
232
- '--ignore_label',
233
- type=str,
234
- default='NA',
235
- help='whether to use the ignore label or not'
236
- )
237
- cli_parser.add_argument(
238
- '--token_text_key',
239
- type=str,
240
- default='text',
241
- help='the key where the note text is present in the json object'
242
- )
243
- cli_parser.add_argument(
244
- '--metadata_key',
245
- type=str,
246
- default='meta',
247
- help='the key where the note metadata is present in the json object'
248
- )
249
- cli_parser.add_argument(
250
- '--note_id_key',
251
- type=str,
252
- default='note_id',
253
- help='the key where the note metadata is present in the json object'
254
- )
255
- cli_parser.add_argument(
256
- '--label_key',
257
- type=str,
258
- default='label',
259
- help='the key where the note label for each token is present in the json object'
260
- )
261
- cli_parser.add_argument(
262
- '--span_text_key',
263
- type=str,
264
- default='spans',
265
- help='the key where the note annotates spans are present in the json object'
266
- )
267
- cli_parser.add_argument(
268
- '--format',
269
- type=str,
270
- default='jsonl',
271
- help='format to store the dataset in: jsonl or conll'
272
- )
273
- cli_parser.add_argument(
274
- '--output_file',
275
- type=str,
276
- help='The file where the NER dataset will be stored'
277
- )
278
- args = cli_parser.parse_args()
279
- dataset_creator = DatasetCreator(
280
- sentencizer=args.sentencizer,
281
- tokenizer=args.tokenizer,
282
- abbreviations=args.abbreviations,
283
- max_tokens=args.max_tokens,
284
- max_prev_sentence_token=args.max_prev_sentence_token,
285
- max_next_sentence_token=args.max_next_sentence_token,
286
- default_chunk_size=args.default_chunk_size,
287
- ignore_label=args.ignore_label)
288
- ner_notes = dataset_creator.create(
289
- input_file=args.input_file,
290
- mode=args.mode,
291
- notation=args.notation,
292
- token_text_key=args.token_text_key,
293
- metadata_key=args.metadata_key,
294
- note_id_key=args.note_id_key,
295
- label_key=args.label_key,
296
- span_text_key=args.span_text_key
297
- )
298
- # Store the NER dataset in the desired format
299
- if args.format == 'jsonl':
300
- # Write the dataset to the output file
301
- with open(args.output_file, 'w') as file:
302
- for ner_sentence in ner_notes:
303
- file.write(json.dumps(ner_sentence) + '\n')
304
- elif args.format == 'conll':
305
- with open(args.output_file, 'w') as file:
306
- for ner_sentence in ner_notes:
307
- tokens = ner_sentence['tokens']
308
- labels = ner_sentence['labels']
309
- current_sent_info = ner_sentence['current_sent_info']
310
- note_id = ner_sentence['note_sent_info']['note_id']
311
- if len(tokens) != len(labels) or len(labels) != len(current_sent_info):
312
- raise ValueError('Length mismatch')
313
- for token, label, sent_info in zip(tokens, labels, current_sent_info):
314
- sent_info['note_id'] = note_id
315
- data = token + ' ' + label + ' ' + json.dumps(sent_info) + '\n'
316
- file.write(data)
317
- file.write('\n')
318
-
319
-
320
- if __name__ == '__main__':
321
-
322
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/dataset_splitter.py DELETED
@@ -1,294 +0,0 @@
1
- import json
2
- import random
3
- from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4
- from collections import Counter
5
- from typing import NoReturn, List
6
-
7
- from .distribution import NERDistribution, DatasetSplits, PrintDistribution
8
-
9
- random.seed(41)
10
-
11
-
12
- class DatasetSplitter(object):
13
- """
14
- Prepare dataset splits - training, validation & testing splits
15
- Compute ner distributions in our dataset. Compute ner distributions
16
- based on which we create and store a dictionary which will contain
17
- information about which notes (in a dataset) belong to which split.
18
- Based on this distribution and whether we want to keep certain notes
19
- grouped (e.g by patient) we assign notes to a split, such that the
20
- final ner type distribution in each split is similar.
21
- """
22
-
23
- def __init__(
24
- self,
25
- train_proportion: int = 70,
26
- validation_proportion: int = 15,
27
- test_proportion: int = 15
28
- ) -> NoReturn:
29
- """
30
- Initialize the proportions of the splits.
31
- Args:
32
- train_proportion (int): Ratio of train dataset
33
- validation_proportion (int): Ratio of validation dataset
34
- test_proportion (int): Ratio of test dataset
35
- """
36
- self._train_proportion = train_proportion
37
- self._validation_proportion = validation_proportion
38
- self._test_proportion = test_proportion
39
- self._split = None
40
- self._lookup_split = dict()
41
-
42
- def get_split(self, split: str) -> List[str]:
43
- return [key for key in self._lookup_split[split].keys()]
44
-
45
- def set_split(self, split: str) -> NoReturn:
46
- """
47
- Set the split that you are currently checking/processing.
48
- Based on the split you can perform certain checks and
49
- computation. Once the split is set, read the information
50
- present in the split_info_path. Extract only the information
51
- belonging to the split. Create a hash map where we have
52
- the keys as the note_ids/patient ids that belong to the split. This hashmap
53
- can then be used to check if a particular note belongs to this
54
- split.
55
- Args:
56
- split (str): The split - train, test etc (depends on how you named it)
57
- """
58
- if split not in ['train', 'validation', 'test']:
59
- raise ValueError('Invalid split')
60
- self._split = split
61
-
62
- def __update_split(self, key: str) -> NoReturn:
63
- """
64
- Update the hash map where we have
65
- the keys (e.g note_id) that belong to the split. This hashmap
66
- can then be used to check if a particular note belongs to this
67
- split.
68
- Args:
69
- key (str): The key that identify the note belonging to the split
70
- """
71
- self._lookup_split[self._split][key] = 1
72
-
73
- def check_note(self, key: str) -> bool:
74
- """
75
- Use the hash map created in the __get_i2b2_filter_map function
76
- to check if the note (note_info) belongs to this split (train,
77
- val, test etc). If it does, return true, else false
78
- Args:
79
- key (str): The key that identify the note belonging to the split
80
- Returns:
81
- (bool): True if the note belongs to the split, false otherwise
82
- """
83
- if self._split is None:
84
- raise ValueError('Split not set')
85
- if self._lookup_split[self._split].get(key, False):
86
- return True
87
- else:
88
- return False
89
-
90
- def assign_splits(
91
- self,
92
- input_file: str,
93
- spans_key: str = 'spans',
94
- metadata_key: str = 'meta',
95
- group_key: str = 'note_id',
96
- margin: float = 0.3
97
- ) -> NoReturn:
98
- """
99
- Get the dataset splits - training, validation & testing splits
100
- Based on the NER distribution and whether we want to keep certain
101
- notes grouped (e.g by patient). Return an iterable that contains
102
- a tuple that contains the note_id and the split. This can be used
103
- to filter notes based on the splits.
104
- Args:
105
- input_file (str): The input file
106
- spans_key (str): The key where the note spans are present
107
- metadata_key (str): The key where the note metadata is present
108
- group_key (str): The key where the note group (e.g note_id or patient id etc) is present.
109
- This field is what the notes will be grouped by, and all notes belonging
110
- to this grouping will be in the same split
111
- margin (float): Margin of error when maintaining proportions in the splits
112
- """
113
- # Compute the distribution of NER types in the grouped notes.
114
- # For example the distribution of NER types in all notes belonging to a
115
- # particular patient
116
- self._lookup_split = {
117
- 'train': dict(),
118
- 'validation': dict(),
119
- 'test': dict()
120
- }
121
- ner_distribution = NERDistribution()
122
- for line in open(input_file, 'r'):
123
- note = json.loads(line)
124
- key = note[metadata_key][group_key]
125
- ner_distribution.update_distribution(spans=note[spans_key], key=key)
126
- # Initialize the dataset splits object
127
- dataset_splits = DatasetSplits(
128
- ner_distribution=ner_distribution,
129
- train_proportion=self._train_proportion,
130
- validation_proportion=self._validation_proportion,
131
- test_proportion=self._test_proportion,
132
- margin=margin
133
- )
134
- # Check the note and assign it to a split
135
- for line in open(input_file, 'r'):
136
- note = json.loads(line)
137
- key = note[metadata_key][group_key]
138
- split = dataset_splits.get_split(key=key)
139
- self.set_split(split)
140
- self.__update_split(key)
141
- return None
142
-
143
-
144
- def main() -> NoReturn:
145
- """
146
- Prepare dataset splits - training, validation & testing splits
147
- Compute ner distributions in our dataset. Based on this distribution
148
- and whether we want to keep certain notes grouped (e.g by patient)
149
- we assign notes to a split, such that the final ner type distribution
150
- in each split is similar.
151
- """
152
- # Compute the distribution of NER types in the grouped notes.
153
- # For example the distribution of NER types in all notes belonging to a
154
- # particular patient
155
- # The following code sets up the arguments to be passed via CLI or via a JSON file
156
- cli_parser = ArgumentParser(
157
- description='configuration arguments provided at run time from the CLI',
158
- formatter_class=ArgumentDefaultsHelpFormatter
159
- )
160
- cli_parser.add_argument(
161
- '--input_file',
162
- type=str,
163
- required=True,
164
- help='the the jsonl file that contains the notes'
165
- )
166
- cli_parser.add_argument(
167
- '--spans_key',
168
- type=str,
169
- default='spans',
170
- help='the key where the note spans is present in the json object'
171
- )
172
- cli_parser.add_argument(
173
- '--metadata_key',
174
- type=str,
175
- default='meta',
176
- help='the key where the note metadata is present in the json object'
177
- )
178
- cli_parser.add_argument(
179
- '--group_key',
180
- type=str,
181
- default='note_id',
182
- help='the key to group notes by in the json object'
183
- )
184
- cli_parser.add_argument(
185
- '--train_proportion',
186
- type=int,
187
- default=70,
188
- help='ratio of train dataset'
189
- )
190
- cli_parser.add_argument(
191
- '--train_file',
192
- type=str,
193
- default=None,
194
- help='The file to store the train data'
195
- )
196
- cli_parser.add_argument(
197
- '--validation_proportion',
198
- type=int,
199
- default=15,
200
- help='ratio of validation dataset'
201
- )
202
- cli_parser.add_argument(
203
- '--validation_file',
204
- type=str,
205
- default=None,
206
- help='The file to store the validation data'
207
- )
208
- cli_parser.add_argument(
209
- '--test_proportion',
210
- type=int,
211
- default=15,
212
- help='ratio of test dataset'
213
- )
214
- cli_parser.add_argument(
215
- '--test_file',
216
- type=str,
217
- default=None,
218
- help='The file to store the test data'
219
- )
220
- cli_parser.add_argument(
221
- '--margin',
222
- type=float,
223
- default=0.3,
224
- help='margin of error when maintaining proportions in the splits'
225
- )
226
- cli_parser.add_argument(
227
- '--print_dist',
228
- action='store_true',
229
- help='whether to print the label distribution in the splits'
230
- )
231
- args = cli_parser.parse_args()
232
- dataset_splitter = DatasetSplitter(
233
- train_proportion=args.train_proportion,
234
- validation_proportion=args.validation_proportion,
235
- test_proportion=args.test_proportion
236
- )
237
- dataset_splitter.assign_splits(
238
- input_file=args.input_file,
239
- spans_key=args.spans_key,
240
- metadata_key=args.metadata_key,
241
- group_key=args.group_key,
242
- margin=args.margin
243
- )
244
-
245
- if args.train_proportion > 0:
246
- with open(args.train_file, 'w') as file:
247
- for line in open(args.input_file, 'r'):
248
- note = json.loads(line)
249
- key = note[args.metadata_key][args.group_key]
250
- dataset_splitter.set_split('train')
251
- if dataset_splitter.check_note(key):
252
- file.write(json.dumps(note) + '\n')
253
-
254
- if args.validation_proportion > 0:
255
- with open(args.validation_file, 'w') as file:
256
- for line in open(args.input_file, 'r'):
257
- note = json.loads(line)
258
- key = note[args.metadata_key][args.group_key]
259
- dataset_splitter.set_split('validation')
260
- if dataset_splitter.check_note(key):
261
- file.write(json.dumps(note) + '\n')
262
-
263
- if args.test_proportion > 0:
264
- with open(args.test_file, 'w') as file:
265
- for line in open(args.input_file, 'r'):
266
- note = json.loads(line)
267
- key = note[args.metadata_key][args.group_key]
268
- dataset_splitter.set_split('test')
269
- if dataset_splitter.check_note(key):
270
- file.write(json.dumps(note) + '\n')
271
-
272
- if args.print_dist:
273
- # Read the dataset splits file and compute the NER type distribution
274
- key_counts = Counter()
275
- ner_distribution = NERDistribution()
276
- for line in open(args.input_file, 'r'):
277
- note = json.loads(line)
278
- key = note[args.metadata_key][args.group_key]
279
- key_counts[key] += 1
280
- ner_distribution.update_distribution(spans=note[args.spans_key], key=key)
281
- print_distribution = PrintDistribution(ner_distribution=ner_distribution, key_counts=key_counts)
282
- train_splits = dataset_splitter.get_split('train')
283
- validation_splits = dataset_splitter.get_split('validation')
284
- test_splits = dataset_splitter.get_split('test')
285
- all_splits = train_splits + validation_splits + test_splits
286
- # Print distribution for each split
287
- print_distribution.split_distribution(split='total', split_info=all_splits)
288
- print_distribution.split_distribution(split='train', split_info=train_splits)
289
- print_distribution.split_distribution(split='validation', split_info=validation_splits)
290
- print_distribution.split_distribution(split='test', split_info=test_splits)
291
-
292
-
293
- if __name__ == "__main__":
294
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/distribution/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .dataset_splits import DatasetSplits
2
- from .ner_distribution import NERDistribution
3
- from .print_distribution import PrintDistribution
4
- __all__=["DatasetSplits", "NERDistribution", "PrintDistribution"]
 
 
 
 
 
ner_datasets/distribution/dataset_splits.py DELETED
@@ -1,218 +0,0 @@
1
- import random
2
- from collections import Counter
3
- from typing import NoReturn
4
-
5
- from .ner_distribution import NERDistribution
6
-
7
- random.seed(41)
8
-
9
-
10
- class DatasetSplits(object):
11
- """
12
- Prepare dataset splits - training, validation & testing splits
13
- Compute ner distributions in the dataset. Based on this we assign
14
- notes to different splits and at the same time, we keep the distribution of
15
- NER types in each split similar. .
16
- Keep track of the split information - which notes are present in which split.
17
- The label distribution in each split, the number of notes in each split.
18
- """
19
-
20
- def __init__(
21
- self,
22
- ner_distribution: NERDistribution,
23
- train_proportion: int,
24
- validation_proportion: int,
25
- test_proportion: int,
26
- margin: float
27
- ) -> NoReturn:
28
- """
29
- Maintain split information. Assign notes based on the proportion of
30
- the splits, while keeping the label distribution in each split similar.
31
- Keep track of the split information - which notes are present in which split.
32
- The label distribution in each split, the number of notes in each split.
33
- Keep track of the dataset splits and the counts in each split etc.
34
- These will be used to assign the different notes to different
35
- splits while keeping the proportion of ner similar in each split.
36
- Get the maximum number of ner that can be present in the train,
37
- validation and test split. The total count will be used to
38
- calculate the current proportion of ner in the split. This can be used
39
- to keep the proportion of ner types consistent among different splits
40
- Args:
41
- ner_distribution (NERDistribution): The NER distribution in the dataset
42
- train_proportion (int): Ratio of train dataset
43
- validation_proportion (int): Ratio of validation dataset
44
- test_proportion (int): Ratio of test dataset
45
- margin (float): Margin by which the label distribution can be exceeded in the split
46
- """
47
- self._ner_distribution = ner_distribution
48
- # Compute the counts of NER types in the entire dataset
49
- total_distribution = Counter()
50
- for key, counts in ner_distribution.get_ner_distribution().items():
51
- for label, count in counts.items():
52
- total_distribution[label] += count
53
- # Compute the percentages of NER types in the entire dataset
54
- self._total_ner = sum(total_distribution.values())
55
- self._label_dist_percentages = {
56
- ner_type: float(count) / self._total_ner * 100 if self._total_ner else 0
57
- for ner_type, count in total_distribution.items()
58
- }
59
- self._margin = margin
60
- # The three splits
61
- self._splits = ['train', 'validation', 'test']
62
- self._split_weights = None
63
- self._splits_info = None
64
- # Keep track of the patient_ids that have been processed.
65
- # Since a patient can have multiple notes and we already know the
66
- # ner distribution for this patient across all the notes (i.e the ner types
67
- # and count that appear in all the notes associated with this patient)
68
- # We also keep all the notes associated with a patient in the same split
69
- # So we check if adding all the notes associated with this patient will
70
- # disturb the ner distribution (proportions) as mentioned before.
71
- self._processed_keys = dict()
72
- # Based on these proportions we compute train_ner_count, validation_ner_count, test_ner_count
73
- # Say the proportion are 85, 10, 5
74
- # The train split will have a maximum of 85% of the overall ner, validation will have 10 and test will 5
75
- # That is if there are total count of all ner is 100, on splitting the datasets
76
- # the train split will have a total of 85 ner, validation split will have a total of 10 ner and the
77
- # test split will have a total of 5 ner
78
- train_ner_count = int(train_proportion * self._total_ner / 100)
79
- validation_ner_count = int(validation_proportion * self._total_ner / 100)
80
- test_ner_count = int(test_proportion * self._total_ner / 100)
81
- # So based on this, we check if adding a note keeps the balance in proportion or not
82
- # If it does not, we check the splits given in the "remain" field of the dict (which is
83
- # the 2 other splits
84
- self._split_weights = [train_proportion, validation_proportion, test_proportion]
85
- # Based on the split proportions, ner counts and ner distribution
86
- # we need to split our dataset into train, validation and test split
87
- # For each split we try and maintain the same distribution (proportions) between ner types
88
- # that we computed from the entire dataset (given by - ner_distribution)
89
- # If the entire dataset had AGE:50%, DATE:30%, LOC:20%, we want the same proportions
90
- # in each of the train, validation and test splits
91
- # So based on this, we check if adding a note keeps the balance in proportion or not
92
- # If it does not, we check the splits given in the "remain" field of the dict (which is
93
- # the 2 other splits
94
- self._splits_info = {'train': {'remain': ['validation', 'test'],
95
- 'total': train_ner_count,
96
- 'remain_weights': [validation_proportion, test_proportion],
97
- 'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
98
- 'validation': {'remain': ['train', 'test'],
99
- 'total': validation_ner_count,
100
- 'remain_weights': [train_proportion, test_proportion],
101
- 'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
102
- 'test': {'remain': ['validation', 'train'],
103
- 'total': test_ner_count,
104
- 'remain_weights': [validation_proportion, train_proportion],
105
- 'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()}}
106
-
107
- def __set_split(self, split: str) -> NoReturn:
108
- """
109
- Set the split that you are currently checking/processing.
110
- Based on the split you can perform certain checks and
111
- computation for that split.
112
- Args:
113
- split (str): The split - train, validation or test
114
- """
115
- self._split = split
116
-
117
- def __update_label_dist(self, distribution: Counter) -> NoReturn:
118
- """
119
- Once we have determined that a note can be added to the split we need to
120
- update the current count of the ner types in the split. So we pass the ner counts
121
- in the note that will be updated and update the counts of the ner types in the split.
122
- Args:
123
- distribution (Counter): Contains the ner type and it's counts (distribution)
124
- """
125
- self._splits_info[self._split]['label_dist'].update(distribution)
126
-
127
- def __update_groups(self, note_group_key: str) -> NoReturn:
128
- """
129
- Once we have determined that a note can be added to the split, we append
130
- to a list some distinct element of the note (e.g note_id). This list will
131
- contain the note_ids of the notes that belong to this split.
132
- Args:
133
- note_group_key (str): Contains the note metadata - e.g note_id, institute etc
134
- """
135
- self._processed_keys[note_group_key] = self._split
136
- self._splits_info[self._split]['groups'].append(note_group_key)
137
-
138
- def __check_split(self, distribution: Counter) -> bool:
139
- """
140
- This function is used to check the resulting ner distribution in the split on adding this
141
- note to the split. We check how the proportion of ner changes if this note is added to
142
- the split. If the proportion exceeds the desired proportion then we return false
143
- to indicate that adding this note will upset the ner distribution across splits, so we should
144
- instead check adding this note to another split. If it does not update the balance then we return
145
- True, which means we can add this note to this split. The desired proportion of ner is passed
146
- in the percentages argument - where we have the desired proportion for each ner type.
147
- Args:
148
- distribution (Counter): Contains the mapping between ner type and count
149
- Returns:
150
- (bool): True if the note can be added to the split, false otherwise
151
- """
152
- # Get the current ner types and counts in the split
153
- split_label_dist = self._splits_info[self._split]['label_dist']
154
- # Get the max ner count that can be present in the split
155
- # This will be used to compute the ner proportions in the split
156
- split_total = self._splits_info[self._split]['total']
157
- # Check if the proportion of the split picked in zero
158
- # and return False because we cant add any note to this split
159
- if split_total == 0:
160
- return False
161
- for ner_type, count in distribution.items():
162
- percentage = (split_label_dist.get(ner_type, 0) + count) / split_total * 100
163
- # Check if the proportion on adding this note exceeds the desired proportion
164
- # within the margin of error
165
- # If it does return false
166
- if percentage > self._label_dist_percentages[ner_type] + self._margin:
167
- return False
168
- return True
169
-
170
- def get_split(self, key: str) -> str:
171
- """
172
- Assign a split to the note - based on the distribution of ner types in the note
173
- and the distribution of ner types in the split. Essentially assign a note to a split
174
- such that the distribution of ner types in each split is similar, once all notes have
175
- been assigned to their respective splits.
176
- Args:
177
- key (str): The note id or patient id of the note (some grouping key)
178
- Returns:
179
- (str): The split
180
- """
181
- current_splits = self._splits
182
- current_weights = self._split_weights
183
- distribution = self._ner_distribution.get_group_distribution(key=key)
184
- if self._processed_keys.get(key, False):
185
- return self._processed_keys[key]
186
- while True:
187
- # Pick and set the split
188
- check_split = random.choices(current_splits, current_weights)[0]
189
- self.__set_split(check_split)
190
- # Get the ner distribution for this particular patient (across all the notes associated
191
- # with this patient) and check if the notes can be added to this split.
192
- # The margin of error for the ner proportions. As we said above we try and keep the proportions
193
- # across the splits the same, but we allow for some flexibility, so we can go +- the amount
194
- # given by margin.
195
- include = self.__check_split(distribution=distribution)
196
- if include:
197
- self.__update_groups(key)
198
- self.__update_label_dist(distribution=distribution)
199
- return check_split
200
- else:
201
- # Check the two other possible splits
202
- if len(current_splits) == 3:
203
- current_splits = self._splits_info[check_split]['remain']
204
- current_weights = self._splits_info[check_split]['remain_weights']
205
- # Check the one other possible split (when the one of the above two other split check returns false)
206
- elif len(current_splits) == 2 and current_weights[1 - current_splits.index(check_split)] != 0:
207
- index = current_splits.index(check_split)
208
- current_splits = [current_splits[1 - index]]
209
- current_weights = [100]
210
- # If it can't be added to any split - choose a split randomly
211
- else:
212
- current_splits = self._splits
213
- current_weights = self._split_weights
214
- check_split = random.choices(current_splits, current_weights)[0]
215
- self.__set_split(check_split)
216
- self.__update_groups(key)
217
- self.__update_label_dist(distribution=distribution)
218
- return check_split
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/distribution/ner_distribution.py DELETED
@@ -1,54 +0,0 @@
1
- from collections import Counter, defaultdict
2
- from typing import Sequence, Mapping, NoReturn
3
-
4
-
5
- class NERDistribution(object):
6
- """
7
- Store the distribution of ner types based on some key.
8
- That is we store the NER type distribution for some given key value and we update
9
- the distribution when spans related to that key is passed
10
- """
11
-
12
- def __init__(self) -> NoReturn:
13
- """
14
- Initialize the NER type - count mapping
15
- """
16
- # Counter the captures the ner types and counts per patient/note_id in the dataset
17
- # Depending on what we set the group_key as. Basically gather counts with respect
18
- # to some grouping of the notes
19
- # E.g - {{PATIENT 1: {AGE: 99, DATE: 55, ...}, {PATIENT 2: {AGE: 5, DATE: 9, ...} ... }
20
- self._ner_distribution = defaultdict(Counter)
21
-
22
- def update_distribution(self, spans: Sequence[Mapping[str, str]], key: str) -> NoReturn:
23
- """
24
- Update the distribution of ner types for the given key
25
- Args:
26
- spans (Sequence[Mapping[str, str]]): The list of spans in the note
27
- key (str): The note id or patient id of the note (some grouping)
28
- """
29
- # Go through the spans in the note and compute the ner distribution
30
- # Compute both the overall ner distribution and ner distribution per
31
- # patient (i.e the ner types in all the notes associated with the patient)
32
- if not self._ner_distribution.get(key, False):
33
- self._ner_distribution[key] = Counter()
34
- for span in spans:
35
- self._ner_distribution[key][span['label']] += 1
36
-
37
- def get_ner_distribution(self) -> defaultdict:
38
- """
39
- Return overall ner distribution. The NER type distribution for every key.
40
- Returns:
41
- ner_distribution (defaultdict(Counter)): Overall NER type distribution for all keys
42
- """
43
- return self._ner_distribution
44
-
45
- def get_group_distribution(self, key: str) -> Counter:
46
- """
47
- Return the NER type distribution for the given key
48
- Returns:
49
- (Counter): ner distribution w.r.t some grouping (key)
50
- """
51
- if key in self._ner_distribution.keys():
52
- return self._ner_distribution[key]
53
- else:
54
- raise ValueError('Key not found')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/distribution/print_distribution.py DELETED
@@ -1,49 +0,0 @@
1
- from collections import Counter
2
- from typing import Sequence, NoReturn
3
-
4
- from .ner_distribution import NERDistribution
5
-
6
-
7
- class PrintDistribution(object):
8
- """
9
- This class is used to print the distribution of NER types
10
- """
11
-
12
- def __init__(self, ner_distribution: NERDistribution, key_counts: Counter) -> NoReturn:
13
- """
14
- Initialize
15
- Args:
16
- ner_distribution (NERDistribution): NERDistribution object that keeps track of the NER type distributions
17
- key_counts (Counter): Number of keys/groups (e.g note_ids, patient ids etc)
18
- """
19
- self._ner_distribution = ner_distribution
20
- self._key_counts = key_counts
21
-
22
- def split_distribution(self, split: str, split_info: Sequence[str]) -> NoReturn:
23
- """
24
- Print NER type distribution
25
- Args:
26
- split (str): The dataset split
27
- split_info (Sequence[str]): The keys belonging to that split
28
- """
29
- split_distribution = Counter()
30
- number_of_notes = 0
31
- for key in split_info:
32
- number_of_notes += self._key_counts[key]
33
- split_distribution.update(self._ner_distribution.get_group_distribution(key))
34
- total_ner = sum(split_distribution.values())
35
- percentages = {ner_type: float(count) / total_ner * 100 if total_ner else 0
36
- for ner_type, count in split_distribution.items()}
37
- print('{:^70}'.format('============ ' + split.upper() + ' NER Distribution ============='))
38
- print('{:<20}{:<10}'.format('Number of Notes: ', number_of_notes))
39
- print('{:<20}{:<10}\n'.format('Number of Groups: ', len(split_info)))
40
- for ner_type, count in split_distribution.most_common():
41
- print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
42
- 'NER Type: ', ner_type,
43
- 'Count: ', count,
44
- 'Percentage: ', '{:0.2f}'.format(percentages[ner_type]))
45
- )
46
- print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
47
- 'NER Type:', 'TOTALS', 'Count: ', total_ner, 'Percentage: ', '{:0.2f}'.format(100))
48
- )
49
- print('\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .preprocessing_loader import PreprocessingLoader
2
- __all__ = ["PreprocessingLoader"]
 
 
 
ner_datasets/preprocessing/preprocessing_loader.py DELETED
@@ -1,64 +0,0 @@
1
- from typing import Union, Optional, Sequence
2
-
3
- from .sentencizers import SpacySentencizer, NoteSentencizer
4
- from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer
5
-
6
-
7
- class PreprocessingLoader(object):
8
-
9
- @staticmethod
10
- def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]:
11
- """
12
- Initialize the sentencizer and tokenizer based
13
- We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or
14
- consider the entire note as a single sentence.
15
- Args:
16
- sentencizer (str): Specify which sentencizer you want to use
17
- Returns:
18
- Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested
19
- sentencizer class
20
- """
21
- if sentencizer == 'en_core_sci_lg':
22
- return SpacySentencizer(spacy_model='en_core_sci_lg')
23
- elif sentencizer == 'en_core_sci_sm':
24
- return SpacySentencizer(spacy_model='en_core_sci_sm')
25
- elif sentencizer == 'en_core_web_sm':
26
- return SpacySentencizer(spacy_model='en_core_web_sm')
27
- elif sentencizer == 'note':
28
- return NoteSentencizer()
29
- else:
30
- raise ValueError('Invalid sentencizer - does not exist')
31
-
32
- @staticmethod
33
- def get_tokenizer(
34
- tokenizer: str,
35
- abbreviations: Optional[Sequence[str]] = None,
36
- ) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]:
37
- """
38
- Initialize the tokenizer based on the CLI arguments
39
- We can either use the default scipacy (en_core_sci_lg or en_core_web_sm)
40
- or the modified scipacy (with regex rule) tokenizer.
41
- It also supports the corenlp tokenizer
42
- Args:
43
- tokenizer (str): Specify which tokenizer you want to use
44
- abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with
45
- used with custom clinical tokenizer
46
- Returns:
47
- Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class
48
- """
49
- if tokenizer == 'en_core_sci_lg':
50
- return SpacyTokenizer(spacy_model='en_core_sci_lg')
51
- elif tokenizer == 'en_core_web_sm':
52
- return SpacyTokenizer(spacy_model='en_core_web_sm')
53
- elif tokenizer == 'en':
54
- return SpacyTokenizer(spacy_model='en')
55
- elif tokenizer == 'corenlp':
56
- return CoreNLPTokenizer()
57
- elif tokenizer == 'clinical':
58
- # Abbreviations - we won't split tokens that match these (e.g 18F-FDG)
59
- if abbreviations is None:
60
- return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
61
- else:
62
- return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
63
- else:
64
- raise ValueError('Invalid tokenizer - does not exist')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/sentencizers/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .note_sentencizer import NoteSentencizer
2
- from .spacy_sentencizer import SpacySentencizer
3
- __all__=["NoteSentencizer", "SpacySentencizer"]
 
 
 
 
ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py DELETED
@@ -1,37 +0,0 @@
1
- from typing import Iterable, Dict, Union
2
-
3
- import stanza
4
-
5
-
6
- class MimicStanzaSentencizer(object):
7
- """
8
- This class is used to read text and split it into
9
- sentences (and their start and end positions)
10
- using the mimic stanza package
11
- """
12
-
13
- def __init__(self, package: str):
14
- """
15
- Initialize a mimic stanza model to read text and split it into
16
- sentences.
17
- Args:
18
- package (str): Name of the mimic model
19
- """
20
- self._nlp = stanza.Pipeline('en', package=package, processors='tokenize', use_gpu=True)
21
-
22
- def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
23
- """
24
- Return an integrator that iterates through the sentences in the text
25
- Args:
26
- text (str): The text
27
- Returns:
28
- (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
29
- the start position of the sentence in the entire text
30
- and the end position of the sentence in the entire text
31
- """
32
- doc = self._nlp(text)
33
- for sentence in doc.sentences:
34
- yield {'text': sentence.text,
35
- 'start': sentence.tokens[0].start_char,
36
- 'end': sentence.tokens[-1].end_char,
37
- 'last_token': sentence.tokens[-1].text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/sentencizers/note_sentencizer.py DELETED
@@ -1,33 +0,0 @@
1
- from typing import Iterable, Dict, Union
2
-
3
-
4
- class NoteSentencizer(object):
5
- """
6
- This class is used to read text and split it into
7
- sentences (and their start and end positions)
8
- This class considers an entire note or text as
9
- a single sentence
10
- """
11
-
12
- def __init__(self):
13
- """
14
- Nothing to be initialized
15
- """
16
-
17
- def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
18
- """
19
- Return an iterator that iterates through the sentences in the text.
20
- In this case it just returns the text itself.
21
- Args:
22
- text (str): The text
23
- Returns:
24
- (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
25
- the start position of the sentence in the entire text
26
- and the end position of the sentence in the entire text
27
- """
28
- yield {
29
- 'text': text,
30
- 'start': 0,
31
- 'end': len(text),
32
- 'last_token': None
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py DELETED
@@ -1,37 +0,0 @@
1
- from typing import Iterable, Dict, Union
2
-
3
- import spacy
4
-
5
-
6
- class SpacySentencizer(object):
7
- """
8
- This class is used to read text and split it into
9
- sentences (and their start and end positions)
10
- using a spacy model
11
- """
12
-
13
- def __init__(self, spacy_model: str):
14
- """
15
- Initialize a spacy model to read text and split it into
16
- sentences.
17
- Args:
18
- spacy_model (str): Name of the spacy model
19
- """
20
- self._nlp = spacy.load(spacy_model)
21
-
22
- def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
23
- """
24
- Return an iterator that iterates through the sentences in the text
25
- Args:
26
- text (str): The text
27
- Returns:
28
- (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
29
- the start position of the sentence in the entire text
30
- and the end position of the sentence in the entire text
31
- """
32
- document = self._nlp(text)
33
- for sentence in document.sents:
34
- yield {'text': sentence.text,
35
- 'start': sentence.start_char,
36
- 'end': sentence.end_char,
37
- 'last_token': None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .spacy_tokenizer import SpacyTokenizer
2
- from .core_nlp_tokenizer import CoreNLPTokenizer
3
- from .clinical_spacy_tokenizer import ClinicalSpacyTokenizer
4
- __all__=["SpacyTokenizer", "CoreNLPTokenizer", "ClinicalSpacyTokenizer"]
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/abbreviations/check.txt DELETED
@@ -1,20 +0,0 @@
1
- sec.
2
- secs.
3
- Sec.
4
- Secs.
5
- fig.
6
- figs.
7
- Fig.
8
- Figs.
9
- eq.
10
- eqs.
11
- Eq.
12
- Eqs.
13
- no.
14
- nos.
15
- No.
16
- Nos.
17
- al.
18
- gen.
19
- sp.
20
- nov.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt DELETED
@@ -1,87 +0,0 @@
1
- -ve
2
- +ve
3
- a.c.
4
- a/g
5
- b.i.d.
6
- C&S
7
- C/O
8
- D/C
9
- D&C
10
- D and C
11
- H&H
12
- H&P
13
- h.s.
14
- H/O
15
- h/o
16
- I&D
17
- M/H
18
- N/V
19
- O&P
20
- O.D.
21
- O.S.
22
- O.U.
23
-
24
- p.o.
25
- p.r.n.
26
- q.d.
27
- q.i.d.
28
- R/O
29
- s/p
30
- T&A
31
- t.i.d.
32
- u/a
33
- u**
34
- y.o.
35
- F/u
36
- Crohn's
37
- R.N.
38
- S/p
39
- S/P
40
- s/P
41
- N/A
42
- n/a
43
- N/a
44
- n/A
45
- w/
46
- Pt.
47
- pt.
48
- PT.
49
- cf.
50
- CF.
51
- Cf.
52
- dr.
53
- DR.
54
- Dr.
55
- ft.
56
- FT.
57
- Ft.
58
- lt.
59
- LT.
60
- Lt.
61
- mr.
62
- MR.
63
- Mr.
64
- ms.
65
- MS.
66
- Ms.
67
- mt.
68
- MT.
69
- Mt.
70
- mx.
71
- MX.
72
- Mx.
73
- ph.
74
- PH.
75
- Ph.
76
- rd.
77
- RD.
78
- Rd.
79
- st.
80
- ST.
81
- St.
82
- vs.
83
- VS.
84
- Vs.
85
- wm.
86
- WM.
87
- Wm.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt DELETED
@@ -1,459 +0,0 @@
1
- +ve
2
- x/12
3
- x/40
4
- x/52
5
- x/7
6
- 18F-FDG
7
-
8
- 2/2
9
- 3TC
10
- 5-FU
11
- 5-HIAA
12
- 5-HT
13
- 6MP
14
- a.a.
15
- A1C
16
- Aa.
17
- AAOx3
18
- A/B
19
- a.c.
20
- AC&BC
21
- ad.
22
- part.
23
- A+E
24
- AF-AFP
25
- a.h.
26
- altern.
27
- d.
28
- Anti-
29
- A&O
30
- A/O
31
- A&Ox3
32
- A&Ox4
33
- a.p.
34
- A&P
35
- A/P
36
- applic.
37
- aq.
38
- bull.
39
- calid.
40
- dist.
41
- gel.
42
- ASC-H
43
- ASC-US
44
- A-T
45
- AT-III
46
- aur.
47
- dextro.
48
- aurist.
49
- A&W
50
- A/W
51
- b.i.d.
52
- b/l
53
- bl.cult
54
- B/O
55
- BRCA1
56
- BRCA2
57
- C1
58
- C2
59
- c/b
60
- CBC/DIFF
61
- C/C/E
62
- CCK-PZ
63
- CHEM-7
64
- CHEM-20
65
- C/O
66
- c/o
67
- CO2
68
- COX-1
69
- COX-2
70
- COX-3
71
- C/S
72
- C&S
73
- C-section
74
- C-spine
75
- C-SSRS
76
- c/a/p
77
- c/w
78
- D5
79
- D25
80
- D4T
81
- D5W
82
- D&C
83
- D/C
84
- D&E
85
- DHEA-S
86
- Di-Di
87
- DM2
88
- D/O
89
- D/T
90
- Ex-n
91
- F/C
92
- F/C/S
93
- FEF25–75
94
- FEV1
95
- fl.oz.
96
- FTA-ABS
97
- F/U
98
- G6PD
99
- G-CSF
100
- GM-CSF
101
- H/A
102
- HbA1c
103
- HCO3
104
- HDL-C
105
- H&E
106
- H/H
107
- H&H
108
- H&M
109
- HMG-CoA
110
- H-mole
111
- H/O
112
- H&P
113
- H/oPI
114
- h.s.
115
- I131
116
- ICD-10
117
- I&D
118
- IgG4-RD
119
- IgG4-RKD
120
- IgG4-ROD
121
- IgG4-TIN
122
- INF(-α/-β/-γ)
123
- I&O
124
- IV-DSA
125
- L&D
126
- LDL-C
127
- L-DOPA
128
- L/S
129
- MC&S
130
- M/E
131
- MgSO4
132
- MHA-TP
133
- M&M
134
- MMR-D
135
- Mono-Di
136
- Mono-Mono
137
- MS-AFP
138
- MSO4
139
- MVo2
140
- No.
141
- rep.
142
- n.s.
143
- n/t
144
- N&V
145
- n/v
146
- O2
147
- OB-GYN
148
- ob-gyne
149
- O/E
150
- O/N
151
- O&P
152
- P&A
153
- PAI-1
154
- PAPP-A
155
- p.c.
156
- PIG-A
157
- PM&R
158
- p.r.
159
- Pt.
160
- p.v.
161
- P-Y
162
- q2wk
163
- q6h
164
- q6°
165
- q.a.d.
166
- q.AM
167
- q.d.
168
- q.d.s.
169
- q.h.
170
- q.h.s.
171
- q.i.d.
172
- q.l.
173
- q.m.t.
174
- q.n.
175
- q.n.s.
176
- q.o.d.
177
- q.o.h.
178
- q.s.
179
- q.v.
180
- q.wk.
181
- r/g/m
182
- R&M
183
- R/O
184
- r/r/w
185
- R/t
186
- RT-PCR
187
- S1
188
- S2
189
- S3
190
- S4
191
- S&O
192
- S.D.
193
- op.
194
- SMA-6
195
- SMA-7
196
- s/p
197
- spp.
198
- Sp.
199
- fl.
200
- gr.
201
- S/S
202
- S/Sx
203
- Staph.
204
- Strep.
205
- Strepto.
206
- T&A
207
- T&C
208
- T&S
209
- TAH-BSO
210
- T2DM
211
- T/F
212
- T&H
213
- Tib-Fib
214
- TRF'd
215
- TSHR-Ab
216
- T.S.T.H.
217
- U/A
218
- U&E
219
- U/O
220
- V-fib
221
- V/Q
222
- WAIS-R
223
- W/C
224
- WISC-R
225
- W/O
226
- w/o
227
- w/u
228
- X-AFP
229
- y/o
230
- a.c.h.s.
231
- ac&hs
232
- a.d.
233
- ad.
234
- add.
235
- lib.
236
- admov.
237
- us.
238
- æq.
239
- agit.
240
- alt.
241
- d.
242
- dieb.
243
- h.
244
- hor.
245
- a.m.
246
- amp.
247
- com.
248
- dest.
249
- ferv.
250
- a.l.
251
- a.s.
252
- a.u.
253
- b.d.s.
254
- bib.
255
- b.i.d.
256
- b.d.
257
- ind.
258
- bol.
259
- Ph.Br.
260
- b.t.
261
- bucc.
262
- cap.
263
- caps.
264
- cap.
265
- c.m.
266
- c.m.s.
267
- c.
268
- cib.
269
- c.c.
270
- cf.
271
- c.n.
272
- cochl.
273
- ampl.
274
- infant.
275
- mag.
276
- mod.
277
- parv.
278
- colet.
279
- comp.
280
- contin.
281
- cpt.
282
- cr.
283
- cuj.
284
- c.v.
285
- cyath.
286
- vinos.
287
- D5LR
288
- D5NS
289
- D5W
290
- D10W
291
- D10W
292
- D/C
293
- decoct.
294
- det.
295
- dil.
296
- dim.
297
- p.
298
- æ.
299
- disp.
300
- div.
301
- d.t.d.
302
- elix.
303
- e.m.p.
304
- emuls.
305
- exhib.
306
- f.
307
- f.h.
308
- fl.
309
- fld.
310
- f.m.
311
- pil.
312
- f.s.a.
313
- ft.
314
- garg.
315
- gutt.
316
- habt.
317
- decub.
318
- intermed.
319
- tert.
320
- inj.
321
- i.m.
322
- inf.
323
- i.v.
324
- i.v.p.
325
- lat.
326
- dol.
327
- lb.
328
- l.c.d.
329
- liq.
330
- lot.
331
- M.
332
- m.
333
- max.
334
- m.d.u.
335
- mg/dL
336
- min.
337
- mist.
338
- mit.
339
- mitt.
340
- præscript.
341
- neb.
342
- noct.
343
- n.p.o.
344
- 1/2NS
345
- o 2
346
- o2
347
- o.d.
348
- o.m.
349
- omn.
350
- bih.
351
- o.n.
352
- o.s.
353
- o.u.
354
- p.c.h.s.
355
- pc&hs
356
- Ph.Br.
357
- Ph.Eur.
358
- Ph.Int.
359
- pig./pigm.
360
- p.m.
361
- p.o.
362
- ppt.
363
- p.r.
364
- p.r.n.
365
- pt.
366
- pulv.
367
- p.v.
368
- q.1
369
- q.1°
370
- q4PM
371
- q.a.m.
372
- q.d./q.1.d.
373
- q.d.a.m.
374
- q.d.p.m.
375
- q.p.m.
376
- q.q.
377
- q.q.h.
378
- a.d
379
- rep.
380
- rept.
381
- R/L
382
- s.
383
- s.a.
384
- sem.
385
- s.i.d.
386
- sig.
387
- sing.
388
- s.l.
389
- sol.
390
- s.o.s.
391
- s.s.
392
- st.
393
- sum.
394
- supp.
395
- susp.
396
- syr.
397
- tab.
398
- tal.
399
- t.
400
- t.d.s.
401
- t.i.d.
402
- t.d.
403
- tinct.
404
- t.i.w.
405
- top.
406
- tinc.
407
- trit.
408
- troch.
409
- u.d.
410
- ut.
411
- dict.
412
- ung.
413
- vag.
414
- w/a
415
- w/f
416
- y.o.
417
- ADD-RT
418
- A-T
419
- PDD-NOS
420
- Alzheimer's
421
- Age-related
422
- Aldosterone-producing
423
- Alcohol-related
424
- Ataxia-telangiectasia
425
- Binswanger's
426
- Becker's
427
- Bloom's
428
- Brown-Séquard
429
- Crimean-Congo
430
- Cerebro-oculo-facio-skeletal
431
- Carbapenem-resistant
432
- Drug-resistant
433
- End-stage
434
- Graft-versus-host
435
- Huntington's
436
- High-functioning
437
- Hypoxanthine-guanine
438
- Legionnaires'
439
- Low-functioning
440
- Multi-drug-resistant
441
- Multi-infarct
442
- Machado-Joseph
443
- Maturity-onset
444
- Multi-sensory
445
- Obsessive-compulsive
446
- Parkinson's
447
- kinase-associated
448
- Post-polio
449
- Port-wine
450
- Reye's
451
- Sensory-based
452
- Vitus's
453
- Septo-optic
454
- ST-elevation
455
- Short-lasting
456
- Urticaria-deafness-amyloidosis
457
- Wilson's
458
- drug-resistant
459
- X-linked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py DELETED
@@ -1,73 +0,0 @@
1
- import re
2
- import spacy
3
- from spacy.symbols import ORTH
4
- from .spacy_tokenizer import SpacyTokenizer
5
- from .utils import DateRegex, CleanRegex, ClinicalRegex
6
-
7
-
8
- class ClinicalSpacyTokenizer(SpacyTokenizer):
9
- """
10
- This class is used to read text and return the tokens
11
- present in the text (and their start and end positions)
12
- """
13
-
14
- def __init__(self, spacy_model, abbreviations,
15
- split_multiple=True, split_temperature=True,
16
- split_percentage=True):
17
- """
18
- Initialize a spacy model to read text and split it into
19
- tokens.
20
- Args:
21
- spacy_model (str): Name of the spacy model
22
- """
23
- super().__init__(spacy_model)
24
- self._nlp.tokenizer.prefix_search = self.__get_prefix_regex(split_multiple, split_temperature,
25
- split_percentage).search
26
- self._nlp.tokenizer.infix_finditer = self.__get_infix_regex().finditer
27
- self._nlp.tokenizer.suffix_search = self.__get_suffix_regex().search
28
- new_rules = {}
29
- for orth, exc in self._nlp.tokenizer.rules.items():
30
- if re.search('((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[.]$)|(^(W|w)ed$)', orth):
31
- continue
32
- new_rules[orth] = exc
33
- self._nlp.tokenizer.rules = new_rules
34
- if (abbreviations != None):
35
- for abbreviation in abbreviations:
36
- special_case = [{ORTH: abbreviation}]
37
- self._nlp.tokenizer.add_special_case(abbreviation, special_case)
38
- # this matches any lower case tokens - abstract this part out - whetehr to lowercase abbreviations ro not
39
- exclusions_uncased = {abbreviation.lower(): [{ORTH: abbreviation.lower()}] for abbreviation in
40
- abbreviations}
41
- for k, excl in exclusions_uncased.items():
42
- try:
43
- self._nlp.tokenizer.add_special_case(k, excl)
44
- except:
45
- print('failed to add exception: {}'.format(k))
46
-
47
- def __get_prefix_regex(self, split_multiple, split_temperature, split_percentage):
48
-
49
- date_prefix = DateRegex.get_infixes()
50
- clinical_prefix = ClinicalRegex.get_prefixes(split_multiple, split_temperature, split_percentage)
51
- clean_prefix = CleanRegex.get_prefixes()
52
- digit_infix = ClinicalRegex.get_infixes()
53
- prefixes = clean_prefix + self._nlp.Defaults.prefixes + date_prefix + clinical_prefix + digit_infix
54
- prefix_regex = spacy.util.compile_prefix_regex(prefixes)
55
- return prefix_regex
56
-
57
- def __get_suffix_regex(self):
58
- clean_suffix = CleanRegex.get_suffixes()
59
- suffixes = clean_suffix + self._nlp.Defaults.suffixes
60
- suffix_regex = spacy.util.compile_suffix_regex(suffixes)
61
- return suffix_regex
62
-
63
- def __get_infix_regex(self):
64
-
65
- date_infixes = DateRegex.get_infixes()
66
- clean_infixes = CleanRegex.get_infixes()
67
- digit_infix = ClinicalRegex.get_infixes()
68
- infixes = self._nlp.Defaults.infixes + date_infixes + clean_infixes
69
- infix_re = spacy.util.compile_infix_regex(infixes)
70
- return infix_re
71
-
72
- def get_nlp(self):
73
- return self._nlp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py DELETED
@@ -1,58 +0,0 @@
1
- import json
2
- from typing import Iterable, Mapping, Dict, Union
3
-
4
- from pycorenlp import StanfordCoreNLP
5
-
6
-
7
- class CoreNLPTokenizer(object):
8
- """
9
- This class is used to read text and return the tokens
10
- present in the text (and their start and end positions)
11
- using core nlp tokenization
12
- """
13
-
14
- def __init__(self, port: int = 9000):
15
- """
16
- Initialize a core nlp server to read text and split it into
17
- tokens using the core nlp annotators
18
- Args:
19
- port (int): The port to run the server on
20
- """
21
- self._core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(port))
22
-
23
- def get_stanford_annotations(self, text: str, annotators: str = 'tokenize,ssplit,pos,lemma') -> Dict:
24
- """
25
- Use the core nlp server to annotate the text and return the
26
- results as a json object
27
- Args:
28
- text (str): The text to annotate
29
- annotators (str): The core nlp annotations to run on the text
30
- Returns:
31
- output (Dict): The core nlp results
32
- """
33
- output = self._core_nlp.annotate(text, properties={
34
- "timeout": "50000",
35
- "ssplit.newlineIsSentenceBreak": "two",
36
- 'annotators': annotators,
37
- 'outputFormat': 'json'
38
- })
39
- if type(output) is str:
40
- output = json.loads(output, strict=False)
41
- return output
42
-
43
- def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
44
- """
45
- Return an iterable that iterates through the tokens in the text
46
- Args:
47
- text (str): The text to annotate
48
- Returns:
49
- (Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
50
- the start position of the token in the entire text
51
- and the end position of the token in the entire text
52
- """
53
- stanford_output = self.get_stanford_annotations(text)
54
- for sentence in stanford_output['sentences']:
55
- for token in sentence['tokens']:
56
- yield {'text': token['originalText'],
57
- 'start': token['characterOffsetBegin'],
58
- 'end': token['characterOffsetEnd']}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py DELETED
@@ -1,49 +0,0 @@
1
- import spacy
2
- from typing import Tuple, Iterable, Mapping, Dict, Union
3
-
4
-
5
- class SpacyTokenizer(object):
6
- """
7
- This class is used to read text and return the tokens
8
- present in the text (and their start and end positions)
9
- using spacy
10
- """
11
-
12
- def __init__(self, spacy_model: str):
13
- """
14
- Initialize a spacy model to read text and split it into
15
- tokens.
16
- Args:
17
- spacy_model (str): Name of the spacy model
18
- """
19
- self._nlp = spacy.load(spacy_model)
20
-
21
- @staticmethod
22
- def __get_start_and_end_offset(token: spacy.tokens.Token) -> Tuple[int, int]:
23
- """
24
- Return the start position of the token in the entire text
25
- and the end position of the token in the entire text
26
- Args:
27
- token (spacy.tokens.Token): The spacy token object
28
- Returns:
29
- start (int): the start position of the token in the entire text
30
- end (int): the end position of the token in the entire text
31
- """
32
- start = token.idx
33
- end = start + len(token)
34
- return start, end
35
-
36
- def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
37
- """
38
- Return an iterable that iterates through the tokens in the text
39
- Args:
40
- text (str): The text to annotate
41
- Returns:
42
- (Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
43
- the start position of the token in the entire text
44
- and the end position of the token in the entire text
45
- """
46
- document = self._nlp(text)
47
- for token in document:
48
- start, end = SpacyTokenizer.__get_start_and_end_offset(token)
49
- yield {'text': token.text, 'start': start, 'end': end}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/utils/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .date_regex import DateRegex
2
- from .clean_regex import CleanRegex
3
- from .clinical_regex import ClinicalRegex
4
- __all__=["DateRegex", "CleanRegex", "ClinicalRegex"]
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/utils/clean_regex.py DELETED
@@ -1,64 +0,0 @@
1
- from typing import List
2
- class CleanRegex(object):
3
- """
4
- This class is used to define the regexes that will be used by the
5
- spacy tokenizer rules. Mainly the regexes are used to clean up
6
- tokens that have unwanted characters (e.g extra hyphens).
7
- """
8
- #Staff - 3
9
- #Hosp - 4, 5
10
- #Loc - 2
11
- @staticmethod
12
- def get_prefixes() -> List[str]:
13
- """
14
- This function is used to build the regex that will clean up dirty characters
15
- present at the prefix position (start position) of a token. For example the token ---clean
16
- has three hyphens that need to be split from the word clean. This regex
17
- will be used by spacy to clean it up. This rule considers any characters that is
18
- not a letter or a digit as dirty characters
19
- Examples: ----------------9/36, :63, -ESH
20
- Returns:
21
- (list): List of regexes to clean the prefix of the token
22
- """
23
- #Handles case 5 of HOSP
24
- return ['((?P<prefix>([^a-zA-Z0-9.]))(?P=prefix)*)', '([.])(?!\d+(\W+|$))']
25
-
26
- @staticmethod
27
- def get_suffixes() -> List[str]:
28
- """
29
- This function is used to build the regex that will clean up dirty characters
30
- present at the suffix position (end position) of a token. For example the token clean---
31
- has three hyphens that need to be split from the word clean. This regex
32
- will be used by spacy to clean it up. This rule considers any characters that is
33
- not a letter or a digit as dirty characters
34
- Examples: FRANK^, regimen---------------, no)
35
- Returns:
36
- (list): List of regexes to clean the suffix of the token
37
- """
38
- return ['((?P<suffix>([^a-zA-Z0-9]))(?P=suffix)*)']
39
-
40
- @staticmethod
41
- def get_infixes() -> List[str]:
42
- """
43
- This function is used to build the regex that will clean up dirty characters
44
- present at the infix position (in-between position) of a token. For example the token
45
- clean---me has three hyphens that need to be split from the word clean and me. This regex
46
- will be used by spacy to clean it up. This rule considers any characters that is
47
- not a letter or a digit as dirty characters
48
- Examples: FRANK^08/30/76^UNDERWOOD, regimen---------------1/37
49
- Returns:
50
- (list): List of regexes to clean the infix of the token
51
- """
52
- #Handles case 3 of STAFF
53
- #Handles case 4 of HOSP
54
- #Handles case 2 of LOC
55
- connector_clean = '\^|;|&#|([\(\)\[\]:="])'
56
- #full_stop_clean = '(?<=[a-zA-Z])(\.)(?=([A-Z][A-Za-z]+)|[^a-zA-Z0-9_.]+)'
57
- bracket_comma_clean = '(((?<=\d)[,)(](?=[a-zA-Z]+))|((?<=[a-zA-Z])[,)(](?=\w+)))'
58
- #special_char_clean = '(?<=[a-zA-Z])(\W{3,}|[_]{3,})(?=[A-Za-z]+)'
59
- special_char_clean = '(?<=[a-zA-Z])([_\W_]{3,})(?=[A-Za-z]+)'
60
- #Sometimes when there is no space between a period and a comma - it becomes part of the same token
61
- #e.g John.,M.D - we need to split this up.
62
- comma_period_clean = '(?<=[a-zA-Z])(\.,)(?=[A-Za-z]+)'
63
-
64
- return [connector_clean, bracket_comma_clean, special_char_clean, comma_period_clean]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py DELETED
@@ -1,309 +0,0 @@
1
- from typing import List
2
- class ClinicalRegex(object):
3
- """
4
- This class is used to define the regexes that will be used by the
5
- spacy tokenizer rules. Mainly the regexes are used to clean up
6
- tokens that have unwanted characters and typos (e.g missing spaces).
7
- In the descriptions when we mention symbol we refer to any character
8
- that is not a letter or a digit or underscore. The spacy tokenizer splits
9
- the text by whitespace and applies these rules (along with some default rules)
10
- to the indiviudal tokens.
11
- """
12
- #Patient - 2, 3, 5
13
- #Staff - 1, 2
14
- #Hosp - 2, 3
15
- #Loc - 1, 3
16
- @staticmethod
17
- def get_word_typo_prefix():
18
- """
19
- If token contains a typo. What we mean by a typo is when two tokens
20
- that should be separate tokens are fused into one token because there
21
- is a missing space.
22
- Examples: JohnMarital Status - John is the name that is fused into the
23
- token Marital because of a missing space.
24
- The regex checks if we have a sequence of characters followed by another
25
- sequence of characters that starts with a capital letter, followed by two or
26
- more small letters, we assume this is a typo and split the tokens (two sequences) up.
27
- If there is a symbol separating the two sequences, we ease the condition saying
28
- the Cpaital letter can be followed by two or more capital/small letters.
29
- Returns:
30
- (str): regex to clean tokens that are fused because of a missing space
31
- """
32
- #Handles cases 2 of PATIENT
33
- #Handles cases 1 & 2 of STAFF
34
- #Handles cases 2 & 3 of HOSP
35
- #Handles cases 1 & 3 of LOC
36
- #'(([a-z]+)|([A-Z]+)|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
37
- return '(([a-z]+)|([A-Z]{2,})|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
38
-
39
- @staticmethod
40
- def get_word_symbol_digit_prefix() -> str:
41
- """
42
- If text is followed by one or more symbols and then followed by one or more digits
43
- we make the assumption that the text is a seperate token. Spacy will use this regex
44
- to extract the text portion as one token and will then move on to
45
- process the rest (symbol and tokens) based on the defined rules.
46
- Examples: Yang(4986231) - "Yang" will become a seperate token & "(4986231)" will
47
- be processed as new token
48
- Returns:
49
- (str): regex to clean text followed by symbols followed by digits
50
- """
51
- #Handles cases 3 & 5 of patient
52
- return '([a-zA-Z]+)(?=\W+\d+)'
53
-
54
- @staticmethod
55
- def get_multiple_prefix(split_multiple: bool) -> str:
56
- """
57
- If text is of the format take it x2 times, this function
58
- can be used to treat the entire thing as one token or
59
- split into two seperate tokens
60
- Args:
61
- split_multiple (bool): whether to treat it as one token or split them up
62
- Returns:
63
- (str): regex to either keep as one token or split into two
64
- """
65
- if(split_multiple):
66
- return '([x])(?=(\d{1,2}$))'
67
- else:
68
- return '[x]\d{1,2}$'
69
-
70
- @staticmethod
71
- def get_pager_prefix():
72
- return '([pXxPb])(?=(\d{4,}|\d+[-]\d+))'
73
-
74
- @staticmethod
75
- def get_age_word_prefix():
76
- return '([MFmf])(?=\d{2,3}(\W+|$))'
77
-
78
- @staticmethod
79
- def get_id_prefix():
80
- return '(ID|id|Id)(?=\d{3,})'
81
-
82
- @staticmethod
83
- def get_word_period_prefix():
84
- return '((cf|CF|Cf|dr|DR|Dr|ft|FT|Ft|lt|LT|Lt|mr|MR|Mr|ms|MS|Ms|mt|MT|Mt|mx|MX|Mx|ph|PH|Ph|rd|RD|Rd|st|ST|St|vs|VS|Vs|wm|WM|Wm|[A-Za-z]{1})[.])(?=((\W+|$)))'
85
-
86
- @staticmethod
87
- def get_chemical_prefix():
88
- #Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
89
- """
90
- There are certain chemicals, vitamins etc that should not be split. They
91
- should be kept as a single token - for example the token "B12" in
92
- "Vitamin B12". This regex checks if there is a single capital letter
93
- followed by some digits (there can be a hyphen in between those digits)
94
- then this most likely represents a token that should not be split
95
- Returns:
96
- (str): regex to keep vitamin/chemical names as a single token
97
- """
98
- #return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
99
- return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
100
-
101
- @staticmethod
102
- def get_chemical_prefix_small():
103
- #Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
104
- """
105
- There are certain chemicals, vitamins etc that should not be split. They
106
- should be kept as a single token - for example the token "B12" in
107
- "Vitamin B12". This regex checks if there is a single capital letter
108
- followed by some digits (there can be a hyphen in between those digits)
109
- then this most likely represents a token that should not be split
110
- Returns:
111
- (str): regex to keep vitamin/chemical names as a single token
112
- """
113
- #return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
114
- return '((\d)?[a-eg-ln-oq-wyz]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
115
-
116
- @staticmethod
117
- def get_instrument_prefix():
118
- """
119
- There are cases when there are tokens like L1-L2-L3, we want to keep these as one
120
- single token. This regex checks if there is a capital letter
121
- Returns:
122
- (str): regex to keep vitamin/chemical names as a single token
123
- """
124
- return '([A-Z]{1,2}\d+(?P<instrument>[-:]+)[A-Z]{1,2}\d+((?P=instrument)[A-Z]{1,2}\d+)*)'
125
-
126
- @staticmethod
127
- def get_instrument_prefix_small():
128
- """
129
- There are cases when there are tokens like L1-L2-L3, we want to keep these as one
130
- single token. This regex checks if there is a capital letter
131
- Returns:
132
- (str): regex to keep vitamin/chemical names as a single token
133
- """
134
- return '([a-z]{1,2}\d+(?P<instrument_small>[-:]+)[a-z]{1,2}\d+((?P=instrument_small)[a-z]{1,2}\d+)*)'
135
-
136
- #Handles Case 3, 4, 5 of MRN
137
- #Handles Case 1, 2, 3 of PHONE
138
- #Handles Case 7, 10 of AGE
139
- #Handles Case 1 of IDNUM
140
- #Handles Case 3, 5 of PATIENT
141
- #Handles Case 7 of HOSP
142
- #Handles Case 1 of General
143
- @staticmethod
144
- def get_age_typo_prefix():
145
- """
146
- There are cases when there is no space between the text and the age
147
- Example: Plan88yo - we want Plan to be a seperate token
148
- Returns:
149
- (str):
150
- """
151
- age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
152
- '|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
153
- '(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
154
- return '([a-zA-Z]+)(?=((\d{1,3})' + age_suffix + '$))'
155
-
156
- @staticmethod
157
- def get_word_digit_split_prefix():
158
- #Word followed by more than 3 digits - might not be part of the same token
159
- #and could be a typo
160
- #This need not be true - maybe we have an id like BFPI980801 - this will be split
161
- #BFPI 980801 - but it might be okay to split - need to check
162
- #([A-Z][a-z]{2,})(?=\d+)
163
- return '([A-Z][a-z]{2,})(?=[A-Za-z]*\d+)'
164
-
165
- @staticmethod
166
- def get_word_digit_mix_prefix():
167
- #Mix of letters and characters - most likely a typo if the
168
- #following characters is a capital letter followed by more than
169
- #2 small letters
170
- #return '([A-Z]+\d+([A-Z]+(?!([a-z]{2,}))))(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
171
- return '([A-Z]+\d+)(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
172
-
173
- @staticmethod
174
- def get_word_digit_mix_prefix_small():
175
- #Mix of letters and characters - most likely a typo if the
176
- #following characters is a capital letter followed by more than
177
- #2 small letters
178
- return '([a-z]+\d+)(?=(\W+|[A-Z][a-z]{2,}|[A-Z]{3,}))'
179
-
180
- @staticmethod
181
- def get_word_id_split_prefix():
182
- return '([a-zA-Z]+)(?=(\d+[-./]+(\d+|$)))'
183
-
184
- @staticmethod
185
- def get_word_section_prefix():
186
- #Fix JOHNID/CC - missing space from previous section - JOHN
187
- return '([A-Za-z]+)(?=(((?P<slash>[/:]+)[A-Za-z]+)((?P=slash)[A-Za-z]+)*\W+\d+))'
188
-
189
- @staticmethod
190
- def get_colon_prefix():
191
- #Split tokens before and after the token
192
- #Does not split time - we make sure the token ebfore the colon
193
- #starts with a letter.
194
- #Splits patterns like <CHAR 1>:<CHAR 2> where CHAR 1 starts with a
195
- #letter and is followed by one more letters/digits
196
- #CHAR 2 is a combination of letters/digits of length greater than 2
197
- #This wont split time, but assumes that when the colon is present
198
- #the entities on either side of the token are different tokens
199
- #A:9 - not split - more likely this makes sense as a single token (could be a chemical)
200
- return '([A-Za-z][A-Za-z0-9]+)(?=([:][A-Za-z0-9]{2,}))'
201
-
202
- @staticmethod
203
- def get_temperature_prefix(split_temperature):
204
- if(split_temperature):
205
- return '((\d+)|(\d+[.]\d+))(?=(\u00B0([FCK]{1}|$)))'
206
- else:
207
- return '(((\d+)|(\d+[.]\d+))\u00B0([FCK]{1}|$))|(\u00A9[FCK]{1})'
208
-
209
- @staticmethod
210
- def get_percentage_prefix(split_percentage):
211
- """
212
- If text is of the format take it 20% times, this function
213
- can be used to treat the entire thing as one token or
214
- split into two seperate tokens
215
- Args:
216
- split_percentage (bool): whether to treat it as one token or split them up
217
- Returns:
218
- (str): regex to either keep as one token or split into two
219
- """
220
- if(split_percentage):
221
- return '(((\d+)|(\d+[.]\d+)))(?=(%(\W+|$)))'
222
- else:
223
- return '(((\d+)|(\d+[.]\d+))%(\W+|$))'
224
-
225
- @staticmethod
226
- def get_value_range_prefixes():
227
- #The following regex might not work on .4-.5 - no number before decimal point
228
- #need to figure this out without breaking anything else
229
- value_range_1 = '(\d{1})(?=([-]((\d{1,2}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
230
- value_range_2 = '(\d{2})(?=([-]((\d{2,3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
231
- value_range_3 = '(\d{3})(?=([-]((\d{3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
232
- return value_range_1, value_range_2, value_range_3
233
-
234
- @staticmethod
235
- def get_year_range_prefix():
236
- return '(\d{4})(?=([-](\d{4})([a-zA-Z]+|[\W]*$)))'
237
-
238
- @staticmethod
239
- def get_short_digit_id_prefix():
240
- #4A, 3C etc
241
- return '(\d{1,2}[A-EG-LN-WZ]{1}(?=(\W+|$)))'
242
-
243
- #Handles Case 1, 2 of MRN
244
- #Handles Case 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19 of AGE
245
- #Handles Case 2, 3, 5 of IDNUM
246
- #Handles Case 1 of HOSP
247
- @staticmethod
248
- def get_digit_symbol_word_prefix():
249
- return '((\d+)|(\d+[.]\d+))(?=\W+[a-zA-Z]+)'
250
-
251
- @staticmethod
252
- def get_digit_age_split_prefix():
253
- age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
254
- '|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
255
- '(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
256
- return '((\d{1,3}))(?=(' + age_suffix + '\W*$))'
257
-
258
- @staticmethod
259
- def get_digit_word_short_prefix():
260
- return '((\d+)|(\d+[.]\d+))([a-z]{1,2}|[A-Z]{1,2})(?=(\W*$))'
261
-
262
- @staticmethod
263
- def get_digit_word_typo_prefix():
264
- return '((\d+)|(\d+[.]\d+))(?=[a-zA-Z]{1}[a-zA-Z\W]+)'
265
-
266
- @staticmethod
267
- def get_prefixes(split_multiple, split_temperature, split_percentage):
268
- word_typo_prefix = ClinicalRegex.get_word_typo_prefix()
269
- word_symbol_digit_prefix = ClinicalRegex.get_word_symbol_digit_prefix()
270
- pager_prefix = ClinicalRegex.get_pager_prefix()
271
- age_word_prefix = ClinicalRegex.get_age_word_prefix()
272
- word_period_prefix = ClinicalRegex.get_word_period_prefix()
273
- id_prefix = ClinicalRegex.get_id_prefix()
274
- multiple_prefix = ClinicalRegex.get_multiple_prefix(split_multiple)
275
- chemical_prefix = ClinicalRegex.get_chemical_prefix()
276
- chemical_prefix_small = ClinicalRegex.get_chemical_prefix_small()
277
- instrument_prefix = ClinicalRegex.get_instrument_prefix()
278
- instrument_prefix_small = ClinicalRegex.get_instrument_prefix_small()
279
- age_typo_prefix = ClinicalRegex.get_age_typo_prefix()
280
- word_digit_split_prefix = ClinicalRegex.get_word_digit_split_prefix()
281
- word_digit_mix_prefix = ClinicalRegex.get_word_digit_mix_prefix()
282
- word_digit_mix_prefix_small = ClinicalRegex.get_word_digit_mix_prefix_small()
283
- word_id_split_prefix = ClinicalRegex.get_word_id_split_prefix()
284
- word_section_prefix = ClinicalRegex.get_word_section_prefix()
285
- colon_prefix = ClinicalRegex.get_colon_prefix()
286
- temperature_prefix = ClinicalRegex.get_temperature_prefix(split_temperature)
287
- percentage_prefix = ClinicalRegex.get_percentage_prefix(split_percentage)
288
- value_range_1, value_range_2, value_range_3 = ClinicalRegex.get_value_range_prefixes()
289
- year_range_prefix = ClinicalRegex.get_year_range_prefix()
290
- short_digit_id_prefix = ClinicalRegex.get_short_digit_id_prefix()
291
- digit_symbol_word_prefix = ClinicalRegex.get_digit_symbol_word_prefix()
292
- digit_age_split_prefix = ClinicalRegex.get_digit_age_split_prefix()
293
- digit_word_short_prefix = ClinicalRegex.get_digit_word_short_prefix()
294
- digit_word_typo_prefix = ClinicalRegex.get_digit_word_typo_prefix()
295
-
296
- return [word_typo_prefix, word_symbol_digit_prefix, pager_prefix, age_word_prefix,\
297
- word_period_prefix, id_prefix, multiple_prefix, chemical_prefix, chemical_prefix_small,\
298
- instrument_prefix, instrument_prefix_small, age_typo_prefix, word_digit_split_prefix,\
299
- word_id_split_prefix, word_digit_mix_prefix, word_digit_mix_prefix_small, \
300
- word_section_prefix, colon_prefix, temperature_prefix,\
301
- percentage_prefix, value_range_1, value_range_2, value_range_3, year_range_prefix,\
302
- short_digit_id_prefix, digit_symbol_word_prefix, digit_age_split_prefix,\
303
- digit_word_short_prefix, digit_word_typo_prefix]
304
-
305
- @staticmethod
306
- def get_infixes():
307
- digit_infix = '(\d+(?P<sep>[-:]+)\d+((?P=sep)\d+)*)'
308
- return [digit_infix, ]
309
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/preprocessing/tokenizers/utils/date_regex.py DELETED
@@ -1,104 +0,0 @@
1
- class DateRegex(object):
2
-
3
- @staticmethod
4
- def __get_day_attributes():
5
- # day of the month with optional suffix, such as 7th, 22nd
6
- dd = '(([0-2]?[0-9]|3[01])(\s*)([sS][tT]|[nN][dD]|[rR][dD]|[tT][hH])?)'
7
- # two-digit numeric day of the month
8
- DD = '(0[0-9]|[1-2][0-9]|3[01])'
9
-
10
- return dd, DD
11
-
12
- @staticmethod
13
- def __get_month_attributes():
14
-
15
- m = \
16
- '([jJ][aA][nN]([uU][aA][rR][yY])?|'+\
17
- '[fF][eE][bB]([rR][uU][aA][rR][yY])?|'+\
18
- '[mM][aA][rR]([cC][hH])?|'+\
19
- '[aA][pP][rR]([iI][lL])?|'+\
20
- '[mM][aA][yY]|'+\
21
- '[jJ][uU][nN]([eE])?|'+\
22
- '[jJ][uU][lL]([yY])?|'+\
23
- '[aA][uU][gG]([uU][sS][tT])?|'+\
24
- '[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
25
- '[oO][cC][tT]([oO][bB][eE][rR])?|'+\
26
- '[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
27
- '[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
28
- M = m
29
-
30
- # numeric month
31
- mm = '(0?[0-9]|1[0-2]|' + m + ')'
32
-
33
- # two digit month
34
- MM = '(0[0-9]|1[0-2]|' + m + ')'
35
-
36
- return m, M, mm, MM
37
-
38
- @staticmethod
39
- def __get_year_attributes():
40
-
41
- # two or four digit year
42
- y = '([0-9]{4}|[0-9]{2})'
43
-
44
- # two digit year
45
- yy = '([0-9]{2})'
46
-
47
- # four digit year
48
- YY = '([0-9]{4})'
49
-
50
- return y, yy, YY
51
-
52
- @staticmethod
53
- def __get_sep_attributes():
54
-
55
- date_sep = '[-./]'
56
- date_sep_optional = '[-./]*'
57
- date_sep_no_full = '[-/]'
58
-
59
- return date_sep, date_sep_optional, date_sep_no_full
60
-
61
- #def get_week_attributes():
62
- # w = \
63
- # '([mM][oO][nN]([dD][aA][yY])?|'+\
64
- # '[tT][uU][eE]([sS][dD][aA][yY])?|'+\
65
- # '[wW][eE][dD]([nN][eE][sS][dD][aA][yY])?|'+\
66
- # '[tT][hH][uU][gG]([uU][sS][tT])?|'+\
67
- # '[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
68
- # '[oO][cC][tT]([oO][bB][eE][rR])?|'+\
69
- # '[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
70
- # '[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
71
-
72
- @staticmethod
73
- def get_infixes():
74
-
75
- dd, DD = DateRegex.__get_day_attributes()
76
- m, M, mm, MM = DateRegex.__get_month_attributes()
77
- y, yy, YY = DateRegex.__get_year_attributes()
78
- date_sep, date_sep_optional, date_sep_no_full = DateRegex.__get_sep_attributes()
79
-
80
- date_1 = y + '/' + mm + '/' + dd + '(?!([/]+|\d+))'
81
- date_2 = y + '/' + dd + '/' + mm + '(?!([/]+|\d+))'
82
- date_3 = dd + '/' + mm + '/' + y + '(?!([/]+|\d+))'
83
- date_4 = mm + '/' + dd + '/' + y + '(?!([/]+|\d+))'
84
- #Do I make this optional (date_sep_optional) - need to check
85
- date_5 = y + date_sep + m + date_sep + dd + '(?!\d)'
86
- date_6 = y + date_sep + dd + date_sep + m
87
- date_7 = dd + date_sep + m + date_sep + y
88
- date_8 = m + date_sep + dd + date_sep + y
89
- date_9 = y + date_sep + m
90
- date_10 = m + date_sep + y
91
- date_11 = dd + date_sep + m
92
- date_12 = m + date_sep + dd
93
- date_13 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
94
- date_14 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
95
- date_15 = '(?<!([/]|\d))' + dd + '/' + y + '(?!([/]+|\d+))'
96
- date_16 = '(?<!([/]|\d))' + mm + '/' + y + '(?!([/]+|\d+))'
97
- date_17 = '(?<!([/]|\d))' + dd + '/' + mm + '(?!([/]+|\d+))'
98
- date_18 = '(?<!([/]|\d))' + mm + '/' + dd + '(?!([/]+|\d+))'
99
-
100
- date_infixes = [date_1, date_2, date_3, date_4, date_5, date_6,\
101
- date_7, date_8, date_9, date_10, date_11, date_12,\
102
- date_13, date_14, date_15, date_16, date_17, date_18]
103
-
104
- return date_infixes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/span_fixer.py DELETED
@@ -1,380 +0,0 @@
1
- import re
2
- import json
3
- from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4
- from typing import Iterable, Dict, List, Sequence, Union, Mapping, Tuple, NoReturn
5
-
6
- from .preprocessing import PreprocessingLoader
7
-
8
-
9
- class SpanFixer(object):
10
- """
11
- The tokens and spans may not align depending on the tokenizer used.
12
- This class either expands the span to cover the tokens, so we don't have a mismatch.
13
- A mismatch is when a span_start will not coincide with some token_start or the span_end
14
- will not coincide with some token_end. This class changes the span_start and span_end
15
- so that the span_start will coincide with some token_start and the span_end
16
- will coincide with some token_end - and we don't get any position mismatch errors while
17
- building our dataset. This entire process involves updating span positions which can lead to duplicate
18
- or overlapping spans, which then need to be removed.
19
- E.g we have text: The patient is 75yo man
20
- AGE Span: 75
21
- Token: 75yo
22
- As you can see the span is smaller than the token, which will lead to an error when
23
- building the NER dataset.
24
- To ensure this does not happen, we correct the span. We change the span from
25
- 75 to 75yo -> So now AGE Span is 75yo instead of 75. This script essentially changes
26
- the annotated spans to match the tokens. In an ideal case we wouldn't need this script
27
- but since medical notes have many typos, this script becomes necessary to deal with
28
- issues and changes that arise from different tokenizers.
29
- Also sort the spans and convert the start and end keys of the spans to integers
30
- """
31
-
32
- def __init__(
33
- self,
34
- sentencizer: str,
35
- tokenizer: str,
36
- ner_priorities: Mapping[str, int],
37
- verbose: bool = True
38
- ) -> NoReturn:
39
- """
40
- Initialize the sentencizer and tokenizer
41
- Args:
42
- sentencizer (str): The sentencizer to use for splitting text into sentences
43
- tokenizer (str): The tokenizer to use for splitting text into tokens
44
- ner_priorities (Mapping[str, int]): The priority when choosing which duplicates to remove.
45
- Mapping that represents a priority for each NER type
46
- verbose (bool): To print out warnings etc
47
- """
48
- self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer)
49
- self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer)
50
- self._ner_priorities = ner_priorities
51
- self._verbose = verbose
52
-
53
- def __get_token_positions(self, text: str) -> Tuple[Dict[int, int], Dict[int, int]]:
54
- """
55
- Get the start and end positions of all the tokens in the note.
56
- Args:
57
- text (str): The text present in the note
58
- Returns:
59
- token_start_positions (Mapping[int, int]): The start positions of all the tokens in the note
60
- token_end_positions (Mapping[int, int]): The end positions of all the tokens in the note
61
- """
62
- token_start_positions = dict()
63
- token_end_positions = dict()
64
- for sentence in self._sentencizer.get_sentences(text):
65
- offset = sentence['start']
66
- for token in self._tokenizer.get_tokens(sentence['text']):
67
- start = token['start'] + offset
68
- end = token['end'] + offset
69
- token_start_positions[start] = 1
70
- token_end_positions[end] = 1
71
- return token_start_positions, token_end_positions
72
-
73
- def get_duplicates(
74
- self,
75
- spans: List[Dict[str, Union[str, int]]],
76
- ) -> List[int]:
77
- """
78
- Return the indexes where there are duplicate/overlapping spans. A duplicate or
79
- span is one where the same token can have two labels.
80
- E.g:
81
- Token: BWH^Bruce
82
- This is a single token where BWH is the hospital label and Bruce is the Patient label
83
- The fix_alignment function assigns this entre token the hospital label but it also
84
- assigns this entire token the patient label. Since we have two labels for the same
85
- token, we need to remove one of them.
86
- We assign this entire token one label - either hospital label or the patient label
87
- In this case we assign patient because of higher priority. So now we need to remove
88
- the hospital label from the dataset (since it is essentially a duplicate label). This
89
- script handles this case.
90
- There are cases when two different labels match the same token partially
91
- E.g
92
- Text: JT/781-815-9090
93
- Spans: JT - hospital, 781-815-9090 - Phone
94
- Tokens: (Jt/781) & (- 815 - 9090)
95
- As you can see the token JT/781 will be assigned the label in the fix_alignment function
96
- but 781-815-9090 is also phone and the 781 portion is overlapped, and we need to resolve this.
97
- In this script, we resolve it by treating JT/781 as one span (hospital) and
98
- -815-9090 as another span (phone).
99
- Args:
100
- spans ([List[Dict[str, Union[str, int]]]): The NER spans in the note
101
- Returns:
102
- remove_spans (Sequence[int]): A list of indexes of the spans to remove
103
- """
104
- remove_spans = list()
105
- prev_start = -1
106
- prev_end = -1
107
- prev_label = None
108
- prev_index = None
109
- spans.sort(key=lambda _span: (_span['start'], _span['end']))
110
- for index, span in enumerate(spans):
111
- current_start = span['start']
112
- current_end = span['end']
113
- current_label = span['label']
114
- if type(current_start) != int or type(current_end) != int:
115
- raise ValueError('The start and end keys of the span must be of type int')
116
- # Check if the current span matches another span
117
- # that is if this span covers the same tokens as the
118
- # previous spans (but has a different label)
119
- # Based on the priority, treat the span with the low
120
- # priority label as a duplicate label and add it to the
121
- # list of spans that need to be removed
122
- if current_start == prev_start and current_end == prev_end:
123
- if self._ner_priorities[current_label] > self._ner_priorities[prev_label]:
124
- # Store index of the previous span if it has lower priority
125
- remove_spans.append(prev_index)
126
- # Reset span details
127
- prev_start = current_start
128
- prev_end = current_end
129
- prev_index = index
130
- prev_label = current_label
131
- if self._verbose:
132
- print('DUPLICATE: ', span)
133
- print('REMOVED: ', spans[remove_spans[-1]])
134
- elif self._ner_priorities[current_label] <= self._ner_priorities[prev_label]:
135
- # Store current index of span if it has lower priority
136
- remove_spans.append(index)
137
- if self._verbose:
138
- print('DUPLICATE: ', spans[prev_index])
139
- print('REMOVED: ', spans[remove_spans[-1]])
140
- # Check for overlapping span
141
- elif current_start < prev_end:
142
- # If the current span end matches the overlapping span end
143
- # Remove the current span, since it is smaller
144
- if current_end <= prev_end:
145
- remove_spans.append(index)
146
- if self._verbose:
147
- print('DUPLICATE: ', spans[prev_index])
148
- print('REMOVED: ', spans[remove_spans[-1]])
149
- # If the current end is greater than the prev_end
150
- # then we split it into tow spans. We treat the previous span
151
- # as one span and the end of the previous span to the end of the current span
152
- # as another span.
153
- elif current_end > prev_end:
154
- # Create the new span - start=previous_span_end, end=current_span_end
155
- overlap_length = spans[prev_index]['end'] - current_start
156
- new_text = span['text'][overlap_length:]
157
- # Remove extra spaces that may arise during this span separation
158
- new_text = re.sub('^(\s+)', '', new_text, flags=re.DOTALL)
159
- span['start'] = current_end - len(new_text)
160
- span['text'] = new_text
161
- if self._verbose:
162
- print('OVERLAP: ', spans[prev_index])
163
- print('UPDATED: ', span)
164
- # Reset span details
165
- prev_start = current_start
166
- prev_end = current_end
167
- prev_label = current_label
168
- prev_index = index
169
- # Reset span details
170
- else:
171
- prev_start = current_start
172
- prev_end = current_end
173
- prev_label = current_label
174
- prev_index = index
175
- return remove_spans
176
-
177
- def fix_alignment(
178
- self,
179
- text: str,
180
- spans: Sequence[Dict[str, Union[str, int]]]
181
- ) -> Iterable[Dict[str, Union[str, int]]]:
182
- """
183
- Align the span and tokens. When the tokens and spans don't align, we change the
184
- start and end positions of the spans so that they align with the tokens. This is
185
- needed when a different tokenizer is used and the spans which are defined against
186
- a different tokenizer don't line up with the new tokenizer. Also remove spaces present
187
- at the start or end of the span.
188
- E.g:
189
- Token: BWH^Bruce
190
- This is a single token where BWH is the hospital label and Bruce is the Patient label
191
- The fix_alignment function assigns this entre token the hospital label but it also
192
- assigns this entire token the patient label. This function basically expands the span
193
- so that it matches the start and end positions of some token. By doing this it may create
194
- overlapping and duplicate spans. As you can see it expands the patient label to match the
195
- start of the token and it expands the hospital label to match the end of the token.
196
- function.
197
- Args:
198
- text (str): The text present in the note
199
- spans ([Sequence[Dict[str, Union[str, int]]]): The NER spans in the note
200
- Returns:
201
- (Iterable[Dict[str, Union[str, int]]]): Iterable through the modified spans
202
- """
203
- # Get token start and end positions so that we can check if a span
204
- # coincides with the start and end position of some token.
205
- token_start_positions, token_end_positions = self.__get_token_positions(text)
206
- for span in spans:
207
- start = span['start']
208
- end = span['end']
209
- if type(start) != int or type(end) != int:
210
- raise ValueError('The start and end keys of the span must be of type int')
211
- if re.search('^\s', text[start:end]):
212
- if self._verbose:
213
- print('WARNING - space present in the start of the span')
214
- start = start + 1
215
- if re.search('(\s+)$', text[start:end], flags=re.DOTALL):
216
- new_text = re.sub('(\s+)$', '', text[start:end], flags=re.DOTALL)
217
- end = start + len(new_text)
218
- # When a span does not coincide with the start and end position of some token
219
- # it means there will be an error when building the ner dataset, we try and avoid
220
- # that error by updating the spans itself, that is we expand the start/end positions
221
- # of the spans so that it is aligned with the tokens.
222
- while token_start_positions.get(start, False) is False:
223
- start -= 1
224
- while token_end_positions.get(end, False) is False:
225
- end += 1
226
- # Print what the old span was and what the new expanded span will look like
227
- if self._verbose and (int(span['start']) != start or int(span['end']) != end):
228
- print('OLD SPAN: ', text[int(span['start']):int(span['end'])])
229
- print('NEW SPAN: ', text[start:end])
230
- # Update the span with its new start and end positions
231
- span['start'] = start
232
- span['end'] = end
233
- span['text'] = text[start:end]
234
- yield span
235
-
236
- def fix_note(
237
- self,
238
- text: str,
239
- spans: Sequence[Dict[str, Union[str, int]]],
240
- ) -> Iterable[Dict[str, Union[str, int]]]:
241
- """
242
- This function changes the span_start and span_end
243
- so that the span_start will coincide with some token_start and the span_end
244
- will coincide with some token_end and also removes duplicate/overlapping spans
245
- that may arise when we change the span start and end positions. The resulting
246
- spans from this function will always coincide with some token start and token
247
- end, and hence will not have any token and span mismatch errors when building the
248
- NER dataset. For more details and examples check the documentation of the
249
- fix_alignment and get_duplicates functions.
250
- Args:
251
- text (str): The text present in the note
252
- spans ([Sequence[Mapping[str, Union[str, int]]]): The NER spans in the note
253
- Returns:
254
- (Iterable[Mapping[str, Union[str, int]]]): Iterable through the fixed spans
255
- """
256
- # Fix span position alignment
257
- spans = [span for span in self.fix_alignment(text=text, spans=spans)]
258
- # Check for duplicate/overlapping spans
259
- remove_spans = self.get_duplicates(spans=spans)
260
- for index, span in enumerate(spans):
261
- # Remove the duplicate/overlapping spans
262
- if index not in remove_spans:
263
- yield span
264
-
265
- def fix(
266
- self,
267
- input_file: str,
268
- text_key: str = 'text',
269
- spans_key: str = 'spans'
270
- ) -> Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]:
271
- """
272
- This function changes the span_start and span_end
273
- so that the span_start will coincide with some token_start and the span_end
274
- will coincide with some token_end and also removes duplicate/overlapping spans
275
- that may arise when we change the span start and end positions. The resulting
276
- spans from this function will always coincide with some token start and token
277
- end, and hence will not have any token and span mismatch errors when building the
278
- NER dataset. For more details and examples check the documentation of the
279
- fix_alignment and get_duplicates functions. Fix spans that arise due to bad typos,
280
- which are not fixed during tokenization. This essentially updates the spans so that
281
- they line up with the start and end positions of tokens - so that there is no error
282
- when we assign labels to tokens based on these spans
283
- Args:
284
- input_file (str): The file that contains the notes that we want to fix the token issues in
285
- text_key (str) the key where the note & token text is present in the json object
286
- spans_key (str): The key where the note spans are present in the json object
287
- Returns:
288
- (Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]): Iterable through the fixed
289
- notes
290
- """
291
- for line in open(input_file, 'r'):
292
- note = json.loads(line)
293
- note[spans_key] = [span for span in self.fix_note(text=note[text_key], spans=note[spans_key])]
294
- yield note
295
-
296
-
297
- def main():
298
- # The following code sets up the arguments to be passed via CLI or via a JSON file
299
- cli_parser = ArgumentParser(
300
- description='configuration arguments provided at run time from the CLI',
301
- formatter_class=ArgumentDefaultsHelpFormatter
302
- )
303
- cli_parser.add_argument(
304
- '--input_file',
305
- type=str,
306
- required=True,
307
- help='the the jsonl file that contains the notes'
308
- )
309
- cli_parser.add_argument(
310
- '--sentencizer',
311
- type=str,
312
- required=True,
313
- help='the sentencizer to use for splitting notes into sentences'
314
- )
315
- cli_parser.add_argument(
316
- '--tokenizer',
317
- type=str,
318
- required=True,
319
- help='the tokenizer to use for splitting text into tokens'
320
- )
321
- cli_parser.add_argument(
322
- '--abbreviations_file',
323
- type=str,
324
- default=None,
325
- help='file that will be used by clinical tokenizer to handle abbreviations'
326
- )
327
- cli_parser.add_argument(
328
- '--ner_types',
329
- nargs="+",
330
- require=True,
331
- help='the NER types'
332
- )
333
- cli_parser.add_argument(
334
- '--ner_priorities',
335
- nargs="+",
336
- require=True,
337
- help='the priorities for the NER types - the priority when choosing which duplicates to remove'
338
- )
339
- cli_parser.add_argument(
340
- '--text_key',
341
- type=str,
342
- default='text',
343
- help='the key where the note & token text is present in the json object'
344
- )
345
- cli_parser.add_argument(
346
- '--spans_key',
347
- type=str,
348
- default='spans',
349
- help='the key where the note spans is present in the json object'
350
- )
351
- cli_parser.add_argument(
352
- '--output_file',
353
- type=str,
354
- required=True,
355
- help='the output json file that will contain the new fixed spans'
356
- )
357
- args = cli_parser.parse_args()
358
- # Mapping that represents a priority for each PHI type
359
- # For example, the PATIENT type will have a higher priority as
360
- # compared to STAFF.
361
- if len(args.ner_types) == len(args.ner_priorities):
362
- ner_priorities = {ner_type: priority for ner_type, priority in zip(args.ner_types, args.ner_priorities)}
363
- else:
364
- raise ValueError('Length of ner_types and ner_priorities must be the same')
365
- span_fixer = SpanFixer(
366
- tokenizer=args.tokenizer,
367
- sentencizer=args.sentencizer,
368
- ner_priorities=ner_priorities
369
- )
370
- with open(args.output_file, 'w') as file:
371
- for note in span_fixer.fix(
372
- input_file=args.input_file,
373
- text_key=args.text_key,
374
- spans_key=args.spans_key
375
- ):
376
- file.write(json.dumps(note) + '\n')
377
-
378
-
379
- if __name__ == '__main__':
380
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner_datasets/span_validation.py DELETED
@@ -1,91 +0,0 @@
1
- import json
2
- import random
3
- from argparse import ArgumentParser
4
- from typing import Union, NoReturn, Iterable, Dict, List
5
-
6
- random.seed(41)
7
-
8
-
9
- class SpanValidation(object):
10
- """
11
- This class is used to build a mapping between the note id
12
- and the annotated spans in that note. This will be used during the
13
- evaluation of the models. This is required to perform span level
14
- evaluation.
15
- """
16
- @staticmethod
17
- def get_spans(
18
- input_file: str,
19
- metadata_key: str = 'meta',
20
- note_id_key: str = 'note_id',
21
- spans_key: str = 'spans'
22
- ):
23
- """
24
- Get a mapping between the note id
25
- and the annotated spans in that note. This will mainly be used during the
26
- evaluation of the models.
27
- Args:
28
- input_file (str): The input file
29
- metadata_key (str): The key where the note metadata is present
30
- note_id_key (str): The key where the note id is present
31
- spans_key (str): The key that contains the annotated spans for a note dictionary
32
- Returns:
33
- (Iterable[Dict[str, Union[str, List[Dict[str, str]]]]]): An iterable that iterates through each note
34
- and contains the note id and annotated spans
35
- for that note
36
- """
37
- # Read the input files (data source)
38
- for line in open(input_file, 'r'):
39
- note = json.loads(line)
40
- note_id = note[metadata_key][note_id_key]
41
- # Store the note_id and the annotated spans
42
- note[spans_key].sort(key=lambda _span: (_span['start'], _span['end']))
43
- yield {'note_id': note_id, 'note_spans': note[spans_key]}
44
-
45
-
46
- def main() -> NoReturn:
47
- cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
48
- cli_parser.add_argument(
49
- '--input_file',
50
- type=str,
51
- required=True,
52
- help='the the jsonl file that contains the notes'
53
- )
54
- cli_parser.add_argument(
55
- '--metadata_key',
56
- type=str,
57
- default='meta',
58
- help='the key where the note metadata is present in the json object'
59
- )
60
- cli_parser.add_argument(
61
- '--note_id_key',
62
- type=str,
63
- default='note_id',
64
- help='the key where the note id is present in the json object'
65
- )
66
- cli_parser.add_argument(
67
- '--span_text_key',
68
- type=str,
69
- default='spans',
70
- help='the key where the annotated spans for the notes are present in the json object'
71
- )
72
- cli_parser.add_argument(
73
- '--output_file',
74
- type=str,
75
- required=True,
76
- help='the file where the note id and the corresponding spans for that note are to be saved'
77
- )
78
- args = cli_parser.parse_args()
79
-
80
- # Write the dataset to the output file
81
- with open(args.output_file, 'w') as file:
82
- for span_info in SpanValidation.get_spans(
83
- input_file=args.input_file,
84
- metadata_key=args.metadata_key,
85
- note_id_key=args.note_id_key,
86
- spans_key=args.spans_key):
87
- file.write(json.dumps(span_info) + '\n')
88
-
89
-
90
- if __name__ == "__main__":
91
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sequence_tagging/.DS_Store DELETED
Binary file (6.15 kB)
 
sequence_tagging/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .sequence_tagger import SequenceTagger
2
- __all__ = ["SequenceTagger"]
 
 
 
sequence_tagging/__pycache__/__init__.cpython-37.pyc DELETED
Binary file (267 Bytes)
 
sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc DELETED
Binary file (13.6 kB)
 
sequence_tagging/arguments/__init__.py DELETED
@@ -1,8 +0,0 @@
1
- from .model_arguments import ModelArguments
2
- from .evaluation_arguments import EvaluationArguments
3
- from .data_training_arguments import DataTrainingArguments
4
- __all__ = [
5
- "ModelArguments",
6
- "DataTrainingArguments",
7
- "EvaluationArguments",
8
- ]
 
 
 
 
 
 
 
 
 
sequence_tagging/arguments/data_training_arguments.py DELETED
@@ -1,115 +0,0 @@
1
- from typing import Optional
2
- from dataclasses import dataclass, field
3
-
4
- @dataclass
5
- class DataTrainingArguments:
6
- """
7
- Arguments pertaining to what data we are going to input our model for training and eval.
8
- """
9
- task_name: Optional[str] = field(
10
- default="ner",
11
- metadata={"help": "The name of the task (ner, pos...)."}
12
- )
13
- notation: str = field(
14
- default="BIO",
15
- metadata={"help": "NER notation e.g BIO"},
16
- )
17
- ner_types: Optional[str] = field(
18
- default=None,
19
- metadata={"help": "Pass a list of NER types"},
20
- )
21
- train_file: Optional[str] = field(
22
- default=None,
23
- metadata={"help": "The input training data file (a csv or JSON file)."}
24
- )
25
- validation_file: Optional[str] = field(
26
- default=None,
27
- metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
28
- )
29
- test_file: Optional[str] = field(
30
- default=None,
31
- metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
32
- )
33
- output_predictions_file: Optional[str] = field(
34
- default=None,
35
- metadata={"help": "A location where to write the output of the test data"},
36
- )
37
- text_column_name: Optional[str] = field(
38
- default='tokens',
39
- metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
40
- )
41
- label_column_name: Optional[str] = field(
42
- default='labels',
43
- metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
44
- )
45
- overwrite_cache: bool = field(
46
- default=False,
47
- metadata={"help": "Overwrite the cached training and evaluation sets"}
48
- )
49
- preprocessing_num_workers: Optional[int] = field(
50
- default=None,
51
- metadata={"help": "The number of processes to use for the preprocessing."},
52
- )
53
- pad_to_max_length: bool = field(
54
- default=False,
55
- metadata={
56
- "help": "Whether to pad all samples to model maximum sentence length. "
57
- "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
58
- "efficient on GPU but very bad for TPU."
59
- },
60
- )
61
- truncation: bool = field(
62
- default=True,
63
- metadata={
64
- "help": "Activates and controls truncation"
65
- },
66
- )
67
- max_length: int = field(
68
- default=512,
69
- metadata={
70
- "help": "Controls the maximum length to use by one of the truncation/padding parameters."
71
- },
72
- )
73
- do_lower_case: bool = field(
74
- default=False,
75
- metadata={
76
- "help": "Whether to lowercase the text"
77
- },
78
- )
79
- max_train_samples: Optional[int] = field(
80
- default=None,
81
- metadata={
82
- "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
83
- "value if set."
84
- },
85
- )
86
- max_eval_samples: Optional[int] = field(
87
- default=None,
88
- metadata={
89
- "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
90
- "value if set."
91
- },
92
- )
93
- max_predict_samples: Optional[int] = field(
94
- default=None,
95
- metadata={
96
- "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
97
- "value if set."
98
- },
99
- )
100
- label_all_tokens: bool = field(
101
- default=False,
102
- metadata={
103
- "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
104
- "one (in which case the other tokens will have a padding index)."
105
- },
106
- )
107
- return_entity_level_metrics: bool = field(
108
- default=True,
109
- metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
110
- )
111
- token_ignore_label: str = field(
112
- default='NA',
113
- metadata={"help": "The label that indicates where the tokens will be ignored in loss computation. Used for "
114
- "indicating context tokens to the model"}
115
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sequence_tagging/arguments/evaluation_arguments.py DELETED
@@ -1,26 +0,0 @@
1
- from typing import Optional
2
- from dataclasses import dataclass, field
3
-
4
- @dataclass
5
- class EvaluationArguments:
6
- """
7
- Arguments pertaining to the evaluation process.
8
- """
9
- model_eval_script: Optional[str] = field(
10
- default=None,
11
- metadata={"help": "The script that is used for evaluation"},
12
- )
13
- evaluation_mode: Optional[str] = field(
14
- default=None,
15
- metadata={"help": "Strict or default mode for sequence evaluation"},
16
- )
17
- validation_spans_file: Optional[str] = field(
18
- default=None,
19
- metadata={"help": "A span evaluation data file to evaluate on span level (json file). This will contain a "
20
- "mapping between the note_ids and note spans"},
21
- )
22
- ner_type_maps: Optional[str] = field(
23
- default=None,
24
- metadata={"help": "List that contains the mappings between the original NER types to another set of NER "
25
- "types. Used mainly for evaluation. to map ner token labels to another set of ner token"},
26
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sequence_tagging/arguments/model_arguments.py DELETED
@@ -1,43 +0,0 @@
1
- from typing import Optional
2
- from dataclasses import dataclass, field
3
-
4
-
5
- @dataclass
6
- class ModelArguments:
7
- """
8
- Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
9
- """
10
- model_name_or_path: str = field(
11
- metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
12
- )
13
- config_name: Optional[str] = field(
14
- default=None,
15
- metadata={"help": "Pretrained config name or path if not the same as model_name"}
16
- )
17
- tokenizer_name: Optional[str] = field(
18
- default=None,
19
- metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
20
- )
21
- cache_dir: Optional[str] = field(
22
- default=None,
23
- metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
24
- )
25
- model_revision: str = field(
26
- default="main",
27
- metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
28
- )
29
- use_auth_token: bool = field(
30
- default=False,
31
- metadata={
32
- "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
33
- "with private models)."
34
- },
35
- )
36
- post_process: str = field(
37
- default='argmax',
38
- metadata={"help": "What post processing to use on the model logits"},
39
- )
40
- threshold: Optional[float] = field(
41
- default=None,
42
- metadata={"help": "Threshold cutoff for softmax"},
43
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sequence_tagging/dataset_builder/__init__.py DELETED
@@ -1,5 +0,0 @@
1
- from .ner_labels import NERLabels
2
- from .ner_dataset import NERDataset
3
- from .label_mapper import LabelMapper
4
- from .dataset_tokenizer import DatasetTokenizer
5
- __all__=["NERLabels", "NERDataset", "LabelMapper", "DatasetTokenizer"]
 
 
 
 
 
 
sequence_tagging/dataset_builder/dataset_tokenizer.py DELETED
@@ -1,178 +0,0 @@
1
- from typing import Mapping, Sequence, List, Union, Optional, NoReturn
2
- from datasets import Dataset
3
- from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
4
-
5
-
6
- class DatasetTokenizer(object):
7
- """
8
- The main goal of this class is to solve the problem described below.
9
- Most of the comments have been copied from the huggingface webpage.
10
- What this class does is initialize a tokenizer with the desired parameters
11
- and then tokenize our dataset and align the tokens with the labels
12
- while keeping in mind the problem & solution described below. We can use this
13
- function to train and for predictions - we just assume the predictions dataset
14
- will have a label column filled with some values (so this code can be re-used).
15
- Now we arrive at a common obstacle with using pre-trained models for
16
- token-level classification: many of the tokens in the dataset may not
17
- be in the tokenizer vocabulary. Bert and many models like it use a method
18
- called WordPiece Tokenization, meaning that single words are split into multiple
19
- tokens such that each token is likely to be in the vocabulary. For example,
20
- the tokenizer would split the date (token) 2080 into the tokens ['208', '##0'].
21
- This is a problem for us because we have exactly one tag per token (2080 -> B-DATE).
22
- If the tokenizer splits a token into multiple sub-tokens, then we will end up with
23
- a mismatch between our tokens and our labels (208, 0) - two tokens but one label (B-DATE).
24
- One way to handle this is to only train on the tag labels for the first subtoken of a
25
- split token. We can do this in huggingface Transformers by setting the labels
26
- we wish to ignore to -100. In the example above, if the label for 2080 is B-DATE
27
- and say the id (from the label to id mapping) for B-DATE is 3, we would set the labels
28
- of ['208', '##0'] to [3, -100]. This tells the model to ignore the tokens labelled with
29
- -100 while updating the weights etc.
30
- """
31
-
32
- def __init__(
33
- self,
34
- tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
35
- token_column: str,
36
- label_column: str,
37
- label_to_id: Mapping[str, int],
38
- b_to_i_label: Sequence[int],
39
- padding: Union[bool, str],
40
- truncation: Union[bool, str],
41
- is_split_into_words: bool,
42
- max_length: Optional[int],
43
- label_all_tokens: bool,
44
- token_ignore_label: Optional[str]
45
- ) -> NoReturn:
46
- """
47
- Set the tokenizer we are using to subword tokenizer our dataset. The name of the
48
- column that contains the pre-split tokens, the name of the column that contains
49
- the labels for each token, label to id mapping.
50
- Set the padding strategy of the input. Set whether to truncate the input tokens.
51
- Indicate whether the input is pre-split into tokens. Set the max length of the
52
- input tokens (post subword tokenization). This will be used in conjunction with truncation.
53
- Set whether we want to label even the sub tokens
54
- In the description above we say for 2080 (B-DATE) - [208, ##0]
55
- We do [3, -100] - which says assume to label of token 2080 is the one
56
- predicted for 208 or we can just label both sub tokens
57
- in which case it would be [3, 3] - so we would label 208 as DATE
58
- and ##0 as DATE - then we would have to figure out how to merge these
59
- labels etc
60
- Args:
61
- tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Tokenizer from huggingface
62
- token_column (str): The column that contains the tokens in the dataset
63
- label_column (str): The column that contains the labels in the dataset
64
- label_to_id (Mapping[str, int]): The mapping between labels and ID
65
- b_to_i_label (Sequence[int]): The mapping between labels and ID
66
- padding (Union[bool, str]): Padding strategy
67
- truncation (Union[bool, str]): Truncation strategy
68
- is_split_into_words (bool): Is the input pre-split(tokenized)
69
- max_length (Optional[int]): Max subword tokenized length for the model
70
- label_all_tokens (bool): Whether to label sub words
71
- token_ignore_label (str): The value of the token ignore label - we ignore these in the loss computation
72
- """
73
- self._tokenizer = tokenizer
74
- self._token_column = token_column
75
- self._label_column = label_column
76
- self._label_to_id = label_to_id
77
- self._b_to_i_label = b_to_i_label
78
- # We can tell the tokenizer that we’re dealing with ready-split tokens rather than full
79
- # sentence strings by passing is_split_into_words=True.
80
- # Set the following parameters using the kwargs
81
- self._padding = padding
82
- self._truncation = truncation
83
- self._is_split_into_words = is_split_into_words
84
- self._max_length = max_length
85
- self._label_all_tokens = label_all_tokens
86
- self._token_ignore_label = token_ignore_label
87
- self._ignore_label = -100
88
-
89
- def tokenize_and_align_labels(self, dataset: Dataset) -> Dataset:
90
- """
91
- This function is the one that is used to read the input dataset
92
- Run the subword tokenization on the pre-split tokens and then
93
- as mentioned above align the subtokens and labels and add the ignore
94
- label. This will read the input - say [60, year, old, in, 2080]
95
- and will return the subtokens - [60, year, old, in, 208, ##0]
96
- some other information like token_type_ids etc
97
- and the labels [0, 20, 20, 20, 3, -100] (0 - corresponds to B-AGE, 20 corresponds to O
98
- and 3 corresponds to B-DATE. This returned input serves as input for training the model
99
- or for gathering predictions from a trained model.
100
- Another important thing to note is that we have mentioned before that
101
- we add chunks of tokens that appear before and after the current chunk for context. We would
102
- also need to assign the label -100 (ignore_label) to these chunks, since we are using them
103
- only to provide context. Basically if a token has the label NA, we don't use it for
104
- training or evaluation. For example the input would be something
105
- like tokens: [James, Doe, 60, year, old, in, 2080, BWH, tomorrow, only],
106
- labels: [NA, NA, B-DATE, O, O, O, B-DATE, NA, NA, NA]. NA represents the tokens used for context
107
- This function would return some tokenizer info (e.g attention mask etc), along with
108
- the information that maps the tokens to the subtokens -
109
- [James, Doe, 60, year, old, in, 208, ##0, BW, ##h, tomorrow, only]
110
- and the labels - [-100, -100, 0, 20, 20, 20, 3, -100, -100, -100]
111
- (if label_all_tokens was true, we would return [-100, -100, 0, 20, 20, 20, 3, 3, -100, -100]).
112
- Args:
113
- dataset (Dataset): The pre-split (tokenized dataset) that contain labels
114
- Returns:
115
- tokenized_inputs (Dataset): Subword tokenized and label aligned dataset
116
- """
117
- # Run the tokenizer - subword tokenization
118
- tokenized_inputs = self._tokenizer(
119
- dataset[self._token_column],
120
- padding=self._padding,
121
- truncation=self._truncation,
122
- max_length=self._max_length,
123
- is_split_into_words=self._is_split_into_words,
124
- )
125
- # Align the subwords and tokens
126
- labels = [self.__get_labels(
127
- labels,
128
- tokenized_inputs.word_ids(batch_index=index)
129
- ) for index, labels in enumerate(dataset[self._label_column])]
130
- tokenized_inputs[self._label_column] = labels
131
-
132
- return tokenized_inputs
133
-
134
- def __get_labels(
135
- self,
136
- labels: Sequence[str],
137
- word_ids: Sequence[int]
138
- ) -> List[int]:
139
- """
140
- Go thorough the subword tokens - which are given as word_ids. 2 different tokens
141
- 2080 & John will have different word_ids, but the subword tokens 2080 & ##0 will
142
- have the same word_id, we use this to align and assign the labels accordingly.
143
- if the subword tokens belong to [CLS], [SEP] append the ignore label (-100) to the
144
- list of labels. If the (2080) subword token (##0) belongs to a token - 2080
145
- then the labels would be [3, -100] if label_all_tokens is false. Also if the token
146
- is used only for context (with label NA) it would get the value -100 for its label
147
- Args:
148
- labels (Sequence[str]): The list of labels for the input (example)
149
- word_ids (Sequence[int]): The word_ids after subword tokenization of the input
150
- Returns:
151
- label_ids (List[int]): The list of label ids for the input with the ignore label (-100) added
152
- as required.
153
- """
154
- label_ids = list()
155
- previous_word_idx = None
156
- for word_idx in word_ids:
157
- # Special tokens have a word id that is None. We set the label to -100 so they are automatically
158
- # ignored in the loss function.
159
- if word_idx is None:
160
- label_ids.append(self._ignore_label)
161
- # We set the label for the first token of each word.
162
- elif word_idx != previous_word_idx:
163
- if labels[word_idx] == self._token_ignore_label:
164
- label_ids.append(self._ignore_label)
165
- else:
166
- label_ids.append(self._label_to_id[labels[word_idx]])
167
- # For the other tokens in a word, we set the label to either the current label or -100, depending on
168
- # the label_all_tokens flag.
169
- else:
170
- if labels[word_idx] == self._token_ignore_label:
171
- label_ids.append(self._ignore_label)
172
- else:
173
- label_ids.append(
174
- self._b_to_i_label[self._label_to_id[labels[word_idx]]]
175
- if self._label_all_tokens else self._ignore_label
176
- )
177
- previous_word_idx = word_idx
178
- return label_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sequence_tagging/dataset_builder/label_mapper.py DELETED
@@ -1,87 +0,0 @@
1
- from typing import List, Sequence, Mapping, Optional, NoReturn, Dict, Union
2
- from .ner_labels import NERLabels
3
-
4
-
5
- class LabelMapper(object):
6
- """
7
- This class is used to map one set of NER labels to another set of NER labels
8
- For example we might want to map all NER labels to Binary HIPAA labels.
9
- E.g:
10
- We change the token labels - [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] to
11
- [B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O] or if we wanted binary I2B2 labels we map it to
12
- [B-I2B2, O, O, U-I2B2, B-I2B2, -I2B2, O, B-I2B2, I-I2B2, L-I2B2]
13
- We do this mapping at the token and the span level. That is we have a span from says start=9, end=15
14
- labelled as LOC, we map this label to HIPAA or I2B2. This class maps an exisitng set of labels to
15
- another set of labels
16
- """
17
-
18
- def __init__(
19
- self,
20
- notation,
21
- ner_types: Sequence[str],
22
- ner_types_maps: Sequence[str],
23
- description: str
24
- ) -> NoReturn:
25
- """
26
- Initialize the variables that will be used to map the NER labels and spans
27
- The ner_map and spans_map should correspond to each other and contain the same NER types
28
- Args:
29
- """
30
- self._description = description
31
- self._types = list(set(ner_types_maps))
32
- self._types.sort()
33
- self._spans_map = {ner_type: ner_type_map for ner_type, ner_type_map in zip(ner_types, ner_types_maps)}
34
- ner_labels = NERLabels(notation=notation, ner_types=ner_types)
35
- self._ner_map = dict()
36
- for label in ner_labels.get_label_list():
37
- if label == 'O' or self._spans_map[label[2:]] == 'O':
38
- self._ner_map[label] = 'O'
39
- else:
40
- self._ner_map[label] = label[0:2] + self._spans_map[label[2:]]
41
-
42
- def map_sequence(self, tag_sequence: Sequence[str]) -> List[str]:
43
- """
44
- Mapping a sequence of NER labels to another set of NER labels.
45
- E.g: If we use a binary HIPAA mapping
46
- This sequence [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] will be mapped to
47
- [B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O]
48
- Return the original sequence if no mapping is used (i.e the maps are == None)
49
- Args:
50
- tag_sequence (Sequence[str]): A sequence of NER labels
51
- Returns:
52
- (List[str]): A mapped sequence of NER labels
53
- """
54
- # Return the original sequence if no mapping is used
55
- return [self._ner_map[tag] for tag in tag_sequence]
56
-
57
- def map_spans(self, spans: Sequence[Mapping[str, Union[str, int]]]) -> Sequence[Dict[str, Union[str, int]]]:
58
- """
59
- Mapping a sequence of NER spans to another set of NER spans.
60
- E.g: If we use a binary HIPAA mapping
61
- The spans: [{start:0, end:5, label: DATE}, {start:17, end:25, label: STAFF}, {start:43, end:54, label: PATIENT}]
62
- will be mapped to: [{start:0, end:5, label: HIPAA}, {start:17, end:25, label: O}, {start:43, end:54, label: HIPAA}]
63
- Return the original list of spans if no mapping is used (i.e the maps are == None)
64
- Args:
65
- spans (Sequence[Mapping[str, Union[str, int]]]): A sequence of NER spans
66
- Returns:
67
- (Sequence[Mapping[str, Union[str, int]]]): A mapped sequence of NER spans
68
- """
69
- return [{'start': span['start'], 'end': span['end'], 'label': self._spans_map[span['label']]} \
70
- for span in spans]
71
-
72
- def get_ner_description(self) -> str:
73
- """
74
- Get the description of the ner labels and span maps used
75
- Returns:
76
- (str): A description of the label/span maps used
77
- """
78
- return self._description
79
-
80
- def get_ner_types(self) -> List[str]:
81
- """
82
- Get the PHI types back from the list of NER labels
83
- [B-AGE, I-AGE, B-DATE, I-DATE ..] ---> [AGE, DATE, ...]
84
- Returns:
85
- ner_types (List[str]): The list of unique NER types
86
- """
87
- return self._types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sequence_tagging/dataset_builder/ner_dataset.py DELETED
@@ -1,102 +0,0 @@
1
- from typing import Sequence, Optional, NoReturn
2
-
3
- from datasets import load_dataset, Dataset
4
-
5
-
6
- class NERDataset(object):
7
- """
8
- This class is a wrapper around the huggingface datasets library
9
- It maintains the train, validation and test datasets based on the
10
- train, validation and test files passed by loading the dataset object
11
- from the file and provides a get function to access each of the datasets.
12
- """
13
-
14
- def __init__(
15
- self,
16
- train_file: Optional[Sequence[str]] = None,
17
- validation_file: Optional[Sequence[str]] = None,
18
- test_file: Optional[Sequence[str]] = None,
19
- extension: str = 'json',
20
- shuffle: bool = True,
21
- seed: int = 41
22
- ) -> NoReturn:
23
- """
24
- Load the train, validation and test datasets from the files passed. Read the files and convert
25
- it into a huggingface dataset.
26
- Args:
27
- train_file (Optional[Sequence[str]]): The list of files that contain train data
28
- validation_file (Optional[Sequence[str]]): The list of files that contain validation data
29
- test_file (Optional[Sequence[str]]): The list of files that contain test data
30
- shuffle (bool): Whether to shuffle the dataset
31
- seed (int): Shuffle seed
32
-
33
- """
34
- self._datasets = NERDataset.__prepare_data(
35
- train_file,
36
- validation_file,
37
- test_file,
38
- extension,
39
- shuffle,
40
- seed
41
- )
42
-
43
- @staticmethod
44
- def __prepare_data(
45
- train_file: Optional[Sequence[str]],
46
- validation_file: Optional[Sequence[str]],
47
- test_file: Optional[Sequence[str]],
48
- extension: str,
49
- shuffle: bool,
50
- seed: int
51
- ) -> Dataset:
52
- """
53
- Get the train, validation and test datasets from the files passed. Read the files and convert
54
- it into a huggingface dataset.
55
- Args:
56
- train_file (Optional[Sequence[str]]): The list of files that contain train data
57
- validation_file (Optional[Sequence[str]]): The list of files that contain validation data
58
- test_file (Optional[Sequence[str]]): The list of files that contain test data
59
- shuffle (bool): Whether to shuffle the dataset
60
- seed (int): Shuffle seed
61
- Returns:
62
- (Dataset): The huggingface dataset with train, validation, test splits (if included)
63
- """
64
- # Read the datasets (train, validation, test etc).
65
- data_files = {}
66
- if train_file is not None:
67
- data_files['train'] = train_file
68
- if validation_file is not None:
69
- data_files['validation'] = validation_file
70
- if test_file is not None:
71
- data_files['test'] = test_file
72
- # Shuffle the dataset
73
- if shuffle:
74
- datasets = load_dataset(extension, data_files=data_files).shuffle(seed=seed)
75
- else:
76
- # Don't shuffle the dataset
77
- datasets = load_dataset(extension, data_files=data_files)
78
- return datasets
79
-
80
- def get_train_dataset(self) -> Dataset:
81
- """
82
- Return the train dataset
83
- Returns:
84
- (Dataset): The huggingface dataset - train split
85
- """
86
- return self._datasets['train']
87
-
88
- def get_validation_dataset(self) -> Dataset:
89
- """
90
- Return the validation dataset
91
- Returns:
92
- (Dataset): The huggingface dataset - validation split
93
- """
94
- return self._datasets['validation']
95
-
96
- def get_test_dataset(self) -> Dataset:
97
- """
98
- Return the test dataset
99
- Returns:
100
- (Dataset): The huggingface dataset - test split
101
- """
102
- return self._datasets['test']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sequence_tagging/dataset_builder/ner_labels.py DELETED
@@ -1,67 +0,0 @@
1
- from typing import Sequence, List, NoReturn, Dict
2
-
3
-
4
- class NERLabels(object):
5
- """
6
- Prepare the labels that will be used by the model. Parse the NER types
7
- and prepare the NER labels. For example - NER Types: [AGE, DATE],
8
- it will create a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
9
- These are the labels that will be assigned to the tokens based on the PHI type.
10
- Say we had the following NER types: NAME, AGE, HOSP
11
- The NER labels in the BIO notation would be B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O
12
- This script creates a list of the NER labels ([B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O])
13
- based on the NER types (NAME, AGE, HOSP) that have been defined. Labels have been sorted.
14
- The script also returns the number of labels, the label_to_id mapping, the id_to_label mapping
15
- Label_id_mapping: {B-AGE:0, B-HOSP:1, B-NAME:2, I-AGE:3, I-HOSP:4, I-NAME:5, O:6}
16
- This information will be used during training, evaluation and prediction.
17
- """
18
-
19
- def __init__(self, notation: str, ner_types: Sequence[str]) -> NoReturn:
20
- """
21
- Initialize the notation that we are using for the NER task
22
- Args:
23
- notation (str): The notation that will be used for the NER labels
24
- ner_types (Sequence[str]): The list of NER categories
25
- """
26
- self._notation = notation
27
- self._ner_types = ner_types
28
-
29
- def get_label_list(self) -> List[str]:
30
- """
31
- Given the NER types return the NER labels.
32
- NER Types: [AGE, DATE] -> return a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
33
- Returns:
34
- ner_labels (List[str]): The list of NER labels based on the NER notation (e.g BIO)
35
- """
36
- # Add the 'O' (Outside - Non-phi) label to the list
37
- if 'O' not in self._ner_types:
38
- ner_labels = ['O']
39
- else:
40
- ner_labels = list()
41
- # Go through each label and prefix it based on the notation (e.g - B, I etc)
42
- for ner_type in self._ner_types:
43
- for ner_tag in list(self._notation):
44
- if ner_tag != 'O':
45
- ner_labels.append(ner_tag + '-' + ner_type)
46
- ner_labels.sort()
47
- return ner_labels
48
-
49
- def get_label_to_id(self) -> Dict[str, int]:
50
- """
51
- Return a label to id mapping
52
- Returns:
53
- label_to_id (Dict[str, int]): label to id mapping
54
- """
55
- labels = self.get_label_list()
56
- label_to_id = {label: index_id for index_id, label in enumerate(labels)}
57
- return label_to_id
58
-
59
- def get_id_to_label(self) -> Dict[int, str]:
60
- """
61
- Return a id to label mapping
62
- Returns:
63
- id_to_label (Dict[int, str]): id to label mapping
64
- """
65
- labels = self.get_label_list()
66
- id_to_label = {index_id: label for index_id, label in enumerate(labels)}
67
- return id_to_label