File size: 18,270 Bytes
2a6033c
49f0c5b
 
2a6033c
 
 
 
 
 
4a23eca
037ab80
49f0c5b
beb4d72
49f0c5b
 
 
 
 
 
 
 
 
 
 
 
 
037ab80
 
 
 
 
 
 
 
 
 
 
 
 
 
49f0c5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
037ab80
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
 
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
 
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
 
 
49f0c5b
 
037ab80
49f0c5b
037ab80
 
 
 
 
 
 
 
49f0c5b
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
 
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
 
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
49f0c5b
 
 
037ab80
49f0c5b
 
 
 
 
 
 
037ab80
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
---
title: "Citations of ECFR Banking Regulation in a spaCy pipeline."
description: "Custom text classification project for spaCy v3 adapted from the spaCy v3"
author: Manjinder 
date: 2024-05-01
tags:
  - machine learning
  - natural language processing
  - huggingface
---

vars:
  config: "default"
  lang: "en"
  train: corpus/train.spacy
  dev: corpus/dev.spacy
  version: "0.1.0"
  gpu_id: -1
  vectors_model: "en_core_web_lg"
  name: ecfr_ner
  prodigy:
    ner_labels: ecfr_initial_ner
    ner_manual_labels: ecfr_manual_ner
    senter_labels: ecfr_labeled_sents
    ner_labeled_dataset: ecfr_labeled_ner

directories:
  - corpus/labels
  - data
  - my_trained_model/textcat_multilabel
  - my_trained_model/vocab
  - output/experiment1/model-best/textcat_multilabel
  - output/experiment1/model-best/vocab
  - output/experiment1/model-last/textcat_multilabel
  - output/experiment1/model-last/vocab
  - output/experiment3/model-best/textcat_multilabel
  - output/experiment3/model-best/vocab
  - output/experiment3/model-last/textcat_multilabel
  - output/experiment3/model-last/vocab
  - python_Code

assets:
  - dest: "corpus/labels/ner.json"
    description: "JSON file containing NER labels"
  - dest: "corpus/labels/parser.json"
    description: "JSON file containing parser labels"
  - dest: "corpus/labels/tagger.json"
    description: "JSON file containing tagger labels"
  - dest: "corpus/labels/textcat_multilabel.json"
    description: "JSON file containing multilabel text classification labels"
  - dest: "data/eval.jsonl"
    description: "JSONL file containing evaluation data"
  - dest: "data/firstStep_file.jsonl"
    description: "JSONL file containing formatted data from the first step"
  - dest: "data/five_examples_annotated5.jsonl"
    description: "JSONL file containing five annotated examples"
  - dest: "data/goldenEval.jsonl"
    description: "JSONL file containing golden evaluation data"
  - dest: "data/thirdStep_file.jsonl"
    description: "JSONL file containing classified data from the third step"
  - dest: "data/train.jsonl"
    description: "JSONL file containing training data"
  - dest: "data/train200.jsonl"
    description: "JSONL file containing initial training data"
  - dest: "data/train4465.jsonl"
    description: "JSONL file containing formatted and labeled training data"
  - dest: "my_trained_model/textcat_multilabel/cfg"
    description: "Configuration files for the text classification model"
  - dest: "my_trained_model/textcat_multilabel/model"
    description: "Trained model files for the text classification model"
  - dest: "my_trained_model/vocab/key2row"
    description: "Mapping from keys to row indices in the vocabulary"
  - dest: "my_trained_model/vocab/lookups.bin"
    description: "Binary lookups file for the vocabulary"
  - dest: "my_trained_model/vocab/strings.json"
    description: "JSON file containing string representations of the vocabulary"
  - dest: "my_trained_model/vocab/vectors"
    description: "Directory containing vector files for the vocabulary"
  - dest: "my_trained_model/vocab/vectors.cfg"
    description: "Configuration file for vectors in the vocabulary"
  - dest: "my_trained_model/config.cfg"
    description: "Configuration file for the trained model"
  - dest: "my_trained_model/meta.json"
    description: "JSON file containing metadata for the trained model"
  - dest: "my_trained_model/tokenizer"
    description: "Tokenizer files for the trained model"
  - dest: "output/experiment1/model-best/textcat_multilabel/cfg"
    description: "Configuration files for the best model in experiment 1"
  - dest: "output/experiment1/model-best/textcat_multilabel/model"
    description: "Trained model files for the best model in experiment 1"
  - dest: "output/experiment1/model-best/vocab/key2row"
    description: "Mapping from keys to row indices in the vocabulary for the best model in experiment 1"
  - dest: "output/experiment1/model-best/vocab/lookups.bin"
    description: "Binary lookups file for the vocabulary for the best model in experiment 1"
  - dest: "output/experiment1/model-best/vocab/strings.json"
    description: "JSON file containing string representations of the vocabulary for the best model in experiment 1"
  - dest: "output/experiment1/model-best/vocab/vectors"
    description: "Directory containing vector files for the vocabulary for the best model in experiment 1"
  - dest: "output/experiment1/model-best/vocab/vectors.cfg"
    description: "Configuration file for vectors in the vocabulary for the best model in experiment 1"
  - dest: "output/experiment1/model-best/config.cfg"
    description: "Configuration file for the best model in experiment 1"
  - dest: "output/experiment1/model-best/meta.json"
    description: "JSON file containing metadata for the best model in experiment 1"
  - dest: "output/experiment1/model-best/tokenizer"
    description: "Tokenizer files for the best model in experiment 1"
  - dest: "output/experiment1/model-last/textcat_multilabel/cfg"
    description: "Configuration files for the last model in experiment 1"
  - dest: "output/experiment1/model-last/textcat_multilabel/model"
    description: "Trained model files for the last model in experiment 1"
  - dest: "output/experiment1/model-last/vocab/key2row"
    description: "Mapping from keys to row indices in the vocabulary for the last model in experiment 1"
  - dest: "output/experiment1/model-last/vocab/lookups.bin"
    description: "Binary lookups file for the vocabulary for the last model in experiment 1"
  - dest: "output/experiment1/model-last/vocab/strings.json"
    description: "JSON file containing string representations of the vocabulary for the last model in experiment 1"
  - dest: "output/experiment1/model-last/vocab/vectors"
    description: "Directory containing vector files for the vocabulary for the last model in experiment 1"
  - dest: "output/experiment1/model-last/vocab/vectors.cfg"
    description: "Configuration file for vectors in the vocabulary for the last model in experiment 1"
  - dest: "output/experiment1/model-last/config.cfg"
    description: "Configuration file for the last model in experiment 1"
  - dest: "output/experiment1/model-last/meta.json"
    description: "JSON file containing metadata for the last model in experiment 1"
  - dest: "output/experiment1/model-last/tokenizer"
    description: "Tokenizer files for the last model in experiment 1"
  - dest: "output/experiment3/model-best/textcat_multilabel/cfg"
    description: "Configuration files for the best model in experiment 3"
  - dest: "output/experiment3/model-best/textcat_multilabel/model"
    description: "Trained model files for the best model in experiment 3"
  - dest: "output/experiment3/model-best/vocab/key2row"
    description: "Mapping from keys to row indices in the vocabulary for the best model in experiment 3"
  - dest: "output/experiment3/model-best/vocab/lookups.bin"
    description: "Binary lookups file for the vocabulary for the best model in experiment 3"
  - dest: "output/experiment3/model-best/vocab/strings.json"
    description: "JSON file containing string representations of the vocabulary for the best model in experiment 3"
  - dest: "output/experiment3/model-best/vocab/vectors"
    description: "Directory containing vector files for the vocabulary for the best model in experiment 3"
  - dest: "output/experiment3/model-best/vocab/vectors.cfg"
    description: "Configuration file for vectors in the vocabulary for the best model in experiment 3"
  - dest: "output/experiment3/model-best/config.cfg"
    description: "Configuration file for the best model in experiment 3"
  - dest: "output/experiment3/model-best/meta.json"
    description: "JSON file containing metadata for the best model in experiment 3"
  - dest: "output/experiment3/model-best/tokenizer"
    description: "Tokenizer files for the best model in experiment 3"
  - dest: "output/experiment3/model-last/textcat_multilabel/cfg"
    description: "Configuration files for the last model in experiment 3"
  - dest: "output/experiment3/model-last/textcat_multilabel/model"
    description: "Trained model files for the last model in experiment 3"
  - dest: "output/experiment3/model-last/vocab/key2row"
    description: "Mapping from keys to row indices in the vocabulary for the last model in experiment 3"
  - dest: "output/experiment3/model-last/vocab/lookups.bin"
    description: "Binary lookups file for the vocabulary for the last model in experiment 3"
  - dest: "output/experiment3/model-last/vocab/strings.json"
    description: "JSON file containing string representations of the vocabulary for the last model in experiment 3"
  - dest: "output/experiment3/model-last/vocab/vectors"
    description: "Directory containing vector files for the vocabulary for the last model in experiment 3"
  - dest: "output/experiment3/model-last/vocab/vectors.cfg"
    description: "Configuration file for vectors in the vocabulary for the last model in experiment 3"
  - dest: "output/experiment3/model-last/config.cfg"
    description: "Configuration file for the last model in experiment 3"
  - dest: "output/experiment3/model-last/meta.json"
    description: "JSON file containing metadata for the last model in experiment 3"
  - dest: "output/experiment3/model-last/tokenizer"
    description: "Tokenizer files for the last model in experiment 3"
  - dest: "python_Code/finalStep-formatLabel.py"
    description: "Python script for formatting labeled data in the final step"
  - dest: "python_Code/firstStep-format.py"
    description: "Python script for formatting data in the first step"
  - dest: "python_Code/five_examples_annotated.ipynb"
    description: "Jupyter notebook containing five annotated examples"
  - dest: "python_Code/secondStep-score.py"
    description: "Python script for scoring data in the second step"
  - dest: "python_Code/thirdStep-label.py"
    description: "Python script for labeling data in the third step"
  - dest: "python_Code/train_eval_split.ipynb"
    description: "Jupyter notebook for training and evaluation data splitting"
  - dest: "TerminalCode.txt"
    description: "Text file containing terminal code"
  - dest: "README.md"
    description: "Markdown file containing project documentation"
  - dest: "prodigy.json"
    description: "JSON file containing Prodigy configuration"

workflows:
  all:
    - format-script
    - train-text-classification-model
    - classify-unlabeled-data
    - format-labeled-data
    - setup-environment
    - review-evaluation-data
    - export-reviewed-evaluation-data
    - import-training-data
    - import-golden-evaluation-data
    - train-model-experiment1
    - download-model
    - convert-data-to-spacy-format
    - train-custom-model

commands:
  - name: "format-script"
    help: |
      Execute the Python script `firstStep-format.py`, which performs the initial formatting of a dataset file for the first step of the project. This script extracts text and labels from a dataset file in JSONL format and writes them to a new JSONL file in a specific format.
      
      Usage:
      ```
      spacy project run execute-first-step-format-script
      ```

      Explanation:
      - The script `firstStep-format.py` reads data from the file specified in the `dataset_file` variable (`data/train200.jsonl` by default).
      - It extracts text and labels from each JSON object in the dataset file.
      - If both text and at least one label are available, it writes a new JSON object to the output file specified in the `output_file` variable (`data/firstStep_file.jsonl` by default) with the extracted text and labels.

  - name: "train-text-classification-model"
    help: |
      Train a text classification model using spaCy.
      
      Usage:
      ```
      spacy project run train-text-classification-model
      ```

      Explanation:
      - This command trains a text classification model using the spaCy library based on the configuration provided in the `textcat_multilabel.cfg` file.
      - The model is trained on the data specified in the `train` and `dev` variables (`corpus/train.spacy` and `corpus/dev.spacy` by default).
      - The trained model is saved to the directory specified in the `output_model_dir` variable (`my_trained_model/textcat_multilabel/model` by default).

  - name: "classify-unlabeled-data"
    help: |
      Classify unlabeled data using a trained text classification model.
      
      Usage:
      ```
      spacy project run classify-unlabeled-data
      ```

      Explanation:
      - This command loads the trained text classification model from the directory specified in the `model_dir` variable (`my_trained_model/textcat_multilabel/model` by default).
      - It classifies unlabeled data from the file specified in the `unlabeled_data_file` variable (`data/thirdStep_file.jsonl` by default).
      - The classified data is saved to the file specified in the `classified_data_file` variable (`data/classified_data.jsonl` by default).

  - name: "format-labeled-data"
    help: |
      Execute the Python script `finalStep-formatLabel.py`, which performs the final formatting of labeled data for the last step of the project. This script converts labeled data from the JSONL format used by Prodigy to the JSONL format used by spaCy.
      
      Usage:
      ```
      spacy project run format-labeled-data
      ```

      Explanation:
      - The script `finalStep-formatLabel.py` reads labeled data from the file specified in the `labeled_data_file` variable (`data/thirdStep_file.jsonl` by default).
      - It converts the labeled data from Prodigy's JSONL format to spaCy's JSONL format.
      - The converted data is saved to the file specified in the `formatted_data_file` variable (`data/fourthStep_file.jsonl` by default).

  - name: "setup-environment"
    help: |
      Set up the Python environment for the project using pip and the provided requirements.txt file.
      
      Usage:
      ```
      spacy project run setup-environment
      ```

      Explanation:
      - This command installs the required Python packages listed in the `requirements.txt` file using pip.

  - name: "review-evaluation-data"
    help: |
      Review the evaluation data using Prodigy.
      
      Usage:
      ```
      spacy project run review-evaluation-data
      ```

      Explanation:
      - This command launches Prodigy to review the evaluation data.
      - Prodigy loads the evaluation data from the file specified in the `eval_data_file` variable (`data/eval.jsonl` by default).
      - You can review the data and annotate it as needed using Prodigy's user interface.

  - name: "export-reviewed-evaluation-data"
    help: |
      Export the reviewed evaluation data from Prodigy.
      
      Usage:
      ```
      spacy project run export-reviewed-evaluation-data
      ```

      Explanation:
      - This command exports the reviewed evaluation data from Prodigy to a JSONL file.
      - Prodigy exports the reviewed data to the file specified in the `exported_eval_data_file` variable (`data/goldenEval.jsonl` by default).

  - name: "import-training-data"
    help: |
      Import training data into Prodigy.
      
      Usage:
      ```
      spacy project run import-training-data
      ```

      Explanation:
      - This command imports training data into Prodigy from the file specified in the `training_data_file` variable (`data/fourthStep_file.jsonl` by default).

  - name: "import-golden-evaluation-data"
    help: |
      Import golden evaluation data into Prodigy.
      
      Usage:
      ```
      spacy project run import-golden-evaluation-data
      ```

      Explanation:
      - This command imports golden evaluation data into Prodigy from the file specified in the `golden_evaluation_data_file` variable (`data/goldenEval.jsonl` by default).

  - name: "train-model-experiment1"
    help: |
      Train a text classification model with different configurations for experiment 1.
      
      Usage:
      ```
      spacy project run train-model-experiment1
      ```

      Explanation:
      - This command trains a text classification model using different configurations specified in the `experiment1_configs` list in the `config.cfg` file.
      - The model is trained on the data specified in the `train` and `dev` variables (`corpus/train.spacy` and `corpus/dev.spacy` by default).
      - The trained models are saved to the directories specified in the `output_model_dir` variable (`output/experiment1/model-last/textcat_multilabel/model` and `output/experiment1/model-best/textcat_multilabel/model` by default).

  - name: "download-model"
    help: |
      Download a trained text classification model.
      
      Usage:
      ```
      spacy project run download-model
      ```

      Explanation:
      - This command downloads a trained text classification model from the URL specified in the `model_url` variable (`https://example.com/model.tar.gz` by default).
      - The downloaded model is saved to the directory specified in the `output_model_dir` variable (`models` by default).

  - name: "convert-data-to-spacy-format"
    help: |
      Convert data to spaCy's JSONL format.
      
      Usage:
      ```
      spacy project run convert-data-to-spacy-format
      ```

      Explanation:
      - This command converts data from Prodigy's JSONL format to spaCy's JSONL format.
      - It reads data from the file specified in the `prodigy_data_file` variable (`data/ner_dataset.jsonl` by default) and writes the converted data to the file specified in the `spacy_data_file` variable (`data/ner_dataset_spacy.jsonl` by default).

  - name: "train-custom-model"
    help: |
      Train a custom NER model using spaCy.
      
      Usage:
      ```
      spacy project run train-custom-model
      ```

      Explanation:
      - This command trains a custom NER model using spaCy based on the configuration provided in the `config.cfg` file.
      - The model is trained on the data specified in the `train` and `dev` variables (`corpus/train.spacy` and `corpus/dev.spacy` by default).
      - The trained model is saved to the directory specified in the `output_model_dir` variable (`my_trained_model` by default).