kapilkd13 commited on
Commit
2c83581
1 Parent(s): 5fc4d25

Training in progress, step 200

Browse files
.ipynb_checkpoints/eval-checkpoint.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import re
4
+ from typing import Dict
5
+
6
+ import torch
7
+ from datasets import Audio, Dataset, load_dataset, load_metric
8
+
9
+ from transformers import AutoFeatureExtractor, pipeline
10
+
11
+
12
+ def log_results(result: Dataset, args: Dict[str, str]):
13
+ """DO NOT CHANGE. This function computes and logs the result metrics."""
14
+
15
+ log_outputs = args.log_outputs
16
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
17
+
18
+ # load metric
19
+ wer = load_metric("wer")
20
+ cer = load_metric("cer")
21
+
22
+ # compute metrics
23
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
24
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
25
+
26
+ # print & log results
27
+ result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
28
+ print(result_str)
29
+
30
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
31
+ f.write(result_str)
32
+
33
+ # log all results in text file. Possibly interesting for analysis
34
+ if log_outputs is not None:
35
+ pred_file = f"log_{dataset_id}_predictions.txt"
36
+ target_file = f"log_{dataset_id}_targets.txt"
37
+
38
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
39
+
40
+ # mapping function to write output
41
+ def write_to_file(batch, i):
42
+ p.write(f"{i}" + "\n")
43
+ p.write(batch["prediction"] + "\n")
44
+ t.write(f"{i}" + "\n")
45
+ t.write(batch["target"] + "\n")
46
+
47
+ result.map(write_to_file, with_indices=True)
48
+
49
+
50
+ def normalize_text(text: str) -> str:
51
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
+
53
+ chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
54
+
55
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
56
+
57
+ # In addition, we can normalize the target text, e.g. removing new lines characters etc...
58
+ # note that order is important here!
59
+ token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
60
+
61
+ for t in token_sequences_to_ignore:
62
+ text = " ".join(text.split(t))
63
+
64
+ return text
65
+
66
+
67
+ def main(args):
68
+ # load dataset
69
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
70
+
71
+ # for testing: only process the first two examples as a test
72
+ # dataset = dataset.select(range(10))
73
+
74
+ # load processor
75
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
76
+ sampling_rate = feature_extractor.sampling_rate
77
+
78
+ # resample audio
79
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
80
+
81
+ # load eval pipeline
82
+ if args.device is None:
83
+ args.device = 0 if torch.cuda.is_available() else -1
84
+ asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
85
+
86
+ # map function to decode audio
87
+ def map_to_pred(batch):
88
+ prediction = asr(
89
+ batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
90
+ )
91
+
92
+ batch["prediction"] = prediction["text"].replace("<s>","")
93
+ batch["target"] = normalize_text(batch["sentence"])
94
+ return batch
95
+
96
+ # run inference on all examples
97
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
98
+
99
+ # compute and log_results
100
+ # do not change function below
101
+ log_results(result, args)
102
+
103
+
104
+ if __name__ == "__main__":
105
+ parser = argparse.ArgumentParser()
106
+
107
+ parser.add_argument(
108
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
109
+ )
110
+ parser.add_argument(
111
+ "--dataset",
112
+ type=str,
113
+ required=True,
114
+ help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
115
+ )
116
+ parser.add_argument(
117
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
118
+ )
119
+ parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
120
+ parser.add_argument(
121
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
122
+ )
123
+ parser.add_argument(
124
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
125
+ )
126
+ parser.add_argument(
127
+ "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
128
+ )
129
+ parser.add_argument(
130
+ "--device",
131
+ type=int,
132
+ default=None,
133
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
134
+ )
135
+ args = parser.parse_args()
136
+
137
+ main(args)
.ipynb_checkpoints/run-checkpoint.sh CHANGED
@@ -4,13 +4,13 @@ python run_speech_recognition_ctc.py \
4
  --dataset_config_name="hi" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
- --max_steps="5000" \
8
  --per_device_train_batch_size="16" \
9
  --learning_rate="3e-4" \
10
- --warmup_steps="200" \
11
  --save_steps="200" \
12
  --eval_steps="400" \
13
- --save_total_limit="1" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
  --length_column_name="input_length" \
 
4
  --dataset_config_name="hi" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
+ --max_steps="8000" \
8
  --per_device_train_batch_size="16" \
9
  --learning_rate="3e-4" \
10
+ --warmup_steps="500" \
11
  --save_steps="200" \
12
  --eval_steps="400" \
13
+ --save_total_limit="3" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
  --length_column_name="input_length" \
.ipynb_checkpoints/run_wav2vec2_lm-checkpoint.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ import torch
4
+ import re
5
+ from datasets import load_dataset, load_metric
6
+ from transformers import Wav2Vec2Processor, AutoModelForCTC, Wav2Vec2ProcessorWithLM
7
+ # from transformers.models.wav2vec2.processing_wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
8
+ import torchaudio.functional as F
9
+ import torch
10
+
11
+ # decide if lm should be used for decoding or not via command line
12
+ do_lm = bool(int(sys.argv[1]))
13
+ eval_size = int(sys.argv[2])
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ model_path = "./"
18
+
19
+ wer = load_metric("wer")
20
+ cer = load_metric("cer")
21
+
22
+ # load model and processor
23
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path) if do_lm else Wav2Vec2Processor.from_pretrained(model_path)
24
+ model = AutoModelForCTC.from_pretrained(model_path).to(device)
25
+
26
+ ds = load_dataset("common_voice", "es", split="test", streaming=True)
27
+ ds_iter = iter(ds)
28
+
29
+
30
+ references = []
31
+ predictions = []
32
+
33
+
34
+ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
35
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
36
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
37
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
38
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]
39
+ chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
40
+
41
+
42
+ for _ in range(eval_size):
43
+ sample = next(ds_iter)
44
+ resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).numpy()
45
+
46
+ input_values = processor(resampled_audio, return_tensors="pt", sampling_rate=16_000).input_values
47
+ with torch.no_grad():
48
+ logits = model(input_values.to(device)).logits.cpu()
49
+
50
+ if do_lm:
51
+ output_str = processor.batch_decode(logits)[0].lower()
52
+ else:
53
+ pred_ids = torch.argmax(logits, dim=-1)
54
+ output_str = processor.batch_decode(pred_ids)[0].lower()
55
+
56
+ ref_str = re.sub(chars_to_ignore_regex, "", sample["sentence"]).lower()
57
+
58
+ # replace long empty strings by a single string
59
+ ref_str = " ".join(ref_str.split())
60
+
61
+ print(f"Pred: {output_str} | Target: {ref_str}")
62
+ print(50 * "=")
63
+
64
+ references.append(ref_str)
65
+ predictions.append(output_str)
66
+
67
+ print(f"WER: {wer.compute(predictions=predictions, references=references) * 100}")
68
+ print(f"CER: {cer.compute(predictions=predictions, references=references) * 100}")
eval.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import re
4
+ from typing import Dict
5
+
6
+ import torch
7
+ from datasets import Audio, Dataset, load_dataset, load_metric
8
+
9
+ from transformers import AutoFeatureExtractor, pipeline
10
+
11
+
12
+ def log_results(result: Dataset, args: Dict[str, str]):
13
+ """DO NOT CHANGE. This function computes and logs the result metrics."""
14
+
15
+ log_outputs = args.log_outputs
16
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
17
+
18
+ # load metric
19
+ wer = load_metric("wer")
20
+ cer = load_metric("cer")
21
+
22
+ # compute metrics
23
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
24
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
25
+
26
+ # print & log results
27
+ result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
28
+ print(result_str)
29
+
30
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
31
+ f.write(result_str)
32
+
33
+ # log all results in text file. Possibly interesting for analysis
34
+ if log_outputs is not None:
35
+ pred_file = f"log_{dataset_id}_predictions.txt"
36
+ target_file = f"log_{dataset_id}_targets.txt"
37
+
38
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
39
+
40
+ # mapping function to write output
41
+ def write_to_file(batch, i):
42
+ p.write(f"{i}" + "\n")
43
+ p.write(batch["prediction"] + "\n")
44
+ t.write(f"{i}" + "\n")
45
+ t.write(batch["target"] + "\n")
46
+
47
+ result.map(write_to_file, with_indices=True)
48
+
49
+
50
+ def normalize_text(text: str) -> str:
51
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
+
53
+ chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
54
+
55
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
56
+
57
+ # In addition, we can normalize the target text, e.g. removing new lines characters etc...
58
+ # note that order is important here!
59
+ token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
60
+
61
+ for t in token_sequences_to_ignore:
62
+ text = " ".join(text.split(t))
63
+
64
+ return text
65
+
66
+
67
+ def main(args):
68
+ # load dataset
69
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
70
+
71
+ # for testing: only process the first two examples as a test
72
+ # dataset = dataset.select(range(10))
73
+
74
+ # load processor
75
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
76
+ sampling_rate = feature_extractor.sampling_rate
77
+
78
+ # resample audio
79
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
80
+
81
+ # load eval pipeline
82
+ if args.device is None:
83
+ args.device = 0 if torch.cuda.is_available() else -1
84
+ asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
85
+
86
+ # map function to decode audio
87
+ def map_to_pred(batch):
88
+ prediction = asr(
89
+ batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
90
+ )
91
+
92
+ batch["prediction"] = prediction["text"].replace("<s>","")
93
+ batch["target"] = normalize_text(batch["sentence"])
94
+ return batch
95
+
96
+ # run inference on all examples
97
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
98
+
99
+ # compute and log_results
100
+ # do not change function below
101
+ log_results(result, args)
102
+
103
+
104
+ if __name__ == "__main__":
105
+ parser = argparse.ArgumentParser()
106
+
107
+ parser.add_argument(
108
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
109
+ )
110
+ parser.add_argument(
111
+ "--dataset",
112
+ type=str,
113
+ required=True,
114
+ help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
115
+ )
116
+ parser.add_argument(
117
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
118
+ )
119
+ parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
120
+ parser.add_argument(
121
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
122
+ )
123
+ parser.add_argument(
124
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
125
+ )
126
+ parser.add_argument(
127
+ "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
128
+ )
129
+ parser.add_argument(
130
+ "--device",
131
+ type=int,
132
+ default=None,
133
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
134
+ )
135
+ args = parser.parse_args()
136
+
137
+ main(args)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bb4a4004eb1787be92384131690112c8cddd0e2583120210818b583b8f91543
3
  size 1262321393
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:602976fa45f06c0d1a97cb892978c576eaaeb5dd4f45a332752ddaecdc256eb2
3
  size 1262321393
run.sh CHANGED
@@ -4,13 +4,13 @@ python run_speech_recognition_ctc.py \
4
  --dataset_config_name="hi" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
- --max_steps="5000" \
8
  --per_device_train_batch_size="16" \
9
  --learning_rate="3e-4" \
10
- --warmup_steps="200" \
11
  --save_steps="200" \
12
  --eval_steps="400" \
13
- --save_total_limit="1" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
  --length_column_name="input_length" \
 
4
  --dataset_config_name="hi" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
+ --max_steps="8000" \
8
  --per_device_train_batch_size="16" \
9
  --learning_rate="3e-4" \
10
+ --warmup_steps="500" \
11
  --save_steps="200" \
12
  --eval_steps="400" \
13
+ --save_total_limit="3" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
  --length_column_name="input_length" \
run_wav2vec2_lm.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ import torch
4
+ import re
5
+ from datasets import load_dataset, load_metric
6
+ from transformers import Wav2Vec2Processor, AutoModelForCTC, Wav2Vec2ProcessorWithLM
7
+ # from transformers.models.wav2vec2.processing_wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
8
+ import torchaudio.functional as F
9
+ import torch
10
+
11
+ # decide if lm should be used for decoding or not via command line
12
+ do_lm = bool(int(sys.argv[1]))
13
+ eval_size = int(sys.argv[2])
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ model_path = "./"
18
+
19
+ wer = load_metric("wer")
20
+ cer = load_metric("cer")
21
+
22
+ # load model and processor
23
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path) if do_lm else Wav2Vec2Processor.from_pretrained(model_path)
24
+ model = AutoModelForCTC.from_pretrained(model_path).to(device)
25
+
26
+ ds = load_dataset("common_voice", "es", split="test", streaming=True)
27
+ ds_iter = iter(ds)
28
+
29
+
30
+ references = []
31
+ predictions = []
32
+
33
+
34
+ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
35
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
36
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
37
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
38
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]
39
+ chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
40
+
41
+
42
+ for _ in range(eval_size):
43
+ sample = next(ds_iter)
44
+ resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).numpy()
45
+
46
+ input_values = processor(resampled_audio, return_tensors="pt", sampling_rate=16_000).input_values
47
+ with torch.no_grad():
48
+ logits = model(input_values.to(device)).logits.cpu()
49
+
50
+ if do_lm:
51
+ output_str = processor.batch_decode(logits)[0].lower()
52
+ else:
53
+ pred_ids = torch.argmax(logits, dim=-1)
54
+ output_str = processor.batch_decode(pred_ids)[0].lower()
55
+
56
+ ref_str = re.sub(chars_to_ignore_regex, "", sample["sentence"]).lower()
57
+
58
+ # replace long empty strings by a single string
59
+ ref_str = " ".join(ref_str.split())
60
+
61
+ print(f"Pred: {output_str} | Target: {ref_str}")
62
+ print(50 * "=")
63
+
64
+ references.append(ref_str)
65
+ predictions.append(output_str)
66
+
67
+ print(f"WER: {wer.compute(predictions=predictions, references=references) * 100}")
68
+ print(f"CER: {cer.compute(predictions=predictions, references=references) * 100}")
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14da007bc161c5e872cf751d1aff466e7935b1a747b2fd05d274af66cf01f6f9
3
  size 2991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:016a31cdd0a756dd0bed3fa48205873370275d7ddb0e90527bd97c46b6284c3c
3
  size 2991