retrained with new data from opensubs. qwerty subs
Browse files- config.json +1 -1
- dataset_dict.json +1 -0
- eval/data-00000-of-00001.arrow +3 -0
- eval/dataset_info.json +65 -0
- eval/state.json +13 -0
- model.safetensors +1 -1
- tokenizer_config.json +0 -4
- train/data-00000-of-00002.arrow +3 -0
- train/data-00001-of-00002.arrow +3 -0
- train/dataset_info.json +65 -0
- train/state.json +16 -0
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"T5ForConditionalGeneration"
|
5 |
],
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "t5-small",
|
3 |
"architectures": [
|
4 |
"T5ForConditionalGeneration"
|
5 |
],
|
dataset_dict.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"splits": ["train", "eval"]}
|
eval/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e3914c2653952d5ef669d4dec5cfa59bd7587d0999b3038929d02ef51c3f3f7
|
3 |
+
size 187010360
|
eval/dataset_info.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "csv",
|
3 |
+
"citation": "",
|
4 |
+
"config_name": "default",
|
5 |
+
"dataset_name": "csv",
|
6 |
+
"dataset_size": 472235292,
|
7 |
+
"description": "",
|
8 |
+
"download_checksums": {
|
9 |
+
"/home/ubuntu/wwdrive2/14March/clean_train.csv": {
|
10 |
+
"num_bytes": 369392984,
|
11 |
+
"checksum": null
|
12 |
+
},
|
13 |
+
"/home/ubuntu/wwdrive2/14March/clean_eval.csv": {
|
14 |
+
"num_bytes": 71786407,
|
15 |
+
"checksum": null
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"download_size": 441179391,
|
19 |
+
"features": {
|
20 |
+
"input_ids": {
|
21 |
+
"feature": {
|
22 |
+
"dtype": "int32",
|
23 |
+
"_type": "Value"
|
24 |
+
},
|
25 |
+
"_type": "Sequence"
|
26 |
+
},
|
27 |
+
"attention_mask": {
|
28 |
+
"feature": {
|
29 |
+
"dtype": "int8",
|
30 |
+
"_type": "Value"
|
31 |
+
},
|
32 |
+
"_type": "Sequence"
|
33 |
+
},
|
34 |
+
"labels": {
|
35 |
+
"feature": {
|
36 |
+
"dtype": "int64",
|
37 |
+
"_type": "Value"
|
38 |
+
},
|
39 |
+
"_type": "Sequence"
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"homepage": "",
|
43 |
+
"license": "",
|
44 |
+
"size_in_bytes": 913414683,
|
45 |
+
"splits": {
|
46 |
+
"train": {
|
47 |
+
"name": "train",
|
48 |
+
"num_bytes": 394316099,
|
49 |
+
"num_examples": 4507525,
|
50 |
+
"dataset_name": "csv"
|
51 |
+
},
|
52 |
+
"eval": {
|
53 |
+
"name": "eval",
|
54 |
+
"num_bytes": 77919193,
|
55 |
+
"num_examples": 1127410,
|
56 |
+
"dataset_name": "csv"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"version": {
|
60 |
+
"version_str": "0.0.0",
|
61 |
+
"major": 0,
|
62 |
+
"minor": 0,
|
63 |
+
"patch": 0
|
64 |
+
}
|
65 |
+
}
|
eval/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "150697196cb85c2e",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": "eval"
|
13 |
+
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 241984552
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35a20c7f6786661d16cdff57b5432726582b676eb3c02e5f7a869c58db2dd3de
|
3 |
size 241984552
|
tokenizer_config.json
CHANGED
@@ -930,12 +930,8 @@
|
|
930 |
"clean_up_tokenization_spaces": true,
|
931 |
"eos_token": "</s>",
|
932 |
"extra_ids": 100,
|
933 |
-
"max_length": 512,
|
934 |
"model_max_length": 512,
|
935 |
"pad_token": "<pad>",
|
936 |
-
"stride": 0,
|
937 |
"tokenizer_class": "T5Tokenizer",
|
938 |
-
"truncation_side": "right",
|
939 |
-
"truncation_strategy": "longest_first",
|
940 |
"unk_token": "<unk>"
|
941 |
}
|
|
|
930 |
"clean_up_tokenization_spaces": true,
|
931 |
"eos_token": "</s>",
|
932 |
"extra_ids": 100,
|
|
|
933 |
"model_max_length": 512,
|
934 |
"pad_token": "<pad>",
|
|
|
935 |
"tokenizer_class": "T5Tokenizer",
|
|
|
|
|
936 |
"unk_token": "<unk>"
|
937 |
}
|
train/data-00000-of-00002.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee121506abd0676a5266c0592070a202642789ef9a38fd5876aa403135c44a04
|
3 |
+
size 446399184
|
train/data-00001-of-00002.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cc62e5c0f5a5137d80e124fb9156e0a60274f953ed6b95347cf7c32df324d25
|
3 |
+
size 425352424
|
train/dataset_info.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "csv",
|
3 |
+
"citation": "",
|
4 |
+
"config_name": "default",
|
5 |
+
"dataset_name": "csv",
|
6 |
+
"dataset_size": 472235292,
|
7 |
+
"description": "",
|
8 |
+
"download_checksums": {
|
9 |
+
"/home/ubuntu/wwdrive2/14March/clean_train.csv": {
|
10 |
+
"num_bytes": 369392984,
|
11 |
+
"checksum": null
|
12 |
+
},
|
13 |
+
"/home/ubuntu/wwdrive2/14March/clean_eval.csv": {
|
14 |
+
"num_bytes": 71786407,
|
15 |
+
"checksum": null
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"download_size": 441179391,
|
19 |
+
"features": {
|
20 |
+
"input_ids": {
|
21 |
+
"feature": {
|
22 |
+
"dtype": "int32",
|
23 |
+
"_type": "Value"
|
24 |
+
},
|
25 |
+
"_type": "Sequence"
|
26 |
+
},
|
27 |
+
"attention_mask": {
|
28 |
+
"feature": {
|
29 |
+
"dtype": "int8",
|
30 |
+
"_type": "Value"
|
31 |
+
},
|
32 |
+
"_type": "Sequence"
|
33 |
+
},
|
34 |
+
"labels": {
|
35 |
+
"feature": {
|
36 |
+
"dtype": "int64",
|
37 |
+
"_type": "Value"
|
38 |
+
},
|
39 |
+
"_type": "Sequence"
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"homepage": "",
|
43 |
+
"license": "",
|
44 |
+
"size_in_bytes": 913414683,
|
45 |
+
"splits": {
|
46 |
+
"train": {
|
47 |
+
"name": "train",
|
48 |
+
"num_bytes": 394316099,
|
49 |
+
"num_examples": 4507525,
|
50 |
+
"dataset_name": "csv"
|
51 |
+
},
|
52 |
+
"eval": {
|
53 |
+
"name": "eval",
|
54 |
+
"num_bytes": 77919193,
|
55 |
+
"num_examples": 1127410,
|
56 |
+
"dataset_name": "csv"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"version": {
|
60 |
+
"version_str": "0.0.0",
|
61 |
+
"major": 0,
|
62 |
+
"minor": 0,
|
63 |
+
"patch": 0
|
64 |
+
}
|
65 |
+
}
|
train/state.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00002.arrow"
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"filename": "data-00001-of-00002.arrow"
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"_fingerprint": "26c810267bb075b4",
|
11 |
+
"_format_columns": null,
|
12 |
+
"_format_kwargs": {},
|
13 |
+
"_format_type": null,
|
14 |
+
"_output_all_columns": false,
|
15 |
+
"_split": "train"
|
16 |
+
}
|