pretrain model
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -74,6 +74,16 @@ def batch_iterator(name=None):
|
|
74 |
del dataset
|
75 |
gc.collect()
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# code
|
78 |
if name in (None, 'bigcode/the-stack-smol-xs'):
|
79 |
dataset = (
|
@@ -105,22 +115,42 @@ def batch_iterator(name=None):
|
|
105 |
del dataset
|
106 |
gc.collect()
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
# code
|
109 |
-
if name in (None, '
|
110 |
dataset = load_dataset(name, split='train')
|
111 |
-
|
112 |
for row in dataset:
|
113 |
-
yield row['
|
114 |
-
|
115 |
del dataset
|
116 |
gc.collect()
|
117 |
|
118 |
-
#
|
119 |
-
if name in (None, '
|
120 |
dataset = load_dataset(name, split='train')
|
121 |
|
122 |
for row in dataset:
|
123 |
-
yield row['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
del dataset
|
126 |
gc.collect()
|
@@ -187,9 +217,12 @@ datasets_names = [
|
|
187 |
'xu-song/cc100-samples',
|
188 |
'ontocord/fineweb-permissive-multilingual-2m',
|
189 |
'nampdn-ai/tiny-textbooks',
|
190 |
-
'bigcode/the-stack-smol-xs',
|
191 |
'nampdn-ai/tiny-codes',
|
|
|
192 |
'm-a-p/CodeFeedback-Filtered-Instruction',
|
|
|
|
|
|
|
193 |
'gair-prox/open-web-math-pro',
|
194 |
'ajibawa-2023/Maths-College',
|
195 |
'microsoft/orca-math-word-problems-200k',
|
|
|
74 |
del dataset
|
75 |
gc.collect()
|
76 |
|
77 |
+
# code
|
78 |
+
if name in (None, 'nampdn-ai/tiny-codes'):
|
79 |
+
dataset = load_dataset(name, split='train')
|
80 |
+
|
81 |
+
for row in dataset:
|
82 |
+
yield row['prompt'] + '\n' + row['response']
|
83 |
+
|
84 |
+
del dataset
|
85 |
+
gc.collect()
|
86 |
+
|
87 |
# code
|
88 |
if name in (None, 'bigcode/the-stack-smol-xs'):
|
89 |
dataset = (
|
|
|
115 |
del dataset
|
116 |
gc.collect()
|
117 |
|
118 |
+
# text + code
|
119 |
+
if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
|
120 |
+
dataset = load_dataset(name, split='train')
|
121 |
+
|
122 |
+
for row in dataset:
|
123 |
+
yield row['query'] + '\n' + row['answer']
|
124 |
+
|
125 |
+
del dataset
|
126 |
+
gc.collect()
|
127 |
+
|
128 |
# code
|
129 |
+
if name in (None, 'jtatman/python-code-dataset-500k'):
|
130 |
dataset = load_dataset(name, split='train')
|
131 |
+
|
132 |
for row in dataset:
|
133 |
+
yield row['instruction'] + '\n' + row['output']
|
134 |
+
|
135 |
del dataset
|
136 |
gc.collect()
|
137 |
|
138 |
+
# code
|
139 |
+
if name in (None, 'iamtarun/python_code_instructions_18k_alpaca'):
|
140 |
dataset = load_dataset(name, split='train')
|
141 |
|
142 |
for row in dataset:
|
143 |
+
yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
|
144 |
+
|
145 |
+
del dataset
|
146 |
+
gc.collect()
|
147 |
+
|
148 |
+
# code
|
149 |
+
if name in (None, 'HuggingFaceH4/CodeAlpaca_20K'):
|
150 |
+
dataset = load_dataset(name, split='train')
|
151 |
+
|
152 |
+
for row in dataset:
|
153 |
+
yield row['prompt'] + '\n' + row['completion']
|
154 |
|
155 |
del dataset
|
156 |
gc.collect()
|
|
|
217 |
'xu-song/cc100-samples',
|
218 |
'ontocord/fineweb-permissive-multilingual-2m',
|
219 |
'nampdn-ai/tiny-textbooks',
|
|
|
220 |
'nampdn-ai/tiny-codes',
|
221 |
+
'bigcode/the-stack-smol-xs',
|
222 |
'm-a-p/CodeFeedback-Filtered-Instruction',
|
223 |
+
'jtatman/python-code-dataset-500k',
|
224 |
+
'iamtarun/python_code_instructions_18k_alpaca',
|
225 |
+
'',
|
226 |
'gair-prox/open-web-math-pro',
|
227 |
'ajibawa-2023/Maths-College',
|
228 |
'microsoft/orca-math-word-problems-200k',
|
scripts/{model.yaml → pretrain-model.yaml}
RENAMED
File without changes
|