mtasic85 commited on
Commit
abd5982
1 Parent(s): c62a845

pretrain model

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -74,6 +74,16 @@ def batch_iterator(name=None):
74
  del dataset
75
  gc.collect()
76
 
 
 
 
 
 
 
 
 
 
 
77
  # code
78
  if name in (None, 'bigcode/the-stack-smol-xs'):
79
  dataset = (
@@ -105,22 +115,42 @@ def batch_iterator(name=None):
105
  del dataset
106
  gc.collect()
107
 
 
 
 
 
 
 
 
 
 
 
108
  # code
109
- if name in (None, 'nampdn-ai/tiny-codes'):
110
  dataset = load_dataset(name, split='train')
111
-
112
  for row in dataset:
113
- yield row['prompt'] + '\n' + row['response']
114
-
115
  del dataset
116
  gc.collect()
117
 
118
- # text + code
119
- if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
120
  dataset = load_dataset(name, split='train')
121
 
122
  for row in dataset:
123
- yield row['query'] + '\n' + row['answer']
 
 
 
 
 
 
 
 
 
 
124
 
125
  del dataset
126
  gc.collect()
@@ -187,9 +217,12 @@ datasets_names = [
187
  'xu-song/cc100-samples',
188
  'ontocord/fineweb-permissive-multilingual-2m',
189
  'nampdn-ai/tiny-textbooks',
190
- 'bigcode/the-stack-smol-xs',
191
  'nampdn-ai/tiny-codes',
 
192
  'm-a-p/CodeFeedback-Filtered-Instruction',
 
 
 
193
  'gair-prox/open-web-math-pro',
194
  'ajibawa-2023/Maths-College',
195
  'microsoft/orca-math-word-problems-200k',
 
74
  del dataset
75
  gc.collect()
76
 
77
+ # code
78
+ if name in (None, 'nampdn-ai/tiny-codes'):
79
+ dataset = load_dataset(name, split='train')
80
+
81
+ for row in dataset:
82
+ yield row['prompt'] + '\n' + row['response']
83
+
84
+ del dataset
85
+ gc.collect()
86
+
87
  # code
88
  if name in (None, 'bigcode/the-stack-smol-xs'):
89
  dataset = (
 
115
  del dataset
116
  gc.collect()
117
 
118
+ # text + code
119
+ if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
120
+ dataset = load_dataset(name, split='train')
121
+
122
+ for row in dataset:
123
+ yield row['query'] + '\n' + row['answer']
124
+
125
+ del dataset
126
+ gc.collect()
127
+
128
  # code
129
+ if name in (None, 'jtatman/python-code-dataset-500k'):
130
  dataset = load_dataset(name, split='train')
131
+
132
  for row in dataset:
133
+ yield row['instruction'] + '\n' + row['output']
134
+
135
  del dataset
136
  gc.collect()
137
 
138
+ # code
139
+ if name in (None, 'iamtarun/python_code_instructions_18k_alpaca'):
140
  dataset = load_dataset(name, split='train')
141
 
142
  for row in dataset:
143
+ yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
144
+
145
+ del dataset
146
+ gc.collect()
147
+
148
+ # code
149
+ if name in (None, 'HuggingFaceH4/CodeAlpaca_20K'):
150
+ dataset = load_dataset(name, split='train')
151
+
152
+ for row in dataset:
153
+ yield row['prompt'] + '\n' + row['completion']
154
 
155
  del dataset
156
  gc.collect()
 
217
  'xu-song/cc100-samples',
218
  'ontocord/fineweb-permissive-multilingual-2m',
219
  'nampdn-ai/tiny-textbooks',
 
220
  'nampdn-ai/tiny-codes',
221
+ 'bigcode/the-stack-smol-xs',
222
  'm-a-p/CodeFeedback-Filtered-Instruction',
223
+ 'jtatman/python-code-dataset-500k',
224
+ 'iamtarun/python_code_instructions_18k_alpaca',
225
+ '',
226
  'gair-prox/open-web-math-pro',
227
  'ajibawa-2023/Maths-College',
228
  'microsoft/orca-math-word-problems-200k',
scripts/{model.yaml → pretrain-model.yaml} RENAMED
File without changes