Feat(data): Allow loading local csv and text (#594)
Browse files* Feat(data): Allow loading local csv and text
* chore: update readme for loading data
- README.md +4 -4
- src/axolotl/utils/data.py +4 -0
README.md
CHANGED
@@ -434,10 +434,10 @@ datasets:
|
|
434 |
- path: vicgalle/alpaca-gpt4
|
435 |
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
436 |
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
437 |
-
ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file
|
438 |
-
data_files: # path to source data files
|
439 |
-
shards: # number of shards to split data into
|
440 |
-
name: # name of dataset configuration to load
|
441 |
|
442 |
# custom user prompt
|
443 |
- path: repo
|
|
|
434 |
- path: vicgalle/alpaca-gpt4
|
435 |
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
436 |
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
437 |
+
ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
|
438 |
+
data_files: # Optional[str] path to source data files
|
439 |
+
shards: # Optional[int] number of shards to split data into
|
440 |
+
name: # Optional[str] name of dataset configuration to load
|
441 |
|
442 |
# custom user prompt
|
443 |
- path: repo
|
src/axolotl/utils/data.py
CHANGED
@@ -183,6 +183,10 @@ def load_tokenized_prepared_datasets(
|
|
183 |
ds_type = "parquet"
|
184 |
elif ".arrow" in d.path:
|
185 |
ds_type = "arrow"
|
|
|
|
|
|
|
|
|
186 |
ds = load_dataset(
|
187 |
ds_type,
|
188 |
name=d.name,
|
|
|
183 |
ds_type = "parquet"
|
184 |
elif ".arrow" in d.path:
|
185 |
ds_type = "arrow"
|
186 |
+
elif ".csv" in d.path:
|
187 |
+
ds_type = "csv"
|
188 |
+
elif ".txt" in d.path:
|
189 |
+
ds_type = "text"
|
190 |
ds = load_dataset(
|
191 |
ds_type,
|
192 |
name=d.name,
|