Upload loaders.py with huggingface_hub
Browse files- loaders.py +25 -1
loaders.py
CHANGED
@@ -1,3 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import importlib
|
2 |
import itertools
|
3 |
import os
|
@@ -31,7 +55,7 @@ class Loader(SourceOperator):
|
|
31 |
# The loader can use this value to limit the amount of data downloaded from the source
|
32 |
# to reduce loading time. However, this may not always be possible, so the
|
33 |
# loader may ingore this. In any case, the recipe, will limit the number of instances in the returned
|
34 |
-
# stream
|
35 |
loader_limit: int = None
|
36 |
pass
|
37 |
|
|
|
1 |
+
"""This section describes unitxt loaders.
|
2 |
+
|
3 |
+
Loaders: Generators of Unitxt Multistreams from existing date sources
|
4 |
+
==============================================================
|
5 |
+
|
6 |
+
Unitxt is all about readily preparing of any given data source for feeding into any given language model, and then,
|
7 |
+
postprocessing the model's output, preparing it for any given evaluator.
|
8 |
+
|
9 |
+
Through that journey, the data advances in the form of Unitxt Multistream, undergoing a sequential application
|
10 |
+
of various off the shelf operators (i.e, picked from Unitxt catalog), or operators easily implemented by inheriting.
|
11 |
+
The journey starts by a Unitxt Loeader bearing a Multistream from the given datasource.
|
12 |
+
A loader, therefore, is the first item on any Unitxt Recipe.
|
13 |
+
|
14 |
+
Unitxt catalog contains several loaders for the most popular datasource formats.
|
15 |
+
All these loaders inherit from Loader, and hence, implementing a loader to expand over a new type of datasource, is
|
16 |
+
straight forward.
|
17 |
+
|
18 |
+
Operators in Unitxt catalog:
|
19 |
+
LoadHF : loads from Huggingface dataset.
|
20 |
+
LoadCSV: loads from csv (comma separated value) files
|
21 |
+
LoadFromKaggle: loads datasets from the kaggle.com community site
|
22 |
+
LoadFromIBMCloud: loads a dataset from the IBM cloud.
|
23 |
+
------------------------
|
24 |
+
"""
|
25 |
import importlib
|
26 |
import itertools
|
27 |
import os
|
|
|
55 |
# The loader can use this value to limit the amount of data downloaded from the source
|
56 |
# to reduce loading time. However, this may not always be possible, so the
|
57 |
# loader may ingore this. In any case, the recipe, will limit the number of instances in the returned
|
58 |
+
# stream, after load is complete.
|
59 |
loader_limit: int = None
|
60 |
pass
|
61 |
|