Spaces:
Runtime error
Runtime error
add option for wikipedia
Browse files
utils.py
CHANGED
@@ -112,7 +112,32 @@ def load_hf_dataset(ds_name: str, ds_config: str = None, ds_split: str = "train"
|
|
112 |
|
113 |
return ds
|
114 |
|
|
|
|
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
|
117 |
"""
|
118 |
Load the model and tokenizer from the HuggingFace Hub.
|
|
|
112 |
|
113 |
return ds
|
114 |
|
115 |
+
def download_wikipedia(ds_name, ds_config):
|
116 |
+
ds = load_dataset(ds_name, ds_config, streaming=True, split="train")
|
117 |
|
118 |
+
def gen():
|
119 |
+
for example in ds:
|
120 |
+
yield {"text": example["text"]}
|
121 |
+
|
122 |
+
ds2 = Dataset.from_generator(gen)
|
123 |
+
|
124 |
+
chunk_size = 200_000
|
125 |
+
|
126 |
+
filenames = []
|
127 |
+
|
128 |
+
Path("wiki_chunks").mkdir(exist_ok=True)
|
129 |
+
|
130 |
+
for chunk_num, start_idx in enumerate(range(0, len(ds2), chunk_size)):
|
131 |
+
end_idx = min(start_idx + chunk_size, len(ds2))
|
132 |
+
|
133 |
+
temp = ds2.select(range(start_idx, end_idx))
|
134 |
+
|
135 |
+
temp.to_parquet(f"wiki_chunks/chunk_{chunk_num}")
|
136 |
+
filenames.append(f"wiki_chunks/chunk_{chunk_num}")
|
137 |
+
|
138 |
+
return load_dataset("parquet", data_files=filenames, split="train")
|
139 |
+
|
140 |
+
|
141 |
def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
|
142 |
"""
|
143 |
Load the model and tokenizer from the HuggingFace Hub.
|