Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
import tqdm | |
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0, | |
num_parts_to_read: int = 1, columns=None, verbose=False) -> pd.DataFrame: | |
""" | |
Reads num_parts_to_read parquet partitions and returns the resulting pd.DataFrame | |
:param path_to_dataset: directory with parquet partitions | |
:param start_from: partition number to start with | |
:param num_parts_to_read: amount of partitions to read | |
:param columns: columns to read and include | |
:return: pd.DataFrame | |
""" | |
res = [] | |
dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset) | |
if filename.startswith('part')]) | |
start_from = max(0, start_from) | |
if num_parts_to_read < 0: | |
chunks = dataset_paths[start_from: ] | |
else: | |
chunks = dataset_paths[start_from: start_from + num_parts_to_read] | |
if verbose: | |
print('Reading chunks:\n') | |
for chunk in chunks: | |
print(chunk) | |
for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"): | |
chunk = pd.read_parquet(chunk_path, columns=columns) | |
for col_name, col_type in [('amnt', 'float32'), ('hour_diff', 'int32')]: | |
if col_name in chunk.columns: | |
chunk[col_name] = chunk[col_name].astype(col_type) | |
res.append(chunk) | |
return pd.concat(res).reset_index(drop=True) | |