from datasets import load_dataset def get_data(): ds = load_dataset("codeparrot/github-code", streaming=True, split="train") print(next(iter(ds)))