import json from tqdm import tqdm from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('gpt2') for i in tqdm(range(298)): with open(f'wikipedia_json_64_filtered/wikipedia.segmented.nltk.split.seq64.{i}.json', 'r') as f: rows = json.load(f) tokens = [row['gpt2_token'] for row in rows] texts = tokenizer.batch_decode(tokens) with open(f'wikipedia/{i}.txt', 'w') as f: for txt in texts: f.write(txt.strip() + '\n')