model_3ed0k4 / src /data.py
3ed0k4's picture
Upload 12 files
65224b2 verified
raw
history blame
1.09 kB
# After saving processed_data.json
from utils import tokenize, build_vocab, save_vocab
from utils import load_data
import json
def prepare_training_data(processed_data, vocab_path='vocab.json'):
tokenized_texts = []
for entry in processed_data:
if isinstance(entry, str):
tokens = tokenize(entry)
tokenized_texts.append(tokens)
elif isinstance(entry, list):
for item in entry:
if isinstance(item, str):
tokens = tokenize(item)
tokenized_texts.append(tokens)
vocab = build_vocab(tokenized_texts)
save_vocab(vocab, vocab_path)
return tokenized_texts, vocab
if __name__ == "__main__":
data = load_data()
tokenized_texts, vocab = prepare_training_data(data)
# Save tokenized data
with open('data/processed/tokenized_data.json', 'w', encoding='utf-8') as f:
json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)
print("Data processing complete. Tokenized data saved to data/processed/tokenized_data.json")