model_3ed0k4 / src /data_processing.py
3ed0k4's picture
Upload 12 files
65224b2 verified
# src/data_processing.py
import os
import json
import csv
from pdfminer.high_level import extract_text
import pandas as pd
from utils import tokenize, build_vocab, save_vocab
def read_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def read_pdf(file_path):
return extract_text(file_path)
def read_json(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return json.load(file)
def read_csv(file_path):
df = pd.read_csv(file_path)
# Concatenate all text columns into a single string
text = ' '.join(df.astype(str).values.flatten())
return text
def process_file(file_path):
_, ext = os.path.splitext(file_path)
ext = ext.lower()
if ext == '.txt':
return read_txt(file_path)
elif ext == '.pdf':
return read_pdf(file_path)
elif ext == '.json':
return read_json(file_path)
elif ext == '.csv':
return read_csv(file_path)
else:
print(f"Unsupported file format: {ext}")
return None
def load_data(raw_data_dir='data/raw'):
all_data = []
for root, dirs, files in os.walk(raw_data_dir):
for file in files:
file_path = os.path.join(root, file)
data = process_file(file_path)
if data:
all_data.append(data)
return all_data
def prepare_training_data(processed_data, vocab_path='vocab.json'):
tokenized_texts = []
for entry in processed_data:
if isinstance(entry, str):
tokens = tokenize(entry)
tokenized_texts.append(tokens)
elif isinstance(entry, list):
for item in entry:
if isinstance(item, str):
tokens = tokenize(item)
tokenized_texts.append(tokens)
vocab = build_vocab(tokenized_texts)
save_vocab(vocab, vocab_path)
return tokenized_texts, vocab
def save_tokenized_data(tokenized_texts, filepath='data/processed/tokenized_data.json'):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)
def save_processed_data(processed_data, filepath='data/processed/processed_data.json'):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(processed_data, f, ensure_ascii=False, indent=4)
if __name__ == "__main__":
print("Loading raw data...")
data = load_data()
print(f"Loaded {len(data)} data entries.")
print("Preparing training data...")
tokenized_texts, vocab = prepare_training_data(data)
save_tokenized_data(tokenized_texts)
save_processed_data(data)
print("Data processing complete.")
print(f"Vocabulary size: {len(vocab)}")