model_3ed0k4 / src /utils.py
3ed0k4's picture
Upload 12 files
65224b2 verified
raw
history blame
1.15 kB
# src/utils.py
import re
from collections import Counter
import json
def tokenize(text):
"""
Simple tokenizer that splits text into tokens based on whitespace and punctuation.
"""
tokens = re.findall(r'\b\w+\b', text.lower())
return tokens
def build_vocab(tokenized_texts, min_freq=2):
"""
Builds a vocabulary dictionary from tokenized texts.
Tokens appearing fewer than `min_freq` times are excluded.
"""
counter = Counter()
for tokens in tokenized_texts:
counter.update(tokens)
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, freq in counter.items():
if freq >= min_freq:
vocab[word] = len(vocab)
return vocab
def save_vocab(vocab, filepath='vocab.json'):
"""
Saves the vocabulary dictionary to a JSON file.
"""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(vocab, f, ensure_ascii=False, indent=4)
def load_vocab(filepath='vocab.json'):
"""
Loads the vocabulary dictionary from a JSON file.
"""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)