Spaces:
Sleeping
Sleeping
# src/utils.py | |
import re | |
from collections import Counter | |
import json | |
def tokenize(text): | |
""" | |
Simple tokenizer that splits text into tokens based on whitespace and punctuation. | |
""" | |
tokens = re.findall(r'\b\w+\b', text.lower()) | |
return tokens | |
def build_vocab(tokenized_texts, min_freq=2): | |
""" | |
Builds a vocabulary dictionary from tokenized texts. | |
Tokens appearing fewer than `min_freq` times are excluded. | |
""" | |
counter = Counter() | |
for tokens in tokenized_texts: | |
counter.update(tokens) | |
vocab = {'<PAD>': 0, '<UNK>': 1} | |
for word, freq in counter.items(): | |
if freq >= min_freq: | |
vocab[word] = len(vocab) | |
return vocab | |
def save_vocab(vocab, filepath='vocab.json'): | |
""" | |
Saves the vocabulary dictionary to a JSON file. | |
""" | |
with open(filepath, 'w', encoding='utf-8') as f: | |
json.dump(vocab, f, ensure_ascii=False, indent=4) | |
def load_vocab(filepath='vocab.json'): | |
""" | |
Loads the vocabulary dictionary from a JSON file. | |
""" | |
with open(filepath, 'r', encoding='utf-8') as f: | |
return json.load(f) | |