Spaces:

3ed0k4
/

model_3ed0k4

Sleeping

model_3ed0k4 / src /utils.py

Upload 12 files

65224b2 verified 5 months ago

1.15 kB

	# src/utils.py
	import re
	from collections import Counter
	import json

	def tokenize(text):
	"""
	Simple tokenizer that splits text into tokens based on whitespace and punctuation.
	"""
	tokens = re.findall(r'\b\w+\b', text.lower())
	return tokens

	def build_vocab(tokenized_texts, min_freq=2):
	"""
	Builds a vocabulary dictionary from tokenized texts.
	Tokens appearing fewer than `min_freq` times are excluded.
	"""
	counter = Counter()
	for tokens in tokenized_texts:
	counter.update(tokens)
	vocab = {'<PAD>': 0, '<UNK>': 1}
	for word, freq in counter.items():
	if freq >= min_freq:
	vocab[word] = len(vocab)
	return vocab

	def save_vocab(vocab, filepath='vocab.json'):
	"""
	Saves the vocabulary dictionary to a JSON file.
	"""
	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(vocab, f, ensure_ascii=False, indent=4)

	def load_vocab(filepath='vocab.json'):
	"""
	Loads the vocabulary dictionary from a JSON file.
	"""
	with open(filepath, 'r', encoding='utf-8') as f:
	return json.load(f)