spam-indobert / src /data_utils.py
yondikavl's picture
Upload folder using huggingface_hub
932d265 verified
raw
history blame contribute delete
293 Bytes
import re
def preprocess_text(text):
text = text.lower().strip()
text = re.sub(r'http\S+', '[URL]', text)
text = re.sub(r'\d+', '[NUM]', text)
return text
def tokenize_function(tokenizer, batch):
return tokenizer(batch["text"], truncation=True, max_length=512)