import re | |
def preprocess_text(text): | |
text = text.lower().strip() | |
text = re.sub(r'http\S+', '[URL]', text) | |
text = re.sub(r'\d+', '[NUM]', text) | |
return text | |
def tokenize_function(tokenizer, batch): | |
return tokenizer(batch["text"], truncation=True, max_length=512) | |