File size: 293 Bytes
932d265
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
import re

def preprocess_text(text):
    text = text.lower().strip()
    text = re.sub(r'http\S+', '[URL]', text)
    text = re.sub(r'\d+', '[NUM]', text)
    return text

def tokenize_function(tokenizer, batch):
    return tokenizer(batch["text"], truncation=True, max_length=512)