DeepActionPotential commited on
Commit
20b3b12
·
verified ·
1 Parent(s): cec02aa

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +60 -50
utils.py CHANGED
@@ -1,50 +1,60 @@
1
-
2
- import joblib
3
-
4
- import re
5
- import string
6
- from nltk.corpus import stopwords
7
-
8
-
9
-
10
- def load_model(model_path):
11
- """
12
- Load a joblib model
13
-
14
- Args:
15
- - model_path (str): path to the model
16
-
17
- Returns:
18
- - model: loaded model
19
- """
20
- model = joblib.load(model_path)
21
- return model
22
-
23
-
24
-
25
- # Set of English stopwords
26
- stop_words = set(stopwords.words('english'))
27
-
28
- def preprocess_text(text:str):
29
- # Step 1: Lowercase
30
- text = text.lower()
31
-
32
- # Step 2: Strip extra whitespace
33
- text = re.sub(r'\s+', ' ', text.strip())
34
-
35
- # Step 3: Remove punctuation
36
- text = text.translate(str.maketrans('', '', string.punctuation))
37
-
38
- # Step 4: Remove stopwords
39
- text = ' '.join(word for word in text.split() if word not in stop_words)
40
-
41
- # Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables)
42
- text = re.sub(r'http\S+|www\.\S+', '', text) # URLs
43
- text = re.sub(r'\S+@\S+\.\S+', '', text) # Emails
44
- text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Hashtags
45
- text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Mentions
46
- text = re.sub(r'\d+', '', text) # Numbers
47
- text = ''.join(ch for ch in text if ch.isprintable()) # Non-printables
48
-
49
- return text
50
-
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import joblib
3
+
4
+ import re
5
+ import string
6
+
7
+ import nltk
8
+
9
+ try:
10
+ nltk.data.find('corpora/stopwords')
11
+ except LookupError:
12
+ nltk.download('stopwords', quiet=True)
13
+
14
+
15
+ from nltk.corpus import stopwords
16
+
17
+
18
+
19
+
20
+ def load_model(model_path):
21
+ """
22
+ Load a joblib model
23
+
24
+ Args:
25
+ - model_path (str): path to the model
26
+
27
+ Returns:
28
+ - model: loaded model
29
+ """
30
+ model = joblib.load(model_path)
31
+ return model
32
+
33
+
34
+
35
+ # Set of English stopwords
36
+ stop_words = set(stopwords.words('english'))
37
+
38
+ def preprocess_text(text:str):
39
+ # Step 1: Lowercase
40
+ text = text.lower()
41
+
42
+ # Step 2: Strip extra whitespace
43
+ text = re.sub(r'\s+', ' ', text.strip())
44
+
45
+ # Step 3: Remove punctuation
46
+ text = text.translate(str.maketrans('', '', string.punctuation))
47
+
48
+ # Step 4: Remove stopwords
49
+ text = ' '.join(word for word in text.split() if word not in stop_words)
50
+
51
+ # Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables)
52
+ text = re.sub(r'http\S+|www\.\S+', '', text) # URLs
53
+ text = re.sub(r'\S+@\S+\.\S+', '', text) # Emails
54
+ text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Hashtags
55
+ text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Mentions
56
+ text = re.sub(r'\d+', '', text) # Numbers
57
+ text = ''.join(ch for ch in text if ch.isprintable()) # Non-printables
58
+
59
+ return text
60
+