fisherman611 commited on
Commit
5069f64
·
verified ·
1 Parent(s): b6fc713

Update utils/text_processor.py

Browse files
Files changed (1) hide show
  1. utils/text_processor.py +110 -109
utils/text_processor.py CHANGED
@@ -1,109 +1,110 @@
1
- import re
2
- import pandas as pd
3
- from typing import List, Set
4
- from underthesea import word_tokenize
5
- from config import Config
6
-
7
-
8
- class VietnameseTextProcessor:
9
- """Vietnamese text processing utilities for legal documents"""
10
-
11
- def __init__(self):
12
- self.stopwords = self._load_stopwords()
13
-
14
- def _load_stopwords(self) -> Set[str]:
15
- """Load Vietnamese stopwords from file"""
16
- try:
17
- # Try UTF-8 first
18
- with open(Config.STOPWORDS_PATH, "r", encoding="utf-8") as f:
19
- stopwords = set(line.strip() for line in f if line.strip())
20
- return stopwords
21
- except UnicodeDecodeError:
22
- try:
23
- # Try UTF-16 if UTF-8 fails
24
- with open(Config.STOPWORDS_PATH, "r", encoding="utf-16") as f:
25
- stopwords = set(line.strip() for line in f if line.strip())
26
- return stopwords
27
- except UnicodeDecodeError:
28
- try:
29
- # Try with BOM detection
30
- with open(Config.STOPWORDS_PATH, "r", encoding="utf-8-sig") as f:
31
- stopwords = set(line.strip() for line in f if line.strip())
32
- return stopwords
33
- except UnicodeDecodeError:
34
- print(
35
- f"Warning: Unable to decode stopwords file at {Config.STOPWORDS_PATH}"
36
- )
37
- return set()
38
- except FileNotFoundError:
39
- print(f"Warning: Stopwords file not found at {Config.STOPWORDS_PATH}")
40
- return set()
41
- except Exception as e:
42
- print(f"Warning: Error loading stopwords file: {e}")
43
- return set()
44
-
45
- def clean_text(self, text: str) -> str:
46
- """Clean Vietnamese text for processing"""
47
- if not text:
48
- return ""
49
-
50
- # Remove extra whitespace and normalize
51
- text = re.sub(r"\s+", " ", text.strip())
52
-
53
- # Remove special characters but keep Vietnamese characters
54
- text = re.sub(
55
- r"[^\w\s\-\.\,\;\:\!\?\(\)\[\]\"\'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđĐ]",
56
- " ",
57
- text,
58
- )
59
-
60
- # Remove multiple spaces
61
- text = re.sub(r"\s+", " ", text.strip())
62
-
63
- return text
64
-
65
- def tokenize(self, text: str) -> List[str]:
66
- """Tokenize Vietnamese text using underthesea"""
67
- try:
68
- cleaned_text = self.clean_text(text)
69
- tokens = word_tokenize(cleaned_text, format="text").split()
70
- return tokens
71
- except Exception as e:
72
- print(f"Error tokenizing text: {e}")
73
- return text.split()
74
-
75
- def remove_stopwords(self, tokens: List[str]) -> List[str]:
76
- """Remove stopwords from token list"""
77
- return [token for token in tokens if token.lower() not in self.stopwords]
78
-
79
- def preprocess_for_search(self, text: str) -> str:
80
- """Preprocess text for search - tokenize and remove stopwords"""
81
- tokens = self.tokenize(text)
82
- filtered_tokens = self.remove_stopwords(tokens)
83
- return " ".join(filtered_tokens)
84
-
85
- def extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
86
- """Extract keywords from text"""
87
- tokens = self.tokenize(text)
88
- filtered_tokens = self.remove_stopwords(tokens)
89
- keywords = [token for token in filtered_tokens if len(token) >= min_length]
90
- return list(set(keywords)) # Remove duplicates
91
-
92
- def chunk_text(
93
- self, text: str, chunk_size: int = None, overlap: int = None
94
- ) -> List[str]:
95
- """Split text into chunks with overlap"""
96
- if chunk_size is None:
97
- chunk_size = Config.CHUNK_SIZE
98
- if overlap is None:
99
- overlap = Config.CHUNK_OVERLAP
100
-
101
- tokens = self.tokenize(text)
102
- chunks = []
103
-
104
- for i in range(0, len(tokens), chunk_size - overlap):
105
- chunk_tokens = tokens[i : i + chunk_size]
106
- if chunk_tokens:
107
- chunks.append(" ".join(chunk_tokens))
108
-
109
- return chunks
 
 
1
+ import re
2
+ import pandas as pd
3
+ from typing import List, Set
4
+ from underthesea import word_tokenize
5
+ from config import Config
6
+
7
+
8
+ class VietnameseTextProcessor:
9
+ """Vietnamese text processing utilities for legal documents"""
10
+
11
+ def __init__(self):
12
+ self.stopwords = self._load_stopwords()
13
+
14
+ def _load_stopwords(self) -> Set[str]:
15
+ """Load Vietnamese stopwords from file"""
16
+ try:
17
+ # Try UTF-8 first
18
+ with open(Config.STOPWORDS_PATH, "r", encoding="utf-8") as f:
19
+ stopwords = set(line.strip() for line in f if line.strip())
20
+ stopwords = set(['_'.join(word.split()) for word in list(stopwords)])
21
+ return stopwords
22
+ except UnicodeDecodeError:
23
+ try:
24
+ # Try UTF-16 if UTF-8 fails
25
+ with open(Config.STOPWORDS_PATH, "r", encoding="utf-16") as f:
26
+ stopwords = set(line.strip() for line in f if line.strip())
27
+ return stopwords
28
+ except UnicodeDecodeError:
29
+ try:
30
+ # Try with BOM detection
31
+ with open(Config.STOPWORDS_PATH, "r", encoding="utf-8-sig") as f:
32
+ stopwords = set(line.strip() for line in f if line.strip())
33
+ return stopwords
34
+ except UnicodeDecodeError:
35
+ print(
36
+ f"Warning: Unable to decode stopwords file at {Config.STOPWORDS_PATH}"
37
+ )
38
+ return set()
39
+ except FileNotFoundError:
40
+ print(f"Warning: Stopwords file not found at {Config.STOPWORDS_PATH}")
41
+ return set()
42
+ except Exception as e:
43
+ print(f"Warning: Error loading stopwords file: {e}")
44
+ return set()
45
+
46
+ def clean_text(self, text: str) -> str:
47
+ """Clean Vietnamese text for processing"""
48
+ if not text:
49
+ return ""
50
+
51
+ # Remove extra whitespace and normalize
52
+ text = re.sub(r"\s+", " ", text.strip())
53
+
54
+ # Remove special characters but keep Vietnamese characters
55
+ text = re.sub(
56
+ r"[^\w\s\-\.\,\;\:\!\?\(\)\[\]\"\'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđĐ]",
57
+ " ",
58
+ text,
59
+ )
60
+
61
+ # Remove multiple spaces
62
+ text = re.sub(r"\s+", " ", text.strip())
63
+
64
+ return text
65
+
66
+ def tokenize(self, text: str) -> List[str]:
67
+ """Tokenize Vietnamese text using underthesea"""
68
+ try:
69
+ cleaned_text = self.clean_text(text)
70
+ tokens = word_tokenize(cleaned_text, format="text").split()
71
+ return tokens
72
+ except Exception as e:
73
+ print(f"Error tokenizing text: {e}")
74
+ return text.split()
75
+
76
+ def remove_stopwords(self, tokens: List[str]) -> List[str]:
77
+ """Remove stopwords from token list"""
78
+ return [token for token in tokens if token.lower() not in self.stopwords]
79
+
80
+ def preprocess_for_search(self, text: str) -> str:
81
+ """Preprocess text for search - tokenize and remove stopwords"""
82
+ tokens = self.tokenize(text)
83
+ filtered_tokens = self.remove_stopwords(tokens)
84
+ return " ".join(filtered_tokens)
85
+
86
+ def extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
87
+ """Extract keywords from text"""
88
+ tokens = self.tokenize(text)
89
+ filtered_tokens = self.remove_stopwords(tokens)
90
+ keywords = [token for token in filtered_tokens if len(token) >= min_length]
91
+ return list(set(keywords)) # Remove duplicates
92
+
93
+ def chunk_text(
94
+ self, text: str, chunk_size: int = None, overlap: int = None
95
+ ) -> List[str]:
96
+ """Split text into chunks with overlap"""
97
+ if chunk_size is None:
98
+ chunk_size = Config.CHUNK_SIZE
99
+ if overlap is None:
100
+ overlap = Config.CHUNK_OVERLAP
101
+
102
+ tokens = self.tokenize(text)
103
+ chunks = []
104
+
105
+ for i in range(0, len(tokens), chunk_size - overlap):
106
+ chunk_tokens = tokens[i : i + chunk_size]
107
+ if chunk_tokens:
108
+ chunks.append(" ".join(chunk_tokens))
109
+
110
+ return chunks