wiizm commited on
Commit
730c79f
ยท
verified ยท
1 Parent(s): 026d511

Upload app\utils\text_utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app//utils//text_utils.py +197 -0
app//utils//text_utils.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜
3
+ """
4
+
5
+ import re
6
+ from typing import List, Optional
7
+
8
+ from app.core.logger import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
13
+ def clean_text(text: str) -> str:
14
+ """
15
+ ํ…์ŠคํŠธ ์ •๋ฆฌ (๊ณต๋ฐฑ ์ •๊ทœํ™” ๋“ฑ)
16
+
17
+ Args:
18
+ text: ์ •๋ฆฌํ•  ํ…์ŠคํŠธ
19
+
20
+ Returns:
21
+ ์ •๋ฆฌ๋œ ํ…์ŠคํŠธ
22
+ """
23
+ if not text:
24
+ return ''
25
+
26
+ # ์—ฐ์†๋œ ๊ณต๋ฐฑ ์ œ๊ฑฐ
27
+ text = re.sub(r'\s+', ' ', text)
28
+ # ์•ž๋’ค ๊ณต๋ฐฑ ์ œ๊ฑฐ
29
+ text = text.strip()
30
+
31
+ return text
32
+
33
+
34
+ def split_text_into_chunks(
35
+ text: str,
36
+ min_chunk_size: int = 200,
37
+ max_chunk_size: int = 1000,
38
+ overlap: int = 150
39
+ ) -> List[str]:
40
+ """
41
+ ์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ…์ŠคํŠธ ์ฒญํ‚น (๋ฌธ์žฅ๊ณผ ๋ฌธ๋‹จ ๊ฒฝ๊ณ„๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ๋ถ„ํ• )
42
+
43
+ Args:
44
+ text: ๋ถ„ํ• ํ•  ํ…์ŠคํŠธ
45
+ min_chunk_size: ์ตœ์†Œ ์ฒญํฌ ํฌ๊ธฐ
46
+ max_chunk_size: ์ตœ๋Œ€ ์ฒญํฌ ํฌ๊ธฐ
47
+ overlap: ์˜ค๋ฒ„๋žฉ ํฌ๊ธฐ
48
+
49
+ Returns:
50
+ ๋ถ„ํ• ๋œ ์ฒญํฌ ๋ฆฌ์ŠคํŠธ
51
+ """
52
+ if not text or len(text.strip()) == 0:
53
+ return []
54
+
55
+ # 1๋‹จ๊ณ„: ๋ฌธ๋‹จ ๋‹จ์œ„๋กœ ๋ถ„ํ•  (๋นˆ ์ค„ ๊ธฐ์ค€)
56
+ paragraphs = re.split(r'\n\s*\n', text.strip())
57
+ paragraphs = [p.strip() for p in paragraphs if p.strip()]
58
+
59
+ if not paragraphs:
60
+ return []
61
+
62
+ # 2๋‹จ๊ณ„: ๊ฐ ๋ฌธ๋‹จ์„ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋ถ„ํ• 
63
+ sentence_pattern = r'([.!?]+)(?=\s+|$)'
64
+
65
+ all_sentences: List[str] = []
66
+ for para in paragraphs:
67
+ parts = re.split(sentence_pattern, para)
68
+ combined_sentences: List[str] = []
69
+ current_sentence = ""
70
+
71
+ for part in parts:
72
+ if not part.strip():
73
+ continue
74
+ if re.match(r'^[.!?]+$', part):
75
+ # ๊ตฌ๋‘์ ์ธ ๊ฒฝ์šฐ ํ˜„์žฌ ๋ฌธ์žฅ์— ์ถ”๊ฐ€ํ•˜๊ณ  ๋ฌธ์žฅ ์™„์„ฑ
76
+ current_sentence += part
77
+ if current_sentence.strip():
78
+ combined_sentences.append(current_sentence.strip())
79
+ current_sentence = ""
80
+ else:
81
+ # ํ…์ŠคํŠธ์ธ ๊ฒฝ์šฐ ํ˜„์žฌ ๋ฌธ์žฅ์— ์ถ”๊ฐ€
82
+ current_sentence += part
83
+
84
+ # ๋งˆ์ง€๋ง‰ ๋ฌธ์žฅ ์ฒ˜๋ฆฌ
85
+ if current_sentence.strip():
86
+ combined_sentences.append(current_sentence.strip())
87
+
88
+ # ๋ฌธ์žฅ์ด ํ•˜๋‚˜๋„ ์—†๋Š” ๊ฒฝ์šฐ
89
+ if not combined_sentences and para.strip():
90
+ combined_sentences.append(para.strip())
91
+
92
+ all_sentences.extend(combined_sentences)
93
+
94
+ if not all_sentences:
95
+ return [text] if text.strip() else []
96
+
97
+ # 3๋‹จ๊ณ„: ๋ฌธ์žฅ๋“ค์„ ๋ชจ์•„์„œ ์˜๋ฏธ ์žˆ๋Š” ์ฒญํฌ ์ƒ์„ฑ
98
+ chunks: List[str] = []
99
+ current_chunk: List[str] = []
100
+ current_size = 0
101
+
102
+ for sentence in all_sentences:
103
+ sentence_size = len(sentence)
104
+
105
+ # ํ˜„์žฌ ์ฒญํฌ์— ๋ฌธ์žฅ ์ถ”๊ฐ€ ์‹œ ์ตœ๋Œ€ ํฌ๊ธฐ๋ฅผ ์ดˆ๊ณผํ•˜๋Š” ๊ฒฝ์šฐ
106
+ if current_size + sentence_size > max_chunk_size and current_chunk:
107
+ # ํ˜„์žฌ ์ฒญํฌ ์ €์žฅ
108
+ chunk_text = '\n'.join(current_chunk)
109
+ if len(chunk_text.strip()) >= min_chunk_size:
110
+ chunks.append(chunk_text)
111
+ else:
112
+ # ์ตœ์†Œ ํฌ๊ธฐ ๋ฏธ๋งŒ์ด๋ฉด ๋‹ค์Œ ์ฒญํฌ์™€ ๋ณ‘ํ•ฉ
113
+ if chunks:
114
+ chunks[-1] = chunks[-1] + '\n' + chunk_text
115
+ else:
116
+ chunks.append(chunk_text)
117
+
118
+ # ์˜ค๋ฒ„๋žฉ์„ ์œ„ํ•œ ๋ฌธ์žฅ ์œ ์ง€
119
+ overlap_sentences: List[str] = []
120
+ overlap_size = 0
121
+ for s in reversed(current_chunk):
122
+ if overlap_size + len(s) <= overlap:
123
+ overlap_sentences.insert(0, s)
124
+ overlap_size += len(s) + 1
125
+ else:
126
+ break
127
+
128
+ current_chunk = overlap_sentences + [sentence]
129
+ current_size = overlap_size + sentence_size
130
+ else:
131
+ # ํ˜„์žฌ ์ฒญํฌ์— ๋ฌธ์žฅ ์ถ”๊ฐ€
132
+ current_chunk.append(sentence)
133
+ current_size += sentence_size + 1
134
+
135
+ # ๋งˆ์ง€๋ง‰ ์ฒญํฌ ์ถ”๊ฐ€
136
+ if current_chunk:
137
+ chunk_text = '\n'.join(current_chunk)
138
+ if chunks and len(chunk_text.strip()) < min_chunk_size:
139
+ chunks[-1] = chunks[-1] + '\n' + chunk_text
140
+ else:
141
+ chunks.append(chunk_text)
142
+
143
+ # ๋นˆ ์ฒญํฌ ์ œ๊ฑฐ ๋ฐ ์ตœ์†Œ ํฌ๊ธฐ ๋ฏธ๋งŒ ์ฒญํฌ ์ฒ˜๋ฆฌ
144
+ final_chunks: List[str] = []
145
+ for chunk in chunks:
146
+ chunk = chunk.strip()
147
+ if chunk and len(chunk) >= min_chunk_size:
148
+ final_chunks.append(chunk)
149
+ elif chunk:
150
+ if final_chunks:
151
+ final_chunks[-1] = final_chunks[-1] + '\n' + chunk
152
+ else:
153
+ final_chunks.append(chunk)
154
+
155
+ return final_chunks if final_chunks else [text] if text.strip() else []
156
+
157
+
158
+ def extract_chapter_number(text: str) -> Optional[int]:
159
+ """
160
+ ํ…์ŠคํŠธ์—์„œ ์ฑ•ํ„ฐ ๋ฒˆํ˜ธ ์ถ”์ถœ
161
+
162
+ Args:
163
+ text: ์ฑ•ํ„ฐ ๋ฒˆํ˜ธ๋ฅผ ์ถ”์ถœํ•  ํ…์ŠคํŠธ
164
+
165
+ Returns:
166
+ ์ฑ•ํ„ฐ ๋ฒˆํ˜ธ, ์—†์œผ๋ฉด None
167
+ """
168
+ # ๋‹ค์–‘ํ•œ ์ฑ•ํ„ฐ ํŒจํ„ด ๋งค์นญ
169
+ patterns = [
170
+ r'์ œ\s*(\d+)\s*์žฅ', # ์ œ1์žฅ, ์ œ 1 ์žฅ
171
+ r'์ œ\s*(\d+)\s*ํ™”', # ์ œ1ํ™”
172
+ r'Chapter\s*(\d+)', # Chapter 1
173
+ r'CHAPTER\s*(\d+)', # CHAPTER 1
174
+ r'Ch\.\s*(\d+)', # Ch. 1
175
+ r'(\d+)\s*์žฅ', # 1์žฅ
176
+ r'(\d+)\s*ํ™”', # 1ํ™”
177
+ r'chap\.\s*(\d+)', # chap. 1
178
+ r'ch\s*(\d+)', # ch 1
179
+ r'(\d+)\s*็ซ ', # 1็ซ 
180
+ ]
181
+
182
+ # ํ…์ŠคํŠธ์˜ ์ฒ˜์Œ 500์ž๋งŒ ๊ฒ€์‚ฌ
183
+ search_text = text[:500]
184
+
185
+ for pattern in patterns:
186
+ match = re.search(pattern, search_text, re.IGNORECASE)
187
+ if match:
188
+ try:
189
+ chapter_num = int(match.group(1))
190
+ return chapter_num
191
+ except (ValueError, AttributeError):
192
+ continue
193
+
194
+ return None
195
+
196
+
197
+