bluewhale2025's picture
Improve text splitting and error handling in summarizer
6bd8ed8
import nltk
from typing import Dict, List
import json
from datetime import datetime
import heapq
class DocumentSummarizer:
def __init__(self):
# Set NLTK data path
nltk_data_paths = [
'/usr/local/share/nltk_data',
'/usr/share/nltk_data',
'/usr/local/nltk_data',
'/usr/local/lib/nltk_data',
'/usr/lib/nltk_data',
'/root/nltk_data',
'/home/user/nltk_data',
'/app/nltk_data'
]
# Add all possible NLTK data paths
nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
# Download NLTK data if not found
try:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
except Exception as e:
print(f"Warning: NLTK data download failed: {str(e)}")
# ν…μŠ€νŠΈ λΆ„ν•  크기 μ„€μ •
self.chunk_size = 1000 # 토큰 κΈ°μ€€
try:
self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
except Exception as e:
print(f"Warning: Failed to load punkt tokenizer: {str(e)}")
# Fallback to default sent_tokenize
self.tokenizer = nltk.tokenize.sent_tokenize
def summarize_text(self, text: str) -> Dict:
"""ν…μŠ€νŠΈλ₯Ό μš”μ•½"""
try:
# ν…μŠ€νŠΈ λΆ„ν• 
chunks = self._split_text(text)
# 각 뢄할에 λŒ€ν•΄ μš”μ•½ 생성
summaries = []
for chunk in chunks:
summary = self._summarize_chunk(chunk)
if summary:
summaries.append(summary)
return {
"timestamp": datetime.now().isoformat(),
"full_summary": " ".join(summaries),
"chunk_summaries": summaries
}
except Exception as e:
raise Exception(f"μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
def _summarize_chunk(self, text: str) -> str:
"""κ°œλ³„ ν…μŠ€νŠΈ 뢄할을 μš”μ•½"""
try:
# ν…μŠ€νŠΈ μ „μ²˜λ¦¬
words = nltk.word_tokenize(text.lower())
sentences = nltk.sent_tokenize(text)
# λΆˆμš©μ–΄ 제거
stop_words = set(nltk.corpus.stopwords.words('english'))
words = [word for word in words if word.isalnum() and word not in stop_words]
# 단어 λΉˆλ„μˆ˜ 계산
word_frequencies = {}
for word in words:
if word not in word_frequencies:
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
# μ΅œλŒ€ λΉˆλ„μˆ˜ 계산
max_frequency = max(word_frequencies.values())
# μ •κ·œν™”λœ λΉˆλ„μˆ˜ 계산
for word in word_frequencies:
word_frequencies[word] = word_frequencies[word] / max_frequency
# λ¬Έμž₯ 점수 계산
sentence_scores = {}
for sentence in sentences:
for word, freq in word_frequencies.items():
if word in sentence.lower():
if sentence not in sentence_scores:
sentence_scores[sentence] = freq
else:
sentence_scores[sentence] += freq
# μƒμœ„ 30%의 λ¬Έμž₯ 선택
summary_sentences = heapq.nlargest(
int(len(sentences) * 0.3),
sentence_scores,
key=sentence_scores.get
)
# μš”μ•½ 생성
return " ".join(summary_sentences)
except Exception as e:
print(f"Chunk summarization error: {str(e)}")
return ""
def _split_text(self, text: str) -> List[str]:
"""ν…μŠ€νŠΈλ₯Ό μ μ ˆν•œ 크기둜 λΆ„ν• """
try:
# Use the configured tokenizer (either punkt or sent_tokenize)
if hasattr(self, 'tokenizer') and callable(self.tokenizer):
if self.tokenizer == nltk.tokenize.sent_tokenize:
sentences = nltk.tokenize.sent_tokenize(text)
else:
# Handle the case where tokenizer is a PunktSentenceTokenizer instance
sentences = self.tokenizer.tokenize(text)
else:
# Fallback to default sentence tokenizer
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size:
current_chunk = f"{current_chunk} {sentence}".strip()
else:
if current_chunk: # Only add non-empty chunks
chunks.append(current_chunk)
current_chunk = sentence
# Add the last chunk if it's not empty
if current_chunk:
chunks.append(current_chunk.strip())
return chunks if chunks else [text] # Return at least one chunk
except LookupError as e:
# If punkt data is missing, try to download it
print(f"NLTK data missing, attempting to download: {e}")
nltk.download('punkt')
# Retry with the default tokenizer
return self._split_text(text)
except Exception as e:
print(f"Error in _split_text: {str(e)}")
# If all else fails, return the original text as a single chunk
return [text]
# 싱글톀 μΈμŠ€ν„΄μŠ€ 생성
document_summarizer = DocumentSummarizer()