Spaces:

bluewhale2025
/

parseai-document-processor

Build error

App Files Files Community

parseai-document-processor / summarizer.py

bluewhale2025

Improve text splitting and error handling in summarizer

6bd8ed8 7 months ago

raw

history blame contribute delete

6.04 kB

	import nltk
	from typing import Dict, List
	import json
	from datetime import datetime
	import heapq

	class DocumentSummarizer:
	def __init__(self):
	# Set NLTK data path
	nltk_data_paths = [
	'/usr/local/share/nltk_data',
	'/usr/share/nltk_data',
	'/usr/local/nltk_data',
	'/usr/local/lib/nltk_data',
	'/usr/lib/nltk_data',
	'/root/nltk_data',
	'/home/user/nltk_data',
	'/app/nltk_data'
	]

	# Add all possible NLTK data paths
	nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))

	# Download NLTK data if not found
	try:
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')
	nltk.download('averaged_perceptron_tagger')
	except Exception as e:
	print(f"Warning: NLTK data download failed: {str(e)}")

	# 텍스트 분할 크기 설정
	self.chunk_size = 1000 # 토큰 기준
	try:
	self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	except Exception as e:
	print(f"Warning: Failed to load punkt tokenizer: {str(e)}")
	# Fallback to default sent_tokenize
	self.tokenizer = nltk.tokenize.sent_tokenize

	def summarize_text(self, text: str) -> Dict:
	"""텍스트를 요약"""
	try:
	# 텍스트 분할
	chunks = self._split_text(text)

	# 각 분할에 대해 요약 생성
	summaries = []
	for chunk in chunks:
	summary = self._summarize_chunk(chunk)
	if summary:
	summaries.append(summary)

	return {
	"timestamp": datetime.now().isoformat(),
	"full_summary": " ".join(summaries),
	"chunk_summaries": summaries
	}

	except Exception as e:
	raise Exception(f"요약 생성 중 오류 발생: {str(e)}")

	def _summarize_chunk(self, text: str) -> str:
	"""개별 텍스트 분할을 요약"""
	try:
	# 텍스트 전처리
	words = nltk.word_tokenize(text.lower())
	sentences = nltk.sent_tokenize(text)

	# 불용어 제거
	stop_words = set(nltk.corpus.stopwords.words('english'))
	words = [word for word in words if word.isalnum() and word not in stop_words]

	# 단어 빈도수 계산
	word_frequencies = {}
	for word in words:
	if word not in word_frequencies:
	word_frequencies[word] = 1
	else:
	word_frequencies[word] += 1

	# 최대 빈도수 계산
	max_frequency = max(word_frequencies.values())

	# 정규화된 빈도수 계산
	for word in word_frequencies:
	word_frequencies[word] = word_frequencies[word] / max_frequency

	# 문장 점수 계산
	sentence_scores = {}
	for sentence in sentences:
	for word, freq in word_frequencies.items():
	if word in sentence.lower():
	if sentence not in sentence_scores:
	sentence_scores[sentence] = freq
	else:
	sentence_scores[sentence] += freq

	# 상위 30%의 문장 선택
	summary_sentences = heapq.nlargest(
	int(len(sentences) * 0.3),
	sentence_scores,
	key=sentence_scores.get
	)

	# 요약 생성
	return " ".join(summary_sentences)

	except Exception as e:
	print(f"Chunk summarization error: {str(e)}")
	return ""

	def _split_text(self, text: str) -> List[str]:
	"""텍스트를 적절한 크기로 분할"""
	try:
	# Use the configured tokenizer (either punkt or sent_tokenize)
	if hasattr(self, 'tokenizer') and callable(self.tokenizer):
	if self.tokenizer == nltk.tokenize.sent_tokenize:
	sentences = nltk.tokenize.sent_tokenize(text)
	else:
	# Handle the case where tokenizer is a PunktSentenceTokenizer instance
	sentences = self.tokenizer.tokenize(text)
	else:
	# Fallback to default sentence tokenizer
	nltk.download('punkt')
	sentences = nltk.tokenize.sent_tokenize(text)

	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size:
	current_chunk = f"{current_chunk} {sentence}".strip()
	else:
	if current_chunk: # Only add non-empty chunks
	chunks.append(current_chunk)
	current_chunk = sentence

	# Add the last chunk if it's not empty
	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks if chunks else [text] # Return at least one chunk

	except LookupError as e:
	# If punkt data is missing, try to download it
	print(f"NLTK data missing, attempting to download: {e}")
	nltk.download('punkt')
	# Retry with the default tokenizer
	return self._split_text(text)
	except Exception as e:
	print(f"Error in _split_text: {str(e)}")
	# If all else fails, return the original text as a single chunk
	return [text]

	# 싱글톤 인스턴스 생성
	document_summarizer = DocumentSummarizer()