Spaces:
Build error
Build error
| import nltk | |
| from typing import Dict, List | |
| import json | |
| from datetime import datetime | |
| import heapq | |
| class DocumentSummarizer: | |
| def __init__(self): | |
| # Set NLTK data path | |
| nltk_data_paths = [ | |
| '/usr/local/share/nltk_data', | |
| '/usr/share/nltk_data', | |
| '/usr/local/nltk_data', | |
| '/usr/local/lib/nltk_data', | |
| '/usr/lib/nltk_data', | |
| '/root/nltk_data', | |
| '/home/user/nltk_data', | |
| '/app/nltk_data' | |
| ] | |
| # Add all possible NLTK data paths | |
| nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path)) | |
| # Download NLTK data if not found | |
| try: | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| nltk.download('averaged_perceptron_tagger') | |
| except Exception as e: | |
| print(f"Warning: NLTK data download failed: {str(e)}") | |
| # ν μ€νΈ λΆν ν¬κΈ° μ€μ | |
| self.chunk_size = 1000 # ν ν° κΈ°μ€ | |
| try: | |
| self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
| except Exception as e: | |
| print(f"Warning: Failed to load punkt tokenizer: {str(e)}") | |
| # Fallback to default sent_tokenize | |
| self.tokenizer = nltk.tokenize.sent_tokenize | |
| def summarize_text(self, text: str) -> Dict: | |
| """ν μ€νΈλ₯Ό μμ½""" | |
| try: | |
| # ν μ€νΈ λΆν | |
| chunks = self._split_text(text) | |
| # κ° λΆν μ λν΄ μμ½ μμ± | |
| summaries = [] | |
| for chunk in chunks: | |
| summary = self._summarize_chunk(chunk) | |
| if summary: | |
| summaries.append(summary) | |
| return { | |
| "timestamp": datetime.now().isoformat(), | |
| "full_summary": " ".join(summaries), | |
| "chunk_summaries": summaries | |
| } | |
| except Exception as e: | |
| raise Exception(f"μμ½ μμ± μ€ μ€λ₯ λ°μ: {str(e)}") | |
| def _summarize_chunk(self, text: str) -> str: | |
| """κ°λ³ ν μ€νΈ λΆν μ μμ½""" | |
| try: | |
| # ν μ€νΈ μ μ²λ¦¬ | |
| words = nltk.word_tokenize(text.lower()) | |
| sentences = nltk.sent_tokenize(text) | |
| # λΆμ©μ΄ μ κ±° | |
| stop_words = set(nltk.corpus.stopwords.words('english')) | |
| words = [word for word in words if word.isalnum() and word not in stop_words] | |
| # λ¨μ΄ λΉλμ κ³μ° | |
| word_frequencies = {} | |
| for word in words: | |
| if word not in word_frequencies: | |
| word_frequencies[word] = 1 | |
| else: | |
| word_frequencies[word] += 1 | |
| # μ΅λ λΉλμ κ³μ° | |
| max_frequency = max(word_frequencies.values()) | |
| # μ κ·νλ λΉλμ κ³μ° | |
| for word in word_frequencies: | |
| word_frequencies[word] = word_frequencies[word] / max_frequency | |
| # λ¬Έμ₯ μ μ κ³μ° | |
| sentence_scores = {} | |
| for sentence in sentences: | |
| for word, freq in word_frequencies.items(): | |
| if word in sentence.lower(): | |
| if sentence not in sentence_scores: | |
| sentence_scores[sentence] = freq | |
| else: | |
| sentence_scores[sentence] += freq | |
| # μμ 30%μ λ¬Έμ₯ μ ν | |
| summary_sentences = heapq.nlargest( | |
| int(len(sentences) * 0.3), | |
| sentence_scores, | |
| key=sentence_scores.get | |
| ) | |
| # μμ½ μμ± | |
| return " ".join(summary_sentences) | |
| except Exception as e: | |
| print(f"Chunk summarization error: {str(e)}") | |
| return "" | |
| def _split_text(self, text: str) -> List[str]: | |
| """ν μ€νΈλ₯Ό μ μ ν ν¬κΈ°λ‘ λΆν """ | |
| try: | |
| # Use the configured tokenizer (either punkt or sent_tokenize) | |
| if hasattr(self, 'tokenizer') and callable(self.tokenizer): | |
| if self.tokenizer == nltk.tokenize.sent_tokenize: | |
| sentences = nltk.tokenize.sent_tokenize(text) | |
| else: | |
| # Handle the case where tokenizer is a PunktSentenceTokenizer instance | |
| sentences = self.tokenizer.tokenize(text) | |
| else: | |
| # Fallback to default sentence tokenizer | |
| nltk.download('punkt') | |
| sentences = nltk.tokenize.sent_tokenize(text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size: | |
| current_chunk = f"{current_chunk} {sentence}".strip() | |
| else: | |
| if current_chunk: # Only add non-empty chunks | |
| chunks.append(current_chunk) | |
| current_chunk = sentence | |
| # Add the last chunk if it's not empty | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks if chunks else [text] # Return at least one chunk | |
| except LookupError as e: | |
| # If punkt data is missing, try to download it | |
| print(f"NLTK data missing, attempting to download: {e}") | |
| nltk.download('punkt') | |
| # Retry with the default tokenizer | |
| return self._split_text(text) | |
| except Exception as e: | |
| print(f"Error in _split_text: {str(e)}") | |
| # If all else fails, return the original text as a single chunk | |
| return [text] | |
| # μ±κΈν€ μΈμ€ν΄μ€ μμ± | |
| document_summarizer = DocumentSummarizer() | |