Spaces:
Sleeping
Sleeping
| """ | |
| Document Processing for Case Analysis | |
| Supports PDF, TXT, DOCX uploads | |
| """ | |
| import os | |
| import tempfile | |
| from typing import Dict, List, Optional | |
| import PyPDF2 | |
| import docx | |
| class DocumentProcessor: | |
| def __init__(self): | |
| self.supported_extensions = ['.pdf', '.txt', '.docx', '.doc'] | |
| def process_uploaded_file(self, file_path: str, file_type: str = None) -> Dict: | |
| """ | |
| Process uploaded document and extract text | |
| Returns: { | |
| "success": bool, | |
| "filename": str, | |
| "text": str, | |
| "word_count": int, | |
| "extracted_sections": Dict | |
| } | |
| """ | |
| if not os.path.exists(file_path): | |
| return {"success": False, "error": "File not found"} | |
| try: | |
| # Determine file type | |
| if not file_type: | |
| _, ext = os.path.splitext(file_path) | |
| file_type = ext.lower() | |
| # Extract text based on file type | |
| text = "" | |
| if file_type == '.pdf': | |
| text = self._extract_from_pdf(file_path) | |
| elif file_type == '.txt': | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| elif file_type in ['.docx', '.doc']: | |
| text = self._extract_from_docx(file_path) | |
| else: | |
| return {"success": False, "error": f"Unsupported file type: {file_type}"} | |
| # Analyze text for homeopathic keywords | |
| extracted = self._extract_homeopathic_info(text) | |
| return { | |
| "success": True, | |
| "filename": os.path.basename(file_path), | |
| "text": text[:5000], # Limit for display | |
| "full_text": text, | |
| "word_count": len(text.split()), | |
| "extracted_sections": extracted, | |
| "summary": self._generate_summary(extracted) | |
| } | |
| except Exception as e: | |
| return {"success": False, "error": str(e)} | |
| def _extract_from_pdf(self, file_path: str) -> str: | |
| """Extract text from PDF""" | |
| text = "" | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def _extract_from_docx(self, file_path: str) -> str: | |
| """Extract text from DOCX""" | |
| doc = docx.Document(file_path) | |
| text = "" | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| return text | |
| def _extract_homeopathic_info(self, text: str) -> Dict: | |
| """Extract homeopathic information from text""" | |
| text_lower = text.lower() | |
| # Common homeopathic sections | |
| sections = { | |
| "symptoms": [], | |
| "modalities": [], | |
| "emotional_state": [], | |
| "physical_symptoms": [], | |
| "timing": [], | |
| "generalities": [] | |
| } | |
| # Keywords to look for | |
| keyword_patterns = { | |
| "symptoms": ["symptom", "complaint", "pain", "ache", "discomfort"], | |
| "modalities": ["worse", "better", "aggravated", "ameliorated", "relieved"], | |
| "emotional_state": ["anxious", "fearful", "irritable", "sad", "depressed", "angry"], | |
| "timing": ["morning", "evening", "night", "afternoon", "periodic"], | |
| "generalities": ["thirst", "hunger", "cold", "hot", "sweat"] | |
| } | |
| # Extract sentences containing keywords | |
| sentences = text.split('.') | |
| for sentence in sentences: | |
| sentence_lower = sentence.lower() | |
| for category, keywords in keyword_patterns.items(): | |
| if any(keyword in sentence_lower for keyword in keywords): | |
| clean_sentence = sentence.strip() | |
| if clean_sentence and len(clean_sentence) > 10: | |
| sections[category].append(clean_sentence[:200]) | |
| # Limit each section | |
| for category in sections: | |
| sections[category] = sections[category][:5] | |
| return sections | |
| def _generate_summary(self, extracted: Dict) -> str: | |
| """Generate summary from extracted information""" | |
| summary_parts = [] | |
| if extracted["symptoms"]: | |
| summary_parts.append(f"Chief complaints: {len(extracted['symptoms'])} identified") | |
| if extracted["modalities"]: | |
| worse_count = sum(1 for s in extracted["modalities"] if "worse" in s.lower()) | |
| better_count = sum(1 for s in extracted["modalities"] if "better" in s.lower()) | |
| summary_parts.append(f"Modalities: {worse_count} aggravations, {better_count} ameliorations") | |
| if extracted["emotional_state"]: | |
| summary_parts.append(f"Emotional patterns: {len(extracted['emotional_state'])} noted") | |
| return "; ".join(summary_parts) if summary_parts else "No clear patterns identified" | |
| def extract_for_analysis(self, text: str) -> Dict: | |
| """Extract structured data for analysis""" | |
| extracted = self._extract_homeopathic_info(text) | |
| # Convert to analysis format | |
| analysis_data = { | |
| "chief_complaint": " ".join(extracted["symptoms"][:3]) if extracted["symptoms"] else "", | |
| "location": "", | |
| "sensation": "", | |
| "aggravations": "; ".join([s for s in extracted["modalities"] if "worse" in s.lower()][:3]), | |
| "ameliorations": "; ".join([s for s in extracted["modalities"] if "better" in s.lower()][:3]), | |
| "timing": "; ".join(extracted["timing"][:3]), | |
| "emotional_state": "; ".join(extracted["emotional_state"][:3]), | |
| "generalities": "; ".join(extracted["generalities"][:3]), | |
| "source": "document_upload" | |
| } | |
| return analysis_data | |
| # Global instance | |
| doc_processor = DocumentProcessor() |