import spacy
from transformers import pipeline
import PyPDF2
import io
import re
import nltk
import hashlib
from datetime import datetime
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
from fastapi import HTTPException
from .gemini_enrichment import enrich_legal_analysis
from .models import (
    LegalEntities, LegalCitation, LegalClause, LegalAnalysis
)
from .config import logger, MODEL_CONFIGS, PATTERNS

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

class LegalDocumentProcessor:
    def __init__(self):
        self.nlp = spacy.load(MODEL_CONFIGS['spacy_model'])
        self.qa_model = pipeline("question-answering", model=MODEL_CONFIGS['qa_model'])
        self.summarizer = pipeline("summarization", model=MODEL_CONFIGS['summarizer'])
        
        # Legal-specific patterns
        self.citation_pattern = PATTERNS['citation']
        self.monetary_pattern = PATTERNS['monetary']
        self.date_pattern = PATTERNS['date']
        
        # Common legal terms and definitions
        self.legal_terms = self._load_legal_terms()

    @staticmethod
    def _load_legal_terms() -> Dict[str, str]:
        """Load common legal terms and their definitions"""
        return {
            "force majeure": "Unforeseeable circumstances that prevent someone from fulfilling a contract",
            "consideration": "Something of value given by both parties to a contract",
            "jurisdiction": "The official power to make legal decisions and judgments",
            "waiver": "Voluntary relinquishment of a known right",
            "indemnification": "Security or protection against a loss or other financial burden",
            "severability": "Contract provision that allows the contract to remain valid even if some parts are unenforceable",
            "precedent": "A previous court decision that guides future decisions on similar issues",
            "res judicata": "A matter that has been adjudicated by a competent court and may not be pursued further",
            "locus standi": "The right or capacity to bring an action or to appear in a court"
        }

    async def extract_text_from_pdf(self, content: bytes) -> str:
        """Extract text from PDF content"""
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(content))
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            return text.strip()
        except Exception as e:
            logger.error(f"PDF extraction error: {str(e)}")
            raise HTTPException(status_code=500, detail="Error processing PDF file")
    def _determine_document_type(self, text: str) -> str:
        """Determine the type of legal document"""
        text_lower = text.lower()
        
        document_types = {
            "contract": ["agreement", "contract", "terms and conditions"],
            "court_filing": ["motion", "petition", "complaint", "brief"],
            "legislation": ["act", "statute", "bill", "regulation"],
            "opinion": ["opinion", "decision", "order", "judgment"],
        }
        
        for doc_type, keywords in document_types.items():
            if any(keyword in text_lower for keyword in keywords):
                return doc_type
                
        return "other"
    def extract_legal_entities(self, doc) -> LegalEntities:
        """Extract legal entities from the document"""
        entities = defaultdict(set)
        
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                context = doc[max(0, ent.start - 5):min(len(doc), ent.end + 5)].text.lower()
                if any(term in context for term in ["judge", "justice", "honor"]):
                    entities["judges"].add(ent.text)
                elif any(term in context for term in ["attorney", "counsel", "esq"]):
                    entities["lawyers"].add(ent.text)
                else:
                    entities["parties"].add(ent.text)
            elif ent.label_ == "ORG":
                if any(term in ent.text.lower() for term in ["court", "tribunal"]):
                    entities["courts"].add(ent.text)
                else:
                    entities["organizations"].add(ent.text)

        return LegalEntities(
            parties=list(entities["parties"]),
            judges=list(entities["judges"]),
            lawyers=list(entities["lawyers"]),
            courts=list(entities["courts"]),
            organizations=list(entities["organizations"])
        )

    def extract_citations(self, text: str) -> List[LegalCitation]:
        """Extract legal citations from text"""
        citations = []
        matches = re.finditer(self.citation_pattern, text)
        
        for match in matches:
            citation_text = match.group()
            year_match = re.search(r'\d{4}', citation_text)
            year = int(year_match.group()) if year_match else None
            
            citations.append(LegalCitation(
                citation_text=citation_text,
                year=year,
                source=self._determine_citation_source(citation_text),
                page=None
            ))
        
        return citations
    def _extract_definitions(self, doc) -> Dict[str, str]:
        """Extract defined terms and their definitions"""
        definitions = {}
        
        # Pattern for common definition structures
        definition_patterns = [
            r'(?i)"([^"]+)"\s+means\s+([^\.]+)',
            r'(?i)"([^"]+)"\s+shall\s+mean\s+([^\.]+)',
            r'(?i)term\s+"([^"]+)"\s+is\s+defined\s+as\s+([^\.]+)',
            r'(?i)"([^"]+)"\s+refers\s+to\s+([^\.]+)'
        ]
        
        text = doc.text
        for pattern in definition_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                term, definition = match.groups()
                definitions[term.strip()] = definition.strip()

        return definitions
    def _determine_citation_source(self, citation: str) -> Optional[str]:
        """Determine the source of a legal citation"""
        if "AIR" in citation:
            return "All India Reporter"
        elif "SCC" in citation:
            return "Supreme Court Cases"
        elif "SC" in citation:
            return "Supreme Court"
        elif "HC" in citation:
            return "High Court"
        elif "ILR" in citation:
            return "Indian Law Reports"
        return None

    def extract_clauses(self, doc) -> List[LegalClause]:
        """Extract and classify legal clauses"""
        clauses = []
        
        clause_patterns = {
            "indemnification": r"(?i)indemnif[iy]|hold\s+harmless",
            "termination": r"(?i)terminat(e|ion)|cancel(lation)?",
            "confidentiality": r"(?i)confidential|non-disclosure",
            "warranty": r"(?i)warrant(y|ies)|guarantee",
            "governing_law": r"(?i)govern(ing)?\s+law|jurisdiction",
            "force_majeure": r"(?i)force\s+majeure|acts?\s+of\s+god",
            "assignment": r"(?i)assign(ment)?|transfer\s+of\s+rights",
            "severability": r"(?i)sever(ability)?|invalid|unenforceable"
        }
        
        for sent in doc.sents:
            for clause_type, pattern in clause_patterns.items():
                if re.search(pattern, sent.text):
                    clauses.append(LegalClause(
                        clause_type=clause_type,
                        text=sent.text,
                        importance=self._calculate_clause_importance(sent.text),
                        section="Unknown"
                    ))
        
        return clauses

    def _calculate_clause_importance(self, text: str) -> float:
        """Calculate importance score for a clause"""
        importance_terms = {
            "shall": 0.3,
            "must": 0.3,
            "will": 0.2,
            "agree": 0.2,
            "terminate": 0.4,
            "indemnify": 0.4,
            "warrant": 0.3,
            "material": 0.4,
            "breach": 0.4,
            "liable": 0.3
        }
        
        score = sum(importance_terms.get(word.lower(), 0)
                   for word in text.split())
        return min(1.0, score)

    def extract_deadlines(self, text: str) -> List[Dict[str, str]]:
        """Extract deadlines and time-sensitive information"""
        deadlines = []
        
        deadline_patterns = [
            r"(?i)within\s+(\d+)\s+(day|month|year)s?",
            r"(?i)no\s+later\s+than\s+([^\.]+)",
            r"(?i)deadline\s+[^\.]+",
            r"(?i)due\s+(?:date|by)\s+([^\.]+)"
        ]
        
        for pattern in deadline_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                deadline_text = match.group()
                date_match = re.search(self.date_pattern, deadline_text)
                deadlines.append({
                    "text": deadline_text,
                    "date": date_match.group() if date_match else None,
                    "context": text[max(0, match.start()-50):min(len(text), match.end()+50)]
                })
        
        return deadlines

    def extract_monetary_values(self, text: str) -> List[Dict[str, str]]:
        """Extract monetary values and related context"""
        monetary_values = []
        
        matches = re.finditer(self.monetary_pattern, text)
        for match in matches:
            monetary_values.append({
                "value": match.group(),
                "context": text[max(0, match.start()-50):min(len(text), match.end()+50)]
            })
        
        return monetary_values

    def extract_obligations(self, doc) -> List[Dict[str, str]]:
        """Extract legal obligations"""
        obligations = []
        
        obligation_patterns = [
            r"(?i)shall\s+[^\.]+",
            r"(?i)must\s+[^\.]+",
            r"(?i)agrees?\s+to\s+[^\.]+",
            r"(?i)required\s+to\s+[^\.]+",
            r"(?i)obligations?\s+[^\.]+",
            r"(?i)duties?\s+[^\.]+",
            r"(?i)responsible\s+for\s+[^\.]+",
        ]
        
        for sent in doc.sents:
            for pattern in obligation_patterns:
                if re.search(pattern, sent.text):
                    obligations.append({
                        "text": sent.text,
                        "type": "mandatory" if any(word in sent.text.lower() 
                                                 for word in ["shall", "must"]) 
                               else "contractual"
                    })
        
        return obligations
    def _extract_governing_law(self, text: str) -> Optional[str]:
        """Extract the governing law from the text"""
        governing_law_patterns = [
            r"(?i)governed\s+by\s+the\s+laws\s+of\s+([^\.]+)",
            r"(?i)subject\s+to\s+the\s+jurisdiction\s+of\s+([^\.]+)",
            r"(?i)Indian\s+law",
            r"(?i)laws\s+of\s+India"
        ]
        
        for pattern in governing_law_patterns:
            match = re.search(pattern, text)
            if match:
                return match.group(1)
        return None

    def _extract_jurisdiction(self, text: str) -> Optional[str]:    
        """Extract the jurisdiction from the text"""
        jurisdiction_patterns = [
            r"(?i)courts\s+of\s+([^\.]+)",
            r"(?i)jurisdiction\s+of\s+([^\.]+)",
            r"(?i)subject\s+to\s+the\s+exclusive\s+jurisdiction\s+of\s+([^\.]+)"
        ]
        
        for pattern in jurisdiction_patterns:
            match = re.search(pattern, text)
            if match:
                return match.group(1)
        return None
    def _extract_risk_factors(self, doc) -> List[str]:
        """Extract potential risk factors from the document"""
        risk_factors = []
        
        for ent in doc.ents:
            if ent.label_ == "PERSON" and any(term in ent.text.lower() for term in ["judge", "justice", "honor"]):
                risk_factors.append(f"Potential bias from {ent.text}")
            elif ent.label_ == "ORG" and any(term in ent.text.lower() for term in ["court", "tribunal"]):
                risk_factors.append(f"Jurisdiction of {ent.text}")
            elif ent.label_ == "GPE" and ent.text.lower() in ["india", "delhi", "mumbai", "chennai", "kolkata"]:
                risk_factors.append(f"Compliance with {ent.text} laws and regulations")
            elif ent.label_ == "MONEY":
                risk_factors.append(f"Financial obligation of {ent.text}")
        
        return risk_factors
    async def analyze_document(self, content: bytes, file_type: str) -> LegalAnalysis:
        """Perform comprehensive legal document analysis"""
        start_time = datetime.now()
        
        # Extract text based on file type
        if file_type == 'pdf':
            text = await self.extract_text_from_pdf(content)
        else:
            text = content.decode().strip()
                
        # Generate document ID
        doc_id = hashlib.md5(text.encode()).hexdigest()
            
        # Process with spaCy
        doc = self.nlp(text)
            
        # Generate summary
        summary = self.summarizer(text[:1024], 
                                max_length=150, 
                                min_length=50, 
                                do_sample=False)[0]['summary_text']
            
        # Extract deadlines
        deadlines = self.extract_deadlines(text)
        
        # Convert deadline dates to the correct format
        for deadline in deadlines:
            if deadline['date']:
                try:
                    deadline_dt = datetime.strptime(deadline['date'], '%Y-%m-%d')
                    deadline['date'] = deadline_dt.strftime('%Y-%m-%d')
                except (ValueError, TypeError):
                    deadline['date'] = None
        
        word_count = len(text.split())

        analysis = LegalAnalysis(
            doc_id=doc_id,
            document_type=self._determine_document_type(text),
            entities=self.extract_legal_entities(doc),
            key_clauses=self.extract_clauses(doc),
            citations=self.extract_citations(text),
            legal_definitions=self._extract_definitions(doc),
            obligations=self.extract_obligations(doc),
            deadlines=[
                {"text": d["text"], "date": d["date"], "context": d["context"]}
                for d in deadlines if d["date"] is not None
            ],
            jurisdiction=self._extract_jurisdiction(text),
            governing_law=self._extract_governing_law(text),
            risk_factors=self._extract_risk_factors(doc),
            monetary_values=self.extract_monetary_values(text),
            summary=summary,
            metadata={
                "file_type": file_type,
                "language": doc.lang_,
                "word_count": str(word_count),
                "created_at": datetime.now().isoformat()
            },
            word_count=word_count,
            created_at=datetime.now().isoformat(),
            processing_time=(datetime.now() - start_time).total_seconds()
        )
        
        # try:
        #     analysis = enrich_legal_analysis(analysis)
        # except Exception as e:
        #     logger.error(f"Gemini enrichment failed: {str(e)}")
        #     pass

        return analysis