import gradio as gr import pdfplumber import matplotlib.pyplot as plt import numpy as np from word2number import w2n import re from typing import Tuple, List, Dict from io import BytesIO import base64 # Custom CSS for styling css = """ :root { --low-color: #28a745; --medium-color: #ffc107; --high-color: #dc3545; --inactive-color: #e9ecef; } .risk-container { display: flex; flex-direction: column; gap: 12px; margin-bottom: 25px; } .risk-row { display: flex; align-items: center; background: white; border-radius: 8px; padding: 15px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); transition: all 0.3s ease; } .risk-row.active { transform: scale(1.02); box-shadow: 0 4px 8px rgba(0,0,0,0.15); } .risk-label { width: 100px; font-weight: 600; font-size: 16px; color: #495057; } .risk-score { width: 80px; font-size: 20px; font-weight: 700; text-align: center; } .risk-low { color: var(--low-color); } .risk-medium { color: var(--medium-color); } .risk-high { color: var(--high-color); } .heatmap-container { flex-grow: 1; height: 30px; border-radius: 15px; overflow: hidden; position: relative; } .heatmap-bar { height: 100%; border-radius: 15px; transition: width 0.5s ease; } .risk-meter { position: absolute; right: 10px; top: 50%; transform: translateY(-50%); font-size: 12px; font-weight: 600; color: white; text-shadow: 0 1px 2px rgba(0,0,0,0.3); } .result-section { background: white; border-radius: 8px; padding: 20px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .result-title { font-size: 18px; font-weight: 600; margin-bottom: 15px; color: #343a40; display: flex; align-items: center; gap: 8px; } .clause-item { margin-bottom: 8px; padding-left: 15px; position: relative; } .clause-item:before { content: "•"; position: absolute; left: 0; color: #6c757d; } .penalty-amount { font-family: monospace; background: #f8f9fa; padding: 2px 6px; border-radius: 4px; margin-left: 5px; } .example-clause { background: #f8f9fa; padding: 12px; border-radius: 6px; margin-bottom: 10px; border-left: 3px solid #6c757d; } .example-number { font-weight: 600; margin-right: 8px; color: #6c757d; } """ def extract_text_from_pdf(pdf_path: str) -> str: """Extract text from PDF using pdfplumber""" text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() or "" return text def count_keywords(text: str, keywords: List[str]) -> Dict[str, int]: """Count occurrences of keywords in text""" counts = {} for keyword in keywords: counts[keyword] = len(re.findall(r'\b' + re.escape(keyword) + r'\b', text, flags=re.IGNORECASE)) return counts def find_penalty_values(text: str) -> List[float]: """Find penalty amounts in the text""" patterns = [ r'\$\s*[\d,]+(?:\.\d+)?', r'(?:USD|usd)\s*[\d,]+(?:\.\d+)?', r'\d+\s*(?:percent|%)', r'(?:\b[a-z]+\s*)+dollars', ] penalties = [] for pattern in patterns: matches = re.finditer(pattern, text, flags=re.IGNORECASE) for match in matches: penalty_text = match.group() try: if any(word in penalty_text.lower() for word in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', 'million']): penalty_value = w2n.word_to_num(penalty_text.split('dollars')[0].strip()) else: penalty_value = float(re.sub(r'[^\d.]', '', penalty_text)) penalties.append(penalty_value) except: continue return penalties def calculate_risk_score(penalty_count: int, penalty_values: List[float], obligation_count: int, delay_count: int) -> Tuple[float, str]: """Calculate risk score based on various factors""" score = 0 score += min(penalty_count * 5, 30) if penalty_values: avg_penalty = sum(penalty_values) / len(penalty_values) if avg_penalty > 1000000: score += 40 elif avg_penalty > 100000: score += 25 elif avg_penalty > 10000: score += 15 else: score += 5 score += min(obligation_count * 2, 20) score += min(delay_count * 10, 30) score = min(score, 100) if score < 30: return score, "Low" elif score < 70: return score, "Medium" else: return score, "High" def create_risk_display(risk_score: float, risk_level: str) -> str: """Create HTML display for all three risk levels""" risk_levels = ["Low", "Medium", "High"] colors = { "Low": "var(--low-color)", "Medium": "var(--medium-color)", "High": "var(--high-color)" } html_parts = [] html_parts.append("
") for level in risk_levels: active = level == risk_level score = risk_score if active else 0 color = colors[level] if active else "var(--inactive-color)" opacity = "1" if active else "0.6" html_parts.append(f"""
{level} Risk
{score:.1f}%
{score:.1f}%
""") html_parts.append("
") return "\n".join(html_parts) def format_clauses(counts: Dict[str, int]) -> str: """Format clause counts into HTML""" return "\n".join([f"
{kw}: {count}
" for kw, count in counts.items()]) def format_penalty_amounts(amounts: List[float]) -> str: """Format penalty amounts into HTML""" if not amounts: return "
No specific penalty amounts found
" return "\n".join([f"
${amt:,.2f}
" for amt in amounts[:5]]) def format_examples(sentences: List[str]) -> str: """Format example sentences into HTML""" if not sentences: return "
No penalty clauses found
" return "\n".join([f"""
{i+1}. {sent}
""" for i, sent in enumerate(sentences[:3])]) def analyze_pdf(file_obj) -> List: """Main analysis function for Gradio interface""" try: # Extract text from the uploaded file text = extract_text_from_pdf(file_obj.name) # Define keywords to search for penalty_keywords = ["penalty", "fine", "forfeit", "liquidated damages", "breach"] obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"] delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"] # Count keyword occurrences penalty_counts = count_keywords(text, penalty_keywords) obligation_counts = count_keywords(text, obligation_keywords) delay_counts = count_keywords(text, delay_keywords) # Find penalty values penalty_values = find_penalty_values(text) # Calculate total counts total_penalties = sum(penalty_counts.values()) total_obligations = sum(obligation_counts.values()) total_delays = sum(delay_counts.values()) # Calculate risk score risk_score, risk_level = calculate_risk_score( total_penalties, penalty_values, total_obligations, total_delays ) # Generate risk display risk_display = create_risk_display(risk_score, risk_level) # Find example sentences with penalties penalty_sentences = [] for sentence in re.split(r'(?<=[.!?])\s+', text): if any(kw.lower() in sentence.lower() for kw in penalty_keywords): penalty_sentences.append(sentence.strip()) # Format all results penalty_html = f"""
📊 Penalty Clauses: {total_penalties} found
{format_clauses(penalty_counts)}
""" amounts_html = f"""
💰 Penalty Amounts: {len(penalty_values)} found
{format_penalty_amounts(penalty_values)}
""" obligation_html = f"""
⚖️ Obligation Clauses: {total_obligations} found
{format_clauses(obligation_counts)}
""" delay_html = f"""
⏱️ Delay Clauses: {total_delays} found
{format_clauses(delay_counts)}
""" examples_html = f"""
🔍 Example Penalty Clauses
{format_examples(penalty_sentences)}
""" return [ risk_display, penalty_html, amounts_html, obligation_html, delay_html, examples_html ] except Exception as e: error_html = f"""
❌ Error
{str(e)}
""" return [error_html] * 6 # Create Gradio interface with gr.Blocks(css=css, title="PDF Contract Risk Analyzer") as demo: gr.Markdown("""

📄 PDF Contract Risk Analyzer

Upload a contract PDF to analyze penalties, obligations, and delays

""") with gr.Row(): with gr.Column(scale=1): file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) submit_btn = gr.Button("Analyze Contract", variant="primary") with gr.Column(scale=3): gr.Markdown("### 🔍 Risk Assessment Summary") risk_display = gr.HTML() with gr.Row(): with gr.Column(): penalty_count = gr.HTML() penalty_amounts = gr.HTML() with gr.Column(): obligation_count = gr.HTML() delay_count = gr.HTML() penalty_examples = gr.HTML() submit_btn.click( fn=analyze_pdf, inputs=file_input, outputs=[risk_display, penalty_count, penalty_amounts, obligation_count, delay_count, penalty_examples] ) if __name__ == "__main__": demo.launch()