Spaces:

Really-amin
/

Hoghoghi

Paused

App Files Files Community

Really-amin commited on Aug 31

Commit

91a893c

verified ·

1 Parent(s): 70b6e55

Upload 2 files

Browse files

Files changed (2) hide show

app/enhanced_legal_scraper.py +366 -0
app/legal_scraper_interface.py +1190 -0

app/enhanced_legal_scraper.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import gradio as gr
+import os
+import sys
+from pathlib import Path
+# اضافه کردن مسیر فعلی به sys.path
+sys.path.insert(0, str(Path(__file__).parent))
+# ایمپورت رابط اسکراپر
+from enhanced_legal_scraper import EnhancedLegalScraper, LegalDocument
+import pandas as pd
+import sqlite3
+import json
+from datetime import datetime
+from typing import List, Dict, Tuple
+import plotly.express as px
+class LegalScraperInterface:
+    """Gradio interface for enhanced legal scraper"""
+    def __init__(self):
+        self.scraper = EnhancedLegalScraper(delay=1.5)
+        self.is_scraping = False
+    def scrape_websites(self, urls_text: str, max_docs: int) -> Tuple[str, str, str]:
+        """Scrape websites from provided URLs"""
+        if self.is_scraping:
+            return "❌ اسکراپینگ در حال انجام است", "", ""
+        urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
+        if not urls:
+            return "❌ لطفاً URL وارد کنید", "", ""
+        try:
+            self.is_scraping = True
+            documents = self.scraper.scrape_real_sources(urls, max_docs)
+            status = f"✅ اسکراپینگ کامل شد - {len(documents)} سند جمع‌آوری شد"
+            summary_lines = [
+                f"📊 **خلاصه نتایج:**",
+                f"- تعداد کل اسناد: {len(documents)}",
+                f"- منابع پردازش شده: {len(urls)}",
+                f"- زمان اسکراپینگ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+                "",
+                "📋 **جزئیات:**"
+            ]
+            for i, doc in enumerate(documents[:5]):
+                summary_lines.append(f"{i+1}. {doc.title[:50]}...")
+            summary = "\n".join(summary_lines)
+            preview_lines = []
+            for doc in documents[:3]:
+                preview_lines.extend([
+                    f"**{doc.title}**",
+                    f"نوع: {doc.document_type}",
+                    f"منبع: {doc.source_url}",
+                    f"امتیاز اهمیت: {doc.importance_score:.2f}",
+                    f"خلاصه: {doc.summary[:100]}..." if doc.summary else "بدون خلاصه",
+                    "---"
+                ])
+            preview = "\n".join(preview_lines) if preview_lines else "هیچ سندی یافت نشد"
+            return status, summary, preview
+        except Exception as e:
+            error_msg = f"❌ خطا در اسکراپینگ: {str(e)}"
+            return error_msg, "", ""
+        finally:
+            self.is_scraping = False
+    def get_database_stats(self) -> Tuple[str, str]:
+        """Get database statistics and visualizations"""
+        try:
+            stats = self.scraper.get_enhanced_statistics()
+            stats_lines = [
+                "📊 **آمار پایگاه داده:**",
+                f"- کل اسناد: {stats.get('total_documents', 0)}",
+                "",
+                "📈 **بر اساس نوع:**"
+            ]
+            for doc_type, count in stats.get('by_type', {}).items():
+                type_name = {
+                    'law': 'قوانین',
+                    'news': 'اخبار',
+                    'ruling': 'آرا',
+                    'regulation': 'آیین‌نامه',
+                    'general': 'عمومی'
+                }.get(doc_type, doc_type)
+                stats_lines.append(f"- {type_name}: {count}")
+            stats_text = "\n".join(stats_lines)
+            viz_html = self._create_stats_visualization(stats)
+            return stats_text, viz_html
+        except Exception as e:
+            error_msg = f"خطا در دریافت آمار: {str(e)}"
+            return error_msg, ""
+    def _create_stats_visualization(self, stats: Dict) -> str:
+        """Create visualization for statistics"""
+        try:
+            by_type = stats.get('by_type', {})
+            if by_type and stats.get('total_documents', 0) > 0:
+                type_names = {
+                    'law': 'قوانین',
+                    'news': 'اخبار',
+                    'ruling': 'آرا',
+                    'regulation': 'آیین‌نامه',
+                    'general': 'عمومی'
+                }
+                labels = [type_names.get(k, k) for k in by_type.keys()]
+                values = list(by_type.values())
+                fig = px.pie(
+                    values=values,
+                    names=labels,
+                    title="توزیع اسناد بر اساس نوع"
+                )
+                fig.update_traces(textposition='inside', textinfo='percent+label')
+                return fig.to_html()
+            else:
+                return "<p>داده‌ای برای نمایش یافت نشد</p>"
+        except Exception as e:
+            return f"<p>خطا در ایجاد نمودار: {str(e)}</p>"
+    def search_documents(self, query: str, search_type: str) -> str:
+        """Search in collected documents"""
+        if not query.strip():
+            return "لطفاً کلیدواژه‌ای برای جستجو وارد کنید"
+        try:
+            if search_type == "هوشمند":
+                results = self.scraper.search_with_similarity(query, limit=10)
+            else:
+                results = self.scraper._text_search(query, limit=10)
+            if not results:
+                return f"هیچ سندی با کلیدواژه '{query}' یافت نشد"
+            result_lines = [f"🔍 **نتایج جستجو برای '{query}':** ({len(results)} مورد یافت شد)\n"]
+            for i, result in enumerate(results):
+                result_lines.extend([
+                    f"**{i+1}. {result['title']}**",
+                    f"   نوع: {result['document_type']}",
+                    f"   منبع: {result['source_url']}",
+                    f"   امتیاز شباهت: {result.get('similarity_score', 0):.3f}" if 'similarity_score' in result else "",
+                    f"   تاریخ: {result['date_published'] or 'نامشخص'}",
+                    f"   خلاصه: {result['summary'][:100]}..." if result.get('summary') else "",
+                    "---"
+                ])
+            return "\n".join(result_lines)
+        except Exception as e:
+            error_msg = f"خطا در جستجو: {str(e)}"
+            return error_msg
+def create_scraper_interface():
+    """Create Gradio interface for legal scraper"""
+    scraper_interface = LegalScraperInterface()
+    css = """
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto;
+        font-family: 'Tahoma', sans-serif;
+    }
+    .header {
+        background: linear-gradient(135deg, #2c3e50, #3498db);
+        color: white;
+        padding: 20px;
+        border-radius: 10px;
+        text-align: center;
+        margin-bottom: 20px;
+    }
+    """
+    with gr.Blocks(css=css, title="اسکراپر پیشرفته اسناد حقوقی", theme=gr.themes.Soft()) as interface:
+        gr.HTML("""
+        <div class="header">
+            <h1>🤖 اسکراپر پیشرفته اسناد حقوقی</h1>
+            <p>سیستم هوشمند جمع‌آوری و تحلیل اسناد حقوقی با قابلیت‌های NLP</p>
+        </div>
+        """)
+        with gr.Tab("🕷️ اسکراپینگ"):
+            gr.Markdown("## جمع‌آوری اسناد از منابع حقوقی")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    urls_input = gr.Textbox(
+                        label="📝 URL های منابع حقوقی",
+                        placeholder="هر URL را در یک خط وارد کنید:\nhttps://rc.majlis.ir\nhttps://dolat.ir",
+                        lines=5,
+                        value="\n".join([
+                            "https://rc.majlis.ir",
+                            "https://dolat.ir",
+                            "https://iribnews.ir"
+                        ])
+                    )
+                    max_docs = gr.Slider(
+                        label="حداکثر اسناد",
+                        minimum=5,
+                        maximum=50,
+                        value=15,
+                        step=5
+                    )
+                    scrape_btn = gr.Button("🚀 شروع اسکراپینگ", variant="primary")
+                with gr.Column(scale=1):
+                    status_output = gr.Textbox(
+                        label="⚡ وضعیت",
+                        interactive=False,
+                        lines=2
+                    )
+            with gr.Row():
+                summary_output = gr.Textbox(
+                    label="📊 خلاصه نتایج",
+                    interactive=False,
+                    lines=6
+                )
+                preview_output = gr.Textbox(
+                    label="👁️ پیش‌نمایش اسناد",
+                    interactive=False,
+                    lines=6,
+                    show_copy_button=True
+                )
+            scrape_btn.click(
+                fn=scraper_interface.scrape_websites,
+                inputs=[urls_input, max_docs],
+                outputs=[status_output, summary_output, preview_output]
+            )
+        with gr.Tab("🔍 جستجوی هوشمند"):
+            gr.Markdown("## جستجوی پیشرفته در اسناد")
+            with gr.Row():
+                search_input = gr.Textbox(
+                    label="🔍 کلیدواژه جستجو",
+                    placeholder="موضوع یا کلیدواژه مورد نظر را وارد کنید..."
+                )
+                search_type = gr.Dropdown(
+                    label="نوع جستجو",
+                    choices=["هوشمند", "متنی"],
+                    value="هوشمند"
+                )
+            search_btn = gr.Button("🔍 جستجو", variant="primary")
+            search_results = gr.Textbox(
+                label="📋 نتایج جستجو",
+                interactive=False,
+                lines=15,
+                show_copy_button=True
+            )
+            search_btn.click(
+                fn=scraper_interface.search_documents,
+                inputs=[search_input, search_type],
+                outputs=[search_results]
+            )
+        with gr.Tab("📊 آمار و تحلیل"):
+            gr.Markdown("## آمار پیشرفته پایگاه داده")
+            stats_btn = gr.Button("📊 بروزرسانی آمار", variant="secondary")
+            with gr.Row():
+                stats_text = gr.Textbox(
+                    label="📈 آمار متنی",
+                    interactive=False,
+                    lines=10
+                )
+                stats_plot = gr.HTML(
+                    label="📊 نمودارها"
+                )
+            stats_btn.click(
+                fn=scraper_interface.get_database_stats,
+                outputs=[stats_text, stats_plot]
+            )
+        with gr.Tab("📚 راهنما"):
+            gr.Markdown("""
+            # 🤖 راهنمای اسکراپر پیشرفته
+            ## ویژگی‌های پیشرفته
+            ### 🧠 پردازش زبان طبیعی (NLP)
+            - استخراج خودکار کلمات کلیدی
+            - تولید خلاصه متن
+            - تحلیل احساسات
+            - شناسایی موجودیت‌های حقوقی
+            - جستجوی هوشمند بر اساس شباهت معنایی
+            ### 📊 تحلیل پیشرفته
+            - امتیازدهی اهمیت اسناد
+            - طبقه‌بندی خودکار
+            - آمار و نمودارهای تحلیلی
+            - گزارش‌های آماری
+            ## منابع پیشنهادی
+            - **مجلس شورای اسلامی**: https://rc.majlis.ir
+            - **دولت**: https://dolat.ir
+            - **خبرگزاری‌ها**: IRIB, IRNA, Tasnim, Mehr, Fars
+            ## نکات فنی
+            - سیستم از فایل robots.txt پیروی می‌کند
+            - محدودیت سرعت درخواست رعایت می‌شود
+            - داده‌ها در پایگاه داده SQLite ذخیره می‌شوند
+            - از مدل‌های هوش مصنوعی برای پردازش استفاده می‌شود
+            ⚠️ **تذکر**: این ابزار برای مقاصد آموزشی و پژوهشی ارائه شده است.
+            """)
+    return interface
+def main():
+    """Main entry point for Hugging Face Spaces"""
+    print("🚀 راه اندازی اسکراپر پیشرفته اسناد حقوقی...")
+    print("📁 ایجاد دایرکتوری‌های مورد نیاز...")
+    # Create required directories
+    os.makedirs("/app/data", exist_ok=True)
+    os.makedirs("/app/logs", exist_ok=True)
+    os.makedirs("/app/cache", exist_ok=True)
+    # Create interface
+    interface = create_scraper_interface()
+    # Launch with Hugging Face optimized settings
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        debug=False,
+        enable_queue=True
+    )
+if __name__ == "__main__":
+    main()

app/legal_scraper_interface.py ADDED Viewed

	@@ -0,0 +1,1190 @@

+import requests
+import time
+import json
+import csv
+import sqlite3
+import logging
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple, Union
+from dataclasses import dataclass, asdict
+from pathlib import Path
+import re
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from bs4 import BeautifulSoup
+try:
+    import torch
+    from transformers import AutoTokenizer, AutoModel
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+    print("⚠️ PyTorch not available, running without advanced NLP features")
+try:
+    import hazm
+    from hazm import Normalizer, word_tokenize, sent_tokenize
+    HAZM_AVAILABLE = True
+except ImportError:
+    HAZM_AVAILABLE = False
+    print("⚠️ Hazm not available, using basic text processing")
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('legal_scraper.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# Predefined Iranian legal and news sources
+IRANIAN_LEGAL_SOURCES = [
+    "https://www.irna.ir",  # خبرگزاری جمهوری اسلامی
+    "https://www.tasnimnews.com",  # خبرگزاری تسنیم
+    "https://www.mehrnews.com",  # خبرگزاری مهر
+    "https://www.farsnews.ir",  # خبرگزاری فارس
+    "https://iribnews.ir",  # خبرگزاری صدا و سیما
+    "https://www.dolat.ir",  # پورتال دولت
+    "https://rc.majlis.ir",  # مرکز پژوهش‌های مجلس
+]
+@dataclass
+class LegalDocument:
+    """Enhanced legal document with NLP features"""
+    title: str
+    content: str
+    source_url: str
+    document_type: str
+    date_published: Optional[str] = None
+    date_scraped: str = None
+    category: Optional[str] = None
+    tags: List[str] = None
+    summary: Optional[str] = None
+    importance_score: float = 0.0
+    sentiment_score: float = 0.0
+    legal_entities: List[str] = None
+    keywords: List[str] = None
+    embedding: List[float] = None
+    language: str = "fa"
+    def __post_init__(self):
+        if self.date_scraped is None:
+            self.date_scraped = datetime.now().isoformat()
+        if self.tags is None:
+            self.tags = []
+        if self.legal_entities is None:
+            self.legal_entities = []
+        if self.keywords is None:
+            self.keywords = []
+class PersianNLPProcessor:
+    """Persian NLP processor using available models"""
+    def __init__(self):
+        if HAZM_AVAILABLE:
+            self.normalizer = Normalizer()
+        else:
+            self.normalizer = None
+        self.device = torch.device('cpu')
+        self.tokenizer = None
+        self.model = None
+        if TORCH_AVAILABLE:
+            try:
+                model_names = [
+                    "HooshvareLab/bert-fa-base-uncased",
+                    "HooshvareLab/bert-base-parsbert-uncased",
+                    "distilbert-base-multilingual-cased"
+                ]
+                for model_name in model_names:
+                    try:
+                        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+                        self.model = AutoModel.from_pretrained(model_name)
+                        self.model.to(self.device)
+                        logger.info(f"✅ Loaded model: {model_name}")
+                        break
+                    except Exception as e:
+                        logger.warning(f"⚠️ Failed to load {model_name}: {e}")
+                        continue
+            except Exception as e:
+                logger.error(f"❌ Failed to load any Persian BERT model: {e}")
+        self.legal_categories = {
+            'قانون': ['قانون', 'ماده', 'بند', 'فصل', 'تبصره', 'اصلاحیه'],
+            'رای': ['رای', 'حکم', 'دادگاه', 'قاضی', 'محکوم', 'دادرسی'],
+            'آیین‌نامه': ['آیین‌نامه', 'دستورالعمل', 'بخشنامه', 'مقررات'],
+            'اخبار': ['خبر', 'گزارش', 'اعلام', 'اطلاعیه', 'بیانیه'],
+            'نظریه': ['نظریه', 'تفسیر', 'استعلام', 'پاسخ', 'رأی']
+        }
+        self.tfidf = None
+        self._init_tfidf()
+    def _init_tfidf(self):
+        """Initialize TF-IDF vectorizer"""
+        try:
+            self.tfidf = TfidfVectorizer(
+                max_features=1000,
+                stop_words=self._get_persian_stopwords(),
+                ngram_range=(1, 2),
+                min_df=1,
+                max_df=0.8
+            )
+        except Exception as e:
+            logger.error(f"TF-IDF initialization failed: {e}")
+    def _get_persian_stopwords(self) -> List[str]:
+        """Get Persian stopwords"""
+        return [
+            'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'را', 'و', 'است',
+            'برای', 'تا', 'کرد', 'شد', 'می', 'خود', 'هم', 'نیز', 'یا', 'اما',
+            'اگر', 'چون', 'پس', 'بعد', 'قبل', 'روی', 'زیر', 'کنار', 'داخل',
+            'نیست', 'بود', 'باشد', 'کند', 'کنند', 'شود', 'گردد', 'دارد', 'دارند'
+        ]
+    def normalize_text(self, text: str) -> str:
+        """Normalize Persian text"""
+        if not text:
+            return ""
+        try:
+            text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
+            text = re.sub(r'\s+', ' ', text)
+            if self.normalizer:
+                text = self.normalizer.normalize(text)
+            return text.strip()
+        except Exception as e:
+            logger.error(f"Text normalization failed: {e}")
+            return text.strip()
+    def extract_keywords(self, text: str, top_k: int = 10) -> List[str]:
+        """Extract keywords using TF-IDF"""
+        try:
+            if not self.tfidf or not text:
+                return []
+            normalized_text = self.normalize_text(text)
+            if HAZM_AVAILABLE:
+                tokens = word_tokenize(normalized_text)
+                processed_text = ' '.join(tokens)
+            else:
+                processed_text = normalized_text
+            tfidf_matrix = self.tfidf.fit_transform([processed_text])
+            feature_names = self.tfidf.get_feature_names_out()
+            scores = tfidf_matrix.toarray()[0]
+            keyword_scores = list(zip(feature_names, scores))
+            keyword_scores.sort(key=lambda x: x[1], reverse=True)
+            return [kw[0] for kw in keyword_scores[:top_k] if kw[1] > 0]
+        except Exception as e:
+            logger.error(f"Keyword extraction failed: {e}")
+            return []
+    def classify_document(self, text: str) -> Tuple[str, float]:
+        """Classify document type with confidence score"""
+        try:
+            normalized_text = self.normalize_text(text.lower())
+            scores = {}
+            for category, keywords in self.legal_categories.items():
+                score = 0
+                for keyword in keywords:
+                    count = normalized_text.count(keyword)
+                    score += count * (len(keyword) / 5)
+                if len(normalized_text) > 0:
+                    scores[category] = score / (len(normalized_text) / 1000)
+                else:
+                    scores[category] = 0
+            if not scores or max(scores.values()) == 0:
+                return "عمومی", 0.0
+            best_category = max(scores.items(), key=lambda x: x[1])
+            total_score = sum(scores.values())
+            confidence = min(best_category[1] / total_score, 1.0) if total_score > 0 else 0.0
+            return best_category[0], confidence
+        except Exception as e:
+            logger.error(f"Document classification failed: {e}")
+            return "عمومی", 0.0
+    def calculate_importance_score(self, doc: LegalDocument) -> float:
+        """Calculate document importance score"""
+        try:
+            score = 0.0
+            title_lower = doc.title.lower()
+            high_importance_words = ['قانون', 'اساسی', 'حکم', 'رای', 'مصوبه']
+            medium_importance_words = ['آیین‌نامه', 'بخشنامه', 'دستورالعمل']
+            for word in high_importance_words:
+                if word in title_lower:
+                    score += 0.3
+                    break
+            for word in medium_importance_words:
+                if word in title_lower:
+                    score += 0.2
+                    break
+            content_length = len(doc.content)
+            if content_length > 5000:
+                score += 0.25
+            elif content_length > 2000:
+                score += 0.15
+            elif content_length > 500:
+                score += 0.1
+            if doc.date_published:
+                try:
+                    date_formats = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
+                    pub_date = None
+                    for fmt in date_formats:
+                        try:
+                            pub_date = datetime.strptime(doc.date_published, fmt)
+                            break
+                        except:
+                            continue
+                    if pub_date:
+                        days_old = (datetime.now() - pub_date).days
+                        if days_old < 30:
+                            score += 0.25
+                        elif days_old < 365:
+                            score += 0.15
+                        elif days_old < 1825:
+                            score += 0.05
+                except:
+                    pass
+            legal_keywords = ['قانون', 'ماده', 'بند', 'حکم', 'رای', 'دادگاه', 'محکمه']
+            content_lower = doc.content.lower()
+            keyword_count = sum(content_lower.count(kw) for kw in legal_keywords)
+            word_count = len(doc.content.split())
+            if word_count > 0:
+                keyword_density = keyword_count / word_count
+                score += min(keyword_density * 5, 0.2)
+            type_bonuses = {
+                'law': 0.2,
+                'ruling': 0.15,
+                'regulation': 0.1,
+                'news': 0.05
+            }
+            score += type_bonuses.get(doc.document_type, 0)
+            return min(score, 1.0)
+        except Exception as e:
+            logger.error(f"Importance score calculation failed: {e}")
+            return 0.0
+    def extract_legal_entities(self, text: str) -> List[str]:
+        """Extract legal entities from text"""
+        try:
+            entities = []
+            patterns = {
+                'قوانین': r'قانون\s+[\u0600-\u06FF\s]{3,30}',
+                'مواد': r'ماده\s+\d+[\u0600-\u06FF\s]*',
+                'دادگاه‌ها': r'دادگاه\s+[\u0600-\u06FF\s]{3,30}',
+                'مراجع': r'(وزارت|سازمان|اداره|شورای|کمیته)\s+[\u0600-\u06FF\s]{3,30}',
+                'احکام': r'(حکم|رای)\s+(شماره\s+)?\d+',
+            }
+            for entity_type, pattern in patterns.items():
+                matches = re.findall(pattern, text)
+                for match in matches:
+                    clean_match = re.sub(r'\s+', ' ', match.strip())
+                    if len(clean_match) > 5 and len(clean_match) < 100:
+                        entities.append(clean_match)
+            unique_entities = list(dict.fromkeys(entities))
+            return unique_entities[:15]
+        except Exception as e:
+            logger.error(f"Entity extraction failed: {e}")
+            return []
+    def get_text_embedding(self, text: str) -> Optional[List[float]]:
+        """Get text embedding using available model"""
+        if not self.model or not self.tokenizer or not TORCH_AVAILABLE:
+            return None
+        try:
+            normalized_text = self.normalize_text(text)
+            if len(normalized_text) > 512:
+                normalized_text = normalized_text[:512]
+            if not normalized_text:
+                return None
+            inputs = self.tokenizer(
+                normalized_text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
+            return embedding.tolist()
+        except Exception as e:
+            logger.error(f"Embedding generation failed: {e}")
+            return None
+    def generate_summary(self, text: str, max_length: int = 200) -> str:
+        """Generate text summary"""
+        try:
+            if len(text) <= max_length:
+                return text
+            if HAZM_AVAILABLE:
+                sentences = sent_tokenize(text)
+            else:
+                sentences = re.split(r'[.!?]+', text)
+                sentences = [s.strip() for s in sentences if s.strip()]
+            if len(sentences) <= 2:
+                return text[:max_length] + "..." if len(text) > max_length else text
+            keywords = self.extract_keywords(text, top_k=15)
+            sentence_scores = []
+            for sentence in sentences:
+                if len(sentence) < 20:
+                    continue
+                score = 0
+                sentence_lower = sentence.lower()
+                for kw in keywords:
+                    if kw in sentence_lower:
+                        score += 1
+                legal_terms = ['قانون', 'ماده', 'حکم', 'رای', 'دادگاه']
+                for term in legal_terms:
+                    if term in sentence_lower:
+                        score += 0.5
+                if len(sentence) > 200:
+                    score *= 0.8
+                sentence_scores.append((sentence, score))
+            sentence_scores.sort(key=lambda x: x[1], reverse=True)
+            selected_sentences = []
+            current_length = 0
+            for sentence, score in sentence_scores:
+                if current_length + len(sentence) <= max_length:
+                    selected_sentences.append(sentence)
+                    current_length += len(sentence)
+                else:
+                    break
+            if not selected_sentences:
+                return text[:max_length] + "..."
+            summary = ' '.join(selected_sentences)
+            return summary if len(summary) <= max_length else summary[:max_length] + "..."
+        except Exception as e:
+            logger.error(f"Summary generation failed: {e}")
+            return text[:max_length] + "..." if len(text) > max_length else text
+    def process_document(self, doc: LegalDocument) -> LegalDocument:
+        """Process document with all available NLP features"""
+        try:
+            logger.info(f"Processing document: {doc.title[:50]}...")
+            doc.keywords = self.extract_keywords(doc.content)
+            doc_type, confidence = self.classify_document(doc.content)
+            if confidence > 0.3:
+                doc.category = doc_type
+            doc.importance_score = self.calculate_importance_score(doc)
+            doc.legal_entities = self.extract_legal_entities(doc.content)
+            doc.summary = self.generate_summary(doc.content)
+            doc.embedding = self.get_text_embedding(doc.content)
+            logger.info(f"✅ Processed: {doc.title[:30]}... (Score: {doc.importance_score:.2f})")
+            return doc
+        except Exception as e:
+            logger.error(f"Document processing failed: {e}")
+            return doc
+class EnhancedLegalScraper:
+    """Enhanced legal scraper with real web scraping and NLP"""
+    def __init__(self, delay: float = 1.0):
+        self.delay = delay
+        self.session = requests.Session()
+        try:
+            self.nlp_processor = PersianNLPProcessor()
+            logger.info("✅ NLP processor initialized")
+        except Exception as e:
+            logger.error(f"❌ NLP processor initialization failed: {e}")
+            self.nlp_processor = None
+        self.db_path = self._get_db_path()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'fa,en-US;q=0.7,en;q=0.3',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+        self._init_database()
+    def _get_db_path(self) -> str:
+        """Get appropriate database path for the environment"""
+        possible_paths = [
+            "/tmp/legal_scraper.db",
+            "./data/legal_scraper.db",
+            "legal_scraper.db"
+        ]
+        for path in possible_paths:
+            try:
+                Path(path).parent.mkdir(parents=True, exist_ok=True)
+                return path
+            except:
+                continue
+        return ":memory:"
+    def _init_database(self):
+        """Initialize enhanced database with NLP fields"""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            cursor.execute('''
+            CREATE TABLE IF NOT EXISTS legal_documents (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                title TEXT NOT NULL,
+                content TEXT NOT NULL,
+                source_url TEXT UNIQUE NOT NULL,
+                document_type TEXT NOT NULL,
+                date_published TEXT,
+                date_scraped TEXT NOT NULL,
+                category TEXT,
+                tags TEXT,
+                summary TEXT,
+                importance_score REAL DEFAULT 0.0,
+                sentiment_score REAL DEFAULT 0.0,
+                legal_entities TEXT,
+                keywords TEXT,
+                embedding TEXT,
+                language TEXT DEFAULT 'fa',
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+            ''')
+            indexes = [
+                'CREATE INDEX IF NOT EXISTS idx_source_url ON legal_documents(source_url)',
+                'CREATE INDEX IF NOT EXISTS idx_document_type ON legal_documents(document_type)',
+                'CREATE INDEX IF NOT EXISTS idx_importance_score ON legal_documents(importance_score DESC)',
+                'CREATE INDEX IF NOT EXISTS idx_category ON legal_documents(category)',
+                'CREATE INDEX IF NOT EXISTS idx_date_published ON legal_documents(date_published)',
+                'CREATE INDEX IF NOT EXISTS idx_date_scraped ON legal_documents(date_scraped DESC)'
+            ]
+            for index in indexes:
+                cursor.execute(index)
+            conn.commit()
+            conn.close()
+            logger.info(f"✅ Database initialized: {self.db_path}")
+        except Exception as e:
+            logger.error(f"❌ Database initialization failed: {e}")
+            raise
+    def save_document(self, doc: LegalDocument) -> bool:
+        """Save enhanced document to database"""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            cursor.execute('''
+            INSERT OR REPLACE INTO legal_documents
+            (title, content, source_url, document_type, date_published,
+             date_scraped, category, tags, summary, importance_score,
+             sentiment_score, legal_entities, keywords, embedding, language)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ''', (
+                doc.title,
+                doc.content,
+                doc.source_url,
+                doc.document_type,
+                doc.date_published,
+                doc.date_scraped,
+                doc.category,
+                json.dumps(doc.tags, ensure_ascii=False) if doc.tags else None,
+                doc.summary,
+                doc.importance_score,
+                doc.sentiment_score,
+                json.dumps(doc.legal_entities, ensure_ascii=False) if doc.legal_entities else None,
+                json.dumps(doc.keywords, ensure_ascii=False) if doc.keywords else None,
+                json.dumps(doc.embedding) if doc.embedding else None,
+                doc.language
+            ))
+            conn.commit()
+            conn.close()
+            return True
+        except Exception as e:
+            logger.error(f"Failed to save document {doc.source_url}: {e}")
+            return False
+    def get_enhanced_statistics(self) -> Dict:
+        """Get comprehensive statistics with NLP insights"""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            stats = {}
+            cursor.execute('SELECT COUNT(*) FROM legal_documents')
+            stats['total_documents'] = cursor.fetchone()[0]
+            cursor.execute('SELECT document_type, COUNT(*) FROM legal_documents GROUP BY document_type')
+            stats['by_type'] = dict(cursor.fetchall())
+            cursor.execute('SELECT category, COUNT(*) FROM legal_documents WHERE category IS NOT NULL GROUP BY category')
+            stats['by_category'] = dict(cursor.fetchall())
+            cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score >= 0.7')
+            high_importance = cursor.fetchone()[0]
+            cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score >= 0.3 AND importance_score < 0.7')
+            medium_importance = cursor.fetchone()[0]
+            cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score < 0.3')
+            low_importance = cursor.fetchone()[0]
+            stats['importance_distribution'] = {
+                'high': high_importance,
+                'medium': medium_importance,
+                'low': low_importance
+            }
+            cursor.execute('SELECT keywords FROM legal_documents WHERE keywords IS NOT NULL')
+            all_keywords = []
+            for row in cursor.fetchall():
+                try:
+                    keywords = json.loads(row[0])
+                    all_keywords.extend(keywords)
+                except:
+                    continue
+            if all_keywords:
+                keyword_counts = {}
+                for kw in all_keywords:
+                    keyword_counts[kw] = keyword_counts.get(kw, 0) + 1
+                topទ
+top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:25]
+                stats['top_keywords'] = dict(top_keywords)
+            cursor.execute('''
+            SELECT DATE(date_scraped) as day, COUNT(*)
+            FROM legal_documents
+            WHERE date_scraped >= date('now', '-7 days')
+            GROUP BY DATE(date_scraped)
+            ORDER BY day DESC
+            ''')
+            stats['recent_activity'] = dict(cursor.fetchall())
+            cursor.execute('''
+            SELECT document_type, AVG(importance_score)
+            FROM legal_documents
+            GROUP BY document_type
+            ''')
+            stats['avg_importance_by_type'] = dict(cursor.fetchall())
+            cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE embedding IS NOT NULL')
+            stats['documents_with_embeddings'] = cursor.fetchone()[0]
+            cursor.execute('SELECT language, COUNT(*) FROM legal_documents GROUP BY language')
+            stats['by_language'] = dict(cursor.fetchall())
+            conn.close()
+            return stats
+        except Exception as e:
+            logger.error(f"Statistics generation failed: {e}")
+            return {
+                'total_documents': 0,
+                'by_type': {},
+                'by_category': {},
+                'importance_distribution': {'high': 0, 'medium': 0, 'low': 0},
+                'top_keywords': {},
+                'recent_activity': {},
+                'avg_importance_by_type': {},
+                'documents_with_embeddings': 0,
+                'by_language': {}
+            }
+    def search_with_similarity(self, query: str, limit: int = 20) -> List[Dict]:
+        """Advanced search using embeddings and similarity"""
+        if not self.nlp_processor or not self.nlp_processor.model:
+            return self._text_search(query, limit)
+        try:
+            query_embedding = self.nlp_processor.get_text_embedding(query)
+            if not query_embedding:
+                return self._text_search(query, limit)
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            cursor.execute('''
+            SELECT id, title, content, source_url, document_type,
+            importance_score, summary, embedding
+            FROM legal_documents
+            WHERE embedding IS NOT NULL
+            ''')
+            results = []
+            query_vector = np.array(query_embedding)
+            for row in cursor.fetchall():
+                try:
+                    doc_embedding = json.loads(row[7])
+                    doc_vector = np.array(doc_embedding)
+                    similarity = cosine_similarity([query_vector], [doc_vector])[0][0]
+                    combined_score = (similarity * 0.7) + (row[5] * 0.3)
+                    results.append({
+                        'id': row[0],
+                        'title': row[1],
+                        'content': row[2][:500] + "..." if len(row[2]) > 500 else row[2],
+                        'source_url': row[3],
+                        'document_type': row[4],
+                        'importance_score': row[5],
+                        'summary': row[6],
+                        'similarity_score': similarity,
+                        'combined_score': combined_score
+                    })
+                except Exception as e:
+                    logger.error(f"Error processing document embedding: {e}")
+                    continue
+            results.sort(key=lambda x: x['combined_score'], reverse=True)
+            conn.close()
+            return results[:limit]
+        except Exception as e:
+            logger.error(f"Similarity search failed: {e}")
+            return self._text_search(query, limit)
+    def _text_search(self, query: str, limit: int = 20) -> List[Dict]:
+        """Fallback text search"""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            if self.nlp_processor:
+                normalized_query = self.nlp_processor.normalize_text(query)
+            else:
+                normalized_query = query
+            query_words = normalized_query.split()
+            search_conditions = []
+            params = []
+            for word in query_words:
+                search_conditions.append("(title LIKE ? OR content LIKE ?)")
+                params.extend([f'%{word}%', f'%{word}%'])
+            where_clause = " OR ".join(search_conditions)
+            cursor.execute(f'''
+            SELECT id, title, content, source_url, document_type,
+            importance_score, summary
+            FROM legal_documents
+            WHERE {where_clause}
+            ORDER BY importance_score DESC
+            LIMIT ?
+            ''', params + [limit])
+            results = []
+            for row in cursor.fetchall():
+                results.append({
+                    'id': row[0],
+                    'title': row[1],
+                    'content': row[2][:500] + "..." if len(row[2]) > 500 else row[2],
+                    'source_url': row[3],
+                    'document_type': row[4],
+                    'importance_score': row[5],
+                    'summary': row[6],
+                    'similarity_score': 0.0
+                })
+            conn.close()
+            return results
+        except Exception as e:
+            logger.error(f"Text search failed: {e}")
+            return []
+    def export_to_csv(self, filename: str = None) -> str:
+        """Export data to CSV with full details"""
+        try:
+            if not filename:
+                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+                filename = f"legal_documents_{timestamp}.csv"
+            conn = sqlite3.connect(self.db_path)
+            query = '''
+            SELECT title, content, source_url, document_type,
+            date_published, date_scraped, category, summary,
+            importance_score, keywords, legal_entities
+            FROM legal_documents
+            ORDER BY importance_score DESC, date_scraped DESC
+            '''
+            df = pd.read_sql_query(query, conn)
+            conn.close()
+            for col in ['keywords', 'legal_entities']:
+                if col in df.columns:
+                    df[col] = df[col].apply(lambda x: ', '.join(json.loads(x)) if x else '')
+            df.to_csv(filename, index=False, encoding='utf-8-sig')
+            logger.info(f"✅ Data exported to CSV: {filename}")
+            return filename
+        except Exception as e:
+            logger.error(f"CSV export failed: {e}")
+            return ""
+    def scrape_real_sources(self, urls: List[str] = IRANIAN_LEGAL_SOURCES, max_docs: int = 20) -> List[LegalDocument]:
+        """Real web scraping implementation with source-specific extraction"""
+        documents = []
+        for i, url in enumerate(urls):
+            if len(documents) >= max_docs:
+                break
+            try:
+                logger.info(f"🔄 Scraping {i+1}/{len(urls)}: {url}")
+                time.sleep(self.delay)
+                response = self.session.get(url, timeout=15)
+                response.raise_for_status()
+                if response.encoding == 'ISO-8859-1':
+                    response.encoding = response.apparent_encoding
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Extract documents using source-specific logic
+                extracted_items = self._extract_source_specific_content(soup, url, max_docs - len(documents))
+                for item in extracted_items:
+                    if len(documents) >= max_docs:
+                        break
+                    doc = LegalDocument(
+                        title=item['title'],
+                        content=item['content'],
+                        source_url=item['url'],
+                        document_type=self._determine_document_type(item['title'], item['content']),
+                        date_published=item['date']
+                    )
+                    if self.nlp_processor:
+                        doc = self.nlp_processor.process_document(doc)
+                    documents.append(doc)
+                    logger.info(f"✅ Extracted: {doc.title[:50]}...")
+            except Exception as e:
+                logger.error(f"❌ Error scraping {url}: {e}")
+                continue
+        documents.sort(key=lambda x: x.importance_score, reverse=True)
+        return documents
+    def _extract_source_specific_content(self, soup: BeautifulSoup, url: str, max_items: int) -> List[Dict]:
+        """Extract content based on source-specific selectors"""
+        if 'irna.ir' in url:
+            return self._extract_irna_content(soup, url, max_items)
+        elif 'tasnimnews.com' in url:
+            return self._extract_tasnim_content(soup, url, max_items)
+        elif 'mehrnews.com' in url:
+            return self._extract_mehr_content(soup, url, max_items)
+        elif 'farsnews.ir' in url:
+            return self._extract_fars_content(soup, url, max_items)
+        else:
+            return self._extract_generic_content(soup, url, max_items)
+    def _extract_irna_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
+        """Extract content from IRNA"""
+        items = []
+        try:
+            articles = soup.select('.news-item, .article, .story')[:max_items]
+            for article in articles:
+                title_elem = soup.select_one('h1, h2, h3, .title, .headline, a')
+                if title_elem:
+                    title = title_elem.get_text(strip=True)
+                    content = article.get_text(strip=True)
+                    if len(title) > 10 and len(content) > 100:
+                        items.append({
+                            'title': title,
+                            'content': content,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+            if not items:
+                main_content = soup.select_one('main, .main-content, .content, article')
+                if main_content:
+                    title = soup.select_one('h1, title')
+                    title_text = title.get_text(strip=True) if title else "خبر ایرنا"
+                    content_text = main_content.get_text(strip=True)
+                    if len(content_text) > 200:
+                        items.append({
+                            'title': title_text,
+                            'content': content_text,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+        except Exception as e:
+            logger.error(f"IRNA extraction error: {e}")
+        return items
+    def _extract_tasnim_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
+        """Extract content from Tasnim"""
+        items = []
+        try:
+            articles = soup.select('.news-box, .item, .story-item')[:max_items]
+            for article in articles:
+                title_elem = article.select_one('h2, h3, .title, a')
+                if title_elem:
+                    title = title_elem.get_text(strip=True)
+                    content = article.get_text(strip=True)
+                    if len(title) > 10 and len(content) > 100:
+                        items.append({
+                            'title': title,
+                            'content': content,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+            if not items:
+                main_content = soup.select_one('.news-content, .story-body, main')
+                if main_content:
+                    title = soup.select_one('h1, .news-title')
+                    title_text = title.get_text(strip=True) if title else "خبر تسنیم"
+                    content_text = main_content.get_text(strip=True)
+                    if len(content_text) > 200:
+                        items.append({
+                            'title': title_text,
+                            'content': content_text,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+        except Exception as e:
+            logger.error(f"Tasnim extraction error: {e}")
+        return items
+    def _extract_mehr_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
+        """Extract content from Mehr News"""
+        items = []
+        try:
+            articles = soup.select('.news-item, .article-item, .story')[:max_items]
+            for article in articles:
+                title_elem = article.select_one('h2, h3, .title, .headline')
+                if title_elem:
+                    title = title_elem.get_text(strip=True)
+                    content = article.get_text(strip=True)
+                    if len(title) > 10 and len(content) > 100:
+                        items.append({
+                            'title': title,
+                            'content': content,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+            if not items:
+                main_content = soup.select_one('.content, .news-body, article')
+                if main_content:
+                    title = soup.select_one('h1, .page-title')
+                    title_text = title.get_text(strip=True) if title else "خبر مهر"
+                    content_text = main_content.get_text(strip=True)
+                    if len(content_text) > 200:
+                        items.append({
+                            'title': title_text,
+                            'content': content_text,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+        except Exception as e:
+            logger.error(f"Mehr extraction error: {e}")
+        return items
+    def _extract_fars_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
+        """Extract content from Fars News"""
+        items = []
+        try:
+            articles = soup.select('.news, .item, .story-item')[:max_items]
+            for article in articles:
+                title_elem = article.select_one('h2, h3, .title, a')
+                if title_elem:
+                    title = title_elem.get_text(strip=True)
+                    content = article.get_text(strip=True)
+                    if len(title) > 10 and len(content) > 100:
+                        items.append({
+                            'title': title,
+                            'content': content,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+            if not items:
+                main_content = soup.select_one('.news-content, .story, main')
+                if main_content:
+                    title = soup.select_one('h1, .news-title')
+                    title_text = title.get_text(strip=True) if title else "خبر فارس"
+                    content_text = main_content.get_text(strip=True)
+                    if len(content_text) > 200:
+                        items.append({
+                            'title': title_text,
+                            'content': content_text,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+        except Exception as e:
+            logger.error(f"Fars extraction error: {e}")
+        return items
+    def _extract_generic_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
+        """Generic content extraction for unknown sources"""
+        items = []
+        try:
+            articles = soup.select('article, .article, .post, .news-item, .story')[:max_items]
+            for article in articles:
+                title_elem = article.select_one('h1, h2, h3, .title, .headline')
+                if title_elem:
+                    title = title_elem.get_text(strip=True)
+                    content = article.get_text(strip=True)
+                    if len(title) > 10 and len(content) > 150:
+                        items.append({
+                            'title': title,
+                            'content': content,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+            if not items:
+                title_elem = soup.select_one('h1, title')
+                content_elem = soup.select_one('main, .main-content, .content, .entry-content, body')
+                if title_elem and content_elem:
+                    for unwanted in content_elem(['script', 'style', 'nav', 'header', 'footer']):
+                        unwanted.decompose()
+                    title = title_elem.get_text(strip=True)
+                    content = content_elem.get_text(strip=True)
+                    if len(title) > 5 and len(content) > 200:
+                        items.append({
+                            'title': title,
+                            'content': content,
+                            'url': base_url,
+                            'date': self._extract_date(soup)
+                        })
+        except Exception as e:
+            logger.error(f"Generic extraction error: {e}")
+        return items
+    def _extract_document_from_soup(self, soup: BeautifulSoup, url: str) -> Optional[LegalDocument]:
+        """Extract main document from BeautifulSoup object using source-specific logic"""
+        try:
+            items = self._extract_source_specific_content(soup, url, 1)
+            if not items:
+                return None
+            item = items[0]
+            return LegalDocument(
+                title=item['title'],
+                content=item['content'],
+                source_url=item['url'],
+                document_type=self._determine_document_type(item['title'], item['content']),
+                date_published=item['date']
+            )
+        except Exception as e:
+            logger.error(f"Document extraction failed: {e}")
+            return None
+    def _extract_additional_articles(self, soup: BeautifulSoup, base_url: str) -> List[LegalDocument]:
+        """Extract additional articles from the same page using source-specific logic"""
+        documents = []
+        try:
+            items = self._extract_source_specific_content(soup, base_url, 3)
+            for item in items:
+                doc = LegalDocument(
+                    title=item['title'],
+                    content=item['content'],
+                    source_url=item['url'],
+                    document_type=self._determine_document_type(item['title'], item['content']),
+                    date_published=item['date']
+                )
+                documents.append(doc)
+        except Exception as e:
+            logger.error(f"Additional articles extraction failed: {e}")
+        return documents[:3]
+    def _determine_document_type(self, title: str, content: str) -> str:
+        """Determine document type based on content"""
+        text = (title + " " + content).lower()
+        if any(word in text for word in ['قانون', 'ماده', 'فصل', 'بند', 'تبصره']):
+            return 'law'
+        elif any(word in text for word in ['رای', 'حکم', 'دادگاه', 'قاضی']):
+            return 'ruling'
+        elif any(word in text for word in ['آیین‌نامه', 'دستورالعمل', 'بخشنامه']):
+            return 'regulation'
+        elif any(word in text for word in ['خبر', 'اعلام', 'گزارش', 'اطلاعیه']):
+            return 'news'
+        else:
+            return 'general'
+    def _extract_date(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract publication date"""
+        try:
+            date_selectors = [
+                'meta[name="article:published_time"]',
+                'meta[property="article:published_time"]',
+                'meta[name="date"]',
+                'meta[name="DC.date"]',
+                '.date',
+                '.publish-date',
+                '.article-date',
+                'time[datetime]'
+            ]
+            for selector in date_selectors:
+                element = soup.select_one(selector)
+                if element:
+                    date_str = element.get('content') or element.get('datetime') or element.get_text()
+                    if date_str:
+                        return self._normalize_date(date_str)
+            text = soup.get_text()
+            persian_date_patterns = [
+                r'(\d{4}/\d{1,2}/\d{1,2})',
+                r'(\d{1,2}/\d{1,2}/\d{4})',
+                r'(\d{4}-\d{1,2}-\d{1,2})'
+            ]
+            for pattern in persian_date_patterns:
+                match = re.search(pattern, text)
+                if match:
+                    return match.group(1)
+            return None
+        except Exception:
+            return None
+    def _normalize_date(self, date_str: str) -> Optional[str]:
+        """Normalize date string to standard format"""
+        try:
+            date_str = re.sub(r'[^\d/\-:]', ' ', date_str).strip()
+            formats = [
+                '%Y-%m-%d',
+                '%Y/%m/%d',
+                '%d/%m/%Y',
+                '%Y-%m-%d %H:%M:%S',
+                '%Y/%m/%d %H:%M:%S'
+            ]
+            for fmt in formats:
+                try:
+                    parsed_date = datetime.strptime(date_str, fmt)
+                    return parsed_date.strftime('%Y-%m-%d')
+                except ValueError:
+                    continue
+            return date_str
+        except Exception:
+            return None