Really-amin commited on
Commit
91a893c
·
verified ·
1 Parent(s): 70b6e55

Upload 2 files

Browse files
app/enhanced_legal_scraper.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ # اضافه کردن مسیر فعلی به sys.path
7
+ sys.path.insert(0, str(Path(__file__).parent))
8
+
9
+ # ایمپورت رابط اسکراپر
10
+ from enhanced_legal_scraper import EnhancedLegalScraper, LegalDocument
11
+ import pandas as pd
12
+ import sqlite3
13
+ import json
14
+ from datetime import datetime
15
+ from typing import List, Dict, Tuple
16
+ import plotly.express as px
17
+
18
+ class LegalScraperInterface:
19
+ """Gradio interface for enhanced legal scraper"""
20
+
21
+ def __init__(self):
22
+ self.scraper = EnhancedLegalScraper(delay=1.5)
23
+ self.is_scraping = False
24
+
25
+ def scrape_websites(self, urls_text: str, max_docs: int) -> Tuple[str, str, str]:
26
+ """Scrape websites from provided URLs"""
27
+ if self.is_scraping:
28
+ return "❌ اسکراپینگ در حال انجام است", "", ""
29
+
30
+ urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
31
+ if not urls:
32
+ return "❌ لطفاً URL وارد کنید", "", ""
33
+
34
+ try:
35
+ self.is_scraping = True
36
+ documents = self.scraper.scrape_real_sources(urls, max_docs)
37
+
38
+ status = f"✅ اسکراپینگ کامل شد - {len(documents)} سند جمع‌آوری شد"
39
+
40
+ summary_lines = [
41
+ f"📊 **خلاصه نتایج:**",
42
+ f"- تعداد کل اسناد: {len(documents)}",
43
+ f"- منابع پردازش شده: {len(urls)}",
44
+ f"- زمان اسکراپینگ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
45
+ "",
46
+ "📋 **جزئیات:**"
47
+ ]
48
+
49
+ for i, doc in enumerate(documents[:5]):
50
+ summary_lines.append(f"{i+1}. {doc.title[:50]}...")
51
+
52
+ summary = "\n".join(summary_lines)
53
+
54
+ preview_lines = []
55
+ for doc in documents[:3]:
56
+ preview_lines.extend([
57
+ f"**{doc.title}**",
58
+ f"نوع: {doc.document_type}",
59
+ f"منبع: {doc.source_url}",
60
+ f"امتیاز اهمیت: {doc.importance_score:.2f}",
61
+ f"خلاصه: {doc.summary[:100]}..." if doc.summary else "بدون خلاصه",
62
+ "---"
63
+ ])
64
+
65
+ preview = "\n".join(preview_lines) if preview_lines else "هیچ سندی یافت نشد"
66
+
67
+ return status, summary, preview
68
+
69
+ except Exception as e:
70
+ error_msg = f"❌ خطا در اسکراپینگ: {str(e)}"
71
+ return error_msg, "", ""
72
+
73
+ finally:
74
+ self.is_scraping = False
75
+
76
+ def get_database_stats(self) -> Tuple[str, str]:
77
+ """Get database statistics and visualizations"""
78
+ try:
79
+ stats = self.scraper.get_enhanced_statistics()
80
+
81
+ stats_lines = [
82
+ "📊 **آمار پایگاه داده:**",
83
+ f"- کل اسناد: {stats.get('total_documents', 0)}",
84
+ "",
85
+ "📈 **بر اساس نوع:**"
86
+ ]
87
+
88
+ for doc_type, count in stats.get('by_type', {}).items():
89
+ type_name = {
90
+ 'law': 'قوانین',
91
+ 'news': 'اخبار',
92
+ 'ruling': 'آرا',
93
+ 'regulation': 'آیین‌نامه',
94
+ 'general': 'عمومی'
95
+ }.get(doc_type, doc_type)
96
+ stats_lines.append(f"- {type_name}: {count}")
97
+
98
+ stats_text = "\n".join(stats_lines)
99
+
100
+ viz_html = self._create_stats_visualization(stats)
101
+
102
+ return stats_text, viz_html
103
+
104
+ except Exception as e:
105
+ error_msg = f"خطا در دریافت آمار: {str(e)}"
106
+ return error_msg, ""
107
+
108
+ def _create_stats_visualization(self, stats: Dict) -> str:
109
+ """Create visualization for statistics"""
110
+ try:
111
+ by_type = stats.get('by_type', {})
112
+ if by_type and stats.get('total_documents', 0) > 0:
113
+ type_names = {
114
+ 'law': 'قوانین',
115
+ 'news': 'اخبار',
116
+ 'ruling': 'آرا',
117
+ 'regulation': 'آیین‌نامه',
118
+ 'general': 'عمومی'
119
+ }
120
+
121
+ labels = [type_names.get(k, k) for k in by_type.keys()]
122
+ values = list(by_type.values())
123
+
124
+ fig = px.pie(
125
+ values=values,
126
+ names=labels,
127
+ title="توزیع اسناد بر اساس نوع"
128
+ )
129
+ fig.update_traces(textposition='inside', textinfo='percent+label')
130
+
131
+ return fig.to_html()
132
+ else:
133
+ return "<p>داده‌ای برای نمایش یافت نشد</p>"
134
+
135
+ except Exception as e:
136
+ return f"<p>خطا در ایجاد نمودار: {str(e)}</p>"
137
+
138
+ def search_documents(self, query: str, search_type: str) -> str:
139
+ """Search in collected documents"""
140
+ if not query.strip():
141
+ return "لطفاً کلیدواژه‌ای برای جستجو وارد کنید"
142
+
143
+ try:
144
+ if search_type == "هوشمند":
145
+ results = self.scraper.search_with_similarity(query, limit=10)
146
+ else:
147
+ results = self.scraper._text_search(query, limit=10)
148
+
149
+ if not results:
150
+ return f"هیچ سندی با کلیدواژه '{query}' یافت نشد"
151
+
152
+ result_lines = [f"🔍 **نتایج جستجو برای '{query}':** ({len(results)} مورد یافت شد)\n"]
153
+
154
+ for i, result in enumerate(results):
155
+ result_lines.extend([
156
+ f"**{i+1}. {result['title']}**",
157
+ f" نوع: {result['document_type']}",
158
+ f" منبع: {result['source_url']}",
159
+ f" امتیاز شباهت: {result.get('similarity_score', 0):.3f}" if 'similarity_score' in result else "",
160
+ f" تاریخ: {result['date_published'] or 'نامشخص'}",
161
+ f" خلاصه: {result['summary'][:100]}..." if result.get('summary') else "",
162
+ "---"
163
+ ])
164
+
165
+ return "\n".join(result_lines)
166
+
167
+ except Exception as e:
168
+ error_msg = f"خطا در جستجو: {str(e)}"
169
+ return error_msg
170
+
171
+ def create_scraper_interface():
172
+ """Create Gradio interface for legal scraper"""
173
+
174
+ scraper_interface = LegalScraperInterface()
175
+
176
+ css = """
177
+ .gradio-container {
178
+ max-width: 1200px !important;
179
+ margin: auto;
180
+ font-family: 'Tahoma', sans-serif;
181
+ }
182
+ .header {
183
+ background: linear-gradient(135deg, #2c3e50, #3498db);
184
+ color: white;
185
+ padding: 20px;
186
+ border-radius: 10px;
187
+ text-align: center;
188
+ margin-bottom: 20px;
189
+ }
190
+ """
191
+
192
+ with gr.Blocks(css=css, title="اسکراپر پیشرفته اسناد حقوقی", theme=gr.themes.Soft()) as interface:
193
+
194
+ gr.HTML("""
195
+ <div class="header">
196
+ <h1>🤖 اسکراپر پیشرفته اسناد حقوقی</h1>
197
+ <p>سیستم هوشمند جمع‌آوری و تحلیل اسناد حقوقی با قابلیت‌های NLP</p>
198
+ </div>
199
+ """)
200
+
201
+ with gr.Tab("🕷️ اسکراپینگ"):
202
+ gr.Markdown("## جمع‌آوری اسناد از منابع حقوقی")
203
+
204
+ with gr.Row():
205
+ with gr.Column(scale=2):
206
+ urls_input = gr.Textbox(
207
+ label="📝 URL های منابع حقوقی",
208
+ placeholder="هر URL را در یک خط وارد کنید:\nhttps://rc.majlis.ir\nhttps://dolat.ir",
209
+ lines=5,
210
+ value="\n".join([
211
+ "https://rc.majlis.ir",
212
+ "https://dolat.ir",
213
+ "https://iribnews.ir"
214
+ ])
215
+ )
216
+
217
+ max_docs = gr.Slider(
218
+ label="حداکثر اسناد",
219
+ minimum=5,
220
+ maximum=50,
221
+ value=15,
222
+ step=5
223
+ )
224
+
225
+ scrape_btn = gr.Button("🚀 شروع اسکراپینگ", variant="primary")
226
+
227
+ with gr.Column(scale=1):
228
+ status_output = gr.Textbox(
229
+ label="⚡ وضعیت",
230
+ interactive=False,
231
+ lines=2
232
+ )
233
+
234
+ with gr.Row():
235
+ summary_output = gr.Textbox(
236
+ label="📊 خلاصه نتایج",
237
+ interactive=False,
238
+ lines=6
239
+ )
240
+
241
+ preview_output = gr.Textbox(
242
+ label="👁️ پیش‌نمایش اسناد",
243
+ interactive=False,
244
+ lines=6,
245
+ show_copy_button=True
246
+ )
247
+
248
+ scrape_btn.click(
249
+ fn=scraper_interface.scrape_websites,
250
+ inputs=[urls_input, max_docs],
251
+ outputs=[status_output, summary_output, preview_output]
252
+ )
253
+
254
+ with gr.Tab("🔍 جستجوی هوشمند"):
255
+ gr.Markdown("## جستجوی پیشرفته در اسناد")
256
+
257
+ with gr.Row():
258
+ search_input = gr.Textbox(
259
+ label="🔍 کلیدواژه جستجو",
260
+ placeholder="موضوع یا کلیدواژه مورد نظر را وارد کنید..."
261
+ )
262
+
263
+ search_type = gr.Dropdown(
264
+ label="نوع جستجو",
265
+ choices=["هوشمند", "متنی"],
266
+ value="هوشمند"
267
+ )
268
+
269
+ search_btn = gr.Button("🔍 جستجو", variant="primary")
270
+
271
+ search_results = gr.Textbox(
272
+ label="📋 نتایج جستجو",
273
+ interactive=False,
274
+ lines=15,
275
+ show_copy_button=True
276
+ )
277
+
278
+ search_btn.click(
279
+ fn=scraper_interface.search_documents,
280
+ inputs=[search_input, search_type],
281
+ outputs=[search_results]
282
+ )
283
+
284
+ with gr.Tab("📊 آمار و تحلیل"):
285
+ gr.Markdown("## آمار پیشرفته پایگاه داده")
286
+
287
+ stats_btn = gr.Button("📊 بروزرسانی آمار", variant="secondary")
288
+
289
+ with gr.Row():
290
+ stats_text = gr.Textbox(
291
+ label="📈 آمار متنی",
292
+ interactive=False,
293
+ lines=10
294
+ )
295
+
296
+ stats_plot = gr.HTML(
297
+ label="📊 نمودارها"
298
+ )
299
+
300
+ stats_btn.click(
301
+ fn=scraper_interface.get_database_stats,
302
+ outputs=[stats_text, stats_plot]
303
+ )
304
+
305
+ with gr.Tab("📚 راهنما"):
306
+ gr.Markdown("""
307
+ # 🤖 راهنمای اسکراپر پیشرفته
308
+
309
+ ## ویژگی‌های پیشرفته
310
+
311
+ ### 🧠 پردازش زبان طبیعی (NLP)
312
+ - استخراج خودکار کلمات کلیدی
313
+ - تولید خلاصه متن
314
+ - تحلیل احساسات
315
+ - شناسایی موجودیت‌های حقوقی
316
+ - جستجوی هوشمند بر اساس شباهت معنایی
317
+
318
+ ### 📊 تحلیل پیشرفته
319
+ - امتیازدهی اهمیت اسناد
320
+ - طبقه‌بندی خودکار
321
+ - آمار و نمودارهای تحلیلی
322
+ - گزارش‌های آماری
323
+
324
+ ## منابع پیشنهادی
325
+
326
+ - **مجلس شورای اسلامی**: https://rc.majlis.ir
327
+ - **دولت**: https://dolat.ir
328
+ - **خبرگزاری‌ها**: IRIB, IRNA, Tasnim, Mehr, Fars
329
+
330
+ ## نکات فنی
331
+
332
+ - سیستم از فایل robots.txt پیروی می‌کند
333
+ - محدودیت سرعت درخواست رعایت می‌شود
334
+ - داده‌ها در پایگاه داده SQLite ذخیره می‌شوند
335
+ - از مدل‌های هوش مصنوعی برای پردازش استفاده می‌شود
336
+
337
+ ⚠️ **تذکر**: این ابزار برای مقاصد آموزشی و پژوهشی ارائه شده است.
338
+ """)
339
+
340
+ return interface
341
+
342
+ def main():
343
+ """Main entry point for Hugging Face Spaces"""
344
+ print("🚀 راه اندازی اسکراپر پیشرفته اسناد حقوقی...")
345
+ print("📁 ایجاد دایرکتوری‌های مورد نیاز...")
346
+
347
+ # Create required directories
348
+ os.makedirs("/app/data", exist_ok=True)
349
+ os.makedirs("/app/logs", exist_ok=True)
350
+ os.makedirs("/app/cache", exist_ok=True)
351
+
352
+ # Create interface
353
+ interface = create_scraper_interface()
354
+
355
+ # Launch with Hugging Face optimized settings
356
+ interface.launch(
357
+ server_name="0.0.0.0",
358
+ server_port=7860,
359
+ share=False,
360
+ show_error=True,
361
+ debug=False,
362
+ enable_queue=True
363
+ )
364
+
365
+ if __name__ == "__main__":
366
+ main()
app/legal_scraper_interface.py ADDED
@@ -0,0 +1,1190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import json
4
+ import csv
5
+ import sqlite3
6
+ import logging
7
+ from datetime import datetime, timedelta
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+ from dataclasses import dataclass, asdict
10
+ from pathlib import Path
11
+ import re
12
+ import pandas as pd
13
+ import numpy as np
14
+ from sklearn.feature_extraction.text import TfidfVectorizer
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ from bs4 import BeautifulSoup
17
+
18
+ try:
19
+ import torch
20
+ from transformers import AutoTokenizer, AutoModel
21
+ TORCH_AVAILABLE = True
22
+ except ImportError:
23
+ TORCH_AVAILABLE = False
24
+ print("⚠️ PyTorch not available, running without advanced NLP features")
25
+
26
+ try:
27
+ import hazm
28
+ from hazm import Normalizer, word_tokenize, sent_tokenize
29
+ HAZM_AVAILABLE = True
30
+ except ImportError:
31
+ HAZM_AVAILABLE = False
32
+ print("⚠️ Hazm not available, using basic text processing")
33
+
34
+ # Configure logging
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format='%(asctime)s - %(levelname)s - %(message)s',
38
+ handlers=[
39
+ logging.FileHandler('legal_scraper.log'),
40
+ logging.StreamHandler()
41
+ ]
42
+ )
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Predefined Iranian legal and news sources
46
+ IRANIAN_LEGAL_SOURCES = [
47
+ "https://www.irna.ir", # خبرگزاری جمهوری اسلامی
48
+ "https://www.tasnimnews.com", # خبرگزاری تسنیم
49
+ "https://www.mehrnews.com", # خبرگزاری مهر
50
+ "https://www.farsnews.ir", # خبرگزاری فارس
51
+ "https://iribnews.ir", # خبرگزاری صدا و سیما
52
+ "https://www.dolat.ir", # پورتال دولت
53
+ "https://rc.majlis.ir", # مرکز پژوهش‌های مجلس
54
+ ]
55
+
56
+ @dataclass
57
+ class LegalDocument:
58
+ """Enhanced legal document with NLP features"""
59
+ title: str
60
+ content: str
61
+ source_url: str
62
+ document_type: str
63
+ date_published: Optional[str] = None
64
+ date_scraped: str = None
65
+ category: Optional[str] = None
66
+ tags: List[str] = None
67
+ summary: Optional[str] = None
68
+ importance_score: float = 0.0
69
+ sentiment_score: float = 0.0
70
+ legal_entities: List[str] = None
71
+ keywords: List[str] = None
72
+ embedding: List[float] = None
73
+ language: str = "fa"
74
+
75
+ def __post_init__(self):
76
+ if self.date_scraped is None:
77
+ self.date_scraped = datetime.now().isoformat()
78
+ if self.tags is None:
79
+ self.tags = []
80
+ if self.legal_entities is None:
81
+ self.legal_entities = []
82
+ if self.keywords is None:
83
+ self.keywords = []
84
+
85
+ class PersianNLPProcessor:
86
+ """Persian NLP processor using available models"""
87
+
88
+ def __init__(self):
89
+ if HAZM_AVAILABLE:
90
+ self.normalizer = Normalizer()
91
+ else:
92
+ self.normalizer = None
93
+
94
+ self.device = torch.device('cpu')
95
+
96
+ self.tokenizer = None
97
+ self.model = None
98
+
99
+ if TORCH_AVAILABLE:
100
+ try:
101
+ model_names = [
102
+ "HooshvareLab/bert-fa-base-uncased",
103
+ "HooshvareLab/bert-base-parsbert-uncased",
104
+ "distilbert-base-multilingual-cased"
105
+ ]
106
+
107
+ for model_name in model_names:
108
+ try:
109
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
110
+ self.model = AutoModel.from_pretrained(model_name)
111
+ self.model.to(self.device)
112
+ logger.info(f"✅ Loaded model: {model_name}")
113
+ break
114
+ except Exception as e:
115
+ logger.warning(f"⚠️ Failed to load {model_name}: {e}")
116
+ continue
117
+ except Exception as e:
118
+ logger.error(f"❌ Failed to load any Persian BERT model: {e}")
119
+
120
+ self.legal_categories = {
121
+ 'قانون': ['قانون', 'ماده', 'بند', 'فصل', 'تبصره', 'اصلاحیه'],
122
+ 'رای': ['رای', 'حکم', 'دادگاه', 'قاضی', 'محکوم', 'دادرسی'],
123
+ 'آیین‌نامه': ['آیین‌نامه', 'دستورالعمل', 'بخشنامه', 'مقررات'],
124
+ 'اخبار': ['خبر', 'گزارش', 'اعلام', 'اطلاعیه', 'بیانیه'],
125
+ 'نظریه': ['نظریه', 'تفسیر', 'استعلام', 'پاسخ', 'رأی']
126
+ }
127
+
128
+ self.tfidf = None
129
+ self._init_tfidf()
130
+
131
+ def _init_tfidf(self):
132
+ """Initialize TF-IDF vectorizer"""
133
+ try:
134
+ self.tfidf = TfidfVectorizer(
135
+ max_features=1000,
136
+ stop_words=self._get_persian_stopwords(),
137
+ ngram_range=(1, 2),
138
+ min_df=1,
139
+ max_df=0.8
140
+ )
141
+ except Exception as e:
142
+ logger.error(f"TF-IDF initialization failed: {e}")
143
+
144
+ def _get_persian_stopwords(self) -> List[str]:
145
+ """Get Persian stopwords"""
146
+ return [
147
+ 'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'را', 'و', 'است',
148
+ 'برای', 'تا', 'کرد', 'شد', 'می', 'خود', 'هم', 'نیز', 'یا', 'اما',
149
+ 'اگر', 'چون', 'پس', 'بعد', 'قبل', 'روی', 'زیر', 'کنار', 'داخل',
150
+ 'نیست', 'بود', 'باشد', 'کند', 'کنند', 'شود', 'گردد', 'دارد', 'دارند'
151
+ ]
152
+
153
+ def normalize_text(self, text: str) -> str:
154
+ """Normalize Persian text"""
155
+ if not text:
156
+ return ""
157
+
158
+ try:
159
+ text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
160
+ text = re.sub(r'\s+', ' ', text)
161
+
162
+ if self.normalizer:
163
+ text = self.normalizer.normalize(text)
164
+
165
+ return text.strip()
166
+ except Exception as e:
167
+ logger.error(f"Text normalization failed: {e}")
168
+ return text.strip()
169
+
170
+ def extract_keywords(self, text: str, top_k: int = 10) -> List[str]:
171
+ """Extract keywords using TF-IDF"""
172
+ try:
173
+ if not self.tfidf or not text:
174
+ return []
175
+
176
+ normalized_text = self.normalize_text(text)
177
+
178
+ if HAZM_AVAILABLE:
179
+ tokens = word_tokenize(normalized_text)
180
+ processed_text = ' '.join(tokens)
181
+ else:
182
+ processed_text = normalized_text
183
+
184
+ tfidf_matrix = self.tfidf.fit_transform([processed_text])
185
+ feature_names = self.tfidf.get_feature_names_out()
186
+ scores = tfidf_matrix.toarray()[0]
187
+
188
+ keyword_scores = list(zip(feature_names, scores))
189
+ keyword_scores.sort(key=lambda x: x[1], reverse=True)
190
+
191
+ return [kw[0] for kw in keyword_scores[:top_k] if kw[1] > 0]
192
+
193
+ except Exception as e:
194
+ logger.error(f"Keyword extraction failed: {e}")
195
+ return []
196
+
197
+ def classify_document(self, text: str) -> Tuple[str, float]:
198
+ """Classify document type with confidence score"""
199
+ try:
200
+ normalized_text = self.normalize_text(text.lower())
201
+
202
+ scores = {}
203
+ for category, keywords in self.legal_categories.items():
204
+ score = 0
205
+ for keyword in keywords:
206
+ count = normalized_text.count(keyword)
207
+ score += count * (len(keyword) / 5)
208
+
209
+ if len(normalized_text) > 0:
210
+ scores[category] = score / (len(normalized_text) / 1000)
211
+ else:
212
+ scores[category] = 0
213
+
214
+ if not scores or max(scores.values()) == 0:
215
+ return "عمومی", 0.0
216
+
217
+ best_category = max(scores.items(), key=lambda x: x[1])
218
+ total_score = sum(scores.values())
219
+ confidence = min(best_category[1] / total_score, 1.0) if total_score > 0 else 0.0
220
+
221
+ return best_category[0], confidence
222
+
223
+ except Exception as e:
224
+ logger.error(f"Document classification failed: {e}")
225
+ return "عمومی", 0.0
226
+
227
+ def calculate_importance_score(self, doc: LegalDocument) -> float:
228
+ """Calculate document importance score"""
229
+ try:
230
+ score = 0.0
231
+
232
+ title_lower = doc.title.lower()
233
+ high_importance_words = ['قانون', 'اساسی', 'حکم', 'رای', 'مصوبه']
234
+ medium_importance_words = ['آیین‌نامه', 'بخشنامه', 'دستورالعمل']
235
+
236
+ for word in high_importance_words:
237
+ if word in title_lower:
238
+ score += 0.3
239
+ break
240
+
241
+ for word in medium_importance_words:
242
+ if word in title_lower:
243
+ score += 0.2
244
+ break
245
+
246
+ content_length = len(doc.content)
247
+ if content_length > 5000:
248
+ score += 0.25
249
+ elif content_length > 2000:
250
+ score += 0.15
251
+ elif content_length > 500:
252
+ score += 0.1
253
+
254
+ if doc.date_published:
255
+ try:
256
+ date_formats = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
257
+ pub_date = None
258
+
259
+ for fmt in date_formats:
260
+ try:
261
+ pub_date = datetime.strptime(doc.date_published, fmt)
262
+ break
263
+ except:
264
+ continue
265
+
266
+ if pub_date:
267
+ days_old = (datetime.now() - pub_date).days
268
+ if days_old < 30:
269
+ score += 0.25
270
+ elif days_old < 365:
271
+ score += 0.15
272
+ elif days_old < 1825:
273
+ score += 0.05
274
+ except:
275
+ pass
276
+
277
+ legal_keywords = ['قانون', 'ماده', 'بند', 'حکم', 'رای', 'دادگاه', 'محکمه']
278
+ content_lower = doc.content.lower()
279
+ keyword_count = sum(content_lower.count(kw) for kw in legal_keywords)
280
+ word_count = len(doc.content.split())
281
+
282
+ if word_count > 0:
283
+ keyword_density = keyword_count / word_count
284
+ score += min(keyword_density * 5, 0.2)
285
+
286
+ type_bonuses = {
287
+ 'law': 0.2,
288
+ 'ruling': 0.15,
289
+ 'regulation': 0.1,
290
+ 'news': 0.05
291
+ }
292
+ score += type_bonuses.get(doc.document_type, 0)
293
+
294
+ return min(score, 1.0)
295
+
296
+ except Exception as e:
297
+ logger.error(f"Importance score calculation failed: {e}")
298
+ return 0.0
299
+
300
+ def extract_legal_entities(self, text: str) -> List[str]:
301
+ """Extract legal entities from text"""
302
+ try:
303
+ entities = []
304
+
305
+ patterns = {
306
+ 'قوانین': r'قانون\s+[\u0600-\u06FF\s]{3,30}',
307
+ 'مواد': r'ماده\s+\d+[\u0600-\u06FF\s]*',
308
+ 'دادگاه‌ها': r'دادگاه\s+[\u0600-\u06FF\s]{3,30}',
309
+ 'مراجع': r'(وزارت|سازمان|اداره|شورای|کمیته)\s+[\u0600-\u06FF\s]{3,30}',
310
+ 'احکام': r'(حکم|رای)\s+(شماره\s+)?\d+',
311
+ }
312
+
313
+ for entity_type, pattern in patterns.items():
314
+ matches = re.findall(pattern, text)
315
+ for match in matches:
316
+ clean_match = re.sub(r'\s+', ' ', match.strip())
317
+ if len(clean_match) > 5 and len(clean_match) < 100:
318
+ entities.append(clean_match)
319
+
320
+ unique_entities = list(dict.fromkeys(entities))
321
+ return unique_entities[:15]
322
+
323
+ except Exception as e:
324
+ logger.error(f"Entity extraction failed: {e}")
325
+ return []
326
+
327
+ def get_text_embedding(self, text: str) -> Optional[List[float]]:
328
+ """Get text embedding using available model"""
329
+ if not self.model or not self.tokenizer or not TORCH_AVAILABLE:
330
+ return None
331
+
332
+ try:
333
+ normalized_text = self.normalize_text(text)
334
+ if len(normalized_text) > 512:
335
+ normalized_text = normalized_text[:512]
336
+
337
+ if not normalized_text:
338
+ return None
339
+
340
+ inputs = self.tokenizer(
341
+ normalized_text,
342
+ return_tensors="pt",
343
+ padding=True,
344
+ truncation=True,
345
+ max_length=512
346
+ ).to(self.device)
347
+
348
+ with torch.no_grad():
349
+ outputs = self.model(**inputs)
350
+ embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
351
+
352
+ return embedding.tolist()
353
+
354
+ except Exception as e:
355
+ logger.error(f"Embedding generation failed: {e}")
356
+ return None
357
+
358
+ def generate_summary(self, text: str, max_length: int = 200) -> str:
359
+ """Generate text summary"""
360
+ try:
361
+ if len(text) <= max_length:
362
+ return text
363
+
364
+ if HAZM_AVAILABLE:
365
+ sentences = sent_tokenize(text)
366
+ else:
367
+ sentences = re.split(r'[.!?]+', text)
368
+ sentences = [s.strip() for s in sentences if s.strip()]
369
+
370
+ if len(sentences) <= 2:
371
+ return text[:max_length] + "..." if len(text) > max_length else text
372
+
373
+ keywords = self.extract_keywords(text, top_k=15)
374
+
375
+ sentence_scores = []
376
+ for sentence in sentences:
377
+ if len(sentence) < 20:
378
+ continue
379
+
380
+ score = 0
381
+ sentence_lower = sentence.lower()
382
+
383
+ for kw in keywords:
384
+ if kw in sentence_lower:
385
+ score += 1
386
+
387
+ legal_terms = ['قانون', 'ماده', 'حکم', 'رای', 'دادگاه']
388
+ for term in legal_terms:
389
+ if term in sentence_lower:
390
+ score += 0.5
391
+
392
+ if len(sentence) > 200:
393
+ score *= 0.8
394
+
395
+ sentence_scores.append((sentence, score))
396
+
397
+ sentence_scores.sort(key=lambda x: x[1], reverse=True)
398
+
399
+ selected_sentences = []
400
+ current_length = 0
401
+
402
+ for sentence, score in sentence_scores:
403
+ if current_length + len(sentence) <= max_length:
404
+ selected_sentences.append(sentence)
405
+ current_length += len(sentence)
406
+ else:
407
+ break
408
+
409
+ if not selected_sentences:
410
+ return text[:max_length] + "..."
411
+
412
+ summary = ' '.join(selected_sentences)
413
+ return summary if len(summary) <= max_length else summary[:max_length] + "..."
414
+
415
+ except Exception as e:
416
+ logger.error(f"Summary generation failed: {e}")
417
+ return text[:max_length] + "..." if len(text) > max_length else text
418
+
419
+ def process_document(self, doc: LegalDocument) -> LegalDocument:
420
+ """Process document with all available NLP features"""
421
+ try:
422
+ logger.info(f"Processing document: {doc.title[:50]}...")
423
+
424
+ doc.keywords = self.extract_keywords(doc.content)
425
+
426
+ doc_type, confidence = self.classify_document(doc.content)
427
+ if confidence > 0.3:
428
+ doc.category = doc_type
429
+
430
+ doc.importance_score = self.calculate_importance_score(doc)
431
+
432
+ doc.legal_entities = self.extract_legal_entities(doc.content)
433
+
434
+ doc.summary = self.generate_summary(doc.content)
435
+
436
+ doc.embedding = self.get_text_embedding(doc.content)
437
+
438
+ logger.info(f"✅ Processed: {doc.title[:30]}... (Score: {doc.importance_score:.2f})")
439
+
440
+ return doc
441
+
442
+ except Exception as e:
443
+ logger.error(f"Document processing failed: {e}")
444
+ return doc
445
+
446
+ class EnhancedLegalScraper:
447
+ """Enhanced legal scraper with real web scraping and NLP"""
448
+
449
+ def __init__(self, delay: float = 1.0):
450
+ self.delay = delay
451
+ self.session = requests.Session()
452
+
453
+ try:
454
+ self.nlp_processor = PersianNLPProcessor()
455
+ logger.info("✅ NLP processor initialized")
456
+ except Exception as e:
457
+ logger.error(f"❌ NLP processor initialization failed: {e}")
458
+ self.nlp_processor = None
459
+
460
+ self.db_path = self._get_db_path()
461
+
462
+ self.session.headers.update({
463
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
464
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
465
+ 'Accept-Language': 'fa,en-US;q=0.7,en;q=0.3',
466
+ 'Accept-Encoding': 'gzip, deflate',
467
+ 'Connection': 'keep-alive',
468
+ 'Upgrade-Insecure-Requests': '1',
469
+ })
470
+
471
+ self._init_database()
472
+
473
+ def _get_db_path(self) -> str:
474
+ """Get appropriate database path for the environment"""
475
+ possible_paths = [
476
+ "/tmp/legal_scraper.db",
477
+ "./data/legal_scraper.db",
478
+ "legal_scraper.db"
479
+ ]
480
+
481
+ for path in possible_paths:
482
+ try:
483
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
484
+ return path
485
+ except:
486
+ continue
487
+
488
+ return ":memory:"
489
+
490
+ def _init_database(self):
491
+ """Initialize enhanced database with NLP fields"""
492
+ try:
493
+ conn = sqlite3.connect(self.db_path)
494
+ cursor = conn.cursor()
495
+
496
+ cursor.execute('''
497
+ CREATE TABLE IF NOT EXISTS legal_documents (
498
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
499
+ title TEXT NOT NULL,
500
+ content TEXT NOT NULL,
501
+ source_url TEXT UNIQUE NOT NULL,
502
+ document_type TEXT NOT NULL,
503
+ date_published TEXT,
504
+ date_scraped TEXT NOT NULL,
505
+ category TEXT,
506
+ tags TEXT,
507
+ summary TEXT,
508
+ importance_score REAL DEFAULT 0.0,
509
+ sentiment_score REAL DEFAULT 0.0,
510
+ legal_entities TEXT,
511
+ keywords TEXT,
512
+ embedding TEXT,
513
+ language TEXT DEFAULT 'fa',
514
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
515
+ )
516
+ ''')
517
+
518
+ indexes = [
519
+ 'CREATE INDEX IF NOT EXISTS idx_source_url ON legal_documents(source_url)',
520
+ 'CREATE INDEX IF NOT EXISTS idx_document_type ON legal_documents(document_type)',
521
+ 'CREATE INDEX IF NOT EXISTS idx_importance_score ON legal_documents(importance_score DESC)',
522
+ 'CREATE INDEX IF NOT EXISTS idx_category ON legal_documents(category)',
523
+ 'CREATE INDEX IF NOT EXISTS idx_date_published ON legal_documents(date_published)',
524
+ 'CREATE INDEX IF NOT EXISTS idx_date_scraped ON legal_documents(date_scraped DESC)'
525
+ ]
526
+
527
+ for index in indexes:
528
+ cursor.execute(index)
529
+
530
+ conn.commit()
531
+ conn.close()
532
+ logger.info(f"✅ Database initialized: {self.db_path}")
533
+
534
+ except Exception as e:
535
+ logger.error(f"❌ Database initialization failed: {e}")
536
+ raise
537
+
538
+ def save_document(self, doc: LegalDocument) -> bool:
539
+ """Save enhanced document to database"""
540
+ try:
541
+ conn = sqlite3.connect(self.db_path)
542
+ cursor = conn.cursor()
543
+
544
+ cursor.execute('''
545
+ INSERT OR REPLACE INTO legal_documents
546
+ (title, content, source_url, document_type, date_published,
547
+ date_scraped, category, tags, summary, importance_score,
548
+ sentiment_score, legal_entities, keywords, embedding, language)
549
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
550
+ ''', (
551
+ doc.title,
552
+ doc.content,
553
+ doc.source_url,
554
+ doc.document_type,
555
+ doc.date_published,
556
+ doc.date_scraped,
557
+ doc.category,
558
+ json.dumps(doc.tags, ensure_ascii=False) if doc.tags else None,
559
+ doc.summary,
560
+ doc.importance_score,
561
+ doc.sentiment_score,
562
+ json.dumps(doc.legal_entities, ensure_ascii=False) if doc.legal_entities else None,
563
+ json.dumps(doc.keywords, ensure_ascii=False) if doc.keywords else None,
564
+ json.dumps(doc.embedding) if doc.embedding else None,
565
+ doc.language
566
+ ))
567
+
568
+ conn.commit()
569
+ conn.close()
570
+ return True
571
+
572
+ except Exception as e:
573
+ logger.error(f"Failed to save document {doc.source_url}: {e}")
574
+ return False
575
+
576
+ def get_enhanced_statistics(self) -> Dict:
577
+ """Get comprehensive statistics with NLP insights"""
578
+ try:
579
+ conn = sqlite3.connect(self.db_path)
580
+ cursor = conn.cursor()
581
+
582
+ stats = {}
583
+
584
+ cursor.execute('SELECT COUNT(*) FROM legal_documents')
585
+ stats['total_documents'] = cursor.fetchone()[0]
586
+
587
+ cursor.execute('SELECT document_type, COUNT(*) FROM legal_documents GROUP BY document_type')
588
+ stats['by_type'] = dict(cursor.fetchall())
589
+
590
+ cursor.execute('SELECT category, COUNT(*) FROM legal_documents WHERE category IS NOT NULL GROUP BY category')
591
+ stats['by_category'] = dict(cursor.fetchall())
592
+
593
+ cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score >= 0.7')
594
+ high_importance = cursor.fetchone()[0]
595
+ cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score >= 0.3 AND importance_score < 0.7')
596
+ medium_importance = cursor.fetchone()[0]
597
+ cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score < 0.3')
598
+ low_importance = cursor.fetchone()[0]
599
+
600
+ stats['importance_distribution'] = {
601
+ 'high': high_importance,
602
+ 'medium': medium_importance,
603
+ 'low': low_importance
604
+ }
605
+
606
+ cursor.execute('SELECT keywords FROM legal_documents WHERE keywords IS NOT NULL')
607
+ all_keywords = []
608
+ for row in cursor.fetchall():
609
+ try:
610
+ keywords = json.loads(row[0])
611
+ all_keywords.extend(keywords)
612
+ except:
613
+ continue
614
+
615
+ if all_keywords:
616
+ keyword_counts = {}
617
+ for kw in all_keywords:
618
+ keyword_counts[kw] = keyword_counts.get(kw, 0) + 1
619
+
620
+ topទ
621
+ top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:25]
622
+ stats['top_keywords'] = dict(top_keywords)
623
+
624
+ cursor.execute('''
625
+ SELECT DATE(date_scraped) as day, COUNT(*)
626
+ FROM legal_documents
627
+ WHERE date_scraped >= date('now', '-7 days')
628
+ GROUP BY DATE(date_scraped)
629
+ ORDER BY day DESC
630
+ ''')
631
+ stats['recent_activity'] = dict(cursor.fetchall())
632
+
633
+ cursor.execute('''
634
+ SELECT document_type, AVG(importance_score)
635
+ FROM legal_documents
636
+ GROUP BY document_type
637
+ ''')
638
+ stats['avg_importance_by_type'] = dict(cursor.fetchall())
639
+
640
+ cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE embedding IS NOT NULL')
641
+ stats['documents_with_embeddings'] = cursor.fetchone()[0]
642
+
643
+ cursor.execute('SELECT language, COUNT(*) FROM legal_documents GROUP BY language')
644
+ stats['by_language'] = dict(cursor.fetchall())
645
+
646
+ conn.close()
647
+ return stats
648
+
649
+ except Exception as e:
650
+ logger.error(f"Statistics generation failed: {e}")
651
+ return {
652
+ 'total_documents': 0,
653
+ 'by_type': {},
654
+ 'by_category': {},
655
+ 'importance_distribution': {'high': 0, 'medium': 0, 'low': 0},
656
+ 'top_keywords': {},
657
+ 'recent_activity': {},
658
+ 'avg_importance_by_type': {},
659
+ 'documents_with_embeddings': 0,
660
+ 'by_language': {}
661
+ }
662
+
663
+ def search_with_similarity(self, query: str, limit: int = 20) -> List[Dict]:
664
+ """Advanced search using embeddings and similarity"""
665
+ if not self.nlp_processor or not self.nlp_processor.model:
666
+ return self._text_search(query, limit)
667
+
668
+ try:
669
+ query_embedding = self.nlp_processor.get_text_embedding(query)
670
+ if not query_embedding:
671
+ return self._text_search(query, limit)
672
+
673
+ conn = sqlite3.connect(self.db_path)
674
+ cursor = conn.cursor()
675
+
676
+ cursor.execute('''
677
+ SELECT id, title, content, source_url, document_type,
678
+ importance_score, summary, embedding
679
+ FROM legal_documents
680
+ WHERE embedding IS NOT NULL
681
+ ''')
682
+
683
+ results = []
684
+ query_vector = np.array(query_embedding)
685
+
686
+ for row in cursor.fetchall():
687
+ try:
688
+ doc_embedding = json.loads(row[7])
689
+ doc_vector = np.array(doc_embedding)
690
+
691
+ similarity = cosine_similarity([query_vector], [doc_vector])[0][0]
692
+
693
+ combined_score = (similarity * 0.7) + (row[5] * 0.3)
694
+
695
+ results.append({
696
+ 'id': row[0],
697
+ 'title': row[1],
698
+ 'content': row[2][:500] + "..." if len(row[2]) > 500 else row[2],
699
+ 'source_url': row[3],
700
+ 'document_type': row[4],
701
+ 'importance_score': row[5],
702
+ 'summary': row[6],
703
+ 'similarity_score': similarity,
704
+ 'combined_score': combined_score
705
+ })
706
+
707
+ except Exception as e:
708
+ logger.error(f"Error processing document embedding: {e}")
709
+ continue
710
+
711
+ results.sort(key=lambda x: x['combined_score'], reverse=True)
712
+ conn.close()
713
+
714
+ return results[:limit]
715
+
716
+ except Exception as e:
717
+ logger.error(f"Similarity search failed: {e}")
718
+ return self._text_search(query, limit)
719
+
720
+ def _text_search(self, query: str, limit: int = 20) -> List[Dict]:
721
+ """Fallback text search"""
722
+ try:
723
+ conn = sqlite3.connect(self.db_path)
724
+ cursor = conn.cursor()
725
+
726
+ if self.nlp_processor:
727
+ normalized_query = self.nlp_processor.normalize_text(query)
728
+ else:
729
+ normalized_query = query
730
+
731
+ query_words = normalized_query.split()
732
+
733
+ search_conditions = []
734
+ params = []
735
+
736
+ for word in query_words:
737
+ search_conditions.append("(title LIKE ? OR content LIKE ?)")
738
+ params.extend([f'%{word}%', f'%{word}%'])
739
+
740
+ where_clause = " OR ".join(search_conditions)
741
+
742
+ cursor.execute(f'''
743
+ SELECT id, title, content, source_url, document_type,
744
+ importance_score, summary
745
+ FROM legal_documents
746
+ WHERE {where_clause}
747
+ ORDER BY importance_score DESC
748
+ LIMIT ?
749
+ ''', params + [limit])
750
+
751
+ results = []
752
+ for row in cursor.fetchall():
753
+ results.append({
754
+ 'id': row[0],
755
+ 'title': row[1],
756
+ 'content': row[2][:500] + "..." if len(row[2]) > 500 else row[2],
757
+ 'source_url': row[3],
758
+ 'document_type': row[4],
759
+ 'importance_score': row[5],
760
+ 'summary': row[6],
761
+ 'similarity_score': 0.0
762
+ })
763
+
764
+ conn.close()
765
+ return results
766
+
767
+ except Exception as e:
768
+ logger.error(f"Text search failed: {e}")
769
+ return []
770
+
771
+ def export_to_csv(self, filename: str = None) -> str:
772
+ """Export data to CSV with full details"""
773
+ try:
774
+ if not filename:
775
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
776
+ filename = f"legal_documents_{timestamp}.csv"
777
+
778
+ conn = sqlite3.connect(self.db_path)
779
+
780
+ query = '''
781
+ SELECT title, content, source_url, document_type,
782
+ date_published, date_scraped, category, summary,
783
+ importance_score, keywords, legal_entities
784
+ FROM legal_documents
785
+ ORDER BY importance_score DESC, date_scraped DESC
786
+ '''
787
+
788
+ df = pd.read_sql_query(query, conn)
789
+ conn.close()
790
+
791
+ for col in ['keywords', 'legal_entities']:
792
+ if col in df.columns:
793
+ df[col] = df[col].apply(lambda x: ', '.join(json.loads(x)) if x else '')
794
+
795
+ df.to_csv(filename, index=False, encoding='utf-8-sig')
796
+ logger.info(f"✅ Data exported to CSV: {filename}")
797
+
798
+ return filename
799
+
800
+ except Exception as e:
801
+ logger.error(f"CSV export failed: {e}")
802
+ return ""
803
+
804
+ def scrape_real_sources(self, urls: List[str] = IRANIAN_LEGAL_SOURCES, max_docs: int = 20) -> List[LegalDocument]:
805
+ """Real web scraping implementation with source-specific extraction"""
806
+ documents = []
807
+
808
+ for i, url in enumerate(urls):
809
+ if len(documents) >= max_docs:
810
+ break
811
+
812
+ try:
813
+ logger.info(f"🔄 Scraping {i+1}/{len(urls)}: {url}")
814
+ time.sleep(self.delay)
815
+
816
+ response = self.session.get(url, timeout=15)
817
+ response.raise_for_status()
818
+
819
+ if response.encoding == 'ISO-8859-1':
820
+ response.encoding = response.apparent_encoding
821
+
822
+ soup = BeautifulSoup(response.content, 'html.parser')
823
+
824
+ # Extract documents using source-specific logic
825
+ extracted_items = self._extract_source_specific_content(soup, url, max_docs - len(documents))
826
+
827
+ for item in extracted_items:
828
+ if len(documents) >= max_docs:
829
+ break
830
+
831
+ doc = LegalDocument(
832
+ title=item['title'],
833
+ content=item['content'],
834
+ source_url=item['url'],
835
+ document_type=self._determine_document_type(item['title'], item['content']),
836
+ date_published=item['date']
837
+ )
838
+
839
+ if self.nlp_processor:
840
+ doc = self.nlp_processor.process_document(doc)
841
+
842
+ documents.append(doc)
843
+ logger.info(f"✅ Extracted: {doc.title[:50]}...")
844
+
845
+ except Exception as e:
846
+ logger.error(f"❌ Error scraping {url}: {e}")
847
+ continue
848
+
849
+ documents.sort(key=lambda x: x.importance_score, reverse=True)
850
+ return documents
851
+
852
+ def _extract_source_specific_content(self, soup: BeautifulSoup, url: str, max_items: int) -> List[Dict]:
853
+ """Extract content based on source-specific selectors"""
854
+ if 'irna.ir' in url:
855
+ return self._extract_irna_content(soup, url, max_items)
856
+ elif 'tasnimnews.com' in url:
857
+ return self._extract_tasnim_content(soup, url, max_items)
858
+ elif 'mehrnews.com' in url:
859
+ return self._extract_mehr_content(soup, url, max_items)
860
+ elif 'farsnews.ir' in url:
861
+ return self._extract_fars_content(soup, url, max_items)
862
+ else:
863
+ return self._extract_generic_content(soup, url, max_items)
864
+
865
+ def _extract_irna_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
866
+ """Extract content from IRNA"""
867
+ items = []
868
+ try:
869
+ articles = soup.select('.news-item, .article, .story')[:max_items]
870
+
871
+ for article in articles:
872
+ title_elem = soup.select_one('h1, h2, h3, .title, .headline, a')
873
+ if title_elem:
874
+ title = title_elem.get_text(strip=True)
875
+ content = article.get_text(strip=True)
876
+
877
+ if len(title) > 10 and len(content) > 100:
878
+ items.append({
879
+ 'title': title,
880
+ 'content': content,
881
+ 'url': base_url,
882
+ 'date': self._extract_date(soup)
883
+ })
884
+
885
+ if not items:
886
+ main_content = soup.select_one('main, .main-content, .content, article')
887
+ if main_content:
888
+ title = soup.select_one('h1, title')
889
+ title_text = title.get_text(strip=True) if title else "خبر ایرنا"
890
+ content_text = main_content.get_text(strip=True)
891
+
892
+ if len(content_text) > 200:
893
+ items.append({
894
+ 'title': title_text,
895
+ 'content': content_text,
896
+ 'url': base_url,
897
+ 'date': self._extract_date(soup)
898
+ })
899
+
900
+ except Exception as e:
901
+ logger.error(f"IRNA extraction error: {e}")
902
+
903
+ return items
904
+
905
+ def _extract_tasnim_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
906
+ """Extract content from Tasnim"""
907
+ items = []
908
+ try:
909
+ articles = soup.select('.news-box, .item, .story-item')[:max_items]
910
+
911
+ for article in articles:
912
+ title_elem = article.select_one('h2, h3, .title, a')
913
+ if title_elem:
914
+ title = title_elem.get_text(strip=True)
915
+ content = article.get_text(strip=True)
916
+
917
+ if len(title) > 10 and len(content) > 100:
918
+ items.append({
919
+ 'title': title,
920
+ 'content': content,
921
+ 'url': base_url,
922
+ 'date': self._extract_date(soup)
923
+ })
924
+
925
+ if not items:
926
+ main_content = soup.select_one('.news-content, .story-body, main')
927
+ if main_content:
928
+ title = soup.select_one('h1, .news-title')
929
+ title_text = title.get_text(strip=True) if title else "خبر تسنیم"
930
+ content_text = main_content.get_text(strip=True)
931
+
932
+ if len(content_text) > 200:
933
+ items.append({
934
+ 'title': title_text,
935
+ 'content': content_text,
936
+ 'url': base_url,
937
+ 'date': self._extract_date(soup)
938
+ })
939
+
940
+ except Exception as e:
941
+ logger.error(f"Tasnim extraction error: {e}")
942
+
943
+ return items
944
+
945
+ def _extract_mehr_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
946
+ """Extract content from Mehr News"""
947
+ items = []
948
+ try:
949
+ articles = soup.select('.news-item, .article-item, .story')[:max_items]
950
+
951
+ for article in articles:
952
+ title_elem = article.select_one('h2, h3, .title, .headline')
953
+ if title_elem:
954
+ title = title_elem.get_text(strip=True)
955
+ content = article.get_text(strip=True)
956
+
957
+ if len(title) > 10 and len(content) > 100:
958
+ items.append({
959
+ 'title': title,
960
+ 'content': content,
961
+ 'url': base_url,
962
+ 'date': self._extract_date(soup)
963
+ })
964
+
965
+ if not items:
966
+ main_content = soup.select_one('.content, .news-body, article')
967
+ if main_content:
968
+ title = soup.select_one('h1, .page-title')
969
+ title_text = title.get_text(strip=True) if title else "خبر مهر"
970
+ content_text = main_content.get_text(strip=True)
971
+
972
+ if len(content_text) > 200:
973
+ items.append({
974
+ 'title': title_text,
975
+ 'content': content_text,
976
+ 'url': base_url,
977
+ 'date': self._extract_date(soup)
978
+ })
979
+
980
+ except Exception as e:
981
+ logger.error(f"Mehr extraction error: {e}")
982
+
983
+ return items
984
+
985
+ def _extract_fars_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
986
+ """Extract content from Fars News"""
987
+ items = []
988
+ try:
989
+ articles = soup.select('.news, .item, .story-item')[:max_items]
990
+
991
+ for article in articles:
992
+ title_elem = article.select_one('h2, h3, .title, a')
993
+ if title_elem:
994
+ title = title_elem.get_text(strip=True)
995
+ content = article.get_text(strip=True)
996
+
997
+ if len(title) > 10 and len(content) > 100:
998
+ items.append({
999
+ 'title': title,
1000
+ 'content': content,
1001
+ 'url': base_url,
1002
+ 'date': self._extract_date(soup)
1003
+ })
1004
+
1005
+ if not items:
1006
+ main_content = soup.select_one('.news-content, .story, main')
1007
+ if main_content:
1008
+ title = soup.select_one('h1, .news-title')
1009
+ title_text = title.get_text(strip=True) if title else "خبر فارس"
1010
+ content_text = main_content.get_text(strip=True)
1011
+
1012
+ if len(content_text) > 200:
1013
+ items.append({
1014
+ 'title': title_text,
1015
+ 'content': content_text,
1016
+ 'url': base_url,
1017
+ 'date': self._extract_date(soup)
1018
+ })
1019
+
1020
+ except Exception as e:
1021
+ logger.error(f"Fars extraction error: {e}")
1022
+
1023
+ return items
1024
+
1025
+ def _extract_generic_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
1026
+ """Generic content extraction for unknown sources"""
1027
+ items = []
1028
+ try:
1029
+ articles = soup.select('article, .article, .post, .news-item, .story')[:max_items]
1030
+
1031
+ for article in articles:
1032
+ title_elem = article.select_one('h1, h2, h3, .title, .headline')
1033
+ if title_elem:
1034
+ title = title_elem.get_text(strip=True)
1035
+ content = article.get_text(strip=True)
1036
+
1037
+ if len(title) > 10 and len(content) > 150:
1038
+ items.append({
1039
+ 'title': title,
1040
+ 'content': content,
1041
+ 'url': base_url,
1042
+ 'date': self._extract_date(soup)
1043
+ })
1044
+
1045
+ if not items:
1046
+ title_elem = soup.select_one('h1, title')
1047
+ content_elem = soup.select_one('main, .main-content, .content, .entry-content, body')
1048
+
1049
+ if title_elem and content_elem:
1050
+ for unwanted in content_elem(['script', 'style', 'nav', 'header', 'footer']):
1051
+ unwanted.decompose()
1052
+
1053
+ title = title_elem.get_text(strip=True)
1054
+ content = content_elem.get_text(strip=True)
1055
+
1056
+ if len(title) > 5 and len(content) > 200:
1057
+ items.append({
1058
+ 'title': title,
1059
+ 'content': content,
1060
+ 'url': base_url,
1061
+ 'date': self._extract_date(soup)
1062
+ })
1063
+
1064
+ except Exception as e:
1065
+ logger.error(f"Generic extraction error: {e}")
1066
+
1067
+ return items
1068
+
1069
+ def _extract_document_from_soup(self, soup: BeautifulSoup, url: str) -> Optional[LegalDocument]:
1070
+ """Extract main document from BeautifulSoup object using source-specific logic"""
1071
+ try:
1072
+ items = self._extract_source_specific_content(soup, url, 1)
1073
+
1074
+ if not items:
1075
+ return None
1076
+
1077
+ item = items[0]
1078
+
1079
+ return LegalDocument(
1080
+ title=item['title'],
1081
+ content=item['content'],
1082
+ source_url=item['url'],
1083
+ document_type=self._determine_document_type(item['title'], item['content']),
1084
+ date_published=item['date']
1085
+ )
1086
+
1087
+ except Exception as e:
1088
+ logger.error(f"Document extraction failed: {e}")
1089
+ return None
1090
+
1091
+ def _extract_additional_articles(self, soup: BeautifulSoup, base_url: str) -> List[LegalDocument]:
1092
+ """Extract additional articles from the same page using source-specific logic"""
1093
+ documents = []
1094
+
1095
+ try:
1096
+ items = self._extract_source_specific_content(soup, base_url, 3)
1097
+
1098
+ for item in items:
1099
+ doc = LegalDocument(
1100
+ title=item['title'],
1101
+ content=item['content'],
1102
+ source_url=item['url'],
1103
+ document_type=self._determine_document_type(item['title'], item['content']),
1104
+ date_published=item['date']
1105
+ )
1106
+
1107
+ documents.append(doc)
1108
+
1109
+ except Exception as e:
1110
+ logger.error(f"Additional articles extraction failed: {e}")
1111
+
1112
+ return documents[:3]
1113
+
1114
+ def _determine_document_type(self, title: str, content: str) -> str:
1115
+ """Determine document type based on content"""
1116
+ text = (title + " " + content).lower()
1117
+
1118
+ if any(word in text for word in ['قانون', 'ماده', 'فصل', 'بند', 'تبصره']):
1119
+ return 'law'
1120
+ elif any(word in text for word in ['رای', 'حکم', 'دادگاه', 'قاضی']):
1121
+ return 'ruling'
1122
+ elif any(word in text for word in ['آیین‌نامه', 'دستورالعمل', 'بخشنامه']):
1123
+ return 'regulation'
1124
+ elif any(word in text for word in ['خبر', 'اعلام', 'گزارش', 'اطلاعیه']):
1125
+ return 'news'
1126
+ else:
1127
+ return 'general'
1128
+
1129
+ def _extract_date(self, soup: BeautifulSoup) -> Optional[str]:
1130
+ """Extract publication date"""
1131
+ try:
1132
+ date_selectors = [
1133
+ 'meta[name="article:published_time"]',
1134
+ 'meta[property="article:published_time"]',
1135
+ 'meta[name="date"]',
1136
+ 'meta[name="DC.date"]',
1137
+ '.date',
1138
+ '.publish-date',
1139
+ '.article-date',
1140
+ 'time[datetime]'
1141
+ ]
1142
+
1143
+ for selector in date_selectors:
1144
+ element = soup.select_one(selector)
1145
+ if element:
1146
+ date_str = element.get('content') or element.get('datetime') or element.get_text()
1147
+ if date_str:
1148
+ return self._normalize_date(date_str)
1149
+
1150
+ text = soup.get_text()
1151
+ persian_date_patterns = [
1152
+ r'(\d{4}/\d{1,2}/\d{1,2})',
1153
+ r'(\d{1,2}/\d{1,2}/\d{4})',
1154
+ r'(\d{4}-\d{1,2}-\d{1,2})'
1155
+ ]
1156
+
1157
+ for pattern in persian_date_patterns:
1158
+ match = re.search(pattern, text)
1159
+ if match:
1160
+ return match.group(1)
1161
+
1162
+ return None
1163
+
1164
+ except Exception:
1165
+ return None
1166
+
1167
+ def _normalize_date(self, date_str: str) -> Optional[str]:
1168
+ """Normalize date string to standard format"""
1169
+ try:
1170
+ date_str = re.sub(r'[^\d/\-:]', ' ', date_str).strip()
1171
+
1172
+ formats = [
1173
+ '%Y-%m-%d',
1174
+ '%Y/%m/%d',
1175
+ '%d/%m/%Y',
1176
+ '%Y-%m-%d %H:%M:%S',
1177
+ '%Y/%m/%d %H:%M:%S'
1178
+ ]
1179
+
1180
+ for fmt in formats:
1181
+ try:
1182
+ parsed_date = datetime.strptime(date_str, fmt)
1183
+ return parsed_date.strftime('%Y-%m-%d')
1184
+ except ValueError:
1185
+ continue
1186
+
1187
+ return date_str
1188
+
1189
+ except Exception:
1190
+ return None