Anupam251272 commited on
Commit
0b85946
Β·
verified Β·
1 Parent(s): be44d09

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +578 -0
app.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import pandas as pd
4
+ from transformers import MarianMTModel, MarianTokenizer
5
+ from sentence_transformers import SentenceTransformer
6
+ from bs4 import BeautifulSoup
7
+ from fake_useragent import UserAgent
8
+ from datetime import datetime
9
+ import warnings
10
+ import gc
11
+ import re
12
+ import time
13
+ import random
14
+ import torch
15
+ from requests.exceptions import RequestException
16
+ import concurrent.futures
17
+ import json
18
+
19
+ warnings.filterwarnings('ignore')
20
+
21
+ class LegalResearchGenerator:
22
+ def __init__(self):
23
+ self.legal_categories = [
24
+ "criminal", "civil", "constitutional", "corporate",
25
+ "tax", "family", "property", "intellectual_property"
26
+ ]
27
+
28
+ self.doc_types = {
29
+ "all": "",
30
+ "central_acts": "central-acts",
31
+ "state_acts": "state-acts",
32
+ "regulations": "regulations",
33
+ "ordinances": "ordinances",
34
+ "constitutional_orders": "constitutional-orders"
35
+ }
36
+
37
+ # Initialize translation model only when needed
38
+ self.translation_model = None
39
+ self.translation_tokenizer = None
40
+
41
+ self.session = requests.Session()
42
+ self.session.headers.update(self.get_random_headers())
43
+
44
+ self.max_retries = 3
45
+ self.retry_delay = 1
46
+
47
+ # Initialize sentence transformer model
48
+ try:
49
+ self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
50
+ except Exception as e:
51
+ print(f"Error initializing sentence transformer: {e}")
52
+ self.sentence_model = None
53
+
54
+ def initialize_translation_model(self):
55
+ """Initialize translation model only when needed"""
56
+ if self.translation_model is None:
57
+ try:
58
+ self.translation_model_name = "Helsinki-NLP/opus-mt-en-hi"
59
+ self.translation_model = MarianMTModel.from_pretrained(self.translation_model_name)
60
+ self.translation_tokenizer = MarianTokenizer.from_pretrained(self.translation_model_name)
61
+ except Exception as e:
62
+ print(f"Error initializing translation model: {e}")
63
+ return False
64
+ return True
65
+
66
+ def get_random_headers(self):
67
+ """Generate random browser headers to avoid detection"""
68
+ ua = UserAgent()
69
+ browser_list = ['chrome', 'firefox', 'safari', 'edge']
70
+ browser = random.choice(browser_list)
71
+
72
+ headers = {
73
+ 'User-Agent': ua[browser],
74
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
75
+ 'Accept-Language': 'en-US,en;q=0.5',
76
+ 'Accept-Encoding': 'gzip, deflate, br',
77
+ 'Connection': 'keep-alive',
78
+ 'DNT': '1'
79
+ }
80
+ return headers
81
+
82
+ def calculate_relevance_score(self, query, text):
83
+ """Calculate relevance score between query and text"""
84
+ if not self.sentence_model:
85
+ return 0.0
86
+
87
+ try:
88
+ query_embedding = self.sentence_model.encode([query])
89
+ text_embedding = self.sentence_model.encode([text])
90
+
91
+ similarity = float(torch.nn.functional.cosine_similarity(
92
+ torch.tensor(query_embedding),
93
+ torch.tensor(text_embedding)
94
+ ))
95
+ return max(0.0, min(1.0, similarity)) # Ensure score is between 0 and 1
96
+
97
+ except Exception as e:
98
+ print(f"Error calculating relevance score: {e}")
99
+ return 0.0
100
+
101
+ def clean_text(self, text):
102
+ """Clean and format text content"""
103
+ if not text:
104
+ return ""
105
+
106
+ # Remove extra whitespace
107
+ text = re.sub(r'\s+', ' ', text.strip())
108
+ # Remove special characters
109
+ text = re.sub(r'[^\w\s\.,;:?!-]', '', text)
110
+ return text
111
+
112
+ def format_legal_case(self, case_num, case_data, target_language='english'):
113
+ """Format legal case data with improved layout"""
114
+ try:
115
+ title = self.translate_text(self.clean_text(case_data['title']), target_language)
116
+ summary = self.translate_text(self.clean_text(case_data['summary']), target_language)
117
+ source = case_data.get('source', 'Unknown Source')
118
+ relevance = round(case_data.get('relevance_score', 0) * 100, 2)
119
+
120
+ output = f"""
121
+ {'═' * 80}
122
+ πŸ“‘ LEGAL DOCUMENT {case_num}
123
+ {'═' * 80}
124
+
125
+ πŸ“Œ TITLE:
126
+ {title}
127
+
128
+ πŸ“š SOURCE: {source}
129
+ 🎯 RELEVANCE: {relevance}%
130
+
131
+ πŸ“– SUMMARY:
132
+ {summary}
133
+
134
+ πŸ”— DOCUMENT LINK:
135
+ {case_data['url']}
136
+
137
+ {'─' * 80}
138
+ """
139
+ return output
140
+ except Exception as e:
141
+ print(f"Error formatting legal case: {e}")
142
+ return ""
143
+
144
+ def translate_text(self, text, target_language):
145
+ """Translate text to target language"""
146
+ if target_language.lower() == "english":
147
+ return text
148
+
149
+ if not self.initialize_translation_model():
150
+ return text
151
+
152
+ try:
153
+ inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
154
+ translated = self.translation_model.generate(**inputs)
155
+ return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True)
156
+ except Exception as e:
157
+ print(f"Error during translation: {e}")
158
+ return text
159
+
160
+ def fetch_from_indiacode(self, query, doc_type="all", max_results=5):
161
+ """Fetch results from India Code portal"""
162
+ for attempt in range(self.max_retries):
163
+ try:
164
+ # Using a more reliable search endpoint
165
+ base_url = "https://www.indiacode.nic.in/search"
166
+
167
+ params = {
168
+ 'q': query,
169
+ 'type': self.doc_types.get(doc_type, ""),
170
+ 'page': 1,
171
+ 'size': max_results * 2
172
+ }
173
+
174
+ response = self.session.get(
175
+ base_url,
176
+ params=params,
177
+ headers=self.get_random_headers(),
178
+ timeout=15
179
+ )
180
+
181
+ if response.status_code == 200:
182
+ soup = BeautifulSoup(response.text, 'html.parser')
183
+ results = []
184
+
185
+ items = (
186
+ soup.select('div.artifact-description') or
187
+ soup.select('.search-result-item') or
188
+ soup.select('.result-item')
189
+ )
190
+
191
+ if not items:
192
+ print(f"No results found with current selectors. Attempt {attempt + 1}/{self.max_retries}")
193
+ continue
194
+
195
+ for item in items:
196
+ try:
197
+ title_elem = (
198
+ item.select_one('h4.artifact-title a') or
199
+ item.select_one('.act-title') or
200
+ item.select_one('h3 a')
201
+ )
202
+
203
+ title = title_elem.get_text(strip=True) if title_elem else "Untitled"
204
+ url = title_elem.get('href', '') if title_elem else ""
205
+
206
+ summary_elem = (
207
+ item.select_one('div.artifact-info') or
208
+ item.select_one('.act-description') or
209
+ item.select_one('.summary')
210
+ )
211
+ summary = summary_elem.get_text(strip=True) if summary_elem else ""
212
+
213
+ if not summary:
214
+ summary = ' '.join(text for text in item.stripped_strings
215
+ if text != title and len(text) > 30)
216
+
217
+ if url and not url.startswith('http'):
218
+ url = f"https://www.indiacode.nic.in{url}"
219
+
220
+ relevance_score = self.calculate_relevance_score(
221
+ query,
222
+ f"{title} {summary}"
223
+ )
224
+
225
+ results.append({
226
+ 'title': title,
227
+ 'court': 'India Code',
228
+ 'summary': summary[:500],
229
+ 'url': url,
230
+ 'type': 'legal',
231
+ 'source': 'India Code Portal',
232
+ 'relevance_score': relevance_score
233
+ })
234
+
235
+ except Exception as e:
236
+ print(f"Error processing result: {e}")
237
+ continue
238
+
239
+ if results:
240
+ results.sort(key=lambda x: x['relevance_score'], reverse=True)
241
+ return results[:max_results]
242
+
243
+ elif response.status_code == 429:
244
+ wait_time = self.retry_delay * (attempt + 1)
245
+ time.sleep(wait_time)
246
+ continue
247
+
248
+ except Exception as e:
249
+ print(f"Error on attempt {attempt + 1}: {e}")
250
+ if attempt < self.max_retries - 1:
251
+ time.sleep(self.retry_delay)
252
+ continue
253
+
254
+ return []
255
+
256
+ def fetch_from_liiofindia(self, query, doc_type="all", max_results=5):
257
+ """Fetch results from LII of India"""
258
+ try:
259
+ # Updated to use the main search endpoint
260
+ base_url = "https://www.liiofindia.org/search/"
261
+
262
+ params = {
263
+ 'q': query,
264
+ 'page': 1,
265
+ 'per_page': max_results * 2,
266
+ 'sort': 'relevance'
267
+ }
268
+
269
+ if doc_type != "all":
270
+ params['type'] = doc_type
271
+
272
+ response = self.session.get(
273
+ base_url,
274
+ params=params,
275
+ headers={
276
+ **self.get_random_headers(),
277
+ 'Accept': 'application/json'
278
+ },
279
+ timeout=15
280
+ )
281
+
282
+ if response.status_code == 200:
283
+ try:
284
+ data = response.json()
285
+ results = []
286
+
287
+ for item in data.get('results', []):
288
+ title = item.get('title', 'Untitled')
289
+ summary = item.get('snippet', '')
290
+
291
+ relevance_score = self.calculate_relevance_score(
292
+ query,
293
+ f"{title} {summary}"
294
+ )
295
+
296
+ results.append({
297
+ 'title': title,
298
+ 'court': item.get('court', 'LII India'),
299
+ 'summary': summary[:500],
300
+ 'url': item.get('url', ''),
301
+ 'type': 'legal',
302
+ 'source': 'LII India',
303
+ 'relevance_score': relevance_score
304
+ })
305
+
306
+ results.sort(key=lambda x: x['relevance_score'], reverse=True)
307
+ return results[:max_results]
308
+
309
+ except ValueError as e:
310
+ print(f"Error parsing JSON from LII India: {e}")
311
+ return []
312
+
313
+ return []
314
+
315
+ except Exception as e:
316
+ print(f"Error fetching from LII India: {e}")
317
+ return []
318
+
319
+ def fetch_alternative_source(self, query, max_results=5):
320
+ """Fetch results from alternative sources"""
321
+ try:
322
+ # Try multiple alternative sources
323
+ sources = [
324
+ "https://indiankanoon.org/search/",
325
+ "https://main.sci.gov.in/judgments",
326
+ "https://doj.gov.in/acts-and-rules/"
327
+ ]
328
+
329
+ all_results = []
330
+ for base_url in sources: # Added colon here
331
+
332
+ params = {
333
+ 'formInput': query,
334
+ 'pageSize': max_results
335
+ }
336
+
337
+ response = self.session.get(
338
+ base_url,
339
+ params=params,
340
+ headers=self.get_random_headers(),
341
+ timeout=15
342
+ )
343
+
344
+ if response.status_code == 200:
345
+ soup = BeautifulSoup(response.text, 'html.parser')
346
+ results = []
347
+
348
+ for result in soup.select('.result_item')[:max_results]:
349
+ try:
350
+ title_elem = result.select_one('.title a')
351
+ title = title_elem.get_text(strip=True) if title_elem else "Untitled"
352
+ url = title_elem.get('href', '') if title_elem else ""
353
+
354
+ snippet_elem = result.select_one('.snippet')
355
+ summary = snippet_elem.get_text(strip=True) if snippet_elem else ""
356
+
357
+ relevance_score = self.calculate_relevance_score(
358
+ query,
359
+ f"{title} {summary}"
360
+ )
361
+
362
+ results.append({
363
+ 'title': title,
364
+ 'court': 'Alternative Source',
365
+ 'summary': summary[:500],
366
+ 'url': url if url.startswith('http') else f"https://indiankanoon.org{url}",
367
+ 'type': 'legal',
368
+ 'source': 'Indian Kanoon',
369
+ 'relevance_score': relevance_score
370
+ })
371
+
372
+ except Exception as e:
373
+ print(f"Error processing alternative result: {e}")
374
+ continue
375
+
376
+ return results
377
+
378
+ except Exception as e:
379
+ print(f"Error in alternative source: {e}")
380
+
381
+ return []
382
+
383
+ def fetch_from_multiple_sources(self, query, doc_type="all", max_results=5):
384
+ """Fetch and combine results from multiple sources"""
385
+ all_results = []
386
+
387
+ with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
388
+ future_to_source = {
389
+ executor.submit(self.fetch_from_indiacode, query, doc_type, max_results): "India Code",
390
+ executor.submit(self.fetch_from_liiofindia, query, doc_type, max_results): "LII India",
391
+ executor.submit(self.fetch_alternative_source, query, max_results): "Alternative"
392
+ }
393
+
394
+ for future in concurrent.futures.as_completed(future_to_source):
395
+ source = future_to_source[future]
396
+ try:
397
+ results = future.result()
398
+ if results:
399
+ all_results.extend(results)
400
+ except Exception as e:
401
+ print(f"Error fetching from {source}: {e}")
402
+
403
+ # Sort by relevance score and return top results
404
+ all_results.sort(key=lambda x: x['relevance_score'], reverse=True)
405
+ return all_results[:max_results]
406
+
407
+ def process_research(self, input_query, research_type="legal", doc_type="all", target_language='english'):
408
+ """Process research query and generate formatted output"""
409
+ try:
410
+ # Validate input
411
+ if not input_query.strip():
412
+ return "Error: Please enter a valid research query."
413
+
414
+ # Add default sample data for testing and development
415
+ sample_data = [
416
+ {
417
+ 'title': 'Right to Privacy Judgment',
418
+ 'court': 'Supreme Court',
419
+ 'summary': 'The right to privacy is protected as an intrinsic part of the right to life and personal liberty under Article 21 and as a part of the freedoms guaranteed by Part III of the Constitution.',
420
+ 'url': 'https://main.sci.gov.in/supremecourt/2012/35071/35071_2012_Judgement_24-Aug-2017.pdf',
421
+ 'type': 'legal',
422
+ 'source': 'Supreme Court of India',
423
+ 'relevance_score': 0.95
424
+ },
425
+ {
426
+ 'title': 'Information Technology Act, 2000',
427
+ 'court': 'India Code',
428
+ 'summary': 'An Act to provide legal recognition for transactions carried out by means of electronic data interchange and other means of electronic communication.',
429
+ 'url': 'https://www.indiacode.nic.in/handle/123456789/1999/simple-search',
430
+ 'type': 'legal',
431
+ 'source': 'India Code Portal',
432
+ 'relevance_score': 0.85
433
+ }
434
+ ]
435
+
436
+ # Fetch results
437
+ cases = self.fetch_from_multiple_sources(input_query, doc_type)
438
+
439
+ # If no results found from APIs, use sample data for development
440
+ if not cases:
441
+ print("No results from APIs, using sample data")
442
+ cases = sample_data
443
+
444
+ # Generate header
445
+ header = f"""
446
+ {'β•”' + '═' * 78 + 'β•—'}
447
+ β•‘ {'LEGAL DOCUMENT ANALYSIS REPORT'.center(76)} β•‘
448
+ {'β• ' + '═' * 78 + 'β•£'}
449
+ β•‘
450
+ β•‘ 🎯 RESEARCH TOPIC: {self.translate_text(input_query, target_language)}
451
+ β•‘ πŸ“… GENERATED: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
452
+ β•‘ πŸ“š DOCUMENTS FOUND: {len(cases)}
453
+ β•‘ πŸ” SOURCES SEARCHED: India Code Portal, LII India, Indian Kanoon
454
+ β•‘
455
+ {'β•š' + '═' * 78 + '╝'}
456
+ """
457
+
458
+ # Generate body
459
+ output_text = self.translate_text(header, target_language)
460
+ for i, case in enumerate(cases, 1):
461
+ output_text += self.format_legal_case(i, case, target_language)
462
+
463
+ # Generate footer
464
+ footer = f"""
465
+ {'═' * 80}
466
+ πŸ“Š RESEARCH INSIGHTS
467
+ {'═' * 80}
468
+
469
+ β€’ Results are sorted by relevance to your query
470
+ β€’ All information should be verified from original sources
471
+ β€’ Use provided links to access complete documents
472
+
473
+ {'─' * 80}
474
+ """
475
+ output_text += self.translate_text(footer, target_language)
476
+ return output_text
477
+
478
+ except Exception as e:
479
+ return f"An error occurred during research processing: {str(e)}"
480
+
481
+ def clear_gpu_memory(self):
482
+ """Clear GPU memory after processing"""
483
+ try:
484
+ gc.collect()
485
+ if torch.cuda.is_available():
486
+ torch.cuda.empty_cache()
487
+ except Exception as e:
488
+ print(f"Error clearing GPU memory: {e}")
489
+
490
+ def create_gradio_interface():
491
+ """Create Gradio interface with improved styling and error handling"""
492
+ generator = LegalResearchGenerator()
493
+
494
+ def process_input(input_text, research_type, doc_type, target_language, output_format):
495
+ if not input_text.strip():
496
+ return "Please enter a research topic to analyze."
497
+
498
+ try:
499
+ if output_format == "Text":
500
+ result = generator.process_research(
501
+ input_text,
502
+ research_type,
503
+ doc_type,
504
+ target_language
505
+ )
506
+ generator.clear_gpu_memory()
507
+ return result
508
+ else:
509
+ return "CSV output format is not implemented yet."
510
+ except Exception as e:
511
+ generator.clear_gpu_memory()
512
+ return f"An error occurred: {str(e)}"
513
+
514
+ css = """
515
+ .gradio-container {
516
+ font-family: 'Arial', sans-serif;
517
+ }
518
+ .output-text {
519
+ font-family: 'Courier New', monospace;
520
+ white-space: pre-wrap;
521
+ }
522
+ """
523
+
524
+ iface = gr.Interface(
525
+ fn=process_input,
526
+ inputs=[
527
+ gr.Textbox(
528
+ label="Enter Research Topic",
529
+ placeholder="e.g., 'privacy rights' or 'environmental protection'",
530
+ lines=3
531
+ ),
532
+ gr.Radio(
533
+ choices=["legal"],
534
+ label="Research Type",
535
+ value="legal"
536
+ ),
537
+ gr.Dropdown(
538
+ choices=list(generator.doc_types.keys()),
539
+ label="Document Type",
540
+ value="all"
541
+ ),
542
+ gr.Dropdown(
543
+ choices=["english", "hindi", "tamil", "bengali", "telugu"],
544
+ label="Output Language",
545
+ value="english"
546
+ ),
547
+ gr.Radio(
548
+ choices=["Text", "CSV"],
549
+ label="Output Format",
550
+ value="Text"
551
+ )
552
+ ],
553
+ outputs=gr.Textbox(
554
+ label="Research Analysis Report",
555
+ lines=30,
556
+ elem_classes=["output-text"]
557
+ ),
558
+ title="πŸ”¬ Legal Research Analysis Tool",
559
+ description="""
560
+ Advanced legal research tool for Indian legal document analysis.
561
+ β€’ Multi-source search across legal databases
562
+ β€’ Smart filtering and relevance ranking
563
+ β€’ Multi-language support
564
+ β€’ Comprehensive research reports
565
+ """,
566
+ examples=[
567
+ ["right to privacy", "legal", "central_acts", "english", "Text"],
568
+ ["environmental protection", "legal", "regulations", "hindi", "Text"],
569
+ ["digital rights", "legal", "constitutional_orders", "english", "Text"]
570
+ ],
571
+ css=css
572
+ )
573
+
574
+ return iface
575
+
576
+ if __name__ == "__main__":
577
+ iface = create_gradio_interface()
578
+ iface.launch(share=True)