JARVIS-JI commited on
Commit
8e9e93e
Β·
verified Β·
1 Parent(s): 4ca148c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -0
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ import faiss
6
+ import numpy as np
7
+ from groq import Groq
8
+ import os
9
+
10
+ # ------------- CONSTANTS ------------------------------------------------------
11
+ LEGAL_BERT_MODEL = "nlpaueb/legal-bert-base-uncased"
12
+
13
+ # Multiple legal documents - adjust PDFs here
14
+ DOCS = [
15
+ ("bns_full.pdf", "Bharatiya Nyaya Sanhita 2023"),
16
+ ("bns_ipc_mapping.pdf", "BNS-IPC Comparative Mapping"),
17
+ ]
18
+
19
+ MAX_CHUNK_SIZE = 1000
20
+ OVERLAP = 200
21
+ TOP_K = 5 # Number of chunks to retrieve for context
22
+ LLAMA_MODEL = 'llama-3.3-70b-versatile'
23
+
24
+ # Groq API setup
25
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
26
+ groq_client = Groq(api_key=GROQ_API_KEY)
27
+
28
+ # ------------- LEGAL-BERT EMBEDDER CLASS ------------------------------------
29
+ class LegalBERTEmbedder:
30
+ def __init__(self, model_name=LEGAL_BERT_MODEL):
31
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
32
+ self.model = AutoModel.from_pretrained(model_name)
33
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ self.model.to(self.device)
35
+ self.model.eval()
36
+
37
+ def embed(self, texts):
38
+ all_embeddings = []
39
+ with torch.no_grad():
40
+ for text in texts:
41
+ inputs = self.tokenizer(text, return_tensors="pt",
42
+ truncation=True, max_length=512).to(self.device)
43
+ outputs = self.model(**inputs)
44
+ cls_embed = outputs.last_hidden_state[:, 0, :].cpu().numpy()
45
+ all_embeddings.append(cls_embed.flatten())
46
+ return np.vstack(all_embeddings)
47
+
48
+ # ------------- PDF PROCESSING FUNCTIONS ------------------------------------
49
+ def extract_text_from_pdf(pdf_path):
50
+ """Extract text from PDF file"""
51
+ reader = PdfReader(pdf_path)
52
+ raw_text = ""
53
+ for page in reader.pages:
54
+ text = page.extract_text()
55
+ if text:
56
+ raw_text += text + "\n"
57
+ return raw_text
58
+
59
+ def chunk_text(text, max_chunk_size=MAX_CHUNK_SIZE, overlap=OVERLAP):
60
+ """Split text into overlapping chunks"""
61
+ chunks = []
62
+ start = 0
63
+ length = len(text)
64
+ while start < length:
65
+ end = min(start + max_chunk_size, length)
66
+ chunk = text[start:end]
67
+ chunks.append(chunk)
68
+ start += max_chunk_size - overlap
69
+ return chunks
70
+
71
+ # ------------- FAISS INDEX FUNCTIONS ---------------------------------------
72
+ def build_faiss_index(embeddings):
73
+ """Build FAISS index for similarity search"""
74
+ dim = embeddings.shape[1]
75
+ index = faiss.IndexFlatIP(dim) # Inner product for cosine similarity
76
+ faiss.normalize_L2(embeddings)
77
+ index.add(embeddings)
78
+ return index
79
+
80
+ def query_faiss(index, query_embed, k=TOP_K):
81
+ """Query FAISS index for top-k similar chunks"""
82
+ faiss.normalize_L2(query_embed)
83
+ distances, indices = index.search(query_embed, k)
84
+ return distances, indices
85
+
86
+ # ------------- LOAD AND PROCESS ALL DOCUMENTS ------------------------------
87
+ print("Loading and processing multiple legal documents...")
88
+
89
+ embedder = LegalBERTEmbedder()
90
+ all_chunks = []
91
+ metadata = [] # Store (act_label, original_chunk_text) for reference
92
+
93
+ print("Extracting and chunking text from all PDFs...")
94
+ for pdf_path, act_label in DOCS:
95
+ try:
96
+ raw_text = extract_text_from_pdf(pdf_path)
97
+ print(f"Extracted {len(raw_text)} characters from {act_label}")
98
+
99
+ chunks = chunk_text(raw_text)
100
+ print(f"Created {len(chunks)} chunks from {act_label}")
101
+
102
+ # Prefix each chunk with act label for better context
103
+ labeled_chunks = [f"[{act_label}] {chunk}" for chunk in chunks]
104
+ all_chunks.extend(labeled_chunks)
105
+ metadata.extend([(act_label, chunk) for chunk in chunks])
106
+
107
+ except Exception as e:
108
+ print(f"Error processing {pdf_path}: {str(e)}")
109
+ continue
110
+
111
+ print(f"Total chunks created: {len(all_chunks)}")
112
+
113
+ print("Embedding all text chunks with Legal-BERT...")
114
+ chunk_embeddings = embedder.embed(all_chunks)
115
+ print("Embeddings created successfully")
116
+
117
+ print("Building FAISS index...")
118
+ faiss_index = build_faiss_index(chunk_embeddings)
119
+ print("FAISS index built successfully")
120
+
121
+ # ------------- PROMPT TEMPLATES -------------------------------------------
122
+ SYSTEM_PROMPT = """You are a senior Indian legal expert specializing in the Bharatiya Nyaya Sanhita 2023 (BNS) and its correspondence with the Indian Penal Code 1860 (IPC).
123
+ When answering any question, you MUST use this exact format:
124
+ CONTEXT/SITUATION:
125
+ [Provide detailed explanation of the legal context and situation]
126
+ BNS SECTIONS:
127
+ [List the specific BNS sections and subsections that apply, with proper citations]
128
+ IPC SECTIONS (if applicable):
129
+ [List the corresponding IPC sections based on mappings, with proper citations]
130
+ SUMMARY:
131
+ [Provide a clear one-sentence summary highlighting the applicable BNS and IPC sections in **bold** format]
132
+ Always cite specific sections when available and ensure your response covers relevant BNS provisions and mapped IPC equivalents."""
133
+
134
+ def build_user_prompt(context, question):
135
+ """Build the user prompt with context and question"""
136
+ return f"""Based on the following relevant extracts from BNS and IPC legislation:
137
+ {context}
138
+ Question: {question}
139
+ Please provide a comprehensive legal answer following the exact format specified in the system instructions."""
140
+
141
+ # ------------- MAIN QUERY FUNCTION ----------------------------------------
142
+ def answer_query(user_query):
143
+ """Main function to answer user queries"""
144
+ try:
145
+ # Embed the user query
146
+ query_embed = embedder.embed([user_query])
147
+
148
+ # Retrieve top-k similar chunks from FAISS
149
+ _, indices = query_faiss(faiss_index, query_embed, k=TOP_K)
150
+ retrieved_chunks = [all_chunks[i] for i in indices[0]]
151
+
152
+ # Prepare context for Llama 3
153
+ context = "\n\n".join(retrieved_chunks)
154
+
155
+ # Create chat completion using Groq API with Llama 3
156
+ chat_completion = groq_client.chat.completions.create(
157
+ messages=[
158
+ {
159
+ "role": "system",
160
+ "content": SYSTEM_PROMPT
161
+ },
162
+ {
163
+ "role": "user",
164
+ "content": build_user_prompt(context, user_query)
165
+ }
166
+ ],
167
+ model=LLAMA_MODEL,
168
+ temperature=0.1,
169
+ max_tokens=1024
170
+ )
171
+
172
+ return chat_completion.choices[0].message.content.strip()
173
+
174
+ except Exception as e:
175
+ return f"Error processing query: {str(e)}\n\nPlease check your Groq API key and internet connection."
176
+
177
+ # ------------- GRADIO INTERFACE -------------------------------------------
178
+ with gr.Blocks(title="IPC & BNS Legal Assistant") as demo:
179
+ gr.Markdown("""
180
+ # πŸ›οΈ IPC & BNS Legal Assistant
181
+
182
+ **Comprehensive Legal Q&A System covering:**
183
+ - Bharatiya Nyaya Sanhita 2023 (BNS)
184
+ - Corresponding Indian Penal Code 1860 (IPC) sections
185
+
186
+ Ask any question about Indian criminal legislation and get structured legal answers with proper citations.
187
+ """)
188
+
189
+ with gr.Row():
190
+ with gr.Column():
191
+ query_input = gr.Textbox(
192
+ label="πŸ’Ό Enter your legal query",
193
+ placeholder="e.g., What are the penalties for murder under BNS? What is the IPC equivalent for theft?",
194
+ lines=4,
195
+ max_lines=8
196
+ )
197
+
198
+ with gr.Row():
199
+ submit_btn = gr.Button("πŸ” Get Legal Answer", variant="primary", scale=2)
200
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", scale=1)
201
+
202
+ with gr.Row():
203
+ answer_output = gr.Markdown(
204
+ label="πŸ“‹ Legal Analysis",
205
+ value="*Submit your question to get a structured legal analysis...*"
206
+ )
207
+
208
+ # Event handlers
209
+ submit_btn.click(answer_query, inputs=query_input, outputs=answer_output)
210
+ query_input.submit(answer_query, inputs=query_input, outputs=answer_output)
211
+ clear_btn.click(lambda: ("", "*Submit your question to get a structured legal analysis...*"),
212
+ outputs=[query_input, answer_output])
213
+
214
+ # Add examples
215
+ gr.Examples(
216
+ examples=[
217
+ ["What are the penalties for murder under BNS?"],
218
+ ["What is the IPC equivalent for BNS Section 103?"],
219
+ ["What constitutes theft according to BNS legislation?"],
220
+ ["How are punishments defined for assault in BNS?"],
221
+ ["What are the legal provisions for robbery under IPC and BNS?"]
222
+ ],
223
+ inputs=query_input,
224
+ outputs=answer_output,
225
+ fn=answer_query,
226
+ cache_examples=False
227
+ )
228
+
229
+ # Launch the interface
230
+ if __name__ == "__main__":
231
+ demo.launch(
232
+ share=False,
233
+ debug=True,
234
+ show_error=True
235
+ )