Polarisailabs commited on
Commit
c2958f9
·
verified ·
1 Parent(s): 7b1af28

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -403
app.py CHANGED
@@ -1,413 +1,225 @@
1
- import os
2
- import io
3
- import json
4
- import pathlib
5
- import shutil
6
- from typing import List, Tuple, Dict
7
- import gradio as gr
8
- import numpy as np
9
- import faiss
 
 
 
 
 
 
10
  from sentence_transformers import SentenceTransformer
11
  from pypdf import PdfReader
12
- import fitz # PyMuPDF
13
  from collections import defaultdict
14
  from openai import OpenAI
15
-
16
- # =========================
17
- # LLM Endpoint
18
- # =========================
19
- API_KEY = os.environ.get("API_KEY")
20
- if not API_KEY:
21
- raise RuntimeError("Missing API_KEY (set it in Hugging Face: Settings → Variables and secrets).")
22
-
23
- client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=API_KEY)
24
-
25
- # Model configuration
26
- SINGLE_MODEL_NAME = "deepseek/deepseek-r1:free"
27
- GEN_TEMPERATURE = 0.2
28
- GEN_TOP_P = 0.95
29
- GEN_MAX_TOKENS = 1024
30
- EMB_MODEL_NAME = "intfloat/multilingual-e5-base"
31
-
32
- def choose_store_dir() -> Tuple[str, bool]:
33
- data_root = "/data"
34
- if os.path.isdir(data_root) and os.access(data_root, os.W_OK):
35
- d = os.path.join(data_root, "rag_store")
36
- try:
37
- os.makedirs(d, exist_ok=True)
38
- testf = os.path.join(d, ".write_test")
39
- with open(testf, "w", encoding="utf-8") as f:
40
- f.write("ok")
41
- os.remove(testf)
42
- return d, True
43
- except Exception:
44
- pass
45
- d = os.path.join(os.getcwd(), "store")
46
- os.makedirs(d, exist_ok=True)
47
- return d, False
48
-
49
- STORE_DIR, IS_PERSISTENT = choose_store_dir()
50
- META_PATH = os.path.join(STORE_DIR, "meta.json")
51
- INDEX_PATH = os.path.join(STORE_DIR, "faiss.index")
52
- LEGACY_STORE_DIR = os.path.join(os.getcwd(), "store")
53
-
54
  def migrate_legacy_if_any():
55
- try:
56
- if IS_PERSISTENT:
57
- legacy_meta = os.path.join(LEGACY_STORE_DIR, "meta.json")
58
- legacy_index = os.path.join(LEGACY_STORE_DIR, "faiss.index")
59
- if (not os.path.exists(META_PATH) or not os.path.exists(INDEX_PATH)) \
60
- and os.path.isdir(LEGACY_STORE_DIR) \
61
- and os.path.exists(legacy_meta) and os.path.exists(legacy_index):
62
- shutil.copyfile(legacy_meta, META_PATH)
63
- shutil.copyfile(legacy_index, INDEX_PATH)
64
- except Exception:
65
- pass
66
-
67
  migrate_legacy_if_any()
68
-
69
- _emb_model = None
70
- _index: faiss.Index = None
71
- _meta: Dict[str, Dict] = {}
72
-
73
- DEFAULT_TOP_K = 6
74
- DEFAULT_POOL_K = 40
75
- DEFAULT_PER_SOURCE_CAP = 2
76
- DEFAULT_STRATEGY = "mmr"
77
- DEFAULT_MMR_LAMBDA = 0.5
78
-
79
  def get_emb_model():
80
- global _emb_model
81
- if _emb_model is None:
82
- _emb_model = SentenceTransformer(EMB_MODEL_NAME)
83
- return _emb_model
84
-
85
- def _ensure_index(dim: int):
86
- global _index
87
- if _index is None:
88
- _index = faiss.IndexFlatIP(dim)
89
-
90
  def _persist():
91
- faiss.write_index(_index, INDEX_PATH)
92
- with open(META_PATH, "w", encoding="utf-8") as f:
93
- json.dump(_meta, f, ensure_ascii=False)
94
-
95
  def _load_if_any():
96
- global _index, _meta
97
- if os.path.exists(INDEX_PATH) and os.path.exists(META_PATH):
98
- _index = faiss.read_index(INDEX_PATH)
99
- with open(META_PATH, "r", encoding="utf-8") as f:
100
- _meta = json.load(f)
101
-
102
- def _chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str]:
103
- text = text.replace("\u0000", "")
104
- res, i, n = [], 0, len(text)
105
- while i < n:
106
- j = min(i + chunk_size, n)
107
- seg = text[i:j].strip()
108
- if seg:
109
- res.append(seg)
110
- i = max(0, j - overlap)
111
- if j >= n:
112
- break
113
- return res
114
-
115
- def _read_bytes(file) -> bytes:
116
- if isinstance(file, dict):
117
- p = file.get("path") or file.get("name")
118
- if p and os.path.exists(p):
119
- with open(p, "rb") as f:
120
- return f.read()
121
- if "data" in file and isinstance(file["data"], (bytes, bytearray)):
122
- return bytes(file["data"])
123
- if isinstance(file, (str, pathlib.Path)):
124
- with open(file, "rb") as f:
125
- return f.read()
126
- if hasattr(file, "read"):
127
- try:
128
- if hasattr(file, "seek"):
129
- try:
130
- file.seek(0)
131
- except Exception:
132
- pass
133
- return file.read()
134
- finally:
135
- try:
136
- file.close()
137
- except Exception:
138
- pass
139
- raise ValueError("Unsupported file type from gr.File")
140
-
141
- def _decode_best_effort(raw: bytes) -> str:
142
- for enc in ["utf-8", "cp932", "shift_jis", "cp950", "big5", "gb18030", "latin-1"]:
143
- try:
144
- return raw.decode(enc)
145
- except Exception:
146
- continue
147
- return raw.decode("utf-8", errors="ignore")
148
-
149
- def _read_pdf(file_bytes: bytes) -> str:
150
- try:
151
- with fitz.open(stream=file_bytes, filetype="pdf") as doc:
152
- if doc.is_encrypted:
153
- try:
154
- doc.authenticate("")
155
- except Exception:
156
- pass
157
- texts = [(page.get_text("text") or "") for page in doc]
158
- txt = "\n".join(texts)
159
- if txt.strip():
160
- return txt
161
- except Exception:
162
- pass
163
- try:
164
- reader = PdfReader(io.BytesIO(file_bytes))
165
- pages = []
166
- for p in reader.pages:
167
- try:
168
- pages.append(p.extract_text() or "")
169
- except Exception:
170
- pages.append("")
171
- return "\n".join(pages)
172
- except Exception:
173
- return ""
174
-
175
- def _read_any(file) -> str:
176
- if isinstance(file, dict):
177
- name = (file.get("orig_name") or file.get("name") or file.get("path") or "upload").lower()
178
- else:
179
- name = getattr(file, "name", None) or (str(file) if isinstance(file, (str, pathlib.Path)) else "upload")
180
- name = name.lower()
181
- raw = _read_bytes(file)
182
- if name.endswith(".pdf"):
183
- return _read_pdf(raw).replace("\u0000", "")
184
- return _decode_best_effort(raw).replace("\u0000", "")
185
-
186
- DOCS_DIR = os.path.join(os.getcwd(), "docs")
187
-
188
- def get_docs_files() -> List[str]:
189
- if not os.path.isdir(DOCS_DIR):
190
- return []
191
- files = []
192
- for fname in os.listdir(DOCS_DIR):
193
- if fname.lower().endswith((".pdf", ".txt")):
194
- files.append(os.path.join(DOCS_DIR, fname))
195
- return files
196
-
197
  def build_corpus_from_docs():
198
- global _index, _meta
199
- files = get_docs_files()
200
- if not files:
201
- return "No files found in docs folder."
202
- emb_model = get_emb_model()
203
- chunks, sources, failed = [], [], []
204
- _index = None
205
- _meta = {}
206
- for f in files:
207
- fname = os.path.basename(f)
208
- try:
209
- text = _read_any(f) or ""
210
- parts = _chunk_text(text)
211
- if not parts:
212
- failed.append(fname)
213
- continue
214
- chunks.extend(parts)
215
- sources.extend([fname] * len(parts))
216
- except Exception:
217
- failed.append(fname)
218
- if not chunks:
219
- return "No text extracted from docs."
220
- passages = [f"passage: {c}" for c in chunks]
221
- vec = emb_model.encode(passages, batch_size=64, convert_to_numpy=True, normalize_embeddings=True)
222
- _ensure_index(vec.shape[1])
223
- _index.add(vec)
224
- for i, (src, c) in enumerate(zip(sources, chunks)):
225
- _meta[str(i)] = {"source": src, "text": c}
226
- _persist()
227
- msg = f"Indexed {len(chunks)} chunks from {len(files)} files."
228
- if failed:
229
- msg += f" Failed files: {', '.join(failed)}"
230
- return msg
231
-
232
- def _encode_query_vec(query: str) -> np.ndarray:
233
- return get_emb_model().encode([f"query: {query}"], convert_to_numpy=True, normalize_embeddings=True)
234
-
235
- def retrieve_candidates(qvec: np.ndarray, pool_k: int = 40) -> List[Tuple[str, float]]:
236
- if _index is None or _index.ntotal == 0:
237
- return []
238
- pool_k = min(pool_k, _index.ntotal)
239
- D, I = _index.search(qvec, pool_k)
240
- return [(str(idx), float(score)) for idx, score in zip(I[0], D[0]) if idx != -1]
241
-
242
- def select_diverse_by_source(cands: List[Tuple[str, float]], top_k: int = 6, per_source_cap: int = 2) -> List[Tuple[str, float]]:
243
- if not cands:
244
- return []
245
- by_src: Dict[str, List[Tuple[str, float]]] = defaultdict(list)
246
- for cid, s in cands:
247
- m = _meta.get(cid)
248
- if not m:
249
- continue
250
- by_src[m["source"]].append((cid, s))
251
- for src in by_src:
252
- by_src[src] = by_src[src][:per_source_cap]
253
- picked, src_items, ptrs = [], [(s, it) for s, it in by_src.items()], {s: 0 for s in by_src}
254
- while len(picked) < top_k:
255
- advanced = False
256
- for src, items in src_items:
257
- i = ptrs[src]
258
- if i < len(items):
259
- picked.append(items[i])
260
- ptrs[src] = i + 1
261
- advanced = True
262
- if len(picked) >= top_k:
263
- break
264
- if not advanced:
265
- break
266
- if len(picked) < top_k:
267
- seen = {cid for cid, _ in picked}
268
- for cid, s in cands:
269
- if cid not in seen:
270
- picked.append((cid, s))
271
- seen.add(cid)
272
- if len(picked) >= top_k:
273
- break
274
- return picked[:top_k]
275
-
276
- def _encode_chunks_text(cids: List[str]) -> np.ndarray:
277
- texts = [f"passage: {(_meta.get(cid) or {}).get('text','')}" for cid in cids]
278
- return get_emb_model().encode(texts, convert_to_numpy=True, normalize_embeddings=True)
279
-
280
- def select_diverse_mmr(cands: List[Tuple[str, float]], qvec: np.ndarray, top_k: int = 6, mmr_lambda: float = 0.5) -> List[Tuple[str, float]]:
281
- if not cands:
282
- return []
283
- cids = [cid for cid, _ in cands]
284
- cvecs = _encode_chunks_text(cids)
285
- sim_to_q = (cvecs @ qvec.T).reshape(-1)
286
- selected, remaining = [], set(range(len(cids)))
287
- while len(selected) < min(top_k, len(cids)):
288
- if not selected:
289
- i = int(np.argmax(sim_to_q))
290
- selected.append(i)
291
- remaining.remove(i)
292
- continue
293
- S = cvecs[selected]
294
- sim_to_S = (cvecs[list(remaining)] @ S.T)
295
- max_sim_to_S = sim_to_S.max(axis=1) if sim_to_S.size > 0 else np.zeros((len(remaining),), dtype=np.float32)
296
- sim_q_rem = sim_to_q[list(remaining)]
297
- mmr_scores = mmr_lambda * sim_q_rem - (1.0 - mmr_lambda) * max_sim_to_S
298
- j_rel = int(np.argmax(mmr_scores))
299
- j = list(remaining)[j_rel]
300
- selected.append(j)
301
- remaining.remove(j)
302
- return [(cids[i], float(sim_to_q[i])) for i in selected][:top_k]
303
-
304
- def retrieve_diverse(query: str,
305
- top_k: int = 6,
306
- pool_k: int = 40,
307
- per_source_cap: int = 2,
308
- strategy: str = "mmr",
309
- mmr_lambda: float = 0.5) -> List[Tuple[str, float]]:
310
- qvec = _encode_query_vec(query)
311
- cands = retrieve_candidates(qvec, pool_k=pool_k)
312
- if strategy == "mmr":
313
- return select_diverse_mmr(cands, qvec, top_k=top_k, mmr_lambda=mmr_lambda)
314
- return select_diverse_by_source(cands, top_k=top_k, per_source_cap=per_source_cap)
315
-
316
- def _format_ctx(hits: List[Tuple[str, float]]) -> str:
317
- if not hits:
318
- return ""
319
- lines = []
320
- for cid, _ in hits:
321
- m = _meta.get(cid)
322
- if not m:
323
- continue
324
- source_clean = m.get("source", "")
325
- text_clean = (m.get("text", "") or "").replace("\n", " ")
326
- lines.append(f"[{cid}] ({source_clean}) " + text_clean)
327
- return "\n".join(lines[:10])
328
-
329
- def chat_fn(message, history):
330
- model_name = SINGLE_MODEL_NAME
331
-
332
- if _index is None or _index.ntotal == 0:
333
- status = build_corpus_from_docs()
334
- if not (_index and _index.ntotal > 0):
335
- yield f"**Index Status:** {status}\n\nPlease ensure you have a 'docs' folder with PDF/TXT files and try again."
336
- return
337
-
338
- hits = retrieve_diverse(
339
- message,
340
- top_k=6,
341
- pool_k=40,
342
- per_source_cap=2,
343
- strategy="mmr",
344
- mmr_lambda=0.5,
345
- )
346
-
347
- ctx = _format_ctx(hits) if hits else "(Current index is empty or no matching chunks found)"
348
- sys_blocks = [
349
- "You are a research assistant who has an excellent factual understanding of the legal policies, regulations, and compliance of enterprises, governments, and global organizations. You are a research assistant who reads Legal papers and provides factual answers to queries. If you do not know the answer, you should convey that to the user instead of hallucinating. Answers must be based on retrieved content with evidence and source numbers cited. If retrieval is insufficient, please clearly explain the shortcomings. When answering, please cite the numbers, e.g., [3]"
350
- ]
351
- messages = [{"role": "system", "content": "\n\n".join(sys_blocks)}]
352
- for u, a in history:
353
- messages.append({"role": "user", "content": u})
354
- messages.append({"role": "assistant", "content": a})
355
- messages.append({"role": "user", "content": message})
356
-
357
- try:
358
- response = client.chat.completions.create(
359
- model=model_name,
360
- messages=messages,
361
- temperature=GEN_TEMPERATURE,
362
- top_p=GEN_TOP_P,
363
- max_tokens=GEN_MAX_TOKENS,
364
- stream=True,
365
- )
366
-
367
- partial_message = ""
368
- for chunk in response:
369
- if hasattr(chunk.choices[0], "delta") and chunk.choices[0].delta.content is not None:
370
- partial_message += chunk.choices[0].delta.content
371
- yield partial_message
372
- elif hasattr(chunk.choices[0], "message") and chunk.choices[0].message.content is not None:
373
- partial_message += chunk.choices[0].message.content
374
- yield partial_message
375
- except Exception as e:
376
- yield f"[Exception] {repr(e)}"
377
-
378
- # Load the FAISS index and metadata from storage if they exist
379
- try:
380
- _load_if_any()
381
- except Exception as e:
382
- print(f"Notice: Could not load existing index. A new one will be created. Error: {e}")
383
-
384
- # Main execution block to configure and launch the Gradio app
385
- if __name__ == "__main__":
386
-
387
- def chatbot_interface(user_message):
388
- """
389
- Adapter function to connect the stateless gr.Interface to the
390
- streaming backend chat function.
391
- """
392
- history = [] # The new interface is stateless
393
- gen = chat_fn(user_message, history)
394
- final_response = ""
395
- for chunk in gen:
396
- final_response = chunk
397
- return final_response
398
-
399
- with gr.Blocks(theme=gr.themes.Default(primary_hue="sky")) as legalprodigy:
400
- inputs = gr.Textbox(
401
- lines=7,
402
- label="LegalProdigy Query:",
403
- placeholder="Try: Explain Arbitration Process"
404
- )
405
- outputs = gr.Textbox(lines=10,label="LegalProdigy Response:")
406
-
407
- gr.Interface(
408
- fn=chatbot_interface,
409
- inputs=inputs,
410
- outputs=outputs
411
- )
412
-
413
- legalprodigy.launch()
 
1
+ _L='faiss.index'
2
+ _K='meta.json'
3
+ _J='source'
4
+ _I='name'
5
+ _H='\x00'
6
+ _G=False
7
+ _F='\n'
8
+ _E='text'
9
+ _D='mmr'
10
+ _C='utf-8'
11
+ _B=None
12
+ _A=True
13
+ import os,io,json,pathlib,shutil
14
+ from typing import List,Tuple,Dict
15
+ import gradio as gr,numpy as np,faiss
16
  from sentence_transformers import SentenceTransformer
17
  from pypdf import PdfReader
18
+ import fitz
19
  from collections import defaultdict
20
  from openai import OpenAI
21
+ API_KEY=os.environ.get('API_KEY')
22
+ if not API_KEY:raise RuntimeError('Missing API_KEY (set it in Hugging Face: Settings → Variables and secrets).')
23
+ client=OpenAI(base_url='https://openrouter.ai/api/v1',api_key=API_KEY)
24
+ SINGLE_MODEL_NAME='deepseek/deepseek-r1:free'
25
+ GEN_TEMPERATURE=.2
26
+ GEN_TOP_P=.95
27
+ GEN_MAX_TOKENS=1024
28
+ EMB_MODEL_NAME='intfloat/multilingual-e5-base'
29
+ def choose_store_dir():
30
+ B='/data'
31
+ if os.path.isdir(B)and os.access(B,os.W_OK):
32
+ A=os.path.join(B,'rag_store')
33
+ try:
34
+ os.makedirs(A,exist_ok=_A);C=os.path.join(A,'.write_test')
35
+ with open(C,'w',encoding=_C)as D:D.write('ok')
36
+ os.remove(C);return A,_A
37
+ except Exception:pass
38
+ A=os.path.join(os.getcwd(),'store');os.makedirs(A,exist_ok=_A);return A,_G
39
+ STORE_DIR,IS_PERSISTENT=choose_store_dir()
40
+ META_PATH=os.path.join(STORE_DIR,_K)
41
+ INDEX_PATH=os.path.join(STORE_DIR,_L)
42
+ LEGACY_STORE_DIR=os.path.join(os.getcwd(),'store')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def migrate_legacy_if_any():
44
+ try:
45
+ if IS_PERSISTENT:
46
+ A=os.path.join(LEGACY_STORE_DIR,_K);B=os.path.join(LEGACY_STORE_DIR,_L)
47
+ if(not os.path.exists(META_PATH)or not os.path.exists(INDEX_PATH))and os.path.isdir(LEGACY_STORE_DIR)and os.path.exists(A)and os.path.exists(B):shutil.copyfile(A,META_PATH);shutil.copyfile(B,INDEX_PATH)
48
+ except Exception:pass
 
 
 
 
 
 
 
49
  migrate_legacy_if_any()
50
+ _emb_model=_B
51
+ _index=_B
52
+ _meta={}
53
+ DEFAULT_TOP_K=6
54
+ DEFAULT_POOL_K=40
55
+ DEFAULT_PER_SOURCE_CAP=2
56
+ DEFAULT_STRATEGY=_D
57
+ DEFAULT_MMR_LAMBDA=.5
 
 
 
58
  def get_emb_model():
59
+ global _emb_model
60
+ if _emb_model is _B:_emb_model=SentenceTransformer(EMB_MODEL_NAME)
61
+ return _emb_model
62
+ def _ensure_index(dim):
63
+ global _index
64
+ if _index is _B:_index=faiss.IndexFlatIP(dim)
 
 
 
 
65
  def _persist():
66
+ faiss.write_index(_index,INDEX_PATH)
67
+ with open(META_PATH,'w',encoding=_C)as A:json.dump(_meta,A,ensure_ascii=_G)
 
 
68
  def _load_if_any():
69
+ global _index,_meta
70
+ if os.path.exists(INDEX_PATH)and os.path.exists(META_PATH):
71
+ _index=faiss.read_index(INDEX_PATH)
72
+ with open(META_PATH,'r',encoding=_C)as A:_meta=json.load(A)
73
+ def _chunk_text(text,chunk_size=800,overlap=120):
74
+ A=text;A=A.replace(_H,'');E,B,C=[],0,len(A)
75
+ while B<C:
76
+ D=min(B+chunk_size,C);F=A[B:D].strip()
77
+ if F:E.append(F)
78
+ B=max(0,D-overlap)
79
+ if D>=C:break
80
+ return E
81
+ def _read_bytes(file):
82
+ D='data';A=file
83
+ if isinstance(A,dict):
84
+ B=A.get('path')or A.get(_I)
85
+ if B and os.path.exists(B):
86
+ with open(B,'rb')as C:return C.read()
87
+ if D in A and isinstance(A[D],(bytes,bytearray)):return bytes(A[D])
88
+ if isinstance(A,(str,pathlib.Path)):
89
+ with open(A,'rb')as C:return C.read()
90
+ if hasattr(A,'read'):
91
+ try:
92
+ if hasattr(A,'seek'):
93
+ try:A.seek(0)
94
+ except Exception:pass
95
+ return A.read()
96
+ finally:
97
+ try:A.close()
98
+ except Exception:pass
99
+ raise ValueError('Unsupported file type from gr.File')
100
+ def _decode_best_effort(raw):
101
+ for A in[_C,'cp932','shift_jis','cp950','big5','gb18030','latin-1']:
102
+ try:return raw.decode(A)
103
+ except Exception:continue
104
+ return raw.decode(_C,errors='ignore')
105
+ def _read_pdf(file_bytes):
106
+ C=file_bytes
107
+ try:
108
+ with fitz.open(stream=C,filetype='pdf')as A:
109
+ if A.is_encrypted:
110
+ try:A.authenticate('')
111
+ except Exception:pass
112
+ E=[A.get_text(_E)or''for A in A];D=_F.join(E)
113
+ if D.strip():return D
114
+ except Exception:pass
115
+ try:
116
+ F=PdfReader(io.BytesIO(C));B=[]
117
+ for G in F.pages:
118
+ try:B.append(G.extract_text()or'')
119
+ except Exception:B.append('')
120
+ return _F.join(B)
121
+ except Exception:return''
122
+ def _read_any(file):
123
+ D='upload';A=file
124
+ if isinstance(A,dict):B=(A.get('orig_name')or A.get(_I)or A.get('path')or D).lower()
125
+ else:B=getattr(A,_I,_B)or(str(A)if isinstance(A,(str,pathlib.Path))else D)
126
+ B=B.lower();C=_read_bytes(A)
127
+ if B.endswith('.pdf'):return _read_pdf(C).replace(_H,'')
128
+ return _decode_best_effort(C).replace(_H,'')
129
+ DOCS_DIR=os.path.join(os.getcwd(),'docs')
130
+ def get_docs_files():
131
+ if not os.path.isdir(DOCS_DIR):return[]
132
+ A=[]
133
+ for B in os.listdir(DOCS_DIR):
134
+ if B.lower().endswith(('.pdf','.txt')):A.append(os.path.join(DOCS_DIR,B))
135
+ return A
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def build_corpus_from_docs():
137
+ global _index,_meta;C=get_docs_files()
138
+ if not C:return'No files found in docs folder.'
139
+ J=get_emb_model();A,F,B=[],[],[];_index=_B;_meta={}
140
+ for G in C:
141
+ D=os.path.basename(G)
142
+ try:
143
+ K=_read_any(G)or'';E=_chunk_text(K)
144
+ if not E:B.append(D);continue
145
+ A.extend(E);F.extend([D]*len(E))
146
+ except Exception:B.append(D)
147
+ if not A:return'No text extracted from docs.'
148
+ L=[f"passage: {A}"for A in A];H=J.encode(L,batch_size=64,convert_to_numpy=_A,normalize_embeddings=_A);_ensure_index(H.shape[1]);_index.add(H)
149
+ for(M,(N,O))in enumerate(zip(F,A)):_meta[str(M)]={_J:N,_E:O}
150
+ _persist();I=f"Indexed {len(A)} chunks from {len(C)} files."
151
+ if B:I+=f" Failed files: {', '.join(B)}"
152
+ return I
153
+ def _encode_query_vec(query):return get_emb_model().encode([f"query: {query}"],convert_to_numpy=_A,normalize_embeddings=_A)
154
+ def retrieve_candidates(qvec,pool_k=40):
155
+ A=pool_k
156
+ if _index is _B or _index.ntotal==0:return[]
157
+ A=min(A,_index.ntotal);B,C=_index.search(qvec,A);return[(str(A),float(B))for(A,B)in zip(C[0],B[0])if A!=-1]
158
+ def select_diverse_by_source(cands,top_k=6,per_source_cap=2):
159
+ F=cands;D=top_k
160
+ if not F:return[]
161
+ B=defaultdict(list)
162
+ for(C,G)in F:
163
+ I=_meta.get(C)
164
+ if not I:continue
165
+ B[I[_J]].append((C,G))
166
+ for E in B:B[E]=B[E][:per_source_cap]
167
+ A,N,J=[],[(A,B)for(A,B)in B.items()],{A:0 for A in B}
168
+ while len(A)<D:
169
+ K=_G
170
+ for(E,L)in N:
171
+ H=J[E]
172
+ if H<len(L):A.append(L[H]);J[E]=H+1;K=_A
173
+ if len(A)>=D:break
174
+ if not K:break
175
+ if len(A)<D:
176
+ M={A for(A,B)in A}
177
+ for(C,G)in F:
178
+ if C not in M:
179
+ A.append((C,G));M.add(C)
180
+ if len(A)>=D:break
181
+ return A[:D]
182
+ def _encode_chunks_text(cids):A=[f"passage: {(_meta.get(A)or{}).get(_E,'')}"for A in cids];return get_emb_model().encode(A,convert_to_numpy=_A,normalize_embeddings=_A)
183
+ def select_diverse_mmr(cands,qvec,top_k=6,mmr_lambda=.5):
184
+ H=mmr_lambda;G=top_k;F=cands
185
+ if not F:return[]
186
+ C=[A for(A,B)in F];D=_encode_chunks_text(C);E=(D@qvec.T).reshape(-1);A,B=[],set(range(len(C)))
187
+ while len(A)<min(G,len(C)):
188
+ if not A:I=int(np.argmax(E));A.append(I);B.remove(I);continue
189
+ L=D[A];J=D[list(B)]@L.T;M=J.max(axis=1)if J.size>0 else np.zeros((len(B),),dtype=np.float32);N=E[list(B)];O=H*N-(1.-H)*M;P=int(np.argmax(O));K=list(B)[P];A.append(K);B.remove(K)
190
+ return[(C[A],float(E[A]))for A in A][:G]
191
+ def retrieve_diverse(query,top_k=6,pool_k=40,per_source_cap=2,strategy=_D,mmr_lambda=.5):
192
+ A=top_k;B=_encode_query_vec(query);C=retrieve_candidates(B,pool_k=pool_k)
193
+ if strategy==_D:return select_diverse_mmr(C,B,top_k=A,mmr_lambda=mmr_lambda)
194
+ return select_diverse_by_source(C,top_k=A,per_source_cap=per_source_cap)
195
+ def _format_ctx(hits):
196
+ if not hits:return''
197
+ B=[]
198
+ for(C,F)in hits:
199
+ A=_meta.get(C)
200
+ if not A:continue
201
+ D=A.get(_J,'');E=(A.get(_E,'')or'').replace(_F,' ');B.append(f"[{C}] ({D}) "+E)
202
+ return _F.join(B[:10])
203
+ def chat_fn(message,history):
204
+ H='user';F=message;E='content';D='role';I=SINGLE_MODEL_NAME
205
+ if _index is _B or _index.ntotal==0:
206
+ J=build_corpus_from_docs()
207
+ if not(_index and _index.ntotal>0):yield f"**Index Status:** {J}\n\nPlease ensure you have a 'docs' folder with PDF/TXT files and try again.";return
208
+ G=retrieve_diverse(F,top_k=6,pool_k=40,per_source_cap=2,strategy=_D,mmr_lambda=.5);P=_format_ctx(G)if G else'(Current index is empty or no matching chunks found)';K=['You are a research assistant who has an excellent factual understanding of the legal policies, regulations, and compliance of enterprises, governments, and global organizations. You are a research assistant who reads Legal papers and provides factual answers to queries. If you do not know the answer, you should convey that to the user instead of hallucinating. Answers must be based on retrieved content with evidence and source numbers cited. If retrieval is insufficient, please clearly explain the shortcomings. When answering, please cite the numbers, e.g., [3]'];B=[{D:'system',E:'\n\n'.join(K)}]
209
+ for(L,M)in history:B.append({D:H,E:L});B.append({D:'assistant',E:M})
210
+ B.append({D:H,E:F})
211
+ try:
212
+ N=client.chat.completions.create(model=I,messages=B,temperature=GEN_TEMPERATURE,top_p=GEN_TOP_P,max_tokens=GEN_MAX_TOKENS,stream=_A);C=''
213
+ for A in N:
214
+ if hasattr(A.choices[0],'delta')and A.choices[0].delta.content is not _B:C+=A.choices[0].delta.content;yield C
215
+ elif hasattr(A.choices[0],'message')and A.choices[0].message.content is not _B:C+=A.choices[0].message.content;yield C
216
+ except Exception as O:yield f"[Exception] {repr(O)}"
217
+ try:_load_if_any()
218
+ except Exception as e:print(f"Notice: Could not load existing index. A new one will be created. Error: {e}")
219
+ if __name__=='__main__':
220
+ def chatbot_interface(user_message):
221
+ '\n Adapter function to connect the stateless gr.Interface to the \n streaming backend chat function.\n ';B=[];C=chat_fn(user_message,B);A=''
222
+ for D in C:A=D
223
+ return A
224
+ with gr.Blocks(theme=gr.themes.Default(primary_hue='sky'))as legalprodigy:inputs=gr.Textbox(lines=7,label='LegalProdigy Query:',placeholder='Try: Explain Arbitration Process');outputs=gr.Textbox(lines=10,label='LegalProdigy Response:');gr.Interface(fn=chatbot_interface,inputs=inputs,outputs=outputs)
225
+ legalprodigy.launch()