jayasrees commited on
Commit
9d21edd
·
1 Parent(s): d60dbcf

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OS / editor
2
+ .DS_Store
3
+ Thumbs.db
4
+ .idea/
5
+ .vscode/
6
+
7
+ # Python
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+ .pytest_cache/
12
+ .mypy_cache/
13
+ .ruff_cache/
14
+ .coverage
15
+ .coverage.*
16
+ htmlcov/
17
+ dist/
18
+ build/
19
+ *.egg-info/
20
+
21
+ # Virtual environments
22
+ venv/
23
+ .venv/
24
+ backend/.venv/
25
+ ENV/
26
+ env/
27
+
28
+ # Environment files
29
+ .env
30
+ .env.*
31
+ !.env.example
32
+
33
+ # Logs
34
+ *.log
35
+ logs/
36
+
37
+ # Databases / local state
38
+ *.db
39
+ *.sqlite
40
+ *.sqlite3
41
+
42
+ # Runtime artifacts
43
+ output/
44
+ tmp/
45
+ *.tmp
46
+
47
+ # Frontend
48
+ node_modules/
49
+ .next/
50
+ coverage/
51
+
52
+ # Local model/checkpoint artifacts (large)
53
+ merged_tinyllama_instruction/
54
+ *.bin
55
+ *.pt
56
+ *.ckpt
57
+ *.safetensors
58
+
analysis/common_analyzer.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Strict Domain Analyzer for Legal Documents.
3
+ Implements specific checks for:
4
+ - Entity Roles (Vendor vs Vendee)
5
+ - Domain Categories (Financial, Possession, Ownership, etc.)
6
+ - Timeline Logic (Agreement vs Registration)
7
+ - Numeric Consistency within context
8
+ """
9
+
10
+ import re
11
+
12
+ # =========================
13
+ # 1. STRICT CLASSIFICATION
14
+ # =========================
15
+
16
+ def is_legal_boilerplate(text):
17
+ """Detects standard legal headers, footers, and witness blocks."""
18
+ t = text.lower()
19
+ patterns = [
20
+ "in witness whereof", "signed and delivered", "witnesses:",
21
+ "schedule", "jurisdiction", "arbitration", "notice",
22
+ "all that piece and parcel", "north by", "south by"
23
+ ]
24
+ # If it's very short (< 5 words) and contains a keyword
25
+ words = t.split()
26
+ if len(words) < 5 and any(p in t for p in patterns):
27
+ return True
28
+
29
+ # If it's just a signature block
30
+ if "signed by" in t or "witness" in t:
31
+ return True
32
+
33
+ return False
34
+
35
+ def get_clause_domain(text):
36
+ """
37
+ Classify clause into strict legal domains.
38
+ Returns: 'FINANCIAL', 'POSSESSION', 'OWNERSHIP', 'ENCUMBRANCE', 'ADMINISTRATIVE', 'RECITAL', 'DEFINITION', 'OPERATIVE' or 'GENERAL'
39
+ """
40
+ t = text.lower()
41
+
42
+ # 1. RECITAL (Background)
43
+ if t.startswith("whereas") or "and whereas" in t:
44
+ return "RECITAL"
45
+
46
+ # 2. DEFINITION
47
+ if "shall mean" in t or "expression vendor" in t or "expression vendee" in t:
48
+ return "DEFINITION"
49
+
50
+ # 3. FINANCIAL (Money, Consideration)
51
+ if any(w in t for w in ["rs.", "rupees", "paid", "consideration", "sum of", "amount", "price", "cheque", "bank"]):
52
+ return "FINANCIAL"
53
+
54
+ # 4. POSSESSION (Handover, Vacant)
55
+ if any(w in t for w in ["possession", "handed over", "delivered", "vacant"]):
56
+ return "POSSESSION"
57
+
58
+ # 5. OWNERSHIP / TITLE
59
+ if any(w in t for w in ["owner", "title", "interest", "rights", "absolute", "fee simple"]):
60
+ return "OWNERSHIP"
61
+
62
+ # 6. ENCUMBRANCE (Loans, Mortgages)
63
+ if any(w in t for w in ["encumbrance", "mortgage", "loan", "charge", "lien", "litigation"]):
64
+ return "ENCUMBRANCE"
65
+
66
+ # 7. ADMINISTRATIVE (Boilerplate)
67
+ if any(w in t for w in ["witness", "signed", "schedule", "jurisdiction", "arbitration", "notice"]):
68
+ return "ADMINISTRATIVE"
69
+
70
+ # 8. OPERATIVE (Action)
71
+ if t.startswith("that") or "hereby" in t or "now this deed" in t:
72
+ return "OPERATIVE"
73
+
74
+ return "GENERAL"
75
+
76
+ def get_entities(text):
77
+ """
78
+ Strictly detect if clause belongs to a specific entity.
79
+ """
80
+ t = text.lower()
81
+ entities = set()
82
+ if "vendor" in t: entities.add("Vendor")
83
+ if "vendee" in t: entities.add("Vendee")
84
+ return entities
85
+
86
+ # =========================
87
+ # 2. EXTRACTION HELPERS
88
+ # =========================
89
+
90
+ def extract_numbers(text):
91
+ """Extract numeric values for comparison."""
92
+ # Matches Rs. 100, 1,00,000, 500 sq ft (just the numbers)
93
+ return [int(n.replace(",", "")) for n in re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)]
94
+
95
+ def has_negation(text):
96
+ neg_words = ["not", "never", "no", "cannot", "must not", "shall not"]
97
+ return any(w in text.lower() for w in neg_words)
98
+
99
+ def has_exception_language(text):
100
+ """Detects legal exception/qualification identifiers."""
101
+ qualifiers = [
102
+ "subject to", "notwithstanding", "except as provided",
103
+ "unless otherwise", "provided however", "without prejudice"
104
+ ]
105
+ return any(q in text.lower() for q in qualifiers)
106
+
107
+ def is_definition(text):
108
+ """Strictly checks if a clause is a definition."""
109
+ t = text.lower()
110
+ if "shall mean" in t or "means" in t or "defined as" in t:
111
+ return True
112
+ return False
113
+
114
+ def is_party_intro(text):
115
+ """Detects if a clause is just listing a party description."""
116
+ t = text.lower()
117
+
118
+ # Strong Indicators: Address patterns, Relations, IDs
119
+ # Regex for "Door No", "D.No", "residing at"
120
+ address_pattern = r"(door\s*no|d\.no|residing\s*at|post\s*,\s*village)"
121
+
122
+ # Regex for relations: "son of", "wife of", "daughter of", "w/o", "s/o", or just "son", "wife" in context
123
+ relation_pattern = r"\b(son|wife|daughter|husband|father|mother|s/o|w/o|d/o)\b"
124
+
125
+ # Regex for IDs: "aadhaar", "pan no", "id card"
126
+ id_pattern = r"(aadhaar|pan\s*no|id\s*card|mobile\s*no)"
127
+
128
+ # Check for presence of these patterns
129
+ has_address = re.search(address_pattern, t)
130
+ has_relation = re.search(relation_pattern, t)
131
+ has_id = re.search(id_pattern, t)
132
+
133
+ # If it has at least 2 strong components (e.g. Relation + ID, or Address + Relation), it's a bio
134
+ score = 0
135
+ if has_address: score += 1
136
+ if has_relation: score += 1
137
+ if has_id: score += 1
138
+
139
+ return score >= 2
140
+
141
+ # =========================
142
+ # 3. CORE LOGIC GATES
143
+ # =========================
144
+
145
+ def analyze_pair(text1, text2, similarity, threshold=0.75):
146
+ """
147
+ Strict Analyzer returning (Label, Score, Reason).
148
+ Args:
149
+ threshold: Minimum similarity score to consider as CANDIDATE (default 0.75)
150
+ """
151
+ # Force Reload Trigger
152
+
153
+ # --- GATE 0: BOILERPLATE CHECK ---
154
+ if is_legal_boilerplate(text1) or is_legal_boilerplate(text2):
155
+ return None, 0.0, "Boilerplate (Skipped)"
156
+
157
+ # --- GATE 1: DOMAIN MISMATCH ---
158
+ d1 = get_clause_domain(text1)
159
+ d2 = get_clause_domain(text2)
160
+
161
+ # If domains are totally different, SKIP.
162
+ # Exception: OPERATIVE and GENERAL might overlap, but strictly FINANCIAL vs POSSESSION should skip.
163
+ if d1 != "GENERAL" and d2 != "GENERAL" and d1 != d2:
164
+ # RELAXATION: Only bypass if similarity is VERY high (suggesting misclassification).
165
+ # Otherwise, DO NOT compare apples (Financial) to oranges (Possession),
166
+ # even in Deep Search mode.
167
+ if similarity < 0.85:
168
+ return None, 0.0, "Domain Mismatch"
169
+
170
+ # --- HARDENED CHECK: GENERAL vs SPECIFIC ---
171
+ # Common source of noise: "Any other details" matching "The price is Rs 100"
172
+ # Block GENERAL vs Specific unless similarity is high
173
+ if (d1 == "GENERAL" and d2 != "GENERAL") or (d2 == "GENERAL" and d1 != "GENERAL"):
174
+ if similarity < 0.80:
175
+ return None, 0.0, "General vs Specific Domain (Skipped)"
176
+
177
+ # --- SPECIFIC FILTER: MONEY vs TIMELINE ---
178
+ # Prevents "Price is X" vs "Payment due on Date Y" (confusing numbers/dates)
179
+ # Check if one clause is purely FINANCIAL and other is purely TIMELINE/DATE based
180
+ is_financial = d1 == "FINANCIAL" or d2 == "FINANCIAL"
181
+ has_date = re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text1) or \
182
+ re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text2)
183
+
184
+ if is_financial and has_date:
185
+ # If one talks about Price/Amount and other has a Date,
186
+ # unless they are explicitly about "Payment Schedule", they are likely different.
187
+ if "schedule" not in text1.lower() and "schedule" not in text2.lower():
188
+ if similarity < 0.85:
189
+ return None, 0.0, "Financial vs Timeline Mismatch"
190
+
191
+ # --- SPECIFIC FILTER: ELIGIBILITY vs ASSISTANCE ---
192
+ # Prevents "Eligibility criteria" vs "Assistance details" (Common in schemes)
193
+ # Check for keywords like "eligible", "qualify" vs "grant", "support", "help"
194
+ t1_lower, t2_lower = text1.lower(), text2.lower()
195
+ is_eligibility = any(w in t1_lower for w in ["eligible", "qualify", "criteria", "requirement"]) or \
196
+ any(w in t2_lower for w in ["eligible", "qualify", "criteria", "requirement"])
197
+ is_assistance = any(w in t1_lower for w in ["provide", "grant", "subsidy", "support", "assistance"]) or \
198
+ any(w in t2_lower for w in ["provide", "grant", "subsidy", "support", "assistance"])
199
+
200
+ if is_eligibility and is_assistance:
201
+ # Unless precise overlap, these are distinct sections
202
+ if similarity < 0.85:
203
+ return None, 0.0, "Eligibility vs Assistance Mismatch"
204
+
205
+ # --- GATE 1.5: PARTY DESCRIPTION CHECK ---
206
+ # If both clauses are just descriptions of people (addresses, relations), skip.
207
+ if is_party_intro(text1) and is_party_intro(text2):
208
+ return None, 0.0, "Party Description (Skipped)"
209
+
210
+ # --- GATE 2: ENTITY MISMATCH ---
211
+ e1 = get_entities(text1)
212
+ e2 = get_entities(text2)
213
+ # If one is Vendor ONLY and other is Vendee ONLY -> SKIP
214
+ if e1 and e2 and e1 != e2 and not (e1 & e2):
215
+ # RELAXATION: Only bypass if similarity is VERY high.
216
+ if similarity < 0.85:
217
+ return None, 0.0, "Entity Role Mismatch"
218
+
219
+ # --- GATE 2.5: DEFINITION GUARD ---
220
+ # Don't compare definitions with operative clauses generally
221
+ if is_definition(text1) or is_definition(text2):
222
+ # Only compare if both are definitions (conflicting definitions)
223
+ if not (is_definition(text1) and is_definition(text2)):
224
+ return None, 0.0, "Definition vs Operative"
225
+
226
+ # --- GATE 3: POSSESSION TIMELINE ---
227
+ # "Possession at agreement" vs "Possession at registration" is NOT a contradiction.
228
+ if d1 == "POSSESSION" and d2 == "POSSESSION":
229
+ keywords_a = ["agreement", "earnest"]
230
+ keywords_b = ["registration", "sale deed", "final"]
231
+
232
+ has_a = any(k in text1.lower() for k in keywords_a)
233
+ has_b = any(k in text2.lower() for k in keywords_b)
234
+
235
+ # If one talks about start and other about end, it's a sequence.
236
+ if (has_a and any(k in text2.lower() for k in keywords_b)) or \
237
+ (has_b and any(k in text1.lower() for k in keywords_a)):
238
+ return None, 0.0, "Possession Timeline Sequence"
239
+
240
+ # --- GATE 4: NUMERIC REASONING ---
241
+ # Only compare numbers if context allows
242
+ nums1 = extract_numbers(text1)
243
+ nums2 = extract_numbers(text2)
244
+
245
+ if nums1 and nums2 and nums1 != nums2:
246
+ # MAGNITUDE CHECK: If numbers differ by > 100x, likely different units (e.g. Price vs Area)
247
+ # e.g. 5,50,000 vs 1.25 -> Ratio is huge.
248
+ max1, max2 = max(nums1), max(nums2)
249
+ if max1 > 0 and max2 > 0:
250
+ ratio = max1 / max2 if max1 > max2 else max2 / max1
251
+ if ratio > 100:
252
+ return None, 0.0, "Numeric Magnitude Mismatch (Likely Unit Diff)"
253
+
254
+ # Check if they are in the same domain (likely valid comparison)
255
+ if d1 == d2 and d1 != "GENERAL":
256
+ return "NUMERIC_INCONSISTENCY", 0.9, f"Mismatch in {d1} values"
257
+
258
+ # If General, be careful.
259
+ # But if similarity is VERY high, it might be a contradiction.
260
+ if similarity > 0.9:
261
+ return "NUMERIC_INCONSISTENCY", 0.85, "Numeric Mismatch in similar context"
262
+
263
+ # --- GATE 4.5: EXCEPTION/HIERARCHY CHECK ---
264
+ # If high similarity but one has exception language
265
+ # We use a slightly lower threshold for exception detection to be safe
266
+ exception_threshold = max(0.65, threshold - 0.05)
267
+ if similarity > exception_threshold:
268
+ has_ex1 = has_exception_language(text1)
269
+ has_ex2 = has_exception_language(text2)
270
+
271
+ if (has_ex1 and not has_ex2) or (has_ex2 and not has_ex1):
272
+ return "QUALIFICATION", similarity, "Legal Exception/Qualification detected (Not a Conflict)"
273
+
274
+ # --- GATE 5: LOGICAL NEGATION ---
275
+ if (has_negation(text1) and not has_negation(text2)) or \
276
+ (has_negation(text2) and not has_negation(text1)):
277
+ # Only flag if high similarity implies they are talking about the same thing
278
+ # Negation check requires fairly high confidence they are related
279
+ if similarity > 0.85:
280
+ return "LEGAL_CONFLICT", 0.8, "Logical Negation detected"
281
+
282
+ # --- FINAL GATE: CANDIDATE FOR NLI ---
283
+ # If we are here, we passed the blocks.
284
+ # If similarity is high, let NLI decide.
285
+ if similarity > threshold:
286
+ return "CANDIDATE", similarity, "High Similarity - Pending NLI"
287
+
288
+ return None, 0.0, "Low Similarity"
analysis/consistency_check.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ def check_inconsistency(text1, text2):
2
+ keywords = ["shall", "must", "may"]
3
+ return any(k in text1.lower() for k in keywords) and \
4
+ any(k in text2.lower() for k in keywords)
analysis/contradiction_check.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def extract_number(text):
4
+ match = re.search(r'INR\s*([\d,]+)', text)
5
+ if match:
6
+ return int(match.group(1).replace(",", ""))
7
+ return None
8
+
9
+ def numeric_contradiction(text1, text2):
10
+ n1 = extract_number(text1)
11
+ n2 = extract_number(text2)
12
+ return n1 is not None and n2 is not None and n1 != n2
13
+
14
+ def ownership_contradiction(text1, text2):
15
+ t1 = text1.lower()
16
+ t2 = text2.lower()
17
+ return (
18
+ ("must not own" in t1 and "may be eligible" in t2) or
19
+ ("must not own" in t2 and "may be eligible" in t1)
20
+ )
21
+
22
+ def check_contradiction(text1, text2):
23
+ return numeric_contradiction(text1, text2) or ownership_contradiction(text1, text2)
analysis/duplication_check.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ def check_duplication(similarity, threshold=0.90):
2
+ return similarity >= threshold
analysis/llama_legal_verifier.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from typing import Tuple
4
+
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
+
8
+
9
+ class LlamaLegalVerifier:
10
+ """
11
+ Verifies whether two legal clauses are contradictory, entailing, or neutral
12
+ using a local fine-tuned causal language model.
13
+ """
14
+
15
+ def __init__(self, model_path: str):
16
+ if not os.path.isdir(model_path):
17
+ raise FileNotFoundError(f"Model path not found: {model_path}")
18
+
19
+ self.model_path = model_path
20
+ self.device = 0 if torch.cuda.is_available() else -1
21
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
22
+
23
+ tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ model_path,
26
+ local_files_only=True,
27
+ torch_dtype=dtype,
28
+ )
29
+ if tokenizer.pad_token_id is None:
30
+ tokenizer.pad_token_id = tokenizer.eos_token_id
31
+
32
+ self.generator = pipeline(
33
+ "text-generation",
34
+ model=model,
35
+ tokenizer=tokenizer,
36
+ device=self.device,
37
+ )
38
+
39
+ @staticmethod
40
+ def _parse_label(text: str) -> str:
41
+ lowered = text.lower()
42
+ if "contradiction" in lowered:
43
+ return "Contradiction"
44
+ if "entailment" in lowered or "duplicate" in lowered or "same meaning" in lowered:
45
+ return "Entailment"
46
+ if "neutral" in lowered:
47
+ return "Neutral"
48
+ return "Neutral"
49
+
50
+ @staticmethod
51
+ def _parse_confidence(text: str) -> float:
52
+ matches = re.findall(r"(?<!\d)(0(?:\.\d+)?|1(?:\.0+)?)(?!\d)", text)
53
+ if matches:
54
+ try:
55
+ value = float(matches[0])
56
+ return max(0.0, min(1.0, value))
57
+ except ValueError:
58
+ return 0.60
59
+ return 0.60
60
+
61
+ @staticmethod
62
+ def _parse_reason(text: str) -> str:
63
+ m = re.search(r"reason\s*:\s*(.+)", text, flags=re.IGNORECASE | re.DOTALL)
64
+ if m:
65
+ return m.group(1).strip()[:300]
66
+ return text.strip()[:300]
67
+
68
+ def predict(self, text1: str, text2: str) -> Tuple[bool, float, str, str]:
69
+ prompt = f"""You are a legal NLI verifier.
70
+ Classify relationship between Clause A and Clause B.
71
+ Allowed labels: Contradiction, Entailment, Neutral.
72
+ Return exactly in this format:
73
+ Label: <Contradiction|Entailment|Neutral>
74
+ Confidence: <0.00-1.00>
75
+ Reason: <one short legal reason>
76
+
77
+ Clause A: {text1}
78
+ Clause B: {text2}
79
+ """
80
+
81
+ output = self.generator(
82
+ prompt,
83
+ max_new_tokens=96,
84
+ do_sample=False,
85
+ return_full_text=False,
86
+ pad_token_id=self.generator.tokenizer.eos_token_id,
87
+ )[0]["generated_text"]
88
+
89
+ label = self._parse_label(output)
90
+ confidence = self._parse_confidence(output)
91
+ reason = self._parse_reason(output)
92
+ is_contradiction = label == "Contradiction" and confidence >= 0.50
93
+ return is_contradiction, confidence, label, reason
analysis/nli_validator.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ # Load once (slow only first time)
4
+ nli_pipeline = pipeline(
5
+ "text-classification",
6
+ model="roberta-large-mnli",
7
+ device=-1 # CPU
8
+ )
9
+
10
+ def nli_contradiction(text1, text2, threshold=0.8):
11
+ """
12
+ Returns True if NLI model strongly predicts contradiction
13
+ """
14
+ input_text = f"{text1} </s></s> {text2}"
15
+ result = nli_pipeline(input_text)[0]
16
+
17
+ return (
18
+ result["label"] == "CONTRADICTION" and
19
+ result["score"] >= threshold
20
+ )
analysis/nli_verifier.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from sentence_transformers import CrossEncoder
4
+ from huggingface_hub import login
5
+
6
+ class NLIVerifier:
7
+ def __init__(self, model_name="cross-encoder/nli-distilroberta-base", hf_token=None):
8
+ """
9
+ Initialize the NLI model using CrossEncoder.
10
+ """
11
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ print(f"Loading NLI Model ({self.device})...")
13
+
14
+ if hf_token:
15
+ try:
16
+ login(token=hf_token)
17
+ print("Logged in to Hugging Face.")
18
+ except Exception as e:
19
+ print(f"HF Login Warning: {e}")
20
+
21
+ try:
22
+ self.model = CrossEncoder(model_name, device=self.device)
23
+ print("NLI Model Loaded Successfully.")
24
+ except Exception as e:
25
+ print(f"Error loading model: {e}")
26
+ self.model = None
27
+
28
+ # Label mapping for cross-encoder/nli-distilroberta-base
29
+ # 0: Contradiction
30
+ # 1: Entailment
31
+ # 2: Neutral
32
+ self.labels = ["Contradiction", "Entailment", "Neutral"]
33
+
34
+ def predict(self, text1, text2):
35
+ """
36
+ Verify if text1 and text2 contradict each other.
37
+ Returns: (IsContradiction: bool, Confidence: float, Label: str)
38
+ """
39
+ if not self.model:
40
+ return False, 0.0, "Model Error"
41
+
42
+ # CrossEncoder returns logits
43
+ scores = self.model.predict([(text1, text2)])[0]
44
+
45
+ # Apply softmax to get probabilities
46
+ exp_scores = np.exp(scores)
47
+ probs = exp_scores / np.sum(exp_scores)
48
+
49
+ pred_label_idx = probs.argmax()
50
+ confidence = probs[pred_label_idx]
51
+ label = self.labels[pred_label_idx]
52
+
53
+ # Check if Contradiction (Index 0) is the winner with high confidence
54
+ is_contradiction = (pred_label_idx == 0 and confidence > 0.5)
55
+
56
+ return is_contradiction, float(confidence), label
analysis/similarity_search.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def get_similar(index, vector, k=5):
2
+ distances, indices = index.search(vector.reshape(1, -1), k)
3
+ return indices[0], distances[0]
auth/user_store.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import secrets
4
+ import sqlite3
5
+ from pathlib import Path
6
+ from typing import Tuple
7
+
8
+
9
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
10
+ DATA_DIR = PROJECT_ROOT / "data"
11
+ DB_PATH = DATA_DIR / "users.db"
12
+
13
+
14
+ def _ensure_db() -> None:
15
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
16
+ conn = sqlite3.connect(DB_PATH)
17
+ try:
18
+ conn.execute(
19
+ """
20
+ CREATE TABLE IF NOT EXISTS users (
21
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
22
+ username TEXT UNIQUE NOT NULL,
23
+ password_hash TEXT NOT NULL,
24
+ salt TEXT NOT NULL,
25
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
26
+ )
27
+ """
28
+ )
29
+ conn.commit()
30
+ finally:
31
+ conn.close()
32
+
33
+
34
+ def _hash_password(password: str, salt_hex: str) -> str:
35
+ salt = bytes.fromhex(salt_hex)
36
+ digest = hashlib.pbkdf2_hmac("sha256", password.encode("utf-8"), salt, 120_000)
37
+ return digest.hex()
38
+
39
+
40
+ def _normalize_username(username: str) -> str:
41
+ return username.strip().lower()
42
+
43
+
44
+ def create_user(username: str, password: str) -> Tuple[bool, str]:
45
+ _ensure_db()
46
+ normalized = _normalize_username(username)
47
+
48
+ if len(normalized) < 3:
49
+ return False, "Username must be at least 3 characters."
50
+ if len(password) < 8:
51
+ return False, "Password must be at least 8 characters."
52
+
53
+ salt_hex = secrets.token_hex(16)
54
+ password_hash = _hash_password(password, salt_hex)
55
+
56
+ conn = sqlite3.connect(DB_PATH)
57
+ try:
58
+ conn.execute(
59
+ "INSERT INTO users (username, password_hash, salt) VALUES (?, ?, ?)",
60
+ (normalized, password_hash, salt_hex),
61
+ )
62
+ conn.commit()
63
+ return True, "Account created successfully."
64
+ except sqlite3.IntegrityError:
65
+ return False, "Username already exists."
66
+ finally:
67
+ conn.close()
68
+
69
+
70
+ def authenticate_user(username: str, password: str) -> Tuple[bool, str]:
71
+ _ensure_db()
72
+ normalized = _normalize_username(username)
73
+
74
+ conn = sqlite3.connect(DB_PATH)
75
+ try:
76
+ row = conn.execute(
77
+ "SELECT password_hash, salt FROM users WHERE username = ?",
78
+ (normalized,),
79
+ ).fetchone()
80
+ finally:
81
+ conn.close()
82
+
83
+ if not row:
84
+ return False, "User not found."
85
+
86
+ stored_hash, salt_hex = row
87
+ candidate_hash = _hash_password(password, salt_hex)
88
+ if candidate_hash != stored_hash:
89
+ return False, "Incorrect password."
90
+ return True, "Login successful."
backend/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Backend (Flask + SQLite)
2
+
3
+ ## Setup
4
+
5
+ ```bash
6
+ cd backend
7
+ python3 -m venv .venv
8
+ source .venv/bin/activate
9
+ pip install -r requirements.txt
10
+ python app.py
11
+ ```
12
+
13
+ Server runs on `http://127.0.0.1:5000`.
14
+
15
+ ## APIs
16
+
17
+ - `GET /api/health`
18
+ - `POST /api/register`
19
+ - `POST /api/login`
20
+ - `POST /api/analyze` (multipart form: `file`, `scanMode`)
21
+
22
+ SQLite database file is created at `backend/app.db`.
backend/app.py ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import os
5
+ import sqlite3
6
+ import sys
7
+ from difflib import SequenceMatcher
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+
11
+ from flask import Flask, jsonify, request
12
+ from flask_cors import CORS
13
+ from werkzeug.security import check_password_hash, generate_password_hash
14
+
15
+ BASE_DIR = Path(__file__).resolve().parent
16
+ PROJECT_ROOT = BASE_DIR.parent
17
+ DB_PATH = Path(os.getenv("DB_PATH", BASE_DIR / "app.db"))
18
+
19
+ app = Flask(__name__)
20
+ CORS(app)
21
+
22
+
23
+ def _bootstrap_site_packages() -> None:
24
+ """
25
+ Make backend resilient when dependencies are split across:
26
+ - project venv site-packages
27
+ - user local site-packages (~/.local)
28
+ """
29
+ py_ver = f"{sys.version_info.major}.{sys.version_info.minor}"
30
+ candidate_paths = [
31
+ PROJECT_ROOT / "venv" / "lib" / f"python{py_ver}" / "site-packages",
32
+ Path.home() / ".local" / "lib" / f"python{py_ver}" / "site-packages",
33
+ ]
34
+ for path in candidate_paths:
35
+ path_str = str(path)
36
+ if path.exists() and path_str not in sys.path:
37
+ sys.path.append(path_str)
38
+
39
+
40
+ _bootstrap_site_packages()
41
+
42
+
43
+ def get_db_connection() -> sqlite3.Connection:
44
+ conn = sqlite3.connect(DB_PATH)
45
+ conn.row_factory = sqlite3.Row
46
+ return conn
47
+
48
+
49
+ def init_db() -> None:
50
+ with get_db_connection() as conn:
51
+ conn.execute(
52
+ """
53
+ CREATE TABLE IF NOT EXISTS users (
54
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
55
+ full_name TEXT NOT NULL,
56
+ email TEXT NOT NULL UNIQUE,
57
+ password_hash TEXT NOT NULL,
58
+ created_at TEXT NOT NULL
59
+ )
60
+ """
61
+ )
62
+ conn.commit()
63
+
64
+
65
+ def _extract_text_data(file_bytes: bytes, file_ext: str):
66
+ if file_ext == "txt":
67
+ return [{"text": file_bytes.decode("utf-8", errors="ignore"), "page": 1}]
68
+
69
+ if file_ext == "pdf":
70
+ import pdfplumber
71
+
72
+ extracted = []
73
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
74
+ for i, page in enumerate(pdf.pages):
75
+ text = page.extract_text() or ""
76
+ if text.strip():
77
+ extracted.append({"text": text, "page": i + 1})
78
+ return extracted
79
+
80
+ if file_ext == "docx":
81
+ import docx
82
+
83
+ doc = docx.Document(io.BytesIO(file_bytes))
84
+ text = "\n".join(p.text for p in doc.paragraphs if p.text is not None)
85
+ return [{"text": text, "page": 1}] if text.strip() else []
86
+
87
+ raise ValueError("Unsupported file type. Use PDF, DOCX, or TXT.")
88
+
89
+
90
+ def _extract_clauses(text_data):
91
+ import re
92
+
93
+ clauses = []
94
+ clause_id = 0
95
+
96
+ for chunk in text_data:
97
+ raw_text = chunk.get("text", "")
98
+ page_num = chunk.get("page", 1)
99
+ pattern = re.compile(r".+?(?:[.!?](?:\s+|$)|$)", re.DOTALL)
100
+
101
+ for match in pattern.finditer(raw_text):
102
+ cleaned = " ".join(match.group(0).split())
103
+ if len(cleaned) < 30:
104
+ continue
105
+
106
+ start_idx = match.start()
107
+ line_no = raw_text[:start_idx].count("\n") + 1
108
+ clauses.append(
109
+ {
110
+ "id": clause_id,
111
+ "text": cleaned,
112
+ "page": page_num,
113
+ "line": line_no,
114
+ }
115
+ )
116
+ clause_id += 1
117
+
118
+ return clauses
119
+
120
+
121
+ def _normalize_person_name(raw: str) -> str:
122
+ import re
123
+
124
+ if not raw:
125
+ return ""
126
+
127
+ cleaned = " ".join(str(raw).split())
128
+ cleaned = re.sub(r"[^A-Za-z.\s]", " ", cleaned)
129
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
130
+ cleaned = re.sub(r"\b(mr|mrs|ms|miss|shri|smt)\.?\b", "", cleaned, flags=re.IGNORECASE)
131
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
132
+
133
+ stop_words = {
134
+ "the",
135
+ "vendor",
136
+ "vendee",
137
+ "party",
138
+ "agreement",
139
+ "hereinafter",
140
+ "called",
141
+ "referred",
142
+ "to",
143
+ "as",
144
+ "and",
145
+ "or",
146
+ "by",
147
+ "of",
148
+ }
149
+ parts = [p for p in cleaned.split(" ") if p and p.lower() not in stop_words]
150
+ if not parts:
151
+ return ""
152
+
153
+ parts = parts[:4]
154
+ name = " ".join(p.capitalize() for p in parts if len(p) > 1)
155
+ return name[:80].strip()
156
+
157
+
158
+ def _extract_party_name(text: str, role: str) -> str:
159
+ import re
160
+
161
+ if not text:
162
+ return "Not found"
163
+
164
+ compact = " ".join(str(text).split())
165
+ role_l = role.lower()
166
+
167
+ patterns = [
168
+ # Role -> Name (e.g., "vendor: suresh kumar")
169
+ rf"\b{role_l}\b\s*[:,-]?\s*(?:is\s+)?(?:mr\.?|mrs\.?|ms\.?|shri|smt\.?)?\s*([A-Za-z][A-Za-z.\s]{{1,80}}?)(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)",
170
+ rf"\bthe\s+{role_l}\b\s*[:,-]?\s*(?:is\s+)?(?:mr\.?|mrs\.?|ms\.?|shri|smt\.?)?\s*([A-Za-z][A-Za-z.\s]{{1,80}}?)(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)",
171
+ # Name -> role via legal wording
172
+ rf"(?:mr\.?|mrs\.?|ms\.?|shri|smt\.?)?\s*([A-Za-z][A-Za-z.\s]{{1,80}}?)\s+(?:hereinafter\s+(?:called|referred\s+to\s+as)|called)\s+(?:the\s+)?{role_l}\b",
173
+ # Name (role)
174
+ rf"\b([A-Za-z][A-Za-z.\s]{{1,60}}?)\s*\(\s*{role_l}\s*\)",
175
+ ]
176
+
177
+ for pattern in patterns:
178
+ match = re.search(pattern, compact, flags=re.IGNORECASE)
179
+ if not match:
180
+ continue
181
+ candidate = _normalize_person_name(match.group(1))
182
+ if candidate:
183
+ return candidate
184
+
185
+ if re.search(rf"\b{role_l}\b", compact, flags=re.IGNORECASE):
186
+ return f"{role.title()} mentioned (name not parsed)"
187
+ return "Not found"
188
+
189
+
190
+ def _extract_document_parties(text_data):
191
+ full_text = "\n".join(chunk.get("text", "") for chunk in (text_data or []))
192
+ vendor = _extract_party_name(full_text, "vendor")
193
+ vendee = _extract_party_name(full_text, "vendee")
194
+ return {"vendor": vendor, "vendee": vendee}
195
+
196
+
197
+ def _similarity(a: str, b: str) -> float:
198
+ return SequenceMatcher(None, a.lower(), b.lower()).ratio()
199
+
200
+
201
+ def _threshold_for_mode(scan_mode: str) -> float:
202
+ mode = (scan_mode or "").lower()
203
+ if "deep" in mode:
204
+ return 0.50
205
+ if "strict" in mode:
206
+ return 0.85
207
+ return 0.60
208
+
209
+
210
+ def _normalized_clause_text(text: str) -> str:
211
+ import re
212
+
213
+ return re.sub(r"\s+", " ", str(text or "").strip().lower())
214
+
215
+
216
+ def _token_set(text: str) -> set[str]:
217
+ import re
218
+
219
+ return set(re.findall(r"[a-z]{3,}", _normalized_clause_text(text)))
220
+
221
+
222
+ def _numeric_tokens(text: str) -> set[str]:
223
+ import re
224
+
225
+ return set(re.findall(r"\b\d+(?:[.,]\d+)?%?\b", str(text or "")))
226
+
227
+
228
+ def _rule_based_category(text_a: str, text_b: str, similarity: float):
229
+ a_norm = _normalized_clause_text(text_a)
230
+ b_norm = _normalized_clause_text(text_b)
231
+ tokens_a = _token_set(text_a)
232
+ tokens_b = _token_set(text_b)
233
+ common = len(tokens_a & tokens_b)
234
+ denom = max(len(tokens_a | tokens_b), 1)
235
+ jaccard = common / denom
236
+
237
+ if a_norm and b_norm and a_norm == b_norm:
238
+ return ("duplication", "DUPLICATION_EXACT", 0.99, "Exact repeated clause text.")
239
+
240
+ if similarity >= 0.94 and jaccard >= 0.88:
241
+ return ("duplication", "DUPLICATION_NEAR", 0.94, "Near-duplicate clause wording.")
242
+
243
+ nums_a = _numeric_tokens(text_a)
244
+ nums_b = _numeric_tokens(text_b)
245
+ if jaccard >= 0.45 and nums_a and nums_b and nums_a != nums_b:
246
+ return (
247
+ "inconsistency",
248
+ "NUMERIC_INCONSISTENCY",
249
+ 0.9,
250
+ f"Numeric mismatch detected: {sorted(nums_a)} vs {sorted(nums_b)}.",
251
+ )
252
+
253
+ neg_words = ("shall not", "will not", "not", "never", "prohibited", "forbidden")
254
+ pos_words = ("shall", "will", "must", "required", "permitted", "allowed")
255
+ a_has_neg = any(w in a_norm for w in neg_words)
256
+ b_has_neg = any(w in b_norm for w in neg_words)
257
+ a_has_pos = any(w in a_norm for w in pos_words)
258
+ b_has_pos = any(w in b_norm for w in pos_words)
259
+ if jaccard >= 0.5 and ((a_has_neg and b_has_pos) or (b_has_neg and a_has_pos)):
260
+ return ("contradiction", "LEGAL_CONFLICT", 0.9, "Opposite obligation/negation polarity.")
261
+
262
+ return (None, None, 0.0, "")
263
+
264
+
265
+ def _analyze_clauses(clauses, threshold: float):
266
+ if str(PROJECT_ROOT) not in sys.path:
267
+ sys.path.append(str(PROJECT_ROOT))
268
+
269
+ try:
270
+ from analysis.common_analyzer import analyze_pair
271
+ except Exception as exc:
272
+ raise RuntimeError(f"Analyzer import failed: {exc}") from exc
273
+
274
+ findings = []
275
+ line_issues = []
276
+ counts = {"duplication": 0, "inconsistency": 0, "contradiction": 0}
277
+ compared_pairs = 0
278
+ max_pairs = 15000
279
+ seen_findings = set()
280
+ seen_line_issues = set()
281
+
282
+ def normalize_category(label: str, reason: str, similarity: float) -> str | None:
283
+ lbl = (label or "").upper()
284
+ rsn = (reason or "").lower()
285
+ if lbl in {"NUMERIC_INCONSISTENCY"}:
286
+ return "inconsistency"
287
+ if lbl in {"LEGAL_CONFLICT", "CONTRADICTION"}:
288
+ return "contradiction"
289
+ if lbl in {"DUPLICATION", "ENTAILMENT"}:
290
+ return "duplication"
291
+ if lbl in {"CANDIDATE", "QUALIFICATION"} and similarity >= 0.92:
292
+ return "duplication"
293
+ if "negation" in rsn or "conflict" in rsn:
294
+ return "contradiction"
295
+ return None
296
+
297
+ for i in range(len(clauses)):
298
+ for j in range(i + 1, len(clauses)):
299
+ compared_pairs += 1
300
+ if compared_pairs > max_pairs:
301
+ break
302
+
303
+ clause_a = clauses[i]
304
+ clause_b = clauses[j]
305
+ similarity = _similarity(clause_a["text"], clause_b["text"])
306
+
307
+ category, label, confidence, reason = _rule_based_category(
308
+ clause_a["text"], clause_b["text"], similarity
309
+ )
310
+
311
+ if category is None:
312
+ label, confidence, reason = analyze_pair(
313
+ clause_a["text"],
314
+ clause_b["text"],
315
+ similarity,
316
+ threshold=threshold,
317
+ )
318
+ if not label or label == "NO_CONFLICT":
319
+ continue
320
+
321
+ category = normalize_category(label, reason, similarity)
322
+ if category is None:
323
+ continue
324
+
325
+ finding_key = (
326
+ category,
327
+ clause_a["page"],
328
+ clause_a["line"],
329
+ clause_b["page"],
330
+ clause_b["line"],
331
+ label,
332
+ )
333
+ if finding_key in seen_findings:
334
+ continue
335
+ seen_findings.add(finding_key)
336
+
337
+ findings.append(
338
+ {
339
+ "issueType": label,
340
+ "category": category,
341
+ "confidence": round(float(confidence), 4),
342
+ "reason": reason,
343
+ "clause1": clause_a["text"],
344
+ "clause2": clause_b["text"],
345
+ "location1": f"Pg {clause_a['page']}, Ln {clause_a['line']}",
346
+ "location2": f"Pg {clause_b['page']}, Ln {clause_b['line']}",
347
+ "page1": clause_a["page"],
348
+ "line1": clause_a["line"],
349
+ "page2": clause_b["page"],
350
+ "line2": clause_b["line"],
351
+ }
352
+ )
353
+ counts[category] += 1
354
+ for clause in (clause_a, clause_b):
355
+ line_key = (category, clause["page"], clause["line"], label)
356
+ if line_key in seen_line_issues:
357
+ continue
358
+ seen_line_issues.add(line_key)
359
+ line_issues.append(
360
+ {
361
+ "category": category,
362
+ "issueType": label,
363
+ "confidence": round(float(confidence), 4),
364
+ "page": clause["page"],
365
+ "line": clause["line"],
366
+ "location": f"Pg {clause['page']}, Ln {clause['line']}",
367
+ "reason": reason,
368
+ }
369
+ )
370
+
371
+ if compared_pairs > max_pairs:
372
+ break
373
+
374
+ findings.sort(key=lambda item: item["confidence"], reverse=True)
375
+ line_issues.sort(key=lambda item: (item["page"], item["line"]))
376
+ return findings, line_issues, counts, compared_pairs
377
+
378
+
379
+ def _build_page_summaries(clauses, line_issues, text_data):
380
+ pages = {}
381
+ page_text_map = {}
382
+
383
+ for chunk in text_data or []:
384
+ page = int(chunk.get("page", 1))
385
+ if page in page_text_map:
386
+ continue
387
+ raw = str(chunk.get("text", "") or "")
388
+ lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
389
+ page_text_map[page] = " ".join(lines[:2])[:260]
390
+
391
+ for clause in clauses:
392
+ page = int(clause.get("page", 1))
393
+ pages.setdefault(
394
+ page,
395
+ {
396
+ "page": page,
397
+ "clauseCount": 0,
398
+ "duplicationCount": 0,
399
+ "inconsistencyCount": 0,
400
+ "contradictionCount": 0,
401
+ "issueCount": 0,
402
+ "keyLines": [],
403
+ "pageSnippet": page_text_map.get(page, ""),
404
+ },
405
+ )
406
+ pages[page]["clauseCount"] += 1
407
+
408
+ for issue in line_issues:
409
+ page = int(issue.get("page", 1))
410
+ pages.setdefault(
411
+ page,
412
+ {
413
+ "page": page,
414
+ "clauseCount": 0,
415
+ "duplicationCount": 0,
416
+ "inconsistencyCount": 0,
417
+ "contradictionCount": 0,
418
+ "issueCount": 0,
419
+ "keyLines": [],
420
+ "pageSnippet": page_text_map.get(page, ""),
421
+ },
422
+ )
423
+ category = issue.get("category")
424
+ if category in {"duplication", "inconsistency", "contradiction"}:
425
+ pages[page][f"{category}Count"] += 1
426
+ pages[page]["issueCount"] += 1
427
+ if len(pages[page]["keyLines"]) < 6:
428
+ line_ref = f"Ln {issue.get('line', '-')}: {issue.get('issueType', '-')}"
429
+ if line_ref not in pages[page]["keyLines"]:
430
+ pages[page]["keyLines"].append(line_ref)
431
+
432
+ page_summaries = []
433
+ for page in sorted(pages.keys()):
434
+ item = pages[page]
435
+ item["summaryText"] = (
436
+ f"Page {page} contains {item['clauseCount']} clauses and {item['issueCount']} flagged lines "
437
+ f"(duplication: {item['duplicationCount']}, inconsistency: {item['inconsistencyCount']}, "
438
+ f"contradiction: {item['contradictionCount']})."
439
+ )
440
+ page_summaries.append(item)
441
+
442
+ return page_summaries
443
+
444
+
445
+ def _shorten_text(text: str, limit: int = 220) -> str:
446
+ s = " ".join(str(text or "").split())
447
+ if len(s) <= limit:
448
+ return s
449
+ return s[: limit - 3].rstrip() + "..."
450
+
451
+
452
+ def _clause_label(text: str, fallback_id: int) -> str:
453
+ import re
454
+
455
+ raw = str(text or "")
456
+ m = re.search(r"\bclause\s*(\d+)\s*(?:\(([^)]+)\))?", raw, flags=re.IGNORECASE)
457
+ if m:
458
+ num = m.group(1)
459
+ title = (m.group(2) or "").strip()
460
+ return f"Clause {num}" + (f" ({title})" if title else "")
461
+ return f"Clause {fallback_id}"
462
+
463
+
464
+ def _build_detailed_summary(clauses, page_summaries, findings):
465
+ from collections import defaultdict
466
+
467
+ clauses_by_page = defaultdict(list)
468
+ for clause in clauses:
469
+ clauses_by_page[int(clause.get("page", 1))].append(clause)
470
+
471
+ lines = ["Here is the detailed summary of the document content:", ""]
472
+
473
+ for page_item in page_summaries:
474
+ page = int(page_item.get("page", 1))
475
+ page_clauses = sorted(clauses_by_page.get(page, []), key=lambda c: (c.get("line", 0), c.get("id", 0)))
476
+ lines.append(f"Page {page} Summary:")
477
+ if not page_clauses:
478
+ lines.append(f"- No clauses extracted for Page {page}.")
479
+ lines.append("")
480
+ continue
481
+
482
+ for idx, clause in enumerate(page_clauses[:12], start=1):
483
+ label = _clause_label(clause.get("text", ""), idx)
484
+ summary = _shorten_text(clause.get("text", ""), 210)
485
+ lines.append(f"- {label}: {summary} (Page {page}, Line {clause.get('line', '-')})")
486
+
487
+ if len(page_clauses) > 12:
488
+ lines.append(f"- Additional clauses on this page: {len(page_clauses) - 12}")
489
+ lines.append("")
490
+
491
+ contradictions = [f for f in findings if f.get("category") == "contradiction"]
492
+ inconsistencies = [f for f in findings if f.get("category") == "inconsistency"]
493
+ duplicates = [f for f in findings if f.get("category") == "duplication"]
494
+
495
+ lines.append("Summary of Key Contradictions Noted:")
496
+ if contradictions:
497
+ for idx, item in enumerate(contradictions[:10], start=1):
498
+ lines.append(
499
+ f"- {idx}. {item.get('issueType', 'LEGAL_CONFLICT')}: "
500
+ f"{_shorten_text(item.get('reason', ''), 170)} "
501
+ f"({item.get('location1', '-') } vs {item.get('location2', '-')})"
502
+ )
503
+ else:
504
+ lines.append("- No strong contradiction pair detected.")
505
+ lines.append("")
506
+
507
+ lines.append("Summary of Key Inconsistencies Noted:")
508
+ if inconsistencies:
509
+ for idx, item in enumerate(inconsistencies[:10], start=1):
510
+ lines.append(
511
+ f"- {idx}. {item.get('issueType', 'INCONSISTENCY')}: "
512
+ f"{_shorten_text(item.get('reason', ''), 170)} "
513
+ f"({item.get('location1', '-') } vs {item.get('location2', '-')})"
514
+ )
515
+ else:
516
+ lines.append("- No strong inconsistency pair detected.")
517
+ lines.append("")
518
+
519
+ lines.append("Summary of Key Duplications Noted:")
520
+ if duplicates:
521
+ for idx, item in enumerate(duplicates[:10], start=1):
522
+ lines.append(
523
+ f"- {idx}. {item.get('issueType', 'DUPLICATION')}: "
524
+ f"{_shorten_text(item.get('reason', ''), 170)} "
525
+ f"({item.get('location1', '-') } vs {item.get('location2', '-')})"
526
+ )
527
+ else:
528
+ lines.append("- No major duplication pair detected.")
529
+
530
+ return "\n".join(lines)
531
+
532
+
533
+ # Ensure schema exists even when started via `flask run`.
534
+ init_db()
535
+
536
+
537
+ @app.get("/api/health")
538
+ def health_check():
539
+ return jsonify({"status": "ok"}), 200
540
+
541
+
542
+ @app.get("/")
543
+ def root():
544
+ return (
545
+ jsonify(
546
+ {
547
+ "message": "Backend is running.",
548
+ "endpoints": [
549
+ "GET /api/health",
550
+ "POST /api/register",
551
+ "POST /api/login",
552
+ "POST /api/analyze",
553
+ "GET /health",
554
+ "POST /register",
555
+ "POST /login",
556
+ "POST /analyze",
557
+ ],
558
+ }
559
+ ),
560
+ 200,
561
+ )
562
+
563
+
564
+ @app.get("/health")
565
+ def health_check_alias():
566
+ return health_check()
567
+
568
+
569
+ @app.post("/api/register")
570
+ def register():
571
+ data = request.get_json(silent=True) or {}
572
+
573
+ full_name = str(data.get("fullName", "")).strip()
574
+ email = str(data.get("email", "")).strip().lower()
575
+ password = str(data.get("password", ""))
576
+
577
+ if not full_name or not email or not password:
578
+ return jsonify({"error": "fullName, email, and password are required."}), 400
579
+
580
+ if len(password) < 6:
581
+ return jsonify({"error": "Password must be at least 6 characters."}), 400
582
+
583
+ password_hash = generate_password_hash(password)
584
+ created_at = datetime.now(timezone.utc).isoformat()
585
+
586
+ try:
587
+ with get_db_connection() as conn:
588
+ conn.execute(
589
+ "INSERT INTO users (full_name, email, password_hash, created_at) VALUES (?, ?, ?, ?)",
590
+ (full_name, email, password_hash, created_at),
591
+ )
592
+ conn.commit()
593
+ except sqlite3.IntegrityError:
594
+ return jsonify({"error": "Email already registered."}), 409
595
+
596
+ return jsonify({"message": "User created successfully."}), 201
597
+
598
+
599
+ @app.post("/register")
600
+ def register_alias():
601
+ return register()
602
+
603
+
604
+ @app.post("/api/login")
605
+ def login():
606
+ data = request.get_json(silent=True) or {}
607
+
608
+ email = str(data.get("email", "")).strip().lower()
609
+ password = str(data.get("password", ""))
610
+
611
+ if not email or not password:
612
+ return jsonify({"error": "email and password are required."}), 400
613
+
614
+ with get_db_connection() as conn:
615
+ user = conn.execute(
616
+ "SELECT id, full_name, email, password_hash FROM users WHERE email = ?",
617
+ (email,),
618
+ ).fetchone()
619
+
620
+ if user is None or not check_password_hash(user["password_hash"], password):
621
+ return jsonify({"error": "Invalid email or password."}), 401
622
+
623
+ return (
624
+ jsonify(
625
+ {
626
+ "message": "Login successful.",
627
+ "user": {
628
+ "id": user["id"],
629
+ "fullName": user["full_name"],
630
+ "email": user["email"],
631
+ },
632
+ }
633
+ ),
634
+ 200,
635
+ )
636
+
637
+
638
+ @app.post("/api/analyze")
639
+ def analyze():
640
+ uploaded = request.files.get("file")
641
+ scan_mode = request.form.get("scanMode", "Standard Scan (Recommended)")
642
+ threshold = _threshold_for_mode(scan_mode)
643
+
644
+ if uploaded is None or uploaded.filename is None or uploaded.filename.strip() == "":
645
+ return jsonify({"error": "Please upload a file."}), 400
646
+
647
+ file_ext = uploaded.filename.rsplit(".", 1)[-1].lower() if "." in uploaded.filename else ""
648
+ if file_ext not in {"pdf", "docx", "txt"}:
649
+ return jsonify({"error": "Unsupported file type. Use PDF, DOCX, or TXT."}), 400
650
+
651
+ try:
652
+ file_bytes = uploaded.read()
653
+ text_data = _extract_text_data(file_bytes=file_bytes, file_ext=file_ext)
654
+ if not text_data:
655
+ return jsonify({"error": "Could not extract text from file."}), 400
656
+
657
+ clauses = _extract_clauses(text_data)
658
+ if len(clauses) < 2:
659
+ return jsonify({"error": "Not enough clauses found for analysis."}), 400
660
+
661
+ parties = _extract_document_parties(text_data)
662
+ findings, line_issues, counts, compared_pairs = _analyze_clauses(
663
+ clauses=clauses, threshold=threshold
664
+ )
665
+ page_summaries = _build_page_summaries(
666
+ clauses=clauses, line_issues=line_issues, text_data=text_data
667
+ )
668
+ detailed_summary = _build_detailed_summary(
669
+ clauses=clauses,
670
+ page_summaries=page_summaries,
671
+ findings=findings,
672
+ )
673
+ except Exception as exc:
674
+ return jsonify({"error": f"Analysis failed: {exc}"}), 500
675
+
676
+ return (
677
+ jsonify(
678
+ {
679
+ "message": "Analysis completed.",
680
+ "summary": {
681
+ "scanMode": scan_mode,
682
+ "threshold": threshold,
683
+ "vendor": parties["vendor"],
684
+ "vendee": parties["vendee"],
685
+ "clauses": len(clauses),
686
+ "pairsCompared": compared_pairs,
687
+ "issuesFound": len(findings),
688
+ "duplicationCount": counts["duplication"],
689
+ "inconsistencyCount": counts["inconsistency"],
690
+ "contradictionCount": counts["contradiction"],
691
+ },
692
+ "pageSummaries": page_summaries,
693
+ "detailedSummary": detailed_summary,
694
+ "findings": findings[:50],
695
+ "lineIssues": line_issues[:200],
696
+ }
697
+ ),
698
+ 200,
699
+ )
700
+
701
+
702
+ @app.post("/login")
703
+ def login_alias():
704
+ return login()
705
+
706
+
707
+ @app.post("/analyze")
708
+ def analyze_alias():
709
+ return analyze()
710
+
711
+
712
+ if __name__ == "__main__":
713
+ # Keep defaults production-safe and compatible with restricted environments.
714
+ debug_mode = os.getenv("FLASK_DEBUG", "0") == "1"
715
+ host = os.getenv("HOST", "127.0.0.1")
716
+ port = int(os.getenv("PORT", "5000"))
717
+ app.run(host=host, port=port, debug=debug_mode, use_reloader=False)
backend/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Flask==3.1.0
2
+ Flask-Cors==5.0.0
3
+ Werkzeug==3.1.3
4
+ pdfplumber==0.11.5
5
+ python-docx==1.1.2
domain_rules/belongings_check.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def belongings_conflict(text1, text2):
2
+ t1 = text1.lower()
3
+ t2 = text2.lower()
4
+ if ("included" in t1 and "excluded" in t2) or \
5
+ ("excluded" in t1 and "included" in t2):
6
+ return True
7
+ return False
domain_rules/belongings_keywords.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ BELONGINGS_KEYWORDS = [
2
+ "fixture", "fitting", "belonging", "movable",
3
+ "immovable", "furniture", "appliance",
4
+ "electrical", "plumbing", "included", "excluded"
5
+ ]
domain_rules/legal_rules.py ADDED
File without changes
embeddings/sbert_encoder.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sentence_transformers import SentenceTransformer
3
+
4
+ _model = None
5
+
6
+ def get_model():
7
+ global _model
8
+ if _model is None:
9
+ model_name = "all-MiniLM-L6-v2"
10
+ try:
11
+ print(f"Loading {model_name}...")
12
+ _model = SentenceTransformer(model_name)
13
+ except Exception as e:
14
+ print(f"Failed to load {model_name} online: {e}")
15
+ print("Attempting to load from local cache...")
16
+ try:
17
+ _model = SentenceTransformer(model_name, local_files_only=True)
18
+ except Exception as e2:
19
+ raise RuntimeError(f"Could not load model {model_name} (Online or Offline). Check connection.") from e2
20
+ return _model
21
+
22
+ def generate_embeddings(clauses):
23
+ model = get_model()
24
+ texts = [c["text"] for c in clauses]
25
+ return model.encode(texts, convert_to_numpy=True)
frontend/README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Frontend (Multi-Page Flow)
2
+
3
+ This frontend now uses a strict page flow:
4
+
5
+ 1. `index.html` -> Login/Signup
6
+ 2. `upload.html` -> Upload document and run analysis
7
+ 3. `issues.html` -> Line-level issue page (duplication, inconsistency, contradiction)
8
+ 4. `summary.html` -> Final full-document summary
9
+
10
+ ## Run
11
+
12
+ Serve this folder using any static server from `frontend/`:
13
+
14
+ ```bash
15
+ python -m http.server 8080
16
+ ```
17
+
18
+ Open:
19
+
20
+ - `http://127.0.0.1:8080/index.html`
21
+
22
+ ## Backend dependency
23
+
24
+ Frontend expects Flask backend endpoints:
25
+
26
+ - `POST /api/register`
27
+ - `POST /api/login`
28
+ - `POST /api/analyze`
29
+
30
+ Fallback aliases are also supported in client code (`/register`, `/login`, `/analyze`) across ports `5000` and `5001`.
31
+
32
+ ## Notes
33
+
34
+ - Login state and analysis payload are stored in `sessionStorage`.
35
+ - If user session is missing, `upload.html`, `issues.html`, and `summary.html` redirect to `index.html`.
36
+ - If analysis payload is missing, `issues.html` and `summary.html` redirect to `upload.html`.
frontend/app.js ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const currentHost = window.location.hostname || "127.0.0.1";
2
+
3
+ const API_BASES = [
4
+ `http://${currentHost}:5000/api`,
5
+ `http://${currentHost}:5001/api`,
6
+ "http://127.0.0.1:5000/api",
7
+ "http://localhost:5000/api",
8
+ "http://127.0.0.1:5001/api",
9
+ "http://localhost:5001/api",
10
+ ];
11
+
12
+ const ANALYZE_URLS = [
13
+ `http://${currentHost}:5000/api/analyze`,
14
+ `http://${currentHost}:5000/analyze`,
15
+ `http://${currentHost}:5001/api/analyze`,
16
+ `http://${currentHost}:5001/analyze`,
17
+ "http://127.0.0.1:5000/api/analyze",
18
+ "http://127.0.0.1:5000/analyze",
19
+ "http://localhost:5000/api/analyze",
20
+ "http://localhost:5000/analyze",
21
+ "http://127.0.0.1:5001/api/analyze",
22
+ "http://127.0.0.1:5001/analyze",
23
+ "http://localhost:5001/api/analyze",
24
+ "http://localhost:5001/analyze",
25
+ ];
26
+
27
+ const page = (window.location.pathname.split("/").pop() || "index.html").toLowerCase();
28
+
29
+ function escapeHtml(value) {
30
+ return String(value)
31
+ .replaceAll("&", "&amp;")
32
+ .replaceAll("<", "&lt;")
33
+ .replaceAll(">", "&gt;")
34
+ .replaceAll('"', "&quot;")
35
+ .replaceAll("'", "&#039;");
36
+ }
37
+
38
+ function setText(el, text, type = null) {
39
+ if (!el) return;
40
+ el.textContent = text;
41
+ el.classList.remove("success", "error");
42
+ if (type) el.classList.add(type);
43
+ }
44
+
45
+ function getUser() {
46
+ const userRaw = sessionStorage.getItem("lsi_user");
47
+ if (!userRaw) return null;
48
+ try {
49
+ return JSON.parse(userRaw);
50
+ } catch {
51
+ return null;
52
+ }
53
+ }
54
+
55
+ function setUser(user) {
56
+ sessionStorage.setItem("lsi_user", JSON.stringify(user));
57
+ }
58
+
59
+ function clearSession() {
60
+ sessionStorage.removeItem("lsi_user");
61
+ sessionStorage.removeItem("lsi_analysis_payload");
62
+ }
63
+
64
+ function getAnalysisPayload() {
65
+ const raw = sessionStorage.getItem("lsi_analysis_payload");
66
+ if (!raw) return null;
67
+ try {
68
+ return JSON.parse(raw);
69
+ } catch {
70
+ return null;
71
+ }
72
+ }
73
+
74
+ function setAnalysisPayload(payload) {
75
+ sessionStorage.setItem("lsi_analysis_payload", JSON.stringify(payload));
76
+ }
77
+
78
+ function ensureAuth() {
79
+ const user = getUser();
80
+ if (!user) {
81
+ window.location.href = "index.html#home";
82
+ return null;
83
+ }
84
+
85
+ const badge = document.getElementById("userBadge");
86
+ if (badge) {
87
+ badge.textContent = `${user.fullName || user.email || "User"}`;
88
+ }
89
+
90
+ const logoutBtn = document.getElementById("logoutBtn");
91
+ if (logoutBtn) {
92
+ logoutBtn.addEventListener("click", () => {
93
+ clearSession();
94
+ window.location.href = "index.html#home";
95
+ });
96
+ }
97
+
98
+ return user;
99
+ }
100
+
101
+ async function postAuth(endpoint, payload) {
102
+ let response = null;
103
+ let data = null;
104
+ let lastNetworkError = null;
105
+
106
+ for (const base of API_BASES) {
107
+ try {
108
+ response = await fetch(`${base}${endpoint}`, {
109
+ method: "POST",
110
+ headers: { "Content-Type": "application/json" },
111
+ body: JSON.stringify(payload),
112
+ });
113
+ data = await response.json().catch(() => null);
114
+ lastNetworkError = null;
115
+ break;
116
+ } catch (error) {
117
+ lastNetworkError = error;
118
+ }
119
+ }
120
+
121
+ if (lastNetworkError) {
122
+ throw new Error(`Cannot reach backend at ${API_BASES.join(", ")}.`);
123
+ }
124
+
125
+ return { response, data };
126
+ }
127
+
128
+ async function runDocumentAnalysis(formData) {
129
+ let response = null;
130
+ let data = null;
131
+ let lastNetworkError = null;
132
+ let status = null;
133
+
134
+ for (const url of ANALYZE_URLS) {
135
+ try {
136
+ response = await fetch(url, { method: "POST", body: formData });
137
+ data = await response.json().catch(() => null);
138
+ status = response.status;
139
+ lastNetworkError = null;
140
+ if (response.status !== 404) break;
141
+ } catch (error) {
142
+ lastNetworkError = error;
143
+ }
144
+ }
145
+
146
+ if (lastNetworkError) {
147
+ throw new Error("Cannot connect to backend for analysis.");
148
+ }
149
+
150
+ if (!response.ok) {
151
+ throw new Error(data?.error || `Analysis request failed with HTTP ${status || response.status}.`);
152
+ }
153
+
154
+ return data;
155
+ }
156
+
157
+ function buildIssueRows(lineIssues, category) {
158
+ const rows = lineIssues
159
+ .filter((item) => item.category === category)
160
+ .slice(0, 80)
161
+ .map(
162
+ (item) => `
163
+ <tr>
164
+ <td>${escapeHtml(item.location || `Pg ${item.page}, Ln ${item.line}`)}</td>
165
+ <td>${escapeHtml(item.issueType || "-")}</td>
166
+ <td>${escapeHtml(item.confidence ?? "-")}</td>
167
+ </tr>
168
+ `
169
+ )
170
+ .join("");
171
+
172
+ if (!rows) {
173
+ return `<p class="result-muted">No ${category} lines detected.</p>`;
174
+ }
175
+
176
+ return `
177
+ <div class="table-wrap">
178
+ <table class="result-table">
179
+ <thead>
180
+ <tr>
181
+ <th>Page/Line</th>
182
+ <th>Issue Type</th>
183
+ <th>Confidence</th>
184
+ </tr>
185
+ </thead>
186
+ <tbody>${rows}</tbody>
187
+ </table>
188
+ </div>
189
+ `;
190
+ }
191
+
192
+ function initIndexPage() {
193
+ const loginTab = document.getElementById("loginTab");
194
+ const signupTab = document.getElementById("signupTab");
195
+ const authForm = document.getElementById("authForm");
196
+ const nameField = document.getElementById("nameField");
197
+ const fullNameInput = document.getElementById("fullName");
198
+ const emailInput = document.getElementById("email");
199
+ const passwordInput = document.getElementById("password");
200
+ const submitBtn = document.getElementById("submitBtn");
201
+ const formSubtitle = document.getElementById("formSubtitle");
202
+ const message = document.getElementById("message");
203
+
204
+ let mode = "login";
205
+
206
+ function setMode(nextMode) {
207
+ mode = nextMode;
208
+ const isSignup = mode === "signup";
209
+ signupTab.classList.toggle("active", isSignup);
210
+ loginTab.classList.toggle("active", !isSignup);
211
+ nameField.classList.toggle("hidden", !isSignup);
212
+ submitBtn.textContent = isSignup ? "Create Account" : "Login";
213
+ formSubtitle.textContent = isSignup
214
+ ? "Create your account to start securely."
215
+ : "Enter your credentials to access your account.";
216
+ fullNameInput.required = isSignup;
217
+ setText(message, "", null);
218
+ }
219
+
220
+ async function handleAuthSubmit(event) {
221
+ event.preventDefault();
222
+ setText(message, "", null);
223
+
224
+ const email = emailInput.value.trim();
225
+ const password = passwordInput.value;
226
+ const fullName = fullNameInput.value.trim();
227
+
228
+ if (!email || !password || (mode === "signup" && !fullName)) {
229
+ setText(message, "Please fill all required fields.", "error");
230
+ return;
231
+ }
232
+
233
+ submitBtn.disabled = true;
234
+
235
+ try {
236
+ const endpoint = mode === "signup" ? "/register" : "/login";
237
+ const payload = mode === "signup" ? { fullName, email, password } : { email, password };
238
+ const { response, data } = await postAuth(endpoint, payload);
239
+
240
+ if (!response.ok) {
241
+ throw new Error(data?.error || `Request failed with HTTP ${response.status}.`);
242
+ }
243
+
244
+ if (mode === "signup") {
245
+ setText(message, "Account created. Please login now.", "success");
246
+ authForm.reset();
247
+ setMode("login");
248
+ return;
249
+ }
250
+
251
+ const user = data?.user || { fullName: fullName || email, email };
252
+ setUser(user);
253
+ window.location.href = "upload.html";
254
+ } catch (error) {
255
+ setText(message, error.message || "Something went wrong.", "error");
256
+ } finally {
257
+ submitBtn.disabled = false;
258
+ }
259
+ }
260
+
261
+ loginTab.addEventListener("click", () => setMode("login"));
262
+ signupTab.addEventListener("click", () => setMode("signup"));
263
+ authForm.addEventListener("submit", handleAuthSubmit);
264
+ setMode("login");
265
+
266
+ if (getUser()) {
267
+ window.location.href = "upload.html";
268
+ }
269
+ }
270
+
271
+ function initUploadPage() {
272
+ if (!ensureAuth()) return;
273
+
274
+ const uploadForm = document.getElementById("uploadForm");
275
+ const legalFile = document.getElementById("legalFile");
276
+ const scanMode = document.getElementById("scanMode");
277
+ const uploadMessage = document.getElementById("uploadMessage");
278
+ const loadingState = document.getElementById("loadingState");
279
+ const analysisInputSummary = document.getElementById("analysisInputSummary");
280
+
281
+ legalFile.addEventListener("change", () => {
282
+ if (!legalFile.files || !legalFile.files[0]) return;
283
+ const selectedFile = legalFile.files[0];
284
+ analysisInputSummary.classList.remove("hidden");
285
+ analysisInputSummary.innerHTML = `
286
+ <p><strong>File:</strong> ${escapeHtml(selectedFile.name)}</p>
287
+ <p><strong>Type:</strong> ${escapeHtml(selectedFile.type || "unknown")}</p>
288
+ <p><strong>Size:</strong> ${escapeHtml((selectedFile.size / 1024).toFixed(2))} KB</p>
289
+ <p><strong>Scan Mode:</strong> ${escapeHtml(scanMode.value)}</p>
290
+ `;
291
+ setText(uploadMessage, `Selected: ${selectedFile.name}`, "success");
292
+ });
293
+
294
+ uploadForm.addEventListener("submit", async (event) => {
295
+ event.preventDefault();
296
+ setText(uploadMessage, "", null);
297
+
298
+ if (!legalFile.files || legalFile.files.length === 0) {
299
+ setText(uploadMessage, "Please choose a file to continue.", "error");
300
+ return;
301
+ }
302
+
303
+ const selectedFile = legalFile.files[0];
304
+ const selectedScanMode = scanMode.value;
305
+
306
+ const formData = new FormData();
307
+ formData.append("file", selectedFile);
308
+ formData.append("scanMode", selectedScanMode);
309
+
310
+ uploadForm.classList.add("hidden");
311
+ loadingState.classList.remove("hidden");
312
+
313
+ try {
314
+ const payload = await runDocumentAnalysis(formData);
315
+ payload._meta = {
316
+ fileName: selectedFile.name,
317
+ fileType: selectedFile.type || "unknown",
318
+ fileSizeKb: Number((selectedFile.size / 1024).toFixed(2)),
319
+ };
320
+ setAnalysisPayload(payload);
321
+ window.location.href = "issues.html";
322
+ } catch (error) {
323
+ loadingState.classList.add("hidden");
324
+ uploadForm.classList.remove("hidden");
325
+ setText(uploadMessage, error.message || "Analysis failed.", "error");
326
+ }
327
+ });
328
+ }
329
+
330
+ function initIssuesPage() {
331
+ if (!ensureAuth()) return;
332
+
333
+ const payload = getAnalysisPayload();
334
+ if (!payload) {
335
+ window.location.href = "upload.html";
336
+ return;
337
+ }
338
+
339
+ const summary = payload.summary || {};
340
+ const lineIssues = Array.isArray(payload.lineIssues) ? payload.lineIssues : [];
341
+
342
+ const issueStats = document.getElementById("issueStats");
343
+ issueStats.innerHTML = `
344
+ <article class="stat-card stat-dup">
345
+ <h3>Duplication</h3>
346
+ <p>${escapeHtml(summary.duplicationCount ?? 0)}</p>
347
+ </article>
348
+ <article class="stat-card stat-inc">
349
+ <h3>Inconsistency</h3>
350
+ <p>${escapeHtml(summary.inconsistencyCount ?? 0)}</p>
351
+ </article>
352
+ <article class="stat-card stat-con">
353
+ <h3>Contradiction</h3>
354
+ <p>${escapeHtml(summary.contradictionCount ?? 0)}</p>
355
+ </article>
356
+ `;
357
+
358
+ const lineIssueTables = document.getElementById("lineIssueTables");
359
+ lineIssueTables.innerHTML = `
360
+ <section class="result-card">
361
+ <h4>Duplication Lines</h4>
362
+ ${buildIssueRows(lineIssues, "duplication")}
363
+ </section>
364
+ <section class="result-card">
365
+ <h4>Inconsistency Lines</h4>
366
+ ${buildIssueRows(lineIssues, "inconsistency")}
367
+ </section>
368
+ <section class="result-card">
369
+ <h4>Contradiction Lines</h4>
370
+ ${buildIssueRows(lineIssues, "contradiction")}
371
+ </section>
372
+ `;
373
+ }
374
+
375
+ function initSummaryPage() {
376
+ if (!ensureAuth()) return;
377
+
378
+ const payload = getAnalysisPayload();
379
+ if (!payload) {
380
+ window.location.href = "upload.html";
381
+ return;
382
+ }
383
+
384
+ const summary = payload.summary || {};
385
+ const findings = Array.isArray(payload.findings) ? payload.findings : [];
386
+ const pageSummaries = Array.isArray(payload.pageSummaries) ? payload.pageSummaries : [];
387
+ const lineIssues = Array.isArray(payload.lineIssues) ? payload.lineIssues : [];
388
+ const detailedSummary = String(payload.detailedSummary || "").trim();
389
+ const meta = payload._meta || {};
390
+
391
+ const summaryDetails = document.getElementById("summaryDetails");
392
+ summaryDetails.innerHTML = `
393
+ <article class="summary-item"><span>File</span><strong>${escapeHtml(meta.fileName || "-")}</strong></article>
394
+ <article class="summary-item"><span>Scan Mode</span><strong>${escapeHtml(summary.scanMode || "-")}</strong></article>
395
+ <article class="summary-item"><span>Threshold</span><strong>${escapeHtml(summary.threshold ?? "-")}</strong></article>
396
+ <article class="summary-item"><span>Vendor</span><strong>${escapeHtml(summary.vendor || "Not found")}</strong></article>
397
+ <article class="summary-item"><span>Vendee</span><strong>${escapeHtml(summary.vendee || "Not found")}</strong></article>
398
+ <article class="summary-item"><span>Clauses</span><strong>${escapeHtml(summary.clauses ?? 0)}</strong></article>
399
+ <article class="summary-item"><span>Pairs Compared</span><strong>${escapeHtml(summary.pairsCompared ?? 0)}</strong></article>
400
+ <article class="summary-item"><span>Total Issues</span><strong>${escapeHtml(summary.issuesFound ?? 0)}</strong></article>
401
+ `;
402
+
403
+ const findingsBoard = document.getElementById("findingsBoard");
404
+ const pageSummaryBoard = document.getElementById("pageSummaryBoard");
405
+ const detailedSummaryText = document.getElementById("detailedSummaryText");
406
+ const lineErrorDashboard = document.getElementById("lineErrorDashboard");
407
+
408
+ if (detailedSummaryText) {
409
+ detailedSummaryText.textContent = detailedSummary || "Detailed summary is not available for this document.";
410
+ }
411
+
412
+ if (pageSummaryBoard) {
413
+ if (pageSummaries.length === 0) {
414
+ pageSummaryBoard.innerHTML =
415
+ `<article class="result-card"><p class="result-muted">No page-wise summary available for this document.</p></article>`;
416
+ } else {
417
+ pageSummaryBoard.innerHTML = pageSummaries
418
+ .map((item) => {
419
+ const keyLines = Array.isArray(item.keyLines) ? item.keyLines : [];
420
+ const keyLineHtml = keyLines.length
421
+ ? keyLines.map((k) => `<li>${escapeHtml(k)}</li>`).join("")
422
+ : "<li>No flagged lines on this page.</li>";
423
+ return `
424
+ <article class="result-card">
425
+ <h4>Page ${escapeHtml(item.page)}</h4>
426
+ <p><strong>Clauses:</strong> ${escapeHtml(item.clauseCount ?? 0)}</p>
427
+ <p><strong>Issues:</strong> ${escapeHtml(item.issueCount ?? 0)} (Duplication: ${escapeHtml(item.duplicationCount ?? 0)}, Inconsistency: ${escapeHtml(item.inconsistencyCount ?? 0)}, Contradiction: ${escapeHtml(item.contradictionCount ?? 0)})</p>
428
+ <p><strong>Page Snippet:</strong> ${escapeHtml(item.pageSnippet || "-")}</p>
429
+ <p><strong>Summary:</strong> ${escapeHtml(item.summaryText || "-")}</p>
430
+ <p><strong>Key Lines:</strong></p>
431
+ <ul>${keyLineHtml}</ul>
432
+ </article>
433
+ `;
434
+ })
435
+ .join("");
436
+ }
437
+ }
438
+
439
+ if (findings.length === 0) {
440
+ findingsBoard.innerHTML = `<article class="result-card"><p class="result-muted">No major findings detected for this document.</p></article>`;
441
+ return;
442
+ }
443
+
444
+ const topFindings = findings.slice(0, 20);
445
+ findingsBoard.innerHTML = topFindings
446
+ .map(
447
+ (item) => `
448
+ <article class="result-card">
449
+ <h4>${escapeHtml(item.category || "issue")} - ${escapeHtml(item.issueType || "-")}</h4>
450
+ <p><strong>Confidence:</strong> ${escapeHtml(item.confidence ?? "-")}</p>
451
+ <p><strong>Location A:</strong> ${escapeHtml(item.location1 || "-")}</p>
452
+ <p><strong>Location B:</strong> ${escapeHtml(item.location2 || "-")}</p>
453
+ <p><strong>Reason:</strong> ${escapeHtml(item.reason || "-")}</p>
454
+ </article>
455
+ `
456
+ )
457
+ .join("");
458
+
459
+ if (lineErrorDashboard) {
460
+ if (lineIssues.length === 0) {
461
+ lineErrorDashboard.innerHTML = `<p class="result-muted">No line-level errors detected.</p>`;
462
+ return;
463
+ }
464
+
465
+ const rows = lineIssues
466
+ .slice(0, 200)
467
+ .map(
468
+ (item) => `
469
+ <tr>
470
+ <td>${escapeHtml(item.location || `Pg ${item.page}, Ln ${item.line}`)}</td>
471
+ <td>${escapeHtml(item.category || "-")}</td>
472
+ <td>${escapeHtml(item.issueType || "-")}</td>
473
+ <td>${escapeHtml(item.confidence ?? "-")}</td>
474
+ <td>${escapeHtml(item.reason || "-")}</td>
475
+ </tr>
476
+ `
477
+ )
478
+ .join("");
479
+
480
+ lineErrorDashboard.innerHTML = `
481
+ <div class="table-wrap">
482
+ <table class="result-table">
483
+ <thead>
484
+ <tr>
485
+ <th>Page/Line</th>
486
+ <th>Category</th>
487
+ <th>Issue Type</th>
488
+ <th>Confidence</th>
489
+ <th>Reason</th>
490
+ </tr>
491
+ </thead>
492
+ <tbody>${rows}</tbody>
493
+ </table>
494
+ </div>
495
+ `;
496
+ }
497
+ }
498
+
499
+ if (page === "index.html" || page === "") {
500
+ initIndexPage();
501
+ } else if (page === "upload.html") {
502
+ initUploadPage();
503
+ } else if (page === "issues.html") {
504
+ initIssuesPage();
505
+ } else if (page === "summary.html") {
506
+ initSummaryPage();
507
+ } else if (page === "workflow.html") {
508
+ window.location.href = "upload.html";
509
+ }
frontend/assets/legal-tech-bg.svg ADDED
frontend/index.html ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Semantix • Legal Semantic Intelligence</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
9
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
10
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=Space+Grotesk:wght@500;600;700&display=swap" rel="stylesheet" />
11
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" />
12
+ <style>
13
+ :root {
14
+ --navy: #0f172a;
15
+ }
16
+
17
+ .tail-container {
18
+ font-family: "Inter", system-ui, sans-serif;
19
+ }
20
+
21
+ .heading-font {
22
+ font-family: "Space Grotesk", sans-serif;
23
+ }
24
+
25
+ .hero-bg {
26
+ background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
27
+ }
28
+
29
+ .card {
30
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
31
+ }
32
+
33
+ .card:hover {
34
+ transform: translateY(-4px);
35
+ box-shadow: 0 20px 25px -5px rgb(15 23 42 / 0.1), 0 8px 10px -6px rgb(15 23 42 / 0.1);
36
+ }
37
+
38
+ .document-3d {
39
+ perspective: 1200px;
40
+ transition: transform 0.6s cubic-bezier(0.23, 1, 0.32, 1);
41
+ }
42
+
43
+ .document-3d:hover {
44
+ transform: rotateX(12deg) rotateY(12deg) scale(1.03);
45
+ }
46
+
47
+ .document-inner {
48
+ box-shadow: 0 25px 50px -12px rgb(0 0 0 / 0.4), 0 0 80px -20px rgb(129 140 248 / 0.6), inset 0 4px 12px rgba(255, 255, 255, 0.3);
49
+ }
50
+
51
+ .scan-line {
52
+ position: absolute;
53
+ top: 0;
54
+ left: 0;
55
+ width: 100%;
56
+ height: 4px;
57
+ background: linear-gradient(90deg, transparent, #a5b4fc, transparent);
58
+ animation: scan 4s linear infinite;
59
+ opacity: 0.6;
60
+ }
61
+
62
+ .switcher button.active {
63
+ background: #ffffff;
64
+ color: #111827;
65
+ box-shadow: 0 4px 10px rgba(15, 23, 42, 0.12);
66
+ }
67
+
68
+ @keyframes scan {
69
+ 0% {
70
+ transform: translateY(-100%);
71
+ }
72
+
73
+ 100% {
74
+ transform: translateY(380px);
75
+ }
76
+ }
77
+
78
+ @media (max-width: 768px) {
79
+ .nav-mobile-hide {
80
+ display: none;
81
+ }
82
+ }
83
+ </style>
84
+ </head>
85
+ <body class="tail-container bg-zinc-50 text-slate-900">
86
+ <header class="bg-white border-b border-slate-200 sticky top-0 z-50">
87
+ <div class="max-w-7xl mx-auto px-6 md:px-8 py-5 flex items-center justify-between gap-4">
88
+ <div class="flex items-center gap-x-3">
89
+ <div class="w-9 h-9 bg-gradient-to-br from-indigo-600 to-violet-600 rounded-2xl flex items-center justify-center text-white font-bold text-2xl leading-none pt-0.5">S</div>
90
+ <a href="#home" class="heading-font text-3xl font-semibold tracking-tighter text-slate-900">Semantix</a>
91
+ </div>
92
+
93
+ <nav class="nav-mobile-hide md:flex items-center gap-x-10 text-sm font-medium">
94
+ <a href="#home" class="hover:text-indigo-600 transition-colors">Home</a>
95
+ <a href="#about" class="hover:text-indigo-600 transition-colors">About</a>
96
+ <a href="#service" class="hover:text-indigo-600 transition-colors">Service</a>
97
+ <a href="#contact" class="hover:text-indigo-600 transition-colors">Contact</a>
98
+ </nav>
99
+
100
+ <a href="#authView" class="px-5 py-2.5 text-sm font-semibold bg-indigo-600 hover:bg-indigo-700 text-white rounded-2xl transition-colors">Get Started</a>
101
+ </div>
102
+ </header>
103
+
104
+ <main>
105
+ <section id="home" class="hero-bg min-h-screen flex items-center relative overflow-hidden">
106
+ <div class="absolute inset-0 bg-[radial-gradient(at_50%_30%,rgba(129,140,248,0.15),transparent)]"></div>
107
+
108
+ <div class="max-w-7xl mx-auto px-6 md:px-8 grid md:grid-cols-12 gap-14 items-center relative z-10 py-16">
109
+ <div class="md:col-span-7">
110
+ <div class="inline-flex items-center gap-x-2 bg-white/10 backdrop-blur-md border border-white/20 text-white text-xs font-medium px-4 py-2 rounded-3xl mb-6">
111
+ <span class="relative flex h-3 w-3">
112
+ <span class="animate-ping absolute inline-flex h-full w-full rounded-full bg-emerald-400 opacity-75"></span>
113
+ <span class="relative inline-flex rounded-full h-3 w-3 bg-emerald-500"></span>
114
+ </span>
115
+ AI LEGAL INTELLIGENCE
116
+ </div>
117
+
118
+ <h1 class="heading-font text-5xl md:text-7xl leading-none font-semibold tracking-tighter text-white max-w-2xl">
119
+ Legal Documents,<br />Deeply Understood
120
+ </h1>
121
+
122
+ <p class="mt-8 text-lg md:text-xl text-slate-300 max-w-xl">
123
+ Advanced semantic analysis that uncovers hidden risks and delivers crystal-clear clarity in every contract.
124
+ </p>
125
+
126
+ <div class="mt-12 flex justify-center md:justify-start">
127
+ <div class="document-3d relative inline-block">
128
+ <div class="document-inner w-[300px] md:w-[320px] h-[360px] md:h-[380px] bg-white rounded-3xl overflow-hidden border border-white/40 relative">
129
+ <div class="h-12 bg-gradient-to-r from-indigo-600 to-violet-600 flex items-center px-6 text-white text-sm font-medium">
130
+ CONTRACT • PAGE 1
131
+ </div>
132
+
133
+ <div class="p-6 space-y-3 text-[10px] leading-tight text-slate-700 font-mono">
134
+ <div class="h-2.5 bg-slate-200 rounded w-3/4"></div>
135
+ <div class="h-2.5 bg-slate-200 rounded w-11/12"></div>
136
+ <div class="h-2.5 bg-slate-200 rounded w-5/6"></div>
137
+ <div class="h-2.5 bg-slate-200 rounded w-full"></div>
138
+ <div class="h-2.5 bg-slate-200 rounded w-3/4"></div>
139
+ <div class="h-2.5 bg-slate-200 rounded w-10/12"></div>
140
+ </div>
141
+
142
+ <div class="absolute inset-0 bg-gradient-to-br from-indigo-400/10 to-violet-400/10 flex items-center justify-center">
143
+ <i class="fa-solid fa-wand-magic-sparkles text-white text-[120px] opacity-30"></i>
144
+ </div>
145
+ <div class="scan-line"></div>
146
+ </div>
147
+ </div>
148
+ </div>
149
+ </div>
150
+
151
+ <div class="md:col-span-5">
152
+ <section id="authView" class="bg-white rounded-3xl shadow-2xl p-8 md:p-10 card">
153
+ <div class="form-header mb-8">
154
+ <div class="switcher grid grid-cols-2 bg-slate-100 p-1 rounded-2xl mb-3" role="tablist" aria-label="Auth mode">
155
+ <button id="loginTab" class="active px-7 py-3 text-sm font-semibold rounded-[14px]" type="button">Login</button>
156
+ <button id="signupTab" class="px-7 py-3 text-sm font-semibold rounded-[14px]" type="button">Sign Up</button>
157
+ </div>
158
+ <p id="formSubtitle" class="text-slate-500 text-sm">Enter your credentials to access your account.</p>
159
+ </div>
160
+
161
+ <form id="authForm" class="space-y-5" novalidate>
162
+ <div id="nameField" class="hidden">
163
+ <label class="text-xs uppercase tracking-widest text-slate-500 block mb-1" for="fullName">Full Name</label>
164
+ <input id="fullName" name="fullName" type="text" placeholder="Jayasree" class="w-full bg-zinc-50 border border-slate-200 focus:border-indigo-500 rounded-2xl px-5 py-4 outline-none" />
165
+ </div>
166
+ <div>
167
+ <label class="text-xs uppercase tracking-widest text-slate-500 block mb-1" for="email">Email</label>
168
+ <input id="email" name="email" type="email" placeholder="you@lawfirm.in" autocomplete="email" required class="w-full bg-zinc-50 border border-slate-200 focus:border-indigo-500 rounded-2xl px-5 py-4 outline-none" />
169
+ </div>
170
+ <div>
171
+ <label class="text-xs uppercase tracking-widest text-slate-500 block mb-1" for="password">Password</label>
172
+ <input id="password" name="password" type="password" placeholder="Minimum 6 characters" autocomplete="current-password" required class="w-full bg-zinc-50 border border-slate-200 focus:border-indigo-500 rounded-2xl px-5 py-4 outline-none" />
173
+ </div>
174
+ <button id="submitBtn" type="submit" class="w-full bg-indigo-600 hover:bg-indigo-700 transition-colors text-white font-semibold py-4 rounded-3xl">Login</button>
175
+ </form>
176
+ <p id="message" class="text-center text-sm mt-6 text-slate-500"></p>
177
+ </section>
178
+ </div>
179
+ </div>
180
+ </section>
181
+
182
+ <section id="about" class="py-20 bg-white">
183
+ <div class="max-w-4xl mx-auto px-8 text-center">
184
+ <h2 class="heading-font text-4xl md:text-5xl font-semibold tracking-tighter mb-6">Reliable. Precise. Intelligent.</h2>
185
+ <p class="text-lg text-slate-600 max-w-2xl mx-auto">
186
+ Semantix delivers clear, accurate semantic analysis of legal documents, helping you catch issues instantly and work with confidence.
187
+ </p>
188
+ </div>
189
+ </section>
190
+
191
+ <section id="service" class="py-24 bg-slate-50">
192
+ <div class="max-w-7xl mx-auto px-8">
193
+ <h2 class="heading-font text-center text-4xl md:text-5xl font-semibold tracking-tighter mb-16">Built for serious legal work</h2>
194
+ <div class="grid md:grid-cols-3 gap-8">
195
+ <div class="bg-white p-10 rounded-3xl card text-center">
196
+ <div class="text-5xl mb-6">🔐</div>
197
+ <h3 class="font-semibold text-xl">Enterprise Security</h3>
198
+ <p class="text-slate-500 mt-3">Your documents stay private and protected.</p>
199
+ </div>
200
+ <div class="bg-white p-10 rounded-3xl card text-center">
201
+ <div class="text-5xl mb-6">🧠</div>
202
+ <h3 class="font-semibold text-xl">Smart Analysis</h3>
203
+ <p class="text-slate-500 mt-3">Understands legal language like a senior counsel.</p>
204
+ </div>
205
+ <div class="bg-white p-10 rounded-3xl card text-center">
206
+ <div class="text-5xl mb-6">📈</div>
207
+ <h3 class="font-semibold text-xl">Instant Insights</h3>
208
+ <p class="text-slate-500 mt-3">Visual dashboard with line-level clarity.</p>
209
+ </div>
210
+ </div>
211
+ </div>
212
+ </section>
213
+
214
+ <section id="contact" class="py-20 bg-slate-900 text-white text-center">
215
+ <div class="max-w-7xl mx-auto px-8">
216
+ <p class="text-sm uppercase tracking-widest text-slate-400">Made for legal professionals</p>
217
+ <h2 class="heading-font text-4xl mt-4">Ready for flawless contracts?</h2>
218
+ <a href="mailto:hello@semantix.ai" class="inline-block mt-10 px-10 py-4 bg-white text-slate-900 font-semibold rounded-3xl hover:bg-indigo-50 transition-colors">Contact Us</a>
219
+ <p class="mt-20 text-xs text-slate-500">© 2026 Semantix • Legal Semantic Intelligence</p>
220
+ </div>
221
+ </section>
222
+ </main>
223
+
224
+ <script src="app.js"></script>
225
+ </body>
226
+ </html>
frontend/issues.html ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Issue Analysis | LegalSI</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
9
+ <link
10
+ href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700;800&family=Space+Grotesk:wght@500;700&display=swap"
11
+ rel="stylesheet"
12
+ />
13
+ <link rel="stylesheet" href="styles.css" />
14
+ </head>
15
+ <body>
16
+ <header class="topbar">
17
+ <div class="container topbar-inner">
18
+ <a class="brand" href="index.html#home">LegalSI</a>
19
+ <div class="page-links">
20
+ <a class="page-link" href="upload.html">Upload</a>
21
+ <a class="page-link active" href="issues.html">Issue Analysis</a>
22
+ <a class="page-link" href="summary.html">Final Summary</a>
23
+ <button id="logoutBtn" class="logout-btn" type="button">Logout</button>
24
+ </div>
25
+ </div>
26
+ </header>
27
+
28
+ <main class="flow-main">
29
+ <section class="container flow-card">
30
+ <div class="upload-header">
31
+ <h1>Line-Level Issue Analysis</h1>
32
+ <span id="userBadge" class="user-badge"></span>
33
+ </div>
34
+ <p class="upload-subtitle">Inconsistencies, contradictions, and duplications with page and line references.</p>
35
+
36
+ <div id="issueStats" class="stats-grid"></div>
37
+ <div id="lineIssueTables"></div>
38
+
39
+ <div class="workflow-actions">
40
+ <a class="secondary-btn as-link" href="upload.html">Back to Upload</a>
41
+ <a class="submit-btn as-link submit-link" href="summary.html">Next: Final Summary</a>
42
+ </div>
43
+ </section>
44
+ </main>
45
+
46
+ <script src="app.js"></script>
47
+ </body>
48
+ </html>
frontend/styles.css ADDED
@@ -0,0 +1,957 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --bg: #f3f5f8;
3
+ --surface: #ffffff;
4
+ --surface-soft: #f8fafc;
5
+ --ink: #0e2238;
6
+ --muted: #5b6f85;
7
+ --border: #d3dee9;
8
+ --navy: #12385f;
9
+ --navy-2: #1f4d79;
10
+ --gold: #b78a28;
11
+ --primary: #1f5fa6;
12
+ --primary-2: #2e79c8;
13
+ --teal: #1f8a75;
14
+ --danger: #b93f4f;
15
+ --ok: #166a47;
16
+ }
17
+
18
+ * {
19
+ box-sizing: border-box;
20
+ }
21
+
22
+ html {
23
+ scroll-behavior: smooth;
24
+ }
25
+
26
+ body {
27
+ margin: 0;
28
+ font-family: "Manrope", sans-serif;
29
+ color: var(--ink);
30
+ background:
31
+ radial-gradient(1000px 450px at -10% -8%, #dfe9f5 0%, rgba(223, 233, 245, 0) 60%),
32
+ radial-gradient(900px 420px at 110% -10%, #ece5d5 0%, rgba(236, 229, 213, 0) 58%),
33
+ linear-gradient(180deg, #eff3f7 0%, #f8fafd 42%, #ffffff 100%);
34
+ line-height: 1.45;
35
+ }
36
+
37
+ .container {
38
+ width: min(1180px, 92%);
39
+ margin: 0 auto;
40
+ }
41
+
42
+ .topbar {
43
+ position: sticky;
44
+ top: 0;
45
+ z-index: 20;
46
+ background: rgba(249, 251, 253, 0.94);
47
+ backdrop-filter: blur(6px);
48
+ border-bottom: 1px solid #cfd9e4;
49
+ box-shadow: 0 4px 18px rgba(14, 34, 56, 0.06);
50
+ }
51
+
52
+ .topbar-inner {
53
+ display: flex;
54
+ align-items: center;
55
+ justify-content: space-between;
56
+ min-height: 68px;
57
+ }
58
+
59
+ .brand {
60
+ font-family: "Space Grotesk", sans-serif;
61
+ font-size: 24px;
62
+ font-weight: 700;
63
+ color: var(--navy);
64
+ text-decoration: none;
65
+ }
66
+
67
+ .nav-links {
68
+ display: flex;
69
+ gap: 20px;
70
+ }
71
+
72
+ .nav-links a {
73
+ color: #264868;
74
+ text-decoration: none;
75
+ font-weight: 700;
76
+ font-size: 14px;
77
+ padding: 6px 8px;
78
+ border-radius: 8px;
79
+ }
80
+
81
+ .nav-links a:hover {
82
+ background: #e9f0f8;
83
+ color: var(--navy);
84
+ }
85
+
86
+ .hero {
87
+ position: relative;
88
+ padding: 48px 0 42px;
89
+ overflow: hidden;
90
+ }
91
+
92
+ .hero-bg {
93
+ position: absolute;
94
+ inset: 0;
95
+ background:
96
+ linear-gradient(120deg, rgba(18, 56, 95, 0.1), rgba(183, 138, 40, 0.09)),
97
+ url("assets/legal-tech-bg.svg") right center / cover no-repeat;
98
+ opacity: 0.95;
99
+ pointer-events: none;
100
+ }
101
+
102
+ .hero-grid {
103
+ position: relative;
104
+ display: grid;
105
+ grid-template-columns: 1.1fr 0.95fr;
106
+ gap: 24px;
107
+ align-items: start;
108
+ }
109
+
110
+ .hero-copy {
111
+ background: rgba(255, 255, 255, 0.82);
112
+ border: 1px solid var(--border);
113
+ border-radius: 20px;
114
+ padding: 24px;
115
+ box-shadow: 0 14px 34px rgba(15, 38, 66, 0.11);
116
+ animation: fadeInUp 0.45s ease-out;
117
+ }
118
+
119
+ .eyebrow {
120
+ margin: 0 0 10px;
121
+ font-size: 13px;
122
+ letter-spacing: 0.05em;
123
+ text-transform: uppercase;
124
+ color: var(--navy-2);
125
+ font-weight: 800;
126
+ }
127
+
128
+ .hero-copy h1 {
129
+ margin: 0;
130
+ font-size: clamp(30px, 4.6vw, 50px);
131
+ line-height: 1.08;
132
+ font-family: "Space Grotesk", sans-serif;
133
+ }
134
+
135
+ .hero-text {
136
+ margin: 14px 0 18px;
137
+ color: var(--muted);
138
+ line-height: 1.6;
139
+ max-width: 66ch;
140
+ }
141
+
142
+ .hero-cta-row {
143
+ display: flex;
144
+ gap: 10px;
145
+ flex-wrap: wrap;
146
+ margin: 8px 0 14px;
147
+ }
148
+
149
+ .hero-cta-primary,
150
+ .hero-cta-secondary {
151
+ text-decoration: none;
152
+ border-radius: 11px;
153
+ font-size: 14px;
154
+ font-weight: 800;
155
+ padding: 10px 14px;
156
+ }
157
+
158
+ .hero-cta-primary {
159
+ color: #ffffff;
160
+ background: linear-gradient(92deg, var(--navy), var(--primary-2) 58%, var(--teal));
161
+ box-shadow: 0 10px 18px rgba(17, 62, 110, 0.22);
162
+ }
163
+
164
+ .hero-cta-secondary {
165
+ color: #1c446b;
166
+ background: #ecf4ff;
167
+ border: 1px solid #bfd6f2;
168
+ }
169
+
170
+ .trust-strip {
171
+ display: flex;
172
+ flex-wrap: wrap;
173
+ gap: 8px;
174
+ margin: 0 0 14px;
175
+ }
176
+
177
+ .trust-strip span {
178
+ border: 1px solid #d0dded;
179
+ border-radius: 999px;
180
+ padding: 5px 10px;
181
+ font-size: 12px;
182
+ font-weight: 700;
183
+ color: #315579;
184
+ background: #f5f9ff;
185
+ }
186
+
187
+ .hero-metrics {
188
+ display: grid;
189
+ grid-template-columns: repeat(3, 1fr);
190
+ gap: 10px;
191
+ }
192
+
193
+ .hero-metrics > div {
194
+ border: 1px solid #d5e2f0;
195
+ background: #ffffff;
196
+ border-radius: 12px;
197
+ padding: 12px;
198
+ transition: transform 0.18s ease, box-shadow 0.18s ease;
199
+ }
200
+
201
+ .hero-metrics > div:hover {
202
+ transform: translateY(-2px);
203
+ box-shadow: 0 10px 18px rgba(16, 43, 74, 0.09);
204
+ }
205
+
206
+ .hero-metrics h3 {
207
+ margin: 0;
208
+ font-size: 14px;
209
+ }
210
+
211
+ .hero-metrics p {
212
+ margin: 6px 0 0;
213
+ color: var(--muted);
214
+ font-size: 12px;
215
+ }
216
+
217
+ .preview-card {
218
+ margin-top: 12px;
219
+ border: 1px solid #ccdaea;
220
+ border-radius: 14px;
221
+ padding: 12px;
222
+ background: linear-gradient(160deg, #f7fbff 0%, #edf5ff 100%);
223
+ }
224
+
225
+ .preview-card h3 {
226
+ margin: 0 0 10px;
227
+ font-size: 14px;
228
+ color: #163a60;
229
+ }
230
+
231
+ .preview-grid {
232
+ display: grid;
233
+ grid-template-columns: repeat(4, 1fr);
234
+ gap: 8px;
235
+ }
236
+
237
+ .preview-grid div {
238
+ border: 1px solid #c8d9ec;
239
+ border-radius: 10px;
240
+ background: #ffffff;
241
+ padding: 8px;
242
+ display: grid;
243
+ gap: 3px;
244
+ }
245
+
246
+ .preview-grid span {
247
+ font-size: 11px;
248
+ color: #5a7090;
249
+ }
250
+
251
+ .preview-grid strong {
252
+ font-size: 19px;
253
+ color: #15395e;
254
+ }
255
+
256
+ .panel {
257
+ background: var(--surface);
258
+ border: 1px solid var(--border);
259
+ border-radius: 18px;
260
+ box-shadow: 0 14px 30px rgba(12, 31, 53, 0.12);
261
+ animation: fadeInUp 0.5s ease-out;
262
+ }
263
+
264
+ .auth-panel {
265
+ padding: 22px;
266
+ }
267
+
268
+ .form-header {
269
+ margin-bottom: 18px;
270
+ }
271
+
272
+ .switcher {
273
+ display: grid;
274
+ grid-template-columns: 1fr 1fr;
275
+ background: #e9eff7;
276
+ border-radius: 12px;
277
+ padding: 4px;
278
+ margin-bottom: 12px;
279
+ }
280
+
281
+ .switcher button {
282
+ border: 0;
283
+ background: transparent;
284
+ border-radius: 9px;
285
+ padding: 10px;
286
+ font-weight: 800;
287
+ cursor: pointer;
288
+ color: #315579;
289
+ transition: background 0.2s ease, color 0.2s ease, transform 0.12s ease;
290
+ }
291
+
292
+ .switcher button.active {
293
+ color: #112a48;
294
+ background: #ffffff;
295
+ box-shadow: 0 6px 14px rgba(8, 26, 49, 0.08);
296
+ }
297
+
298
+ .switcher button:active {
299
+ transform: scale(0.98);
300
+ }
301
+
302
+ #formSubtitle {
303
+ margin: 0;
304
+ color: var(--muted);
305
+ font-size: 14px;
306
+ }
307
+
308
+ .auth-form {
309
+ display: grid;
310
+ gap: 14px;
311
+ }
312
+
313
+ .field {
314
+ display: grid;
315
+ gap: 7px;
316
+ }
317
+
318
+ .field label {
319
+ font-size: 14px;
320
+ font-weight: 700;
321
+ }
322
+
323
+ .field input,
324
+ .control {
325
+ border: 1px solid var(--border);
326
+ border-radius: 12px;
327
+ padding: 12px 13px;
328
+ font: inherit;
329
+ background: #ffffff;
330
+ outline: none;
331
+ width: 100%;
332
+ }
333
+
334
+ .field input:focus,
335
+ .control:focus {
336
+ border-color: var(--primary);
337
+ box-shadow: 0 0 0 4px rgba(31, 95, 166, 0.16);
338
+ }
339
+
340
+ .hidden {
341
+ display: none;
342
+ }
343
+
344
+ .submit-btn {
345
+ margin-top: 8px;
346
+ border: 0;
347
+ border-radius: 12px;
348
+ padding: 12px;
349
+ background: linear-gradient(92deg, var(--navy), var(--primary-2) 58%, var(--teal));
350
+ color: #ffffff;
351
+ font-weight: 800;
352
+ font-size: 15px;
353
+ cursor: pointer;
354
+ transition: transform 0.16s ease, box-shadow 0.16s ease, filter 0.16s ease;
355
+ }
356
+
357
+ .submit-btn:hover {
358
+ filter: brightness(1.03);
359
+ transform: translateY(-1px);
360
+ box-shadow: 0 10px 18px rgba(17, 62, 110, 0.22);
361
+ }
362
+
363
+ .message {
364
+ min-height: 22px;
365
+ margin: 14px 0 0;
366
+ font-size: 14px;
367
+ font-weight: 700;
368
+ }
369
+
370
+ .message.success {
371
+ color: var(--ok);
372
+ }
373
+
374
+ .message.error {
375
+ color: var(--danger);
376
+ }
377
+
378
+ .upload-header {
379
+ display: flex;
380
+ align-items: center;
381
+ justify-content: space-between;
382
+ }
383
+
384
+ .upload-header h2 {
385
+ margin: 0;
386
+ font-family: "Space Grotesk", sans-serif;
387
+ }
388
+
389
+ .upload-subtitle {
390
+ margin: 10px 0 18px;
391
+ color: var(--muted);
392
+ }
393
+
394
+ .stepper {
395
+ display: grid;
396
+ grid-template-columns: repeat(3, 1fr);
397
+ gap: 8px;
398
+ margin: 10px 0 16px;
399
+ }
400
+
401
+ .step-chip {
402
+ text-align: center;
403
+ border: 1px solid var(--border);
404
+ border-radius: 10px;
405
+ padding: 8px 10px;
406
+ font-size: 13px;
407
+ font-weight: 800;
408
+ color: #5d7190;
409
+ background: #f3f6fb;
410
+ transition: all 0.2s ease;
411
+ }
412
+
413
+ .step-chip.active {
414
+ color: #0f2d4e;
415
+ border-color: #b7cde7;
416
+ background: #e8f1fc;
417
+ box-shadow: inset 0 0 0 1px rgba(38, 97, 166, 0.15);
418
+ }
419
+
420
+ .workflow-step {
421
+ margin-top: 6px;
422
+ }
423
+
424
+ .summary-box {
425
+ border: 1px solid var(--border);
426
+ border-radius: 12px;
427
+ background: var(--surface-soft);
428
+ padding: 12px;
429
+ color: #25496f;
430
+ box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.6);
431
+ }
432
+
433
+ .summary-box p {
434
+ margin: 5px 0;
435
+ font-size: 14px;
436
+ }
437
+
438
+ .workflow-actions {
439
+ display: flex;
440
+ gap: 10px;
441
+ margin-top: 12px;
442
+ flex-wrap: wrap;
443
+ }
444
+
445
+ .upload-zone-wrap {
446
+ margin-top: 2px;
447
+ }
448
+
449
+ .upload-zone {
450
+ border: 1.5px dashed #b8cbe0;
451
+ border-radius: 14px;
452
+ background: linear-gradient(180deg, #f8fbff 0%, #f3f8ff 100%);
453
+ min-height: 132px;
454
+ display: grid;
455
+ place-content: center;
456
+ text-align: center;
457
+ gap: 6px;
458
+ cursor: pointer;
459
+ padding: 14px;
460
+ transition: border-color 0.2s ease, background 0.2s ease, transform 0.18s ease;
461
+ }
462
+
463
+ .upload-zone:hover {
464
+ border-color: #7ca4cf;
465
+ background: linear-gradient(180deg, #fafdff 0%, #eef5ff 100%);
466
+ transform: translateY(-1px);
467
+ }
468
+
469
+ .upload-icon {
470
+ width: 34px;
471
+ height: 34px;
472
+ border-radius: 999px;
473
+ margin: 0 auto;
474
+ display: grid;
475
+ place-content: center;
476
+ font-size: 22px;
477
+ font-weight: 700;
478
+ color: #21507f;
479
+ background: #e5eef9;
480
+ }
481
+
482
+ .upload-title {
483
+ font-size: 14px;
484
+ font-weight: 800;
485
+ color: #1f4469;
486
+ }
487
+
488
+ .upload-hint {
489
+ font-size: 12px;
490
+ color: #5f7691;
491
+ }
492
+
493
+ .file-input-hidden {
494
+ position: absolute;
495
+ left: -10000px;
496
+ width: 1px;
497
+ height: 1px;
498
+ opacity: 0;
499
+ }
500
+
501
+ .chat-panel {
502
+ border: 1px solid var(--border);
503
+ border-radius: 12px;
504
+ background: #f7fbff;
505
+ padding: 12px;
506
+ margin-top: 10px;
507
+ display: grid;
508
+ gap: 10px;
509
+ max-height: 220px;
510
+ overflow-y: auto;
511
+ }
512
+
513
+ .chat-bubble {
514
+ padding: 10px 12px;
515
+ border-radius: 12px;
516
+ font-size: 13px;
517
+ line-height: 1.5;
518
+ }
519
+
520
+ .chat-bubble.user {
521
+ justify-self: end;
522
+ max-width: 92%;
523
+ background: #e8f1ff;
524
+ border: 1px solid #bfd6f4;
525
+ color: #1f4268;
526
+ }
527
+
528
+ .chat-bubble.bot {
529
+ justify-self: start;
530
+ max-width: 96%;
531
+ background: #ffffff;
532
+ border: 1px solid #d4e0ee;
533
+ color: #274968;
534
+ }
535
+
536
+ .logout-btn {
537
+ border: 1px solid var(--border);
538
+ background: #ffffff;
539
+ border-radius: 10px;
540
+ padding: 8px 12px;
541
+ font-weight: 700;
542
+ cursor: pointer;
543
+ }
544
+
545
+ .secondary-btn {
546
+ border: 1px solid #b8cbe0;
547
+ background: #f1f6fc;
548
+ color: #1f4469;
549
+ border-radius: 12px;
550
+ padding: 12px 14px;
551
+ font-weight: 800;
552
+ font-size: 14px;
553
+ cursor: pointer;
554
+ transition: background 0.18s ease, transform 0.14s ease;
555
+ }
556
+
557
+ .secondary-btn:hover {
558
+ background: #e7f0fa;
559
+ }
560
+
561
+ .secondary-btn:active {
562
+ transform: scale(0.98);
563
+ }
564
+
565
+ .section {
566
+ padding: 20px 0 26px;
567
+ }
568
+
569
+ .section-card {
570
+ background: var(--surface);
571
+ border: 1px solid var(--border);
572
+ border-radius: 18px;
573
+ padding: 24px;
574
+ box-shadow: 0 10px 24px rgba(12, 34, 58, 0.09);
575
+ transition: box-shadow 0.2s ease, transform 0.2s ease;
576
+ }
577
+
578
+ .section-card:hover {
579
+ box-shadow: 0 14px 26px rgba(12, 34, 58, 0.13);
580
+ transform: translateY(-1px);
581
+ }
582
+
583
+ .section-card h2 {
584
+ margin: 0 0 10px;
585
+ font-family: "Space Grotesk", sans-serif;
586
+ }
587
+
588
+ .section-card p {
589
+ margin: 0;
590
+ color: var(--muted);
591
+ line-height: 1.7;
592
+ }
593
+
594
+ .service-grid {
595
+ margin-top: 14px;
596
+ display: grid;
597
+ grid-template-columns: repeat(3, 1fr);
598
+ gap: 12px;
599
+ }
600
+
601
+ .service-grid article {
602
+ border: 1px solid var(--border);
603
+ border-radius: 12px;
604
+ padding: 14px;
605
+ background: var(--surface-soft);
606
+ }
607
+
608
+ .service-grid h3 {
609
+ margin: 0 0 8px;
610
+ font-size: 16px;
611
+ }
612
+
613
+ .contact-grid {
614
+ margin-top: 14px;
615
+ display: grid;
616
+ gap: 8px;
617
+ color: #193b61;
618
+ }
619
+
620
+ .analysis-result {
621
+ margin-top: 16px;
622
+ border-top: 1px solid var(--border);
623
+ padding-top: 14px;
624
+ }
625
+
626
+ .result-summary h3 {
627
+ margin: 0 0 8px;
628
+ font-family: "Space Grotesk", sans-serif;
629
+ }
630
+
631
+ .result-summary p {
632
+ margin: 4px 0;
633
+ color: #1d3352;
634
+ }
635
+
636
+ .result-visual {
637
+ margin-top: 12px;
638
+ border: 1px solid var(--border);
639
+ border-radius: 12px;
640
+ padding: 12px;
641
+ background: linear-gradient(180deg, #f8fbff 0%, #f4f8fd 100%);
642
+ }
643
+
644
+ .result-visual h3 {
645
+ margin: 0 0 10px;
646
+ }
647
+
648
+ .bar-row {
649
+ display: grid;
650
+ grid-template-columns: 170px 1fr 52px;
651
+ align-items: center;
652
+ gap: 8px;
653
+ margin-bottom: 8px;
654
+ }
655
+
656
+ .bar-label,
657
+ .bar-value {
658
+ font-size: 13px;
659
+ font-weight: 700;
660
+ }
661
+
662
+ .bar-track {
663
+ width: 100%;
664
+ height: 12px;
665
+ border-radius: 999px;
666
+ background: #dde5f1;
667
+ overflow: hidden;
668
+ }
669
+
670
+ .bar-fill {
671
+ height: 100%;
672
+ border-radius: 999px;
673
+ }
674
+
675
+ .bar-fill.dup {
676
+ background: #2d6ec8;
677
+ }
678
+
679
+ .bar-fill.inc {
680
+ background: #d08f28;
681
+ }
682
+
683
+ .bar-fill.con {
684
+ background: #bd4b58;
685
+ }
686
+
687
+ .result-list {
688
+ margin-top: 12px;
689
+ display: grid;
690
+ gap: 10px;
691
+ }
692
+
693
+ .result-card {
694
+ border: 1px solid var(--border);
695
+ border-radius: 12px;
696
+ padding: 10px 12px;
697
+ background: #f9fbfe;
698
+ transition: box-shadow 0.16s ease;
699
+ }
700
+
701
+ .result-card:hover {
702
+ box-shadow: 0 10px 20px rgba(12, 34, 58, 0.09);
703
+ }
704
+
705
+ .result-card h4 {
706
+ margin: 0 0 6px;
707
+ }
708
+
709
+ .result-muted {
710
+ color: var(--muted);
711
+ }
712
+
713
+ .table-wrap {
714
+ width: 100%;
715
+ overflow-x: auto;
716
+ }
717
+
718
+ .result-table {
719
+ width: 100%;
720
+ border-collapse: collapse;
721
+ margin-top: 8px;
722
+ }
723
+
724
+ .result-table th,
725
+ .result-table td {
726
+ border: 1px solid var(--border);
727
+ padding: 8px;
728
+ text-align: left;
729
+ font-size: 13px;
730
+ vertical-align: top;
731
+ }
732
+
733
+ .result-table th {
734
+ background: #eef4ff;
735
+ }
736
+
737
+ @keyframes fadeInUp {
738
+ from {
739
+ opacity: 0;
740
+ transform: translateY(8px);
741
+ }
742
+ to {
743
+ opacity: 1;
744
+ transform: translateY(0);
745
+ }
746
+ }
747
+
748
+ @media (max-width: 980px) {
749
+ .hero-grid {
750
+ grid-template-columns: 1fr;
751
+ }
752
+
753
+ .hero-metrics {
754
+ grid-template-columns: 1fr;
755
+ }
756
+
757
+ .preview-grid {
758
+ grid-template-columns: repeat(2, 1fr);
759
+ }
760
+
761
+ .service-grid {
762
+ grid-template-columns: 1fr;
763
+ }
764
+
765
+ .bar-row {
766
+ grid-template-columns: 1fr;
767
+ gap: 6px;
768
+ }
769
+
770
+ .nav-links {
771
+ gap: 12px;
772
+ flex-wrap: wrap;
773
+ justify-content: flex-end;
774
+ }
775
+
776
+ .topbar-inner {
777
+ padding-block: 8px;
778
+ }
779
+ }
780
+
781
+ .page-links {
782
+ display: flex;
783
+ align-items: center;
784
+ gap: 10px;
785
+ }
786
+
787
+ .page-link {
788
+ border: 1px solid #bfd0e3;
789
+ border-radius: 10px;
790
+ padding: 6px 10px;
791
+ font-size: 13px;
792
+ font-weight: 700;
793
+ color: #23496f;
794
+ text-decoration: none;
795
+ background: #f4f8fd;
796
+ }
797
+
798
+ .page-link.active {
799
+ background: #e7f1ff;
800
+ border-color: #98b9dc;
801
+ color: #14395f;
802
+ }
803
+
804
+ .flow-main {
805
+ padding: 28px 0 36px;
806
+ }
807
+
808
+ .flow-card {
809
+ background: var(--surface);
810
+ border: 1px solid var(--border);
811
+ border-radius: 18px;
812
+ box-shadow: 0 14px 30px rgba(12, 31, 53, 0.12);
813
+ padding: 22px;
814
+ }
815
+
816
+ .flow-card h1 {
817
+ margin: 0;
818
+ font-family: "Space Grotesk", sans-serif;
819
+ font-size: clamp(28px, 4vw, 40px);
820
+ }
821
+
822
+ .user-badge {
823
+ border: 1px solid #c6d9ee;
824
+ border-radius: 999px;
825
+ padding: 8px 12px;
826
+ background: #f2f8ff;
827
+ color: #24486d;
828
+ font-weight: 700;
829
+ font-size: 13px;
830
+ }
831
+
832
+ .loading-panel {
833
+ margin-top: 16px;
834
+ border: 1px solid var(--border);
835
+ border-radius: 12px;
836
+ padding: 18px;
837
+ background: #f5f9ff;
838
+ display: grid;
839
+ justify-items: center;
840
+ gap: 10px;
841
+ }
842
+
843
+ .spinner {
844
+ width: 30px;
845
+ height: 30px;
846
+ border: 3px solid #c8d8eb;
847
+ border-top-color: #1f5fa6;
848
+ border-radius: 50%;
849
+ animation: spin 0.8s linear infinite;
850
+ }
851
+
852
+ .stats-grid {
853
+ display: grid;
854
+ grid-template-columns: repeat(3, minmax(0, 1fr));
855
+ gap: 10px;
856
+ margin: 12px 0 14px;
857
+ }
858
+
859
+ .stat-card {
860
+ border: 1px solid var(--border);
861
+ border-radius: 12px;
862
+ padding: 12px;
863
+ background: #f9fbfe;
864
+ }
865
+
866
+ .stat-card h3 {
867
+ margin: 0;
868
+ font-size: 14px;
869
+ }
870
+
871
+ .stat-card p {
872
+ margin: 6px 0 0;
873
+ font-size: 28px;
874
+ font-weight: 800;
875
+ }
876
+
877
+ .stat-dup p {
878
+ color: #2d6ec8;
879
+ }
880
+
881
+ .stat-inc p {
882
+ color: #d08f28;
883
+ }
884
+
885
+ .stat-con p {
886
+ color: #bd4b58;
887
+ }
888
+
889
+ .summary-grid {
890
+ display: grid;
891
+ grid-template-columns: repeat(2, minmax(0, 1fr));
892
+ gap: 10px;
893
+ margin-bottom: 14px;
894
+ }
895
+
896
+ .summary-item {
897
+ border: 1px solid var(--border);
898
+ border-radius: 12px;
899
+ padding: 10px;
900
+ background: #f9fbfe;
901
+ display: grid;
902
+ gap: 4px;
903
+ }
904
+
905
+ .summary-item span {
906
+ color: var(--muted);
907
+ font-size: 13px;
908
+ }
909
+
910
+ .summary-item strong {
911
+ color: #1b3d63;
912
+ font-size: 14px;
913
+ }
914
+
915
+ .section-subtitle {
916
+ margin: 6px 0 10px;
917
+ font-family: "Space Grotesk", sans-serif;
918
+ font-size: 20px;
919
+ color: #183d62;
920
+ }
921
+
922
+ .detailed-summary-text {
923
+ white-space: pre-wrap;
924
+ line-height: 1.65;
925
+ color: #1d3552;
926
+ font-size: 14px;
927
+ }
928
+
929
+ .as-link {
930
+ text-decoration: none;
931
+ display: inline-flex;
932
+ align-items: center;
933
+ justify-content: center;
934
+ }
935
+
936
+ .submit-link {
937
+ min-width: 220px;
938
+ }
939
+
940
+ @keyframes spin {
941
+ to {
942
+ transform: rotate(360deg);
943
+ }
944
+ }
945
+
946
+ @media (max-width: 980px) {
947
+ .page-links {
948
+ gap: 6px;
949
+ flex-wrap: wrap;
950
+ justify-content: flex-end;
951
+ }
952
+
953
+ .stats-grid,
954
+ .summary-grid {
955
+ grid-template-columns: 1fr;
956
+ }
957
+ }
frontend/summary.html ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Final Summary | LegalSI</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
9
+ <link
10
+ href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700;800&family=Space+Grotesk:wght@500;700&display=swap"
11
+ rel="stylesheet"
12
+ />
13
+ <link rel="stylesheet" href="styles.css" />
14
+ </head>
15
+ <body>
16
+ <header class="topbar">
17
+ <div class="container topbar-inner">
18
+ <a class="brand" href="index.html#home">LegalSI</a>
19
+ <div class="page-links">
20
+ <a class="page-link" href="upload.html">Upload</a>
21
+ <a class="page-link" href="issues.html">Issue Analysis</a>
22
+ <a class="page-link active" href="summary.html">Final Summary</a>
23
+ <button id="logoutBtn" class="logout-btn" type="button">Logout</button>
24
+ </div>
25
+ </div>
26
+ </header>
27
+
28
+ <main class="flow-main">
29
+ <section class="container flow-card">
30
+ <div class="upload-header">
31
+ <h1>Final Document Summary</h1>
32
+ <span id="userBadge" class="user-badge"></span>
33
+ </div>
34
+ <p class="upload-subtitle">Overall analysis result for the entire uploaded legal document.</p>
35
+
36
+ <div id="summaryDetails" class="summary-grid"></div>
37
+ <h3 class="section-subtitle">Detailed Document Summary</h3>
38
+ <article class="result-card">
39
+ <div id="detailedSummaryText" class="detailed-summary-text"></div>
40
+ </article>
41
+ <h3 class="section-subtitle">Page-wise Summary</h3>
42
+ <div id="pageSummaryBoard" class="result-list"></div>
43
+ <h3 class="section-subtitle">Top Findings</h3>
44
+ <div id="findingsBoard" class="result-list"></div>
45
+ <h3 class="section-subtitle">Line Error Dashboard</h3>
46
+ <article class="result-card">
47
+ <div id="lineErrorDashboard"></div>
48
+ </article>
49
+
50
+ <div class="workflow-actions">
51
+ <a class="secondary-btn as-link" href="issues.html">Back to Issue Analysis</a>
52
+ <a class="submit-btn as-link submit-link" href="upload.html">Analyze New Document</a>
53
+ </div>
54
+ </section>
55
+ </main>
56
+
57
+ <script src="app.js"></script>
58
+ </body>
59
+ </html>
frontend/upload.html ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Upload Document | LegalSI</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
9
+ <link
10
+ href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700;800&family=Space+Grotesk:wght@500;700&display=swap"
11
+ rel="stylesheet"
12
+ />
13
+ <link rel="stylesheet" href="styles.css" />
14
+ </head>
15
+ <body>
16
+ <header class="topbar">
17
+ <div class="container topbar-inner">
18
+ <a class="brand" href="index.html#home">LegalSI</a>
19
+ <div class="page-links">
20
+ <a class="page-link active" href="upload.html">Upload</a>
21
+ <a class="page-link" href="issues.html">Issue Analysis</a>
22
+ <a class="page-link" href="summary.html">Final Summary</a>
23
+ <button id="logoutBtn" class="logout-btn" type="button">Logout</button>
24
+ </div>
25
+ </div>
26
+ </header>
27
+
28
+ <main class="flow-main">
29
+ <section class="container flow-card">
30
+ <div class="upload-header">
31
+ <h1>Upload Document</h1>
32
+ <span id="userBadge" class="user-badge"></span>
33
+ </div>
34
+ <p class="upload-subtitle">Upload legal document, then continue to issue analysis and final summary pages.</p>
35
+
36
+ <form id="uploadForm" class="auth-form" novalidate>
37
+ <div class="field">
38
+ <label for="scanMode">Scan Mode</label>
39
+ <select id="scanMode" class="control">
40
+ <option>Standard Scan (Recommended)</option>
41
+ <option>Deep Search (Fuzzy)</option>
42
+ <option>Strict (Duplicates Only)</option>
43
+ </select>
44
+ </div>
45
+
46
+ <div class="field upload-zone-wrap">
47
+ <label for="legalFile">Upload File (PDF/DOCX/TXT)</label>
48
+ <label class="upload-zone" for="legalFile">
49
+ <span class="upload-icon">+</span>
50
+ <span class="upload-title">Drop your document or click to browse</span>
51
+ <span class="upload-hint">Supported: PDF, DOCX, TXT</span>
52
+ </label>
53
+ <input id="legalFile" class="control file-input-hidden" type="file" accept=".pdf,.docx,.txt" required />
54
+ </div>
55
+
56
+ <div id="analysisInputSummary" class="summary-box hidden"></div>
57
+
58
+ <div class="workflow-actions">
59
+ <a class="secondary-btn as-link" href="index.html#home">Back to Home</a>
60
+ <button id="runUploadBtn" class="submit-btn" type="submit">Upload and Analyze</button>
61
+ </div>
62
+ </form>
63
+
64
+ <div id="loadingState" class="loading-panel hidden" aria-live="polite">
65
+ <div class="spinner"></div>
66
+ <p>Analyzing document. Please wait...</p>
67
+ </div>
68
+
69
+ <p id="uploadMessage" class="message" aria-live="polite"></p>
70
+ </section>
71
+ </main>
72
+
73
+ <script src="app.js"></script>
74
+ </body>
75
+ </html>
frontend/workflow.html ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta http-equiv="refresh" content="0;url=upload.html" />
6
+ <title>Redirecting</title>
7
+ </head>
8
+ <body>
9
+ <p>Redirecting to upload page...</p>
10
+ </body>
11
+ </html>
ingestion/docx_reader.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from docx import Document
2
+
3
+ def extract_text_from_docx(path):
4
+ doc = Document(path)
5
+ return "\n".join(p.text for p in doc.paragraphs)
ingestion/pdf_reader.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+
3
+ def extract_text_from_pdf(path):
4
+ text = ""
5
+ with pdfplumber.open(path) as pdf:
6
+ for page in pdf.pages:
7
+ if page.extract_text():
8
+ text += page.extract_text() + "\n"
9
+ return text
main.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ingestion.pdf_reader import extract_text_from_pdf
2
+ from preprocessing.clause_extraction import extract_clauses
3
+ from embeddings.sbert_encoder import generate_embeddings
4
+ from storage.faiss_index import create_faiss_index
5
+ from analysis.similarity_search import get_similar
6
+ from analysis.common_analyzer import analyze_pair
7
+ from output.report_generator import generate_report
8
+ import numpy as np
9
+
10
+ # Load document
11
+ text = extract_text_from_pdf("data/sample_docs/policy.pdf")
12
+
13
+ # Clause extraction
14
+ clauses = extract_clauses(text)
15
+
16
+ # Embeddings
17
+ embeddings = generate_embeddings(clauses)
18
+ index = create_faiss_index(embeddings)
19
+
20
+ results = []
21
+
22
+ for i, emb in enumerate(embeddings):
23
+ idxs, dists = get_similar(index, emb)
24
+ for j, dist in zip(idxs, dists):
25
+ if i == j:
26
+ continue
27
+
28
+
29
+ similarity = 1 / (1 + dist)
30
+
31
+ # Use new Common Analyzer (Centralized Logic)
32
+ issue_type, score = analyze_pair(clauses[i]["text"], clauses[j]["text"], similarity)
33
+
34
+ if issue_type:
35
+ results.append({
36
+ "type": issue_type,
37
+ "confidence": score,
38
+ "clause_1": clauses[i]["text"],
39
+ "clause_2": clauses[j]["text"]
40
+ })
41
+
42
+ generate_report(results)
43
+ print("✅ Analysis completed. Report generated.")
preprocessing/clause_extraction.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def extract_clauses(text_data):
4
+ """
5
+ Extracts clauses from text chunks with location data.
6
+ Args:
7
+ text_data: List[Dict] with 'text' and 'page' keys.
8
+ Returns:
9
+ List[Dict]: [{'id', 'text', 'page', 'line'}]
10
+ """
11
+ unique_clauses = []
12
+ seen = set()
13
+ clause_id = 0
14
+
15
+ for chunk in text_data:
16
+ raw_text = chunk.get("text", "")
17
+ page_num = chunk.get("page", 1)
18
+
19
+ # Split into lines first to track line numbers roughly
20
+ # Or split by sentence and find position.
21
+
22
+ # Simple approach: Split by sentence, then find approximate line number in chunk
23
+ sentences = re.split(r'(?<=[.!?])\s+', raw_text)
24
+
25
+ # Helper to find line number
26
+ def get_line_number(substring, source_text):
27
+ idx = source_text.find(substring)
28
+ if idx == -1: return 1
29
+ return source_text[:idx].count('\n') + 1
30
+
31
+ for s in sentences:
32
+ s_clean = s.strip()
33
+ if len(s_clean) > 30 and s_clean not in seen:
34
+ seen.add(s_clean)
35
+
36
+ # Estimate line number within the page
37
+ line_offset = get_line_number(s_clean, raw_text)
38
+
39
+ unique_clauses.append({
40
+ "id": clause_id,
41
+ "text": s_clean,
42
+ "page": page_num,
43
+ "line": line_offset
44
+ })
45
+ clause_id += 1
46
+
47
+ return unique_clauses
preprocessing/text_extractor.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pdfplumber
3
+ import docx
4
+ import io
5
+
6
+ def extract_text_from_file(file_obj, file_type):
7
+ """
8
+ Extracts text from various file formats with page/location tracking.
9
+ Args:
10
+ file_obj: The uploaded file object (bytes).
11
+ file_type: 'pdf', 'docx', or 'txt'.
12
+ Returns:
13
+ List[Dict]: List of {'text': str, 'page': int}
14
+ """
15
+ extracted_data = []
16
+ try:
17
+ if file_type == "pdf":
18
+ with pdfplumber.open(file_obj) as pdf:
19
+ for i, page in enumerate(pdf.pages):
20
+ page_text = page.extract_text()
21
+ if page_text:
22
+ extracted_data.append({
23
+ "text": page_text,
24
+ "page": i + 1
25
+ })
26
+
27
+ elif file_type == "docx":
28
+ doc = docx.Document(file_obj)
29
+ # DOCX doesn't have strict pages, so we'll treat paragraphs/sections
30
+ # as a stream. We'll mark it as Page 1 for now, or maybe
31
+ # increment 'page' every N paragraphs to simulate flow?
32
+ # Better: Return logical sections.
33
+ full_text = ""
34
+ for para in doc.paragraphs:
35
+ full_text += para.text + "\n"
36
+
37
+ extracted_data.append({
38
+ "text": full_text,
39
+ "page": 1 # DOCX treated as single continuous flow unless paginated
40
+ })
41
+
42
+ elif file_type == "txt":
43
+ # Assuming utf-8 encoding
44
+ text = file_obj.read().decode("utf-8")
45
+ extracted_data.append({
46
+ "text": text,
47
+ "page": 1
48
+ })
49
+
50
+ except Exception as e:
51
+ print(f"Error extracting text: {e}")
52
+ return []
53
+
54
+ return extracted_data
reproduce_issue.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer, util
5
+
6
+ sys.path.append(os.getcwd())
7
+ try:
8
+ from analysis.common_analyzer import analyze_pair
9
+ from preprocessing.clause_extraction import extract_clauses
10
+ except ImportError:
11
+ # Handle case where run from root
12
+ sys.path.append(os.path.join(os.getcwd(), 'analysis'))
13
+ sys.path.append(os.path.join(os.getcwd(), 'preprocessing'))
14
+ from analysis.common_analyzer import analyze_pair
15
+ from preprocessing.clause_extraction import extract_clauses
16
+
17
+ def test_reproduction():
18
+ print("--- Section 1: Core Logic Test ---")
19
+ t1 = "Audit reports must be retained for a minimum of three (3) years."
20
+ t2 = "Audit reports shall be deleted after one (1) year to reduce storage overhead."
21
+
22
+ print(f"Text 1: {t1}")
23
+ print(f"Text 2: {t2}")
24
+
25
+ # 1. Calculate Similarity
26
+ print("Loading embedding model...")
27
+ model = SentenceTransformer('all-MiniLM-L6-v2')
28
+ e1 = model.encode(t1)
29
+ e2 = model.encode(t2)
30
+
31
+ sim = util.cos_sim(e1, e2).item()
32
+ print(f"Similarity Score: {sim:.4f}")
33
+
34
+ # 2. Test analyze_pair
35
+ print("Running analyze_pair...")
36
+ label, conf, reason = analyze_pair(t1, t2, sim)
37
+ print(f"Result: Label={label}, Conf={conf}, Reason={reason}")
38
+
39
+ if label == "CANDIDATE":
40
+ print("!!! PASSED Phase 1: ACCEPTED as CANDIDATE")
41
+
42
+ # 3. Test NLI
43
+ from analysis.nli_verifier import NLIVerifier
44
+ print("\nRunning NLI Verification (Phase 2)...")
45
+ verifier = NLIVerifier()
46
+ is_contra, nli_conf, nli_label = verifier.predict(t1, t2)
47
+ print(f"NLI Result: IsContra={is_contra}, Conf={nli_conf}, Label={nli_label}")
48
+
49
+ elif label:
50
+ print(f"!!! PASSED Phase 1: ACCEPTED as {label} (No NLI needed usually, but logic might vary)")
51
+ else:
52
+ print("!!! PASSED Phase 1: REJECTED (None)")
53
+
54
+ print("\n--- Section 2: Pipeline & Metadata Test ---")
55
+ mock_text = [
56
+ {"text": "Section 1. This is a test clause on page 1.", "page": 1},
57
+ {"text": "Section 2. This is another clause on page 2.", "page": 2}
58
+ ]
59
+ print("Testing extract_clauses with structured input...")
60
+ clauses = extract_clauses(mock_text)
61
+ if len(clauses) > 0 and 'page' in clauses[0] and 'line' in clauses[0]:
62
+ print(f"SUCCESS: Extracted {len(clauses)} clauses with metadata.")
63
+ print(f"Sample: {clauses[0]}")
64
+ else:
65
+ print("FAIL: Metadata extraction failed.")
66
+
67
+
68
+ if __name__ == "__main__":
69
+ test_reproduction()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pdfplumber
2
+ python-docx
3
+ spacy
4
+ sentence-transformers
5
+ faiss-cpu
6
+ numpy
7
+ streamlit
8
+ transformers
9
+ torch
10
+ huggingface_hub
11
+ reportlab
storage/faiss_index.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+
4
+ def create_faiss_index(embeddings):
5
+ dim = embeddings.shape[1]
6
+ index = faiss.IndexFlatL2(dim)
7
+ index.add(np.array(embeddings))
8
+ return index
ui/app.py ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import importlib
6
+ import json
7
+ import base64
8
+ import re
9
+
10
+ import pandas as pd
11
+ import plotly.express as px
12
+ import streamlit as st
13
+
14
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
15
+
16
+ from preprocessing.text_extractor import extract_text_from_file
17
+ from preprocessing.clause_extraction import extract_clauses
18
+ from embeddings.sbert_encoder import generate_embeddings
19
+ from storage.faiss_index import create_faiss_index
20
+ from analysis.similarity_search import get_similar
21
+
22
+ import analysis.common_analyzer
23
+ importlib.reload(analysis.common_analyzer)
24
+ from analysis.common_analyzer import analyze_pair
25
+
26
+ from analysis.nli_verifier import NLIVerifier
27
+ from analysis.llama_legal_verifier import LlamaLegalVerifier
28
+ from output.pdf_generator import generate_pdf_report
29
+ from auth.user_store import authenticate_user, create_user
30
+
31
+
32
+ APP_TITLE = "Legal Semantic Integrity"
33
+ DEFAULT_MODEL_PATH = "merged_tinyllama_instruction"
34
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
35
+
36
+
37
+ def init_state():
38
+ st.session_state.setdefault("is_authenticated", False)
39
+ st.session_state.setdefault("username", "")
40
+ st.session_state.setdefault("analysis_done", False)
41
+ st.session_state.setdefault("results", [])
42
+ st.session_state.setdefault("line_issues", [])
43
+ st.session_state.setdefault("uploaded_name", "")
44
+ st.session_state.setdefault("uploaded_ext", "")
45
+ st.session_state.setdefault("uploaded_bytes", b"")
46
+
47
+
48
+ def _extract_party_name(text: str, role: str) -> str:
49
+ """
50
+ Try to extract a nearby party name for vendor/vendee from clause text.
51
+ Falls back to role-present markers when exact name is not available.
52
+ """
53
+ if not text:
54
+ return "Not found"
55
+
56
+ t = " ".join(str(text).split())
57
+ role_l = role.lower()
58
+
59
+ # Pattern examples:
60
+ # "Vendor Mr. Ravi Kumar", "Vendee: Sita Devi", "the vendor, John Doe"
61
+ patterns = [
62
+ rf"\b{role_l}\b\s*[:,-]?\s*(?:mr\.?|mrs\.?|ms\.?)?\s*([A-Z][A-Za-z.\s]{{2,60}}?)(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)",
63
+ rf"\bthe\s+{role_l}\b\s*[:,-]?\s*(?:is\s+)?(?:mr\.?|mrs\.?|ms\.?)?\s*([A-Z][A-Za-z.\s]{{2,60}}?)(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)",
64
+ ]
65
+
66
+ for pat in patterns:
67
+ m = re.search(pat, t, flags=re.IGNORECASE)
68
+ if m:
69
+ name = " ".join(m.group(1).split())
70
+ # Filter generic captures like "hereinafter called"
71
+ if name and not re.search(r"hereinafter|called|referred|party|agreement", name, re.IGNORECASE):
72
+ return name[:80]
73
+
74
+ if re.search(rf"\b{role_l}\b", t, flags=re.IGNORECASE):
75
+ return f"{role.title()} mentioned (name not parsed)"
76
+ return "Not found"
77
+
78
+
79
+ def _clean_candidate_name(name: str) -> str:
80
+ name = re.sub(r"\s+", " ", str(name)).strip(" ,.;:-")
81
+ if not name:
82
+ return ""
83
+ banned = r"hereinafter|called|referred|party|agreement|vendor|vendee|purchaser|buyer|seller"
84
+ if re.search(banned, name, flags=re.IGNORECASE):
85
+ return ""
86
+ return name[:80]
87
+
88
+
89
+ def _extract_document_parties(text_data):
90
+ full_text = "\n".join(chunk.get("text", "") for chunk in (text_data or []))
91
+ compact = " ".join(full_text.split())
92
+ parties = {"Vendor": "Not found", "Vendee": "Not found"}
93
+
94
+ # Common legal intro patterns:
95
+ # "Mr. X ... hereinafter called the VENDOR"
96
+ # "Y ... hereinafter called the VENDEE"
97
+ role_patterns = {
98
+ "Vendor": [
99
+ r"(Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80}?)\s+(?:son of|wife of|daughter of|residing at|aged about|hereinafter)\b[^.]{0,120}\bvendor\b",
100
+ r"\bvendor\b\s*[:,-]?\s*(?:is\s+)?(?:Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80})(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)",
101
+ ],
102
+ "Vendee": [
103
+ r"(Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80}?)\s+(?:son of|wife of|daughter of|residing at|aged about|hereinafter)\b[^.]{0,120}\bvendee\b",
104
+ r"\bvendee\b\s*[:,-]?\s*(?:is\s+)?(?:Mr\.?|Mrs\.?|Ms\.?)?\s*([A-Z][A-Za-z.\s]{2,80})(?=,|\.|;|\bson of\b|\bwife of\b|\bresiding\b|\baged\b|$)",
105
+ ],
106
+ }
107
+
108
+ for role, patterns in role_patterns.items():
109
+ for pat in patterns:
110
+ m = re.search(pat, compact, flags=re.IGNORECASE)
111
+ if not m:
112
+ continue
113
+ candidate = m.group(2) if (m.lastindex or 0) >= 2 else m.group(1)
114
+ cleaned = _clean_candidate_name(candidate)
115
+ if cleaned:
116
+ parties[role] = cleaned
117
+ break
118
+ # Secondary fallback: explicit role in text without name
119
+ if parties[role] == "Not found" and re.search(rf"\b{role.lower()}\b", compact, flags=re.IGNORECASE):
120
+ parties[role] = f"{role} mentioned (name not parsed)"
121
+
122
+ return parties
123
+
124
+
125
+ def _extract_parties(text1: str, text2: str, doc_parties=None):
126
+ vendor = _extract_party_name(text1, "vendor")
127
+ if vendor == "Not found":
128
+ vendor = _extract_party_name(text2, "vendor")
129
+
130
+ vendee = _extract_party_name(text1, "vendee")
131
+ if vendee == "Not found":
132
+ vendee = _extract_party_name(text2, "vendee")
133
+
134
+ if doc_parties:
135
+ if vendor in ["Not found", "Vendor mentioned (name not parsed)"] and doc_parties.get("Vendor"):
136
+ vendor = doc_parties.get("Vendor")
137
+ if vendee in ["Not found", "Vendee mentioned (name not parsed)"] and doc_parties.get("Vendee"):
138
+ vendee = doc_parties.get("Vendee")
139
+
140
+ return vendor, vendee
141
+
142
+
143
+ @st.cache_resource
144
+ def load_verifier(backend: str, llama_model_path: str):
145
+ if backend == "llama":
146
+ return LlamaLegalVerifier(model_path=llama_model_path)
147
+ return NLIVerifier(model_name="cross-encoder/nli-distilroberta-base")
148
+
149
+
150
+ def apply_theme():
151
+ st.markdown(
152
+ """
153
+ <style>
154
+ @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap');
155
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500&display=swap');
156
+
157
+ :root {
158
+ --bg-soft: #f6fbff;
159
+ --ink-900: #0b2f4a;
160
+ --ink-700: #21506f;
161
+ --accent-500: #0a84c6;
162
+ --accent-700: #005b88;
163
+ --mint-500: #2aa198;
164
+ --warn-500: #c57b00;
165
+ --danger-500: #c44736;
166
+ --card-border: #dbeaf4;
167
+ }
168
+
169
+ html, body, [class*="css"] {
170
+ font-family: 'Space Grotesk', sans-serif;
171
+ }
172
+
173
+ .stApp {
174
+ background:
175
+ radial-gradient(900px 420px at -15% -25%, #d7f0ff 0%, rgba(215,240,255,0) 62%),
176
+ radial-gradient(900px 420px at 115% -20%, #fff2d8 0%, rgba(255,242,216,0) 62%),
177
+ linear-gradient(180deg, #f8fcff 0%, #ffffff 55%);
178
+ }
179
+
180
+ .hero {
181
+ border: 1px solid var(--card-border);
182
+ background: linear-gradient(145deg, #f0f8ff 0%, #fffdf8 95%);
183
+ border-radius: 18px;
184
+ padding: 20px 22px;
185
+ margin-bottom: 14px;
186
+ box-shadow: 0 10px 24px rgba(9, 59, 102, 0.07);
187
+ animation: fadeIn .45s ease-out;
188
+ }
189
+
190
+ .hero h2 {
191
+ margin: 0;
192
+ color: var(--ink-900);
193
+ letter-spacing: .2px;
194
+ font-weight: 700;
195
+ }
196
+
197
+ .hero p {
198
+ margin: 8px 0 0 0;
199
+ color: var(--ink-700);
200
+ }
201
+
202
+ .step {
203
+ border-left: 4px solid var(--accent-500);
204
+ background: #ffffff;
205
+ border-radius: 8px;
206
+ padding: 8px 12px;
207
+ margin-bottom: 8px;
208
+ font-weight: 500;
209
+ color: #12344d;
210
+ box-shadow: 0 6px 16px rgba(12, 53, 88, 0.05);
211
+ }
212
+
213
+ .mini-card {
214
+ border: 1px solid var(--card-border);
215
+ border-radius: 14px;
216
+ background: #ffffff;
217
+ padding: 14px 14px;
218
+ margin-bottom: 10px;
219
+ box-shadow: 0 6px 16px rgba(12, 53, 88, 0.04);
220
+ animation: fadeIn .55s ease-out;
221
+ }
222
+
223
+ .mini-label {
224
+ color: #43627c;
225
+ font-size: 0.78rem;
226
+ letter-spacing: .02em;
227
+ text-transform: uppercase;
228
+ margin-bottom: 6px;
229
+ }
230
+
231
+ .mini-value {
232
+ color: #082d48;
233
+ font-size: 1.45rem;
234
+ font-weight: 700;
235
+ line-height: 1.2;
236
+ }
237
+
238
+ .mono {
239
+ font-family: 'IBM Plex Mono', monospace;
240
+ }
241
+
242
+ .tag {
243
+ display: inline-block;
244
+ border-radius: 999px;
245
+ padding: 5px 10px;
246
+ font-size: 0.75rem;
247
+ font-weight: 600;
248
+ margin-right: 6px;
249
+ margin-top: 5px;
250
+ border: 1px solid;
251
+ }
252
+
253
+ .tag-info { color: var(--accent-700); border-color: #b7def4; background: #ecf7ff; }
254
+ .tag-ok { color: #186b64; border-color: #bceae5; background: #ecfffc; }
255
+ .tag-warn { color: #8c5c00; border-color: #f2d9a4; background: #fff7e8; }
256
+ .tag-risk { color: #9f3124; border-color: #efb5ad; background: #fff1ee; }
257
+
258
+ [data-testid="stDataFrame"] div[role="table"] {
259
+ border-radius: 12px;
260
+ border: 1px solid #d6e8f4;
261
+ overflow: hidden;
262
+ }
263
+
264
+ @keyframes fadeIn {
265
+ from { opacity: 0; transform: translateY(8px); }
266
+ to { opacity: 1; transform: translateY(0); }
267
+ }
268
+ </style>
269
+ """,
270
+ unsafe_allow_html=True,
271
+ )
272
+
273
+
274
+ def login_page():
275
+ col_intro, col_auth = st.columns([1.15, 1], gap="large")
276
+ with col_intro:
277
+ st.markdown(
278
+ """
279
+ <div class="hero">
280
+ <h2>Legal Semantic Integrity Portal</h2>
281
+ <p>Interactive contract diagnostics with line-level visibility and legal conflict tracing.</p>
282
+ <div>
283
+ <span class="tag tag-info">Step 1: Secure Login</span>
284
+ <span class="tag tag-ok">Step 2: Upload & Analyze</span>
285
+ <span class="tag tag-warn">Step 3: Error-Line Dashboard</span>
286
+ </div>
287
+ </div>
288
+ <div class="mini-card">
289
+ <div class="mini-label">What You Get</div>
290
+ <div class="mono">Duplicate clauses, legal contradictions, and exact page/line issue map.</div>
291
+ </div>
292
+ """,
293
+ unsafe_allow_html=True,
294
+ )
295
+
296
+ with col_auth:
297
+ st.markdown('<div class="step">Step 1 of 3: Login</div>', unsafe_allow_html=True)
298
+ tab_login, tab_signup = st.tabs(["Sign In", "Create Account"])
299
+
300
+ with tab_login:
301
+ with st.form("login_form", clear_on_submit=False):
302
+ username = st.text_input("Username")
303
+ password = st.text_input("Password", type="password")
304
+ submit = st.form_submit_button("Login")
305
+
306
+ if submit:
307
+ ok, message = authenticate_user(username, password)
308
+ if ok:
309
+ st.session_state.is_authenticated = True
310
+ st.session_state.username = username.strip().lower()
311
+ st.success(message)
312
+ st.rerun()
313
+ else:
314
+ st.error(message)
315
+
316
+ with tab_signup:
317
+ with st.form("signup_form", clear_on_submit=True):
318
+ new_username = st.text_input("New Username")
319
+ new_password = st.text_input("New Password", type="password")
320
+ confirm_password = st.text_input("Confirm Password", type="password")
321
+ create_submit = st.form_submit_button("Create Account")
322
+
323
+ if create_submit:
324
+ if new_password != confirm_password:
325
+ st.error("Passwords do not match.")
326
+ else:
327
+ ok, message = create_user(new_username, new_password)
328
+ if ok:
329
+ st.success(message)
330
+ else:
331
+ st.error(message)
332
+
333
+ st.caption("Local accounts are saved in data/users.db")
334
+
335
+
336
+ def run_analysis(uploaded_file, sensitivity: float, backend: str, llama_model_path: str):
337
+ file_ext = uploaded_file.name.split(".")[-1].lower()
338
+
339
+ with st.spinner("Extracting text..."):
340
+ text_data = extract_text_from_file(uploaded_file, file_ext)
341
+
342
+ if not text_data:
343
+ st.error("Could not extract text from this file.")
344
+ return [], []
345
+
346
+ with st.spinner("Extracting clauses..."):
347
+ clauses = extract_clauses(text_data)
348
+ doc_parties = _extract_document_parties(text_data)
349
+
350
+ if not clauses:
351
+ st.warning("No valid clauses were detected.")
352
+ return [], []
353
+
354
+ with st.spinner("Building semantic index..."):
355
+ embeddings = generate_embeddings(clauses)
356
+ index = create_faiss_index(embeddings)
357
+
358
+ resolved_model_path = Path(llama_model_path)
359
+ if not resolved_model_path.is_absolute():
360
+ resolved_model_path = PROJECT_ROOT / resolved_model_path
361
+ verifier = load_verifier(backend=backend, llama_model_path=str(resolved_model_path))
362
+
363
+ results = []
364
+ seen_pairs = set()
365
+
366
+ progress = st.progress(0)
367
+ total = len(embeddings)
368
+
369
+ for i, emb in enumerate(embeddings):
370
+ idxs, dists = get_similar(index, emb, k=5)
371
+
372
+ for j, dist in zip(idxs, dists):
373
+ if i >= j:
374
+ continue
375
+ if (i, j) in seen_pairs:
376
+ continue
377
+ seen_pairs.add((i, j))
378
+
379
+ similarity = 1 / (1 + dist)
380
+ label, confidence, reason = analyze_pair(
381
+ clauses[i]["text"],
382
+ clauses[j]["text"],
383
+ similarity,
384
+ threshold=sensitivity,
385
+ )
386
+
387
+ if not label:
388
+ continue
389
+
390
+ result = {
391
+ "Label": label,
392
+ "Confidence": float(confidence),
393
+ "Reason": reason,
394
+ "Clause 1": clauses[i]["text"],
395
+ "Clause 2": clauses[j]["text"],
396
+ "Page 1": clauses[i]["page"],
397
+ "Line 1": clauses[i]["line"],
398
+ "Page 2": clauses[j]["page"],
399
+ "Line 2": clauses[j]["line"],
400
+ "Location 1": f"Pg {clauses[i]['page']}, Ln {clauses[i]['line']}",
401
+ "Location 2": f"Pg {clauses[j]['page']}, Ln {clauses[j]['line']}",
402
+ }
403
+ vendor_name, vendee_name = _extract_parties(
404
+ result["Clause 1"], result["Clause 2"], doc_parties=doc_parties
405
+ )
406
+ result["Vendor"] = vendor_name
407
+ result["Vendee"] = vendee_name
408
+
409
+ if backend == "llama":
410
+ _, llm_conf, llm_label, llm_reason = verifier.predict(result["Clause 1"], result["Clause 2"])
411
+ else:
412
+ _, llm_conf, llm_label = verifier.predict(result["Clause 1"], result["Clause 2"])
413
+ llm_reason = f"NLI label: {llm_label}"
414
+
415
+ if llm_label == "Neutral":
416
+ # Do not erase strong rule-based findings just because LLM is neutral.
417
+ if result["Label"] in ["NUMERIC_INCONSISTENCY", "LEGAL_CONFLICT"]:
418
+ result["Reason"] = f"{result['Reason']} | LLM neutral review"
419
+ else:
420
+ result["Label"] = "NO_CONFLICT"
421
+ result["Reason"] = "LLM marked as neutral"
422
+ elif llm_label == "Entailment":
423
+ result["Label"] = "DUPLICATION"
424
+ result["Reason"] = "LLM marked as entailment"
425
+ elif llm_label == "Contradiction":
426
+ if result["Label"] in ["CANDIDATE", "QUALIFICATION"]:
427
+ result["Label"] = "LEGAL_CONFLICT"
428
+ result["Reason"] = llm_reason
429
+
430
+ result["Confidence"] = float(llm_conf)
431
+ results.append(result)
432
+
433
+ progress.progress((i + 1) / total)
434
+
435
+ progress.empty()
436
+
437
+ line_issues = []
438
+ for r in results:
439
+ if r["Label"] == "NO_CONFLICT":
440
+ continue
441
+ line_issues.append(
442
+ {
443
+ "Issue Type": r["Label"],
444
+ "Confidence": round(r["Confidence"], 4),
445
+ "Page": r["Page 1"],
446
+ "Line": r["Line 1"],
447
+ "Snippet": r["Clause 1"][:160],
448
+ "Reason": r["Reason"],
449
+ "Vendor": r.get("Vendor", "Not found"),
450
+ "Vendee": r.get("Vendee", "Not found"),
451
+ }
452
+ )
453
+ line_issues.append(
454
+ {
455
+ "Issue Type": r["Label"],
456
+ "Confidence": round(r["Confidence"], 4),
457
+ "Page": r["Page 2"],
458
+ "Line": r["Line 2"],
459
+ "Snippet": r["Clause 2"][:160],
460
+ "Reason": r["Reason"],
461
+ "Vendor": r.get("Vendor", "Not found"),
462
+ "Vendee": r.get("Vendee", "Not found"),
463
+ }
464
+ )
465
+
466
+ line_issues.sort(key=lambda item: (item["Page"], item["Line"]))
467
+
468
+ return results, line_issues
469
+
470
+
471
+ def upload_page():
472
+ st.markdown(
473
+ """
474
+ <div class="hero">
475
+ <h2>Upload And Scan</h2>
476
+ <p>Drop your legal document, choose model/backend, and run full semantic integrity analysis.</p>
477
+ </div>
478
+ """,
479
+ unsafe_allow_html=True,
480
+ )
481
+ st.markdown('<div class="step">Step 2 of 3: Upload Document</div>', unsafe_allow_html=True)
482
+
483
+ with st.sidebar:
484
+ st.header("Scan Settings")
485
+ scan_mode = st.radio(
486
+ "Select scan mode",
487
+ (
488
+ "Standard Scan (Recommended)",
489
+ "Deep Search (Fuzzy)",
490
+ "Strict (Duplicates Only)",
491
+ ),
492
+ index=0,
493
+ )
494
+
495
+ if "Standard" in scan_mode:
496
+ sensitivity = 0.60
497
+ elif "Deep" in scan_mode:
498
+ sensitivity = 0.50
499
+ else:
500
+ sensitivity = 0.85
501
+
502
+ # Locked configuration requested by user:
503
+ # always use local fine-tuned Llama verifier and hide controls.
504
+ model_backend = "llama"
505
+ llama_model_path = DEFAULT_MODEL_PATH
506
+ st.caption("Verifier backend: llama (fixed)")
507
+ st.caption("Local model: merged_tinyllama_instruction (fixed)")
508
+ st.markdown(
509
+ f"""
510
+ <div class="mini-card">
511
+ <div class="mini-label">Active Mode</div>
512
+ <div class="mini-value">{scan_mode.split('(')[0].strip()}</div>
513
+ <div class="mono">Sensitivity: {sensitivity} | Backend: {model_backend}</div>
514
+ </div>
515
+ """,
516
+ unsafe_allow_html=True,
517
+ )
518
+
519
+ col_left, col_right = st.columns([1.35, 1], gap="large")
520
+ with col_left:
521
+ uploaded_file = st.file_uploader(
522
+ "Upload a legal document",
523
+ type=["pdf", "docx", "txt"],
524
+ help="Supported files: PDF, DOCX, TXT",
525
+ )
526
+ with col_right:
527
+ st.markdown(
528
+ """
529
+ <div class="mini-card">
530
+ <div class="mini-label">Supported Inputs</div>
531
+ <div class="mono">PDF / DOCX / TXT</div>
532
+ </div>
533
+ <div class="mini-card">
534
+ <div class="mini-label">Output</div>
535
+ <div class="mono">Pair Findings + Error-Line Dashboard + PDF/JSON Export</div>
536
+ </div>
537
+ """,
538
+ unsafe_allow_html=True,
539
+ )
540
+
541
+ if uploaded_file is None:
542
+ st.info("Upload a file to continue.")
543
+ return
544
+
545
+ st.session_state.uploaded_name = uploaded_file.name
546
+ st.session_state.uploaded_ext = uploaded_file.name.split(".")[-1].lower()
547
+ st.session_state.uploaded_bytes = uploaded_file.getvalue()
548
+ st.success(f"File ready: {uploaded_file.name}")
549
+
550
+ if st.button("Run Full Analysis", type="primary"):
551
+ try:
552
+ results, line_issues = run_analysis(
553
+ uploaded_file=uploaded_file,
554
+ sensitivity=sensitivity,
555
+ backend=model_backend,
556
+ llama_model_path=llama_model_path,
557
+ )
558
+ st.session_state.results = results
559
+ st.session_state.line_issues = line_issues
560
+ st.session_state.analysis_done = True
561
+ st.rerun()
562
+ except Exception as exc:
563
+ st.error(f"Analysis failed: {exc}")
564
+
565
+
566
+ def dashboard_page():
567
+ st.markdown(
568
+ """
569
+ <div class="hero">
570
+ <h2>Interactive Findings Dashboard</h2>
571
+ <p>Trace conflicts by issue type, confidence, and exact line location.</p>
572
+ </div>
573
+ """,
574
+ unsafe_allow_html=True,
575
+ )
576
+ st.markdown('<div class="step">Step 3 of 3: Dashboard</div>', unsafe_allow_html=True)
577
+
578
+ results = st.session_state.results
579
+ line_issues = st.session_state.line_issues
580
+
581
+ if not results:
582
+ st.warning("No results found.")
583
+ return
584
+
585
+ df = pd.DataFrame(results)
586
+ df["Confidence"] = df["Confidence"].astype(float)
587
+
588
+ issues_df = df[~df["Label"].isin(["NO_CONFLICT"])].copy()
589
+
590
+ col1, col2, col3, col4 = st.columns(4)
591
+ with col1:
592
+ st.markdown(
593
+ f"""
594
+ <div class="mini-card">
595
+ <div class="mini-label">User</div>
596
+ <div class="mini-value">{st.session_state.username or "N/A"}</div>
597
+ </div>
598
+ """,
599
+ unsafe_allow_html=True,
600
+ )
601
+ with col2:
602
+ st.markdown(
603
+ f"""
604
+ <div class="mini-card">
605
+ <div class="mini-label">Pairs Reviewed</div>
606
+ <div class="mini-value">{len(df)}</div>
607
+ </div>
608
+ """,
609
+ unsafe_allow_html=True,
610
+ )
611
+ with col3:
612
+ st.markdown(
613
+ f"""
614
+ <div class="mini-card">
615
+ <div class="mini-label">Detected Issues</div>
616
+ <div class="mini-value">{len(issues_df)}</div>
617
+ </div>
618
+ """,
619
+ unsafe_allow_html=True,
620
+ )
621
+ with col4:
622
+ max_conf = float(df["Confidence"].max()) if not df.empty else 0.0
623
+ st.markdown(
624
+ f"""
625
+ <div class="mini-card">
626
+ <div class="mini-label">Max Confidence</div>
627
+ <div class="mini-value">{max_conf:.2f}</div>
628
+ </div>
629
+ """,
630
+ unsafe_allow_html=True,
631
+ )
632
+
633
+ st.subheader("Issue Analytics Dashboard")
634
+ if line_issues:
635
+ line_df = pd.DataFrame(line_issues).copy()
636
+ line_df["Page"] = line_df["Page"].astype(int)
637
+ line_df["Line"] = line_df["Line"].astype(int)
638
+ line_df["Confidence"] = line_df["Confidence"].astype(float)
639
+
640
+ filter_col1, filter_col2, filter_col3 = st.columns([1.2, 1, 1], gap="large")
641
+ with filter_col1:
642
+ issue_types = sorted(line_df["Issue Type"].dropna().unique().tolist())
643
+ issue_sel = st.multiselect("Issue Types", issue_types, default=issue_types)
644
+ with filter_col2:
645
+ conf_min = st.slider("Min Confidence (analytics)", 0.0, 1.0, 0.0, 0.01)
646
+ page_min, page_max = int(line_df["Page"].min()), int(line_df["Page"].max())
647
+ if page_min == page_max:
648
+ st.caption(f"Single issue page: {page_min}")
649
+ page_sel = (page_min, page_max)
650
+ else:
651
+ page_sel = st.slider("Page Range (analytics)", page_min, page_max, (page_min, page_max))
652
+ with filter_col3:
653
+ vendors = ["All"] + sorted(line_df["Vendor"].dropna().astype(str).unique().tolist())
654
+ vendees = ["All"] + sorted(line_df["Vendee"].dropna().astype(str).unique().tolist())
655
+ vendor_sel = st.selectbox("Vendor", vendors, index=0)
656
+ vendee_sel = st.selectbox("Vendee", vendees, index=0)
657
+
658
+ filtered = line_df.copy()
659
+ if issue_sel:
660
+ filtered = filtered[filtered["Issue Type"].isin(issue_sel)]
661
+ filtered = filtered[filtered["Confidence"] >= conf_min]
662
+ filtered = filtered[(filtered["Page"] >= page_sel[0]) & (filtered["Page"] <= page_sel[1])]
663
+ if vendor_sel != "All":
664
+ filtered = filtered[filtered["Vendor"] == vendor_sel]
665
+ if vendee_sel != "All":
666
+ filtered = filtered[filtered["Vendee"] == vendee_sel]
667
+
668
+ total_issues = len(filtered)
669
+ conflict_rate = (len(issues_df) / len(df) * 100.0) if len(df) else 0.0
670
+ top_issue = filtered["Issue Type"].mode().iloc[0] if not filtered.empty else "N/A"
671
+ highest_risk_page = (
672
+ int(filtered.groupby("Page")["Confidence"].mean().idxmax()) if not filtered.empty else "N/A"
673
+ )
674
+ k1, k2, k3, k4 = st.columns(4)
675
+ k1.metric("Filtered Issues", total_issues)
676
+ k2.metric("Conflict Rate", f"{conflict_rate:.1f}%")
677
+ k3.metric("Top Issue Type", top_issue)
678
+ k4.metric("Highest Risk Page", highest_risk_page)
679
+
680
+ if filtered.empty:
681
+ st.warning("No analytics data for current filter.")
682
+ else:
683
+ pie_df = filtered["Issue Type"].value_counts().reset_index()
684
+ pie_df.columns = ["Issue Type", "Count"]
685
+ pie_fig = px.pie(
686
+ pie_df,
687
+ names="Issue Type",
688
+ values="Count",
689
+ title="Issue Type Split",
690
+ hole=0.35,
691
+ )
692
+ pie_fig.update_layout(margin=dict(l=10, r=10, t=50, b=10))
693
+ st.plotly_chart(pie_fig, use_container_width=True)
694
+
695
+ top_lines = filtered.sort_values(by=["Confidence"], ascending=False).head(10)
696
+ st.markdown("**Top 10 High-Risk Lines**")
697
+ st.dataframe(
698
+ top_lines[["Issue Type", "Confidence", "Page", "Line", "Vendor", "Vendee", "Snippet", "Reason"]],
699
+ use_container_width=True,
700
+ )
701
+ else:
702
+ st.info("No issue analytics data available.")
703
+
704
+ tab_findings, tab_line_map, tab_export = st.tabs(
705
+ ["Findings Table", "Error Line Map", "Export"]
706
+ )
707
+
708
+ with tab_findings:
709
+ st.subheader("Detected Issues")
710
+ left, right = st.columns([1, 1.1])
711
+ with left:
712
+ display_mode = st.radio(
713
+ "Display mode",
714
+ ["Issues Only", "All Analyzed Pairs"],
715
+ horizontal=True,
716
+ )
717
+ with right:
718
+ conf_threshold = st.slider("Minimum confidence", 0.0, 1.0, 0.0, 0.01)
719
+
720
+ display_df = issues_df if display_mode == "Issues Only" else df
721
+ display_df = display_df[display_df["Confidence"] >= conf_threshold]
722
+
723
+ if display_mode == "Issues Only" and display_df.empty:
724
+ st.warning("No issues match this filter.")
725
+ st.info("Try lower confidence or switch to 'All Analyzed Pairs'.")
726
+ elif display_df.empty:
727
+ st.info("No analyzed pairs match this filter.")
728
+ else:
729
+ display_df = display_df.copy().reset_index(drop=True)
730
+ display_df.insert(0, "S.No", range(1, len(display_df) + 1))
731
+ cols = [
732
+ "S.No",
733
+ "Label",
734
+ "Confidence",
735
+ "Reason",
736
+ "Location 1",
737
+ "Location 2",
738
+ "Clause 1",
739
+ "Clause 2",
740
+ ]
741
+ st.dataframe(display_df[cols], use_container_width=True)
742
+
743
+ with tab_line_map:
744
+ st.subheader("Error Line Dashboard")
745
+ if line_issues:
746
+ line_df = pd.DataFrame(line_issues)
747
+ labels = sorted(line_df["Issue Type"].dropna().unique().tolist())
748
+ selected = st.multiselect("Filter issue types", labels, default=labels)
749
+ page_min = int(line_df["Page"].min()) if not line_df.empty else 1
750
+ page_max = int(line_df["Page"].max()) if not line_df.empty else 1
751
+ if page_min == page_max:
752
+ st.caption(f"Only one page with issues: Page {page_min}")
753
+ page_range = (page_min, page_max)
754
+ else:
755
+ page_range = st.slider("Page range", page_min, page_max, (page_min, page_max))
756
+
757
+ if selected:
758
+ line_df = line_df[line_df["Issue Type"].isin(selected)]
759
+ line_df = line_df[(line_df["Page"] >= page_range[0]) & (line_df["Page"] <= page_range[1])]
760
+
761
+ st.dataframe(line_df, use_container_width=True)
762
+
763
+ st.markdown("**Issue Occurrence By Line With Parties**")
764
+ by_line = line_df.copy()
765
+ by_line = by_line.sort_values(by=["Page", "Line", "Confidence"], ascending=[True, True, False])
766
+ st.dataframe(
767
+ by_line[["Issue Type", "Page", "Line", "Vendor", "Vendee", "Confidence", "Reason"]],
768
+ use_container_width=True,
769
+ )
770
+
771
+ st.subheader("Jump To Error Line")
772
+ if not line_df.empty:
773
+ line_df = line_df.reset_index(drop=True)
774
+ line_df.insert(0, "Item", range(1, len(line_df) + 1))
775
+ line_df["Jump"] = line_df.apply(
776
+ lambda r: f"#{r['Item']} | Pg {int(r['Page'])}, Ln {int(r['Line'])} | {r['Issue Type']}",
777
+ axis=1,
778
+ )
779
+ selected_jump = st.selectbox("Select issue line", line_df["Jump"].tolist())
780
+ chosen = line_df[line_df["Jump"] == selected_jump].iloc[0]
781
+
782
+ c1, c2 = st.columns([1.1, 1], gap="large")
783
+ with c1:
784
+ st.markdown(
785
+ f"""
786
+ <div class="mini-card">
787
+ <div class="mini-label">Selected Line</div>
788
+ <div class="mini-value">Pg {int(chosen['Page'])} · Ln {int(chosen['Line'])}</div>
789
+ <div class="mono">{chosen['Issue Type']} | Confidence: {float(chosen['Confidence']):.2f}</div>
790
+ </div>
791
+ """,
792
+ unsafe_allow_html=True,
793
+ )
794
+ st.caption("Snippet")
795
+ st.code(str(chosen["Snippet"]), language="text")
796
+ st.caption("Reason")
797
+ st.write(str(chosen["Reason"]))
798
+
799
+ with c2:
800
+ is_pdf = st.session_state.uploaded_ext == "pdf"
801
+ if is_pdf and st.session_state.uploaded_bytes:
802
+ st.caption("PDF Preview (jumped to selected page)")
803
+ page_number = int(chosen["Page"])
804
+ pdf_b64 = base64.b64encode(st.session_state.uploaded_bytes).decode("utf-8")
805
+ pdf_html = f"""
806
+ <iframe
807
+ src="data:application/pdf;base64,{pdf_b64}#page={page_number}&zoom=110"
808
+ width="100%"
809
+ height="520"
810
+ style="border:1px solid #d6e8f4; border-radius: 10px;"
811
+ ></iframe>
812
+ """
813
+ st.markdown(pdf_html, unsafe_allow_html=True)
814
+ else:
815
+ st.info("Inline PDF preview is available for PDF uploads. Current file is not PDF.")
816
+ else:
817
+ st.info("No line-level issues to display.")
818
+
819
+ with tab_export:
820
+ st.subheader("Download Reports")
821
+ json_payload = json.dumps(results, indent=2)
822
+ st.download_button(
823
+ label="Download JSON Report",
824
+ data=json_payload,
825
+ file_name="semantic_integrity_report.json",
826
+ mime="application/json",
827
+ )
828
+ pdf_bytes = generate_pdf_report([r for r in results if r["Label"] != "NO_CONFLICT"])
829
+ st.download_button(
830
+ label="Download PDF Report",
831
+ data=pdf_bytes,
832
+ file_name="semantic_integrity_report.pdf",
833
+ mime="application/pdf",
834
+ )
835
+
836
+ if st.button("Analyze Another Document"):
837
+ st.session_state.analysis_done = False
838
+ st.session_state.results = []
839
+ st.session_state.line_issues = []
840
+ st.rerun()
841
+
842
+
843
+ def main():
844
+ st.set_page_config(page_title=APP_TITLE, layout="wide")
845
+ apply_theme()
846
+ init_state()
847
+
848
+ top_col1, top_col2 = st.columns([5, 1])
849
+ with top_col1:
850
+ st.title(APP_TITLE)
851
+ with top_col2:
852
+ if st.session_state.is_authenticated and st.button("Logout"):
853
+ st.session_state.is_authenticated = False
854
+ st.session_state.username = ""
855
+ st.session_state.analysis_done = False
856
+ st.session_state.results = []
857
+ st.session_state.line_issues = []
858
+ st.rerun()
859
+
860
+ if not st.session_state.is_authenticated:
861
+ login_page()
862
+ return
863
+
864
+ if not st.session_state.analysis_done:
865
+ upload_page()
866
+ else:
867
+ dashboard_page()
868
+
869
+
870
+ if __name__ == "__main__":
871
+ main()