Spaces:
Sleeping
Sleeping
Rivalcoder
commited on
Commit
·
fbf4182
1
Parent(s):
540d948
Use of Playright
Browse files- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/kanon_api.cpython-312.pyc +0 -0
- __pycache__/predictor.cpython-312.pyc +0 -0
- __pycache__/vectorstore.cpython-312.pyc +0 -0
- app.py +1 -1
- kanon_api.py +46 -73
- predictor.py +17 -21
__pycache__/app.cpython-312.pyc
ADDED
|
Binary file (1.66 kB). View file
|
|
|
__pycache__/kanon_api.cpython-312.pyc
ADDED
|
Binary file (3.48 kB). View file
|
|
|
__pycache__/predictor.cpython-312.pyc
ADDED
|
Binary file (5.37 kB). View file
|
|
|
__pycache__/vectorstore.cpython-312.pyc
ADDED
|
Binary file (2.27 kB). View file
|
|
|
app.py
CHANGED
|
@@ -12,7 +12,7 @@ class CaseRequest(BaseModel):
|
|
| 12 |
@app.post("/predict")
|
| 13 |
async def predict(case_request: CaseRequest):
|
| 14 |
user_case = case_request.case
|
| 15 |
-
result = predict_outcome(user_case)
|
| 16 |
return {"prediction": result}
|
| 17 |
|
| 18 |
@app.get("/health")
|
|
|
|
| 12 |
@app.post("/predict")
|
| 13 |
async def predict(case_request: CaseRequest):
|
| 14 |
user_case = case_request.case
|
| 15 |
+
result = await predict_outcome(user_case)
|
| 16 |
return {"prediction": result}
|
| 17 |
|
| 18 |
@app.get("/health")
|
kanon_api.py
CHANGED
|
@@ -1,19 +1,33 @@
|
|
| 1 |
-
import
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
-
|
| 4 |
|
| 5 |
BASE_URL = "https://indiankanoon.org"
|
| 6 |
|
| 7 |
-
|
| 8 |
-
""
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
search_url = f"{BASE_URL}/search/?formInput={query}"
|
| 13 |
-
|
| 14 |
-
response.raise_for_status()
|
| 15 |
|
| 16 |
-
soup = BeautifulSoup(
|
| 17 |
results = []
|
| 18 |
|
| 19 |
for result in soup.select(".result_title")[:max_results]:
|
|
@@ -25,68 +39,27 @@ def search_cases(query, max_results=10):
|
|
| 25 |
})
|
| 26 |
return results
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
""
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
if text:
|
| 51 |
-
return text
|
| 52 |
-
|
| 53 |
-
paragraphs = soup.find_all("p")
|
| 54 |
-
if paragraphs:
|
| 55 |
-
return "\n".join(p.get_text(strip=True) for p in paragraphs)
|
| 56 |
-
|
| 57 |
-
except Exception:
|
| 58 |
-
return None
|
| 59 |
|
| 60 |
return "No content found."
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
# =========================
|
| 64 |
-
# Parallel Case Fetching
|
| 65 |
-
# =========================
|
| 66 |
-
def fetch_case_text(case):
|
| 67 |
-
"""
|
| 68 |
-
Fetch case content safely for a single case dictionary.
|
| 69 |
-
"""
|
| 70 |
-
case['text'] = get_case_content(case['url'])
|
| 71 |
-
return case
|
| 72 |
-
|
| 73 |
-
def fetch_cases_parallel(cases, max_workers=5):
|
| 74 |
-
"""
|
| 75 |
-
Fetch multiple cases in parallel using ThreadPoolExecutor.
|
| 76 |
-
"""
|
| 77 |
-
results = []
|
| 78 |
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 79 |
-
futures = {executor.submit(fetch_case_text, case): case for case in cases}
|
| 80 |
-
for future in as_completed(futures):
|
| 81 |
-
results.append(future.result())
|
| 82 |
-
return results
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
# # Example usage
|
| 86 |
-
# query = "Cheat in Neet exam"
|
| 87 |
-
# cases = search_cases(query, max_results=5)
|
| 88 |
-
# # Fetch content in parallel
|
| 89 |
-
# cases = fetch_cases_parallel(cases, max_workers=5)
|
| 90 |
-
# for case in cases:
|
| 91 |
-
# print(f"Title: {case['title']}")
|
| 92 |
-
# print(f"Content snippet: {case['text'][:1000]}...\n")
|
|
|
|
| 1 |
+
from playwright.async_api import async_playwright
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
+
import asyncio
|
| 4 |
|
| 5 |
BASE_URL = "https://indiankanoon.org"
|
| 6 |
|
| 7 |
+
HEADERS = {
|
| 8 |
+
"User-Agent": (
|
| 9 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 10 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 11 |
+
"Chrome/122.0.0.0 Safari/537.36"
|
| 12 |
+
),
|
| 13 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
async def safe_get_content(url: str) -> str:
|
| 17 |
+
async with async_playwright() as p:
|
| 18 |
+
browser = await p.chromium.launch(headless=True)
|
| 19 |
+
context = await browser.new_context()
|
| 20 |
+
page = await context.new_page()
|
| 21 |
+
await page.goto(url, wait_until="domcontentloaded")
|
| 22 |
+
html_content = await page.content()
|
| 23 |
+
await browser.close()
|
| 24 |
+
return html_content
|
| 25 |
+
|
| 26 |
+
async def search_cases(query, max_results=10):
|
| 27 |
search_url = f"{BASE_URL}/search/?formInput={query}"
|
| 28 |
+
html_content = await safe_get_content(search_url)
|
|
|
|
| 29 |
|
| 30 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 31 |
results = []
|
| 32 |
|
| 33 |
for result in soup.select(".result_title")[:max_results]:
|
|
|
|
| 39 |
})
|
| 40 |
return results
|
| 41 |
|
| 42 |
+
async def get_case_content(case_url):
|
| 43 |
+
html_content = await safe_get_content(case_url)
|
| 44 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 45 |
+
|
| 46 |
+
selectors = [
|
| 47 |
+
"div#maincontent",
|
| 48 |
+
"div.content",
|
| 49 |
+
"pre",
|
| 50 |
+
"div.article_text",
|
| 51 |
+
"div.judgement-text",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for sel in selectors:
|
| 55 |
+
content_div = soup.select_one(sel)
|
| 56 |
+
if content_div:
|
| 57 |
+
text = content_div.get_text(separator="\n", strip=True)
|
| 58 |
+
if text:
|
| 59 |
+
return text
|
| 60 |
+
|
| 61 |
+
paragraphs = soup.find_all("p")
|
| 62 |
+
if paragraphs:
|
| 63 |
+
return "\n".join(p.get_text(strip=True) for p in paragraphs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
return "No content found."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
predictor.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
-
from kanon_api import search_cases, get_case_content
|
| 2 |
from vectorstore import create_vector_store
|
| 3 |
from google import genai
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
import json
|
| 7 |
-
|
| 8 |
|
| 9 |
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 10 |
|
| 11 |
-
def predict_outcome(user_case: str):
|
| 12 |
"""
|
| 13 |
Predict likely case outcome using AI based on related past cases.
|
| 14 |
"""
|
|
@@ -32,28 +32,28 @@ Example output:
|
|
| 32 |
"Liability for defective vehicles and accident compensation."
|
| 33 |
"About compensation for deaths and injuries due to a road accident caused by a vehicle defect"
|
| 34 |
"""
|
| 35 |
-
|
| 36 |
search_chat = client.chats.create(model="gemini-2.5-flash-lite")
|
| 37 |
query_response = search_chat.send_message(search_prompt)
|
| 38 |
-
|
| 39 |
query = query_response.text.strip().replace("\n", " ").strip('"').strip("'")
|
| 40 |
print("Generated legal search query:", query)
|
| 41 |
|
| 42 |
-
# 2️⃣ Search related cases
|
| 43 |
-
related_cases_data = search_cases(query, max_results=10)
|
| 44 |
|
| 45 |
-
# 3️⃣ Fetch full text for each result
|
| 46 |
-
for case in related_cases_data
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
|
| 49 |
related_cases_texts = [case["text"] for case in related_cases_data if case.get("text")]
|
| 50 |
if not related_cases_texts:
|
| 51 |
-
return "No relevant cases found to analyze."
|
| 52 |
|
| 53 |
# 4️⃣ Create vector store
|
| 54 |
vectorstore = create_vector_store(related_cases_texts)
|
| 55 |
if not vectorstore:
|
| 56 |
-
return "Vector store creation failed."
|
| 57 |
|
| 58 |
# 5️⃣ Retrieve relevant cases
|
| 59 |
retriever = vectorstore.as_retriever()
|
|
@@ -61,7 +61,7 @@ Example output:
|
|
| 61 |
combined_text = "\n".join([d.page_content for d in relevant_docs])
|
| 62 |
|
| 63 |
if not combined_text.strip():
|
| 64 |
-
return "No relevant context could be found from retrieved cases."
|
| 65 |
|
| 66 |
# 6️⃣ Generate final prediction
|
| 67 |
prompt = f"""
|
|
@@ -94,19 +94,15 @@ Do **not** include any explanation outside the JSON.
|
|
| 94 |
|
| 95 |
raw_text = response.text.strip()
|
| 96 |
|
| 97 |
-
#
|
| 98 |
raw_text = re.sub(r"^```json\s*|^```|```$", "", raw_text, flags=re.IGNORECASE).strip()
|
| 99 |
-
|
| 100 |
-
# 2️⃣ Remove wrapping quotes if present
|
| 101 |
if (raw_text.startswith('"') and raw_text.endswith('"')) or (raw_text.startswith("'") and raw_text.endswith("'")):
|
| 102 |
-
raw_text = raw_text[1:-1].strip()
|
| 103 |
-
# Unescape quotes inside
|
| 104 |
-
raw_text = raw_text.replace('\\"', '"').replace("\\'", "'")
|
| 105 |
|
| 106 |
-
#
|
| 107 |
try:
|
| 108 |
result_json = json.loads(raw_text)
|
| 109 |
except json.JSONDecodeError:
|
| 110 |
result_json = {"error": "AI did not return valid JSON", "raw_response": raw_text}
|
| 111 |
|
| 112 |
-
return result_json
|
|
|
|
| 1 |
+
from kanon_api import search_cases, get_case_content # now async versions
|
| 2 |
from vectorstore import create_vector_store
|
| 3 |
from google import genai
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
import json
|
| 7 |
+
import asyncio
|
| 8 |
|
| 9 |
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 10 |
|
| 11 |
+
async def predict_outcome(user_case: str):
|
| 12 |
"""
|
| 13 |
Predict likely case outcome using AI based on related past cases.
|
| 14 |
"""
|
|
|
|
| 32 |
"Liability for defective vehicles and accident compensation."
|
| 33 |
"About compensation for deaths and injuries due to a road accident caused by a vehicle defect"
|
| 34 |
"""
|
|
|
|
| 35 |
search_chat = client.chats.create(model="gemini-2.5-flash-lite")
|
| 36 |
query_response = search_chat.send_message(search_prompt)
|
|
|
|
| 37 |
query = query_response.text.strip().replace("\n", " ").strip('"').strip("'")
|
| 38 |
print("Generated legal search query:", query)
|
| 39 |
|
| 40 |
+
# 2️⃣ Search related cases (async)
|
| 41 |
+
related_cases_data = await search_cases(query, max_results=10)
|
| 42 |
|
| 43 |
+
# 3️⃣ Fetch full text for each result concurrently
|
| 44 |
+
tasks = [get_case_content(case["url"]) for case in related_cases_data]
|
| 45 |
+
texts = await asyncio.gather(*tasks)
|
| 46 |
+
for case, text in zip(related_cases_data, texts):
|
| 47 |
+
case["text"] = text
|
| 48 |
|
| 49 |
related_cases_texts = [case["text"] for case in related_cases_data if case.get("text")]
|
| 50 |
if not related_cases_texts:
|
| 51 |
+
return {"error": "No relevant cases found to analyze."}
|
| 52 |
|
| 53 |
# 4️⃣ Create vector store
|
| 54 |
vectorstore = create_vector_store(related_cases_texts)
|
| 55 |
if not vectorstore:
|
| 56 |
+
return {"error": "Vector store creation failed."}
|
| 57 |
|
| 58 |
# 5️⃣ Retrieve relevant cases
|
| 59 |
retriever = vectorstore.as_retriever()
|
|
|
|
| 61 |
combined_text = "\n".join([d.page_content for d in relevant_docs])
|
| 62 |
|
| 63 |
if not combined_text.strip():
|
| 64 |
+
return {"error": "No relevant context could be found from retrieved cases."}
|
| 65 |
|
| 66 |
# 6️⃣ Generate final prediction
|
| 67 |
prompt = f"""
|
|
|
|
| 94 |
|
| 95 |
raw_text = response.text.strip()
|
| 96 |
|
| 97 |
+
# Clean ```json``` or wrapping quotes
|
| 98 |
raw_text = re.sub(r"^```json\s*|^```|```$", "", raw_text, flags=re.IGNORECASE).strip()
|
|
|
|
|
|
|
| 99 |
if (raw_text.startswith('"') and raw_text.endswith('"')) or (raw_text.startswith("'") and raw_text.endswith("'")):
|
| 100 |
+
raw_text = raw_text[1:-1].strip().replace('\\"', '"').replace("\\'", "'")
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
# Parse JSON
|
| 103 |
try:
|
| 104 |
result_json = json.loads(raw_text)
|
| 105 |
except json.JSONDecodeError:
|
| 106 |
result_json = {"error": "AI did not return valid JSON", "raw_response": raw_text}
|
| 107 |
|
| 108 |
+
return result_json
|