Spaces:

Rivalcoder
/

Prediction

Sleeping

App Files Files Community

Rivalcoder commited on Sep 20

Commit

fbf4182

1 Parent(s): 540d948

Use of Playright

Browse files

Files changed (7) hide show

__pycache__/app.cpython-312.pyc +0 -0
__pycache__/kanon_api.cpython-312.pyc +0 -0
__pycache__/predictor.cpython-312.pyc +0 -0
__pycache__/vectorstore.cpython-312.pyc +0 -0
app.py +1 -1
kanon_api.py +46 -73
predictor.py +17 -21

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (1.66 kB). View file

__pycache__/kanon_api.cpython-312.pyc ADDED Viewed

Binary file (3.48 kB). View file

__pycache__/predictor.cpython-312.pyc ADDED Viewed

Binary file (5.37 kB). View file

__pycache__/vectorstore.cpython-312.pyc ADDED Viewed

Binary file (2.27 kB). View file

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ class CaseRequest(BaseModel):
 @app.post("/predict")
 async def predict(case_request: CaseRequest):
     user_case = case_request.case
-    result = predict_outcome(user_case)
     return {"prediction": result}
 @app.get("/health")

 @app.post("/predict")
 async def predict(case_request: CaseRequest):
     user_case = case_request.case
+    result = await predict_outcome(user_case)
     return {"prediction": result}
 @app.get("/health")

kanon_api.py CHANGED Viewed

@@ -1,19 +1,33 @@
-import requests
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, as_completed
 BASE_URL = "https://indiankanoon.org"
-def search_cases(query, max_results=10):
-    """
-    Scrape search results from Indian Kanoon website.
-    Returns a list of case URLs and titles.
-    """
     search_url = f"{BASE_URL}/search/?formInput={query}"
-    response = requests.get(search_url)
-    response.raise_for_status()
-    soup = BeautifulSoup(response.text, "html.parser")
     results = []
     for result in soup.select(".result_title")[:max_results]:
@@ -25,68 +39,27 @@ def search_cases(query, max_results=10):
             })
     return results
-def get_case_content(case_url):
-    """
-    Scrape the full text of a case from its URL.
-    """
-    try:
-        response = requests.get(case_url)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
-        selectors = [
-            "div#maincontent",
-            "div.content",
-            "pre",
-            "div.article_text",
-            "div.judgement-text"
-        ]
-        for sel in selectors:
-            content_div = soup.select_one(sel)
-            if content_div:
-                text = content_div.get_text(separator="\n", strip=True)
-                if text:
-                    return text
-        paragraphs = soup.find_all("p")
-        if paragraphs:
-            return "\n".join(p.get_text(strip=True) for p in paragraphs)
-    except Exception:
-        return None
     return "No content found."
-# =========================
-# Parallel Case Fetching
-# =========================
-def fetch_case_text(case):
-    """
-    Fetch case content safely for a single case dictionary.
-    """
-    case['text'] = get_case_content(case['url'])
-    return case
-def fetch_cases_parallel(cases, max_workers=5):
-    """
-    Fetch multiple cases in parallel using ThreadPoolExecutor.
-    """
-    results = []
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {executor.submit(fetch_case_text, case): case for case in cases}
-        for future in as_completed(futures):
-            results.append(future.result())
-    return results
-# # Example usage
-# query = "Cheat in Neet exam"
-# cases = search_cases(query, max_results=5)
-# # Fetch content in parallel
-# cases = fetch_cases_parallel(cases, max_workers=5)
-# for case in cases:
-#     print(f"Title: {case['title']}")
-#     print(f"Content snippet: {case['text'][:1000]}...\n")

+from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
+import asyncio
 BASE_URL = "https://indiankanoon.org"
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/122.0.0.0 Safari/537.36"
+    ),
+    "Accept-Language": "en-US,en;q=0.9",
+}
+async def safe_get_content(url: str) -> str:
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context()
+        page = await context.new_page()
+        await page.goto(url, wait_until="domcontentloaded")
+        html_content = await page.content()
+        await browser.close()
+        return html_content
+async def search_cases(query, max_results=10):
     search_url = f"{BASE_URL}/search/?formInput={query}"
+    html_content = await safe_get_content(search_url)
+    soup = BeautifulSoup(html_content, "html.parser")
     results = []
     for result in soup.select(".result_title")[:max_results]:
             })
     return results
+async def get_case_content(case_url):
+    html_content = await safe_get_content(case_url)
+    soup = BeautifulSoup(html_content, "html.parser")
+    selectors = [
+        "div#maincontent",
+        "div.content",
+        "pre",
+        "div.article_text",
+        "div.judgement-text",
+    ]
+    for sel in selectors:
+        content_div = soup.select_one(sel)
+        if content_div:
+            text = content_div.get_text(separator="\n", strip=True)
+            if text:
+                return text
+    paragraphs = soup.find_all("p")
+    if paragraphs:
+        return "\n".join(p.get_text(strip=True) for p in paragraphs)
     return "No content found."

predictor.py CHANGED Viewed

@@ -1,14 +1,14 @@
-from kanon_api import search_cases, get_case_content
 from vectorstore import create_vector_store
 from google import genai
 import os
 import re
 import json
 client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
-def predict_outcome(user_case: str):
     """
     Predict likely case outcome using AI based on related past cases.
     """
@@ -32,28 +32,28 @@ Example output:
 "Liability for defective vehicles and accident compensation."
 "About compensation for deaths and injuries due to a road accident caused by a vehicle defect"
 """
     search_chat = client.chats.create(model="gemini-2.5-flash-lite")
     query_response = search_chat.send_message(search_prompt)
     query = query_response.text.strip().replace("\n", " ").strip('"').strip("'")
     print("Generated legal search query:", query)
-    # 2️⃣ Search related cases
-    related_cases_data = search_cases(query, max_results=10)
-    # 3️⃣ Fetch full text for each result
-    for case in related_cases_data:
-        case['text'] = get_case_content(case['url'])
     related_cases_texts = [case["text"] for case in related_cases_data if case.get("text")]
     if not related_cases_texts:
-        return "No relevant cases found to analyze."
     # 4️⃣ Create vector store
     vectorstore = create_vector_store(related_cases_texts)
     if not vectorstore:
-        return "Vector store creation failed."
     # 5️⃣ Retrieve relevant cases
     retriever = vectorstore.as_retriever()
@@ -61,7 +61,7 @@ Example output:
     combined_text = "\n".join([d.page_content for d in relevant_docs])
     if not combined_text.strip():
-        return "No relevant context could be found from retrieved cases."
     # 6️⃣ Generate final prediction
     prompt = f"""
@@ -94,19 +94,15 @@ Do **not** include any explanation outside the JSON.
     raw_text = response.text.strip()
-    # 1️⃣ Remove ```json or ``` at start/end
     raw_text = re.sub(r"^```json\s*|^```|```$", "", raw_text, flags=re.IGNORECASE).strip()
-    # 2️⃣ Remove wrapping quotes if present
     if (raw_text.startswith('"') and raw_text.endswith('"')) or (raw_text.startswith("'") and raw_text.endswith("'")):
-        raw_text = raw_text[1:-1].strip()
-        # Unescape quotes inside
-        raw_text = raw_text.replace('\\"', '"').replace("\\'", "'")
-    # 3️⃣ Try parsing as JSON
     try:
         result_json = json.loads(raw_text)
     except json.JSONDecodeError:
         result_json = {"error": "AI did not return valid JSON", "raw_response": raw_text}
-    return result_json

+from kanon_api import search_cases, get_case_content  # now async versions
 from vectorstore import create_vector_store
 from google import genai
 import os
 import re
 import json
+import asyncio
 client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
+async def predict_outcome(user_case: str):
     """
     Predict likely case outcome using AI based on related past cases.
     """
 "Liability for defective vehicles and accident compensation."
 "About compensation for deaths and injuries due to a road accident caused by a vehicle defect"
 """
     search_chat = client.chats.create(model="gemini-2.5-flash-lite")
     query_response = search_chat.send_message(search_prompt)
     query = query_response.text.strip().replace("\n", " ").strip('"').strip("'")
     print("Generated legal search query:", query)
+    # 2️⃣ Search related cases (async)
+    related_cases_data = await search_cases(query, max_results=10)
+    # 3️⃣ Fetch full text for each result concurrently
+    tasks = [get_case_content(case["url"]) for case in related_cases_data]
+    texts = await asyncio.gather(*tasks)
+    for case, text in zip(related_cases_data, texts):
+        case["text"] = text
     related_cases_texts = [case["text"] for case in related_cases_data if case.get("text")]
     if not related_cases_texts:
+        return {"error": "No relevant cases found to analyze."}
     # 4️⃣ Create vector store
     vectorstore = create_vector_store(related_cases_texts)
     if not vectorstore:
+        return {"error": "Vector store creation failed."}
     # 5️⃣ Retrieve relevant cases
     retriever = vectorstore.as_retriever()
     combined_text = "\n".join([d.page_content for d in relevant_docs])
     if not combined_text.strip():
+        return {"error": "No relevant context could be found from retrieved cases."}
     # 6️⃣ Generate final prediction
     prompt = f"""
     raw_text = response.text.strip()
+    # Clean ```json``` or wrapping quotes
     raw_text = re.sub(r"^```json\s*|^```|```$", "", raw_text, flags=re.IGNORECASE).strip()
     if (raw_text.startswith('"') and raw_text.endswith('"')) or (raw_text.startswith("'") and raw_text.endswith("'")):
+        raw_text = raw_text[1:-1].strip().replace('\\"', '"').replace("\\'", "'")
+    # Parse JSON
     try:
         result_json = json.loads(raw_text)
     except json.JSONDecodeError:
         result_json = {"error": "AI did not return valid JSON", "raw_response": raw_text}
+    return result_json