Rivalcoder commited on
Commit
fbf4182
·
1 Parent(s): 540d948

Use of Playright

Browse files
__pycache__/app.cpython-312.pyc ADDED
Binary file (1.66 kB). View file
 
__pycache__/kanon_api.cpython-312.pyc ADDED
Binary file (3.48 kB). View file
 
__pycache__/predictor.cpython-312.pyc ADDED
Binary file (5.37 kB). View file
 
__pycache__/vectorstore.cpython-312.pyc ADDED
Binary file (2.27 kB). View file
 
app.py CHANGED
@@ -12,7 +12,7 @@ class CaseRequest(BaseModel):
12
  @app.post("/predict")
13
  async def predict(case_request: CaseRequest):
14
  user_case = case_request.case
15
- result = predict_outcome(user_case)
16
  return {"prediction": result}
17
 
18
  @app.get("/health")
 
12
  @app.post("/predict")
13
  async def predict(case_request: CaseRequest):
14
  user_case = case_request.case
15
+ result = await predict_outcome(user_case)
16
  return {"prediction": result}
17
 
18
  @app.get("/health")
kanon_api.py CHANGED
@@ -1,19 +1,33 @@
1
- import requests
2
  from bs4 import BeautifulSoup
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
 
5
  BASE_URL = "https://indiankanoon.org"
6
 
7
- def search_cases(query, max_results=10):
8
- """
9
- Scrape search results from Indian Kanoon website.
10
- Returns a list of case URLs and titles.
11
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  search_url = f"{BASE_URL}/search/?formInput={query}"
13
- response = requests.get(search_url)
14
- response.raise_for_status()
15
 
16
- soup = BeautifulSoup(response.text, "html.parser")
17
  results = []
18
 
19
  for result in soup.select(".result_title")[:max_results]:
@@ -25,68 +39,27 @@ def search_cases(query, max_results=10):
25
  })
26
  return results
27
 
28
-
29
- def get_case_content(case_url):
30
- """
31
- Scrape the full text of a case from its URL.
32
- """
33
- try:
34
- response = requests.get(case_url)
35
- response.raise_for_status()
36
- soup = BeautifulSoup(response.text, "html.parser")
37
-
38
- selectors = [
39
- "div#maincontent",
40
- "div.content",
41
- "pre",
42
- "div.article_text",
43
- "div.judgement-text"
44
- ]
45
-
46
- for sel in selectors:
47
- content_div = soup.select_one(sel)
48
- if content_div:
49
- text = content_div.get_text(separator="\n", strip=True)
50
- if text:
51
- return text
52
-
53
- paragraphs = soup.find_all("p")
54
- if paragraphs:
55
- return "\n".join(p.get_text(strip=True) for p in paragraphs)
56
-
57
- except Exception:
58
- return None
59
 
60
  return "No content found."
61
-
62
-
63
- # =========================
64
- # Parallel Case Fetching
65
- # =========================
66
- def fetch_case_text(case):
67
- """
68
- Fetch case content safely for a single case dictionary.
69
- """
70
- case['text'] = get_case_content(case['url'])
71
- return case
72
-
73
- def fetch_cases_parallel(cases, max_workers=5):
74
- """
75
- Fetch multiple cases in parallel using ThreadPoolExecutor.
76
- """
77
- results = []
78
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
79
- futures = {executor.submit(fetch_case_text, case): case for case in cases}
80
- for future in as_completed(futures):
81
- results.append(future.result())
82
- return results
83
-
84
-
85
- # # Example usage
86
- # query = "Cheat in Neet exam"
87
- # cases = search_cases(query, max_results=5)
88
- # # Fetch content in parallel
89
- # cases = fetch_cases_parallel(cases, max_workers=5)
90
- # for case in cases:
91
- # print(f"Title: {case['title']}")
92
- # print(f"Content snippet: {case['text'][:1000]}...\n")
 
1
+ from playwright.async_api import async_playwright
2
  from bs4 import BeautifulSoup
3
+ import asyncio
4
 
5
  BASE_URL = "https://indiankanoon.org"
6
 
7
+ HEADERS = {
8
+ "User-Agent": (
9
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
10
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
11
+ "Chrome/122.0.0.0 Safari/537.36"
12
+ ),
13
+ "Accept-Language": "en-US,en;q=0.9",
14
+ }
15
+
16
+ async def safe_get_content(url: str) -> str:
17
+ async with async_playwright() as p:
18
+ browser = await p.chromium.launch(headless=True)
19
+ context = await browser.new_context()
20
+ page = await context.new_page()
21
+ await page.goto(url, wait_until="domcontentloaded")
22
+ html_content = await page.content()
23
+ await browser.close()
24
+ return html_content
25
+
26
+ async def search_cases(query, max_results=10):
27
  search_url = f"{BASE_URL}/search/?formInput={query}"
28
+ html_content = await safe_get_content(search_url)
 
29
 
30
+ soup = BeautifulSoup(html_content, "html.parser")
31
  results = []
32
 
33
  for result in soup.select(".result_title")[:max_results]:
 
39
  })
40
  return results
41
 
42
+ async def get_case_content(case_url):
43
+ html_content = await safe_get_content(case_url)
44
+ soup = BeautifulSoup(html_content, "html.parser")
45
+
46
+ selectors = [
47
+ "div#maincontent",
48
+ "div.content",
49
+ "pre",
50
+ "div.article_text",
51
+ "div.judgement-text",
52
+ ]
53
+
54
+ for sel in selectors:
55
+ content_div = soup.select_one(sel)
56
+ if content_div:
57
+ text = content_div.get_text(separator="\n", strip=True)
58
+ if text:
59
+ return text
60
+
61
+ paragraphs = soup.find_all("p")
62
+ if paragraphs:
63
+ return "\n".join(p.get_text(strip=True) for p in paragraphs)
 
 
 
 
 
 
 
 
 
64
 
65
  return "No content found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
predictor.py CHANGED
@@ -1,14 +1,14 @@
1
- from kanon_api import search_cases, get_case_content
2
  from vectorstore import create_vector_store
3
  from google import genai
4
  import os
5
  import re
6
  import json
7
-
8
 
9
  client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
10
 
11
- def predict_outcome(user_case: str):
12
  """
13
  Predict likely case outcome using AI based on related past cases.
14
  """
@@ -32,28 +32,28 @@ Example output:
32
  "Liability for defective vehicles and accident compensation."
33
  "About compensation for deaths and injuries due to a road accident caused by a vehicle defect"
34
  """
35
-
36
  search_chat = client.chats.create(model="gemini-2.5-flash-lite")
37
  query_response = search_chat.send_message(search_prompt)
38
-
39
  query = query_response.text.strip().replace("\n", " ").strip('"').strip("'")
40
  print("Generated legal search query:", query)
41
 
42
- # 2️⃣ Search related cases
43
- related_cases_data = search_cases(query, max_results=10)
44
 
45
- # 3️⃣ Fetch full text for each result
46
- for case in related_cases_data:
47
- case['text'] = get_case_content(case['url'])
 
 
48
 
49
  related_cases_texts = [case["text"] for case in related_cases_data if case.get("text")]
50
  if not related_cases_texts:
51
- return "No relevant cases found to analyze."
52
 
53
  # 4️⃣ Create vector store
54
  vectorstore = create_vector_store(related_cases_texts)
55
  if not vectorstore:
56
- return "Vector store creation failed."
57
 
58
  # 5️⃣ Retrieve relevant cases
59
  retriever = vectorstore.as_retriever()
@@ -61,7 +61,7 @@ Example output:
61
  combined_text = "\n".join([d.page_content for d in relevant_docs])
62
 
63
  if not combined_text.strip():
64
- return "No relevant context could be found from retrieved cases."
65
 
66
  # 6️⃣ Generate final prediction
67
  prompt = f"""
@@ -94,19 +94,15 @@ Do **not** include any explanation outside the JSON.
94
 
95
  raw_text = response.text.strip()
96
 
97
- # 1️⃣ Remove ```json or ``` at start/end
98
  raw_text = re.sub(r"^```json\s*|^```|```$", "", raw_text, flags=re.IGNORECASE).strip()
99
-
100
- # 2️⃣ Remove wrapping quotes if present
101
  if (raw_text.startswith('"') and raw_text.endswith('"')) or (raw_text.startswith("'") and raw_text.endswith("'")):
102
- raw_text = raw_text[1:-1].strip()
103
- # Unescape quotes inside
104
- raw_text = raw_text.replace('\\"', '"').replace("\\'", "'")
105
 
106
- # 3️⃣ Try parsing as JSON
107
  try:
108
  result_json = json.loads(raw_text)
109
  except json.JSONDecodeError:
110
  result_json = {"error": "AI did not return valid JSON", "raw_response": raw_text}
111
 
112
- return result_json
 
1
+ from kanon_api import search_cases, get_case_content # now async versions
2
  from vectorstore import create_vector_store
3
  from google import genai
4
  import os
5
  import re
6
  import json
7
+ import asyncio
8
 
9
  client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
10
 
11
+ async def predict_outcome(user_case: str):
12
  """
13
  Predict likely case outcome using AI based on related past cases.
14
  """
 
32
  "Liability for defective vehicles and accident compensation."
33
  "About compensation for deaths and injuries due to a road accident caused by a vehicle defect"
34
  """
 
35
  search_chat = client.chats.create(model="gemini-2.5-flash-lite")
36
  query_response = search_chat.send_message(search_prompt)
 
37
  query = query_response.text.strip().replace("\n", " ").strip('"').strip("'")
38
  print("Generated legal search query:", query)
39
 
40
+ # 2️⃣ Search related cases (async)
41
+ related_cases_data = await search_cases(query, max_results=10)
42
 
43
+ # 3️⃣ Fetch full text for each result concurrently
44
+ tasks = [get_case_content(case["url"]) for case in related_cases_data]
45
+ texts = await asyncio.gather(*tasks)
46
+ for case, text in zip(related_cases_data, texts):
47
+ case["text"] = text
48
 
49
  related_cases_texts = [case["text"] for case in related_cases_data if case.get("text")]
50
  if not related_cases_texts:
51
+ return {"error": "No relevant cases found to analyze."}
52
 
53
  # 4️⃣ Create vector store
54
  vectorstore = create_vector_store(related_cases_texts)
55
  if not vectorstore:
56
+ return {"error": "Vector store creation failed."}
57
 
58
  # 5️⃣ Retrieve relevant cases
59
  retriever = vectorstore.as_retriever()
 
61
  combined_text = "\n".join([d.page_content for d in relevant_docs])
62
 
63
  if not combined_text.strip():
64
+ return {"error": "No relevant context could be found from retrieved cases."}
65
 
66
  # 6️⃣ Generate final prediction
67
  prompt = f"""
 
94
 
95
  raw_text = response.text.strip()
96
 
97
+ # Clean ```json``` or wrapping quotes
98
  raw_text = re.sub(r"^```json\s*|^```|```$", "", raw_text, flags=re.IGNORECASE).strip()
 
 
99
  if (raw_text.startswith('"') and raw_text.endswith('"')) or (raw_text.startswith("'") and raw_text.endswith("'")):
100
+ raw_text = raw_text[1:-1].strip().replace('\\"', '"').replace("\\'", "'")
 
 
101
 
102
+ # Parse JSON
103
  try:
104
  result_json = json.loads(raw_text)
105
  except json.JSONDecodeError:
106
  result_json = {"error": "AI did not return valid JSON", "raw_response": raw_text}
107
 
108
+ return result_json