File size: 16,054 Bytes
d5e14e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 |
import os
import re
import requests
import pandas as pd
import gradio as gr
import time
import random
from bs4 import BeautifulSoup
from dateutil.parser import parse
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# βββ 1. OPTIONAL: LLM FOR CORRECTION & PARAPHRASING ββββββββββββββββββββββββββββ
try:
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
def correct_text(raw_text: str) -> str:
"""Paraphrase & correct via T5-small, with fallback on error."""
try:
prompt = "paraphrase and correct: " + raw_text.strip()
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
outputs = model.generate(**inputs, max_length=128)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
except Exception:
return raw_text
except ImportError:
def correct_text(raw_text: str) -> str:
# If transformers not installed, return raw text
return raw_text
# βββ 2. CREATE REQUESTS SESSION WITH RETRY LOGIC ββββββββββββββββββββββββββββββ
def create_robust_session():
"""Create a requests session with retry logic"""
session = requests.Session()
# Configure retry strategy
retry_strategy = Retry(
total=5, # Total number of retries
backoff_factor=1, # Exponential backoff
status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes
allowed_methods=["GET", "POST"] # Allow retrying on POST requests
)
# Mount adapter with retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
# βββ 3. SCRAPER FOR GeM CPPP ββββββββββββββββββββββββββββββββββββββββββββββββββββ
def scrape_gem_cppp(keyword="", org_name="", start_date=None, end_date=None, max_pages=10):
"""Scrape tender data from GeM CPPP portal with robust error handling"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://gem.gov.in/cppp',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive'
}
# Create a robust session with retry logic
session = create_robust_session()
tenders = []
page = 1
total_pages = max_pages
while page <= total_pages and page <= max_pages:
try:
print(f"Fetching page {page} of maximum {max_pages}")
# Prepare form data for the request
form_data = {
'page': str(page),
'tid': '',
'title': keyword,
'orgname': org_name,
'startdate': start_date.strftime('%d-%m-%Y') if start_date else '',
'enddate': end_date.strftime('%d-%m-%Y') if end_date else '',
't_outrefid': '',
'search': '1',
}
# Add a small random delay to avoid rate limiting
time.sleep(random.uniform(0.5, 1.5))
# Make POST request with increased timeouts
resp = session.post(
"https://gem.gov.in/cppp",
headers=headers,
data=form_data,
timeout=(30, 60) # (Connect timeout, Read timeout)
)
# Check if request was successful
if resp.status_code != 200:
print(f"Error: Received status code {resp.status_code}")
break
# Parse the response
soup = BeautifulSoup(resp.text, "html.parser")
# Find the tender table
table = soup.find("table", {"class": "table"})
if not table:
print(f"No tender table found on page {page}")
break
# Extract data from rows (skip header row)
rows = table.find_all("tr")[1:]
if not rows:
print(f"No tender rows found on page {page}")
break
print(f"Found {len(rows)} tender rows on page {page}")
# Process each row
for row in rows:
cols = row.find_all("td")
if len(cols) < 8:
continue
try:
# Extract fields with detailed error handling
closing = cols[0].get_text(strip=True)
opening_date = cols[1].get_text(strip=True)
publish_date = cols[2].get_text(strip=True)
# Extract title and link with careful error handling
title_el = cols[3].find("a")
title = title_el.get_text(strip=True) if title_el else cols[3].get_text(strip=True)
# Extract full link with proper domain
link = ""
if title_el and title_el.has_attr("href"):
link = title_el["href"]
if link and link.startswith("/"):
link = "https://gem.gov.in" + link
# Extract organization
org = cols[4].get_text(strip=True)
# Extract reference ID with better parsing
full_text = cols[3].get_text(strip=True)
ref_id = ""
if title in full_text:
ref_id = full_text.replace(title, "").strip("/").strip()
else:
# Try to extract any alphanumeric ID patterns
id_match = re.search(r'[A-Za-z0-9_-]+/\d+', full_text)
if id_match:
ref_id = id_match.group(0)
# Extract download link with proper error handling
dl_el = cols[7].find("a")
dl_link = ""
if dl_el and dl_el.has_attr("href"):
dl_link = dl_el["href"]
# Ensure it's a complete URL
if dl_link and dl_link.startswith("/"):
dl_link = "https://gem.gov.in" + dl_link
# Apply date filters if specified
try:
if closing:
cdate = parse(closing)
if start_date and cdate < start_date:
continue
if end_date and cdate > end_date:
continue
except Exception:
# If date parsing fails, include the tender anyway
pass
# Add to results
tenders.append({
"Title": title,
"Organization": org,
"Closing Date": closing,
"Opening Date": opening_date,
"Published Date": publish_date,
"Reference/Tender ID": ref_id,
"Tender Link": link,
"Download Link": dl_link
})
except Exception as row_err:
print(f"Error processing row on page {page}: {row_err}")
continue
# Check for pagination
pag = soup.find("ul", {"class": "pagination"})
next_page_exists = False
if pag:
# Look for "Next" button or links to next pages
next_link = pag.find("a", string=re.compile(r"Next", re.I))
if next_link:
next_page_exists = True
# Also check for numbered page links
page_links = pag.find_all("a")
for link in page_links:
try:
page_num = int(link.get_text(strip=True))
total_pages = max(total_pages, page_num)
except (ValueError, TypeError):
pass
if not next_page_exists:
print(f"No next page found after page {page}")
break
# Move to the next page
page += 1
except requests.Timeout:
print(f"Timeout error on page {page}. Retrying...")
continue
except requests.RequestException as e:
print(f"Request error on page {page}: {e}")
# Wait before retrying
time.sleep(5)
continue
except Exception as e:
print(f"Unexpected error on page {page}: {e}")
break
print(f"Scraping completed: found {len(tenders)} tenders across {page} pages")
return tenders
# βββ 4. SUMMARY GENERATOR (ALL RESULTS) ββββββββββββββββββββββββββββββββββββββββ
def summarize_tenders(tenders: list[dict]) -> str:
if not tenders:
return "No tenders were found matching those criteria."
lines = [f"I found {len(tenders)} tenders matching your criteria:\n"]
# Sort tenders by closing date (newest first)
try:
tenders = sorted(tenders,
key=lambda x: parse(x.get("Closing Date", "01-01-2000")),
reverse=True)
except Exception:
# If sorting fails, continue with unsorted data
pass
# Generate the summary
for idx, t in enumerate(tenders, 1):
# Format title with link if available
title_line = f"{idx}. "
if t.get("Tender Link"):
title_line += f"[{t['Title']}]({t['Tender Link']})"
else:
title_line += t['Title']
lines.append(title_line)
# Add organization info
lines.append(f" β’ Organization: {t['Organization']}")
# Add date information
lines.append(f" β’ Closing Date: {t['Closing Date']}")
if t.get("Opening Date") and t["Opening Date"].strip():
lines.append(f" β’ Opening Date: {t['Opening Date']}")
if t.get("Published Date") and t["Published Date"].strip():
lines.append(f" β’ Published Date: {t['Published Date']}")
# Add Reference ID
if t.get("Reference/Tender ID") and t["Reference/Tender ID"].strip():
lines.append(f" β’ Ref ID: {t['Reference/Tender ID']}")
# Add download link if available
if t.get("Download Link") and t["Download Link"].strip():
lines.append(f" β’ [Download Tender Document]({t['Download Link']})")
lines.append("") # Add a blank line between tenders
# Return the formatted summary
return "\n".join(lines)
# βββ 5. CHAT FUNCTION ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def chat_fn(user_message: str, history):
"""Process chat messages and extract search parameters"""
# Debug output
print(f"User Message: {user_message}")
try:
# Clean and potentially correct user message
corrected = correct_text(user_message)
print(f"Corrected Text: {corrected}")
# Extract date ranges with flexible patterns
date_patterns = [
# Format: "from DD/MM/YYYY to DD/MM/YYYY"
r"from\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+to\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})",
# Format: "between DD/MM/YYYY and DD/MM/YYYY"
r"between\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+and\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})"
]
start_date = end_date = None
for pattern in date_patterns:
match = re.search(pattern, corrected, re.I)
if match:
try:
start_date = parse(match.group(1))
end_date = parse(match.group(2))
print(f"Dates extracted: {start_date} to {end_date}")
break
except Exception as e:
print(f"Date parsing error: {e}")
# Extract organization with multiple patterns
org_patterns = [
r"from\s+ministry\s+of\s+(\w+)",
r"from\s+(\w+)\s+ministry",
r"by\s+(\w+\s+\w+)",
r"organization\s+(\w+\s+\w+)"
]
org = ""
for pattern in org_patterns:
org_match = re.search(pattern, corrected.lower())
if org_match:
org = org_match.group(1)
print(f"Organization extracted: {org}")
break
# Extract keywords with smarter filtering
stops = {"find", "search", "get", "tenders", "tender", "from", "to",
"between", "after", "before", "the", "and", "of", "in"}
# Try pattern matching first
keyword = ""
kw_match = re.search(r"(?:get|find|search)\s+(.*?)\s+tenders?", corrected.lower())
if kw_match:
keyword = kw_match.group(1).strip()
else:
# Fallback to word filtering
words = re.findall(r"\b\w+\b", corrected.lower())
keyword = " ".join(w for w in words if w not in stops and len(w) > 2)
print(f"Final keyword: '{keyword}'")
# Search for tenders
results = scrape_gem_cppp(
keyword=keyword.strip(),
org_name=org,
start_date=start_date,
end_date=end_date,
max_pages=10 # Increased max pages
)
# Generate reply
bot_reply = summarize_tenders(results)
except Exception as e:
import traceback
print(f"Error in chat function: {e}")
print(traceback.format_exc())
bot_reply = f"Sorry, an error occurred while processing your request: {str(e)}"
return bot_reply
# βββ 6. GRADIO APP βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Blocks() as demo:
gr.Markdown("## Government Tender Search Chatbot")
gr.Markdown("Ask me to find tenders by keyword, organization, or date range.")
gr.ChatInterface(
fn=chat_fn,
title="TenderBot",
description="E.g. Search solar panel tenders from 01/06/2025 to 30/06/2025",
examples=[
"Find solar panel tenders",
"Search for IT tenders from Ministry of Defense",
"Get construction tenders from 01/05/2025 to 30/06/2025"
],
)
if __name__ == "__main__":
# Launch with appropriate parameters
demo.launch(debug=True, share=False) |