|
import os |
|
import re |
|
import requests |
|
import pandas as pd |
|
import gradio as gr |
|
import time |
|
import random |
|
from bs4 import BeautifulSoup |
|
from dateutil.parser import parse |
|
from datetime import datetime, timedelta |
|
from requests.adapters import HTTPAdapter |
|
from urllib3.util.retry import Retry |
|
|
|
|
|
try: |
|
from transformers import T5ForConditionalGeneration, T5Tokenizer |
|
tokenizer = T5Tokenizer.from_pretrained("t5-small") |
|
model = T5ForConditionalGeneration.from_pretrained("t5-small") |
|
|
|
def correct_text(raw_text: str) -> str: |
|
"""Paraphrase & correct via T5-small, with fallback on error.""" |
|
try: |
|
prompt = "paraphrase and correct: " + raw_text.strip() |
|
inputs = tokenizer(prompt, return_tensors="pt", truncation=True) |
|
outputs = model.generate(**inputs, max_length=128) |
|
return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
except Exception: |
|
return raw_text |
|
except ImportError: |
|
def correct_text(raw_text: str) -> str: |
|
|
|
return raw_text |
|
|
|
|
|
def create_robust_session(): |
|
"""Create a requests session with retry logic""" |
|
session = requests.Session() |
|
|
|
|
|
retry_strategy = Retry( |
|
total=5, |
|
backoff_factor=1, |
|
status_forcelist=[429, 500, 502, 503, 504], |
|
allowed_methods=["GET", "POST"] |
|
) |
|
|
|
|
|
adapter = HTTPAdapter(max_retries=retry_strategy) |
|
session.mount("http://", adapter) |
|
session.mount("https://", adapter) |
|
|
|
return session |
|
|
|
|
|
def scrape_gem_cppp(keyword="", org_name="", start_date=None, end_date=None, max_pages=10): |
|
"""Scrape tender data from GeM CPPP portal with robust error handling""" |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
|
'Content-Type': 'application/x-www-form-urlencoded', |
|
'Referer': 'https://gem.gov.in/cppp', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.5', |
|
'Connection': 'keep-alive' |
|
} |
|
|
|
|
|
session = create_robust_session() |
|
|
|
tenders = [] |
|
page = 1 |
|
total_pages = max_pages |
|
|
|
while page <= total_pages and page <= max_pages: |
|
try: |
|
print(f"Fetching page {page} of maximum {max_pages}") |
|
|
|
|
|
form_data = { |
|
'page': str(page), |
|
'tid': '', |
|
'title': keyword, |
|
'orgname': org_name, |
|
'startdate': start_date.strftime('%d-%m-%Y') if start_date else '', |
|
'enddate': end_date.strftime('%d-%m-%Y') if end_date else '', |
|
't_outrefid': '', |
|
'search': '1', |
|
} |
|
|
|
|
|
time.sleep(random.uniform(0.5, 1.5)) |
|
|
|
|
|
resp = session.post( |
|
"https://gem.gov.in/cppp", |
|
headers=headers, |
|
data=form_data, |
|
timeout=(30, 60) |
|
) |
|
|
|
|
|
if resp.status_code != 200: |
|
print(f"Error: Received status code {resp.status_code}") |
|
break |
|
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser") |
|
|
|
|
|
table = soup.find("table", {"class": "table"}) |
|
if not table: |
|
print(f"No tender table found on page {page}") |
|
break |
|
|
|
|
|
rows = table.find_all("tr")[1:] |
|
if not rows: |
|
print(f"No tender rows found on page {page}") |
|
break |
|
|
|
print(f"Found {len(rows)} tender rows on page {page}") |
|
|
|
|
|
for row in rows: |
|
cols = row.find_all("td") |
|
if len(cols) < 8: |
|
continue |
|
|
|
try: |
|
|
|
closing = cols[0].get_text(strip=True) |
|
opening_date = cols[1].get_text(strip=True) |
|
publish_date = cols[2].get_text(strip=True) |
|
|
|
|
|
title_el = cols[3].find("a") |
|
title = title_el.get_text(strip=True) if title_el else cols[3].get_text(strip=True) |
|
|
|
|
|
link = "" |
|
if title_el and title_el.has_attr("href"): |
|
link = title_el["href"] |
|
if link and link.startswith("/"): |
|
link = "https://gem.gov.in" + link |
|
|
|
|
|
org = cols[4].get_text(strip=True) |
|
|
|
|
|
full_text = cols[3].get_text(strip=True) |
|
ref_id = "" |
|
if title in full_text: |
|
ref_id = full_text.replace(title, "").strip("/").strip() |
|
else: |
|
|
|
id_match = re.search(r'[A-Za-z0-9_-]+/\d+', full_text) |
|
if id_match: |
|
ref_id = id_match.group(0) |
|
|
|
|
|
dl_el = cols[7].find("a") |
|
dl_link = "" |
|
if dl_el and dl_el.has_attr("href"): |
|
dl_link = dl_el["href"] |
|
|
|
if dl_link and dl_link.startswith("/"): |
|
dl_link = "https://gem.gov.in" + dl_link |
|
|
|
|
|
try: |
|
if closing: |
|
cdate = parse(closing) |
|
if start_date and cdate < start_date: |
|
continue |
|
if end_date and cdate > end_date: |
|
continue |
|
except Exception: |
|
|
|
pass |
|
|
|
|
|
tenders.append({ |
|
"Title": title, |
|
"Organization": org, |
|
"Closing Date": closing, |
|
"Opening Date": opening_date, |
|
"Published Date": publish_date, |
|
"Reference/Tender ID": ref_id, |
|
"Tender Link": link, |
|
"Download Link": dl_link |
|
}) |
|
|
|
except Exception as row_err: |
|
print(f"Error processing row on page {page}: {row_err}") |
|
continue |
|
|
|
|
|
pag = soup.find("ul", {"class": "pagination"}) |
|
next_page_exists = False |
|
|
|
if pag: |
|
|
|
next_link = pag.find("a", string=re.compile(r"Next", re.I)) |
|
if next_link: |
|
next_page_exists = True |
|
|
|
|
|
page_links = pag.find_all("a") |
|
for link in page_links: |
|
try: |
|
page_num = int(link.get_text(strip=True)) |
|
total_pages = max(total_pages, page_num) |
|
except (ValueError, TypeError): |
|
pass |
|
|
|
if not next_page_exists: |
|
print(f"No next page found after page {page}") |
|
break |
|
|
|
|
|
page += 1 |
|
|
|
except requests.Timeout: |
|
print(f"Timeout error on page {page}. Retrying...") |
|
continue |
|
|
|
except requests.RequestException as e: |
|
print(f"Request error on page {page}: {e}") |
|
|
|
time.sleep(5) |
|
continue |
|
|
|
except Exception as e: |
|
print(f"Unexpected error on page {page}: {e}") |
|
break |
|
|
|
print(f"Scraping completed: found {len(tenders)} tenders across {page} pages") |
|
return tenders |
|
|
|
|
|
def summarize_tenders(tenders: list[dict]) -> str: |
|
if not tenders: |
|
return "No tenders were found matching those criteria." |
|
|
|
lines = [f"I found {len(tenders)} tenders matching your criteria:\n"] |
|
|
|
|
|
try: |
|
tenders = sorted(tenders, |
|
key=lambda x: parse(x.get("Closing Date", "01-01-2000")), |
|
reverse=True) |
|
except Exception: |
|
|
|
pass |
|
|
|
|
|
for idx, t in enumerate(tenders, 1): |
|
|
|
title_line = f"{idx}. " |
|
if t.get("Tender Link"): |
|
title_line += f"[{t['Title']}]({t['Tender Link']})" |
|
else: |
|
title_line += t['Title'] |
|
|
|
lines.append(title_line) |
|
|
|
|
|
lines.append(f" β’ Organization: {t['Organization']}") |
|
|
|
|
|
lines.append(f" β’ Closing Date: {t['Closing Date']}") |
|
|
|
if t.get("Opening Date") and t["Opening Date"].strip(): |
|
lines.append(f" β’ Opening Date: {t['Opening Date']}") |
|
|
|
if t.get("Published Date") and t["Published Date"].strip(): |
|
lines.append(f" β’ Published Date: {t['Published Date']}") |
|
|
|
|
|
if t.get("Reference/Tender ID") and t["Reference/Tender ID"].strip(): |
|
lines.append(f" β’ Ref ID: {t['Reference/Tender ID']}") |
|
|
|
|
|
if t.get("Download Link") and t["Download Link"].strip(): |
|
lines.append(f" β’ [Download Tender Document]({t['Download Link']})") |
|
|
|
lines.append("") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
def chat_fn(user_message: str, history): |
|
"""Process chat messages and extract search parameters""" |
|
|
|
print(f"User Message: {user_message}") |
|
|
|
try: |
|
|
|
corrected = correct_text(user_message) |
|
print(f"Corrected Text: {corrected}") |
|
|
|
|
|
date_patterns = [ |
|
|
|
r"from\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+to\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})", |
|
|
|
r"between\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+and\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})" |
|
] |
|
|
|
start_date = end_date = None |
|
|
|
for pattern in date_patterns: |
|
match = re.search(pattern, corrected, re.I) |
|
if match: |
|
try: |
|
start_date = parse(match.group(1)) |
|
end_date = parse(match.group(2)) |
|
print(f"Dates extracted: {start_date} to {end_date}") |
|
break |
|
except Exception as e: |
|
print(f"Date parsing error: {e}") |
|
|
|
|
|
org_patterns = [ |
|
r"from\s+ministry\s+of\s+(\w+)", |
|
r"from\s+(\w+)\s+ministry", |
|
r"by\s+(\w+\s+\w+)", |
|
r"organization\s+(\w+\s+\w+)" |
|
] |
|
|
|
org = "" |
|
for pattern in org_patterns: |
|
org_match = re.search(pattern, corrected.lower()) |
|
if org_match: |
|
org = org_match.group(1) |
|
print(f"Organization extracted: {org}") |
|
break |
|
|
|
|
|
stops = {"find", "search", "get", "tenders", "tender", "from", "to", |
|
"between", "after", "before", "the", "and", "of", "in"} |
|
|
|
|
|
keyword = "" |
|
kw_match = re.search(r"(?:get|find|search)\s+(.*?)\s+tenders?", corrected.lower()) |
|
if kw_match: |
|
keyword = kw_match.group(1).strip() |
|
else: |
|
|
|
words = re.findall(r"\b\w+\b", corrected.lower()) |
|
keyword = " ".join(w for w in words if w not in stops and len(w) > 2) |
|
|
|
print(f"Final keyword: '{keyword}'") |
|
|
|
|
|
results = scrape_gem_cppp( |
|
keyword=keyword.strip(), |
|
org_name=org, |
|
start_date=start_date, |
|
end_date=end_date, |
|
max_pages=10 |
|
) |
|
|
|
|
|
bot_reply = summarize_tenders(results) |
|
|
|
except Exception as e: |
|
import traceback |
|
print(f"Error in chat function: {e}") |
|
print(traceback.format_exc()) |
|
bot_reply = f"Sorry, an error occurred while processing your request: {str(e)}" |
|
|
|
return bot_reply |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Government Tender Search Chatbot") |
|
gr.Markdown("Ask me to find tenders by keyword, organization, or date range.") |
|
gr.ChatInterface( |
|
fn=chat_fn, |
|
title="TenderBot", |
|
description="E.g. Search solar panel tenders from 01/06/2025 to 30/06/2025", |
|
examples=[ |
|
"Find solar panel tenders", |
|
"Search for IT tenders from Ministry of Defense", |
|
"Get construction tenders from 01/05/2025 to 30/06/2025" |
|
], |
|
|
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch(debug=True, share=False) |