File size: 16,054 Bytes
d5e14e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import os
import re
import requests
import pandas as pd
import gradio as gr
import time
import random
from bs4 import BeautifulSoup
from dateutil.parser import parse
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ─── 1. OPTIONAL: LLM FOR CORRECTION & PARAPHRASING ────────────────────────────
try:
    from transformers import T5ForConditionalGeneration, T5Tokenizer
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model     = T5ForConditionalGeneration.from_pretrained("t5-small")

    def correct_text(raw_text: str) -> str:
        """Paraphrase & correct via T5-small, with fallback on error."""
        try:
            prompt = "paraphrase and correct: " + raw_text.strip()
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
            outputs = model.generate(**inputs, max_length=128)
            return tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception:
            return raw_text
except ImportError:
    def correct_text(raw_text: str) -> str:
        # If transformers not installed, return raw text
        return raw_text

# ─── 2. CREATE REQUESTS SESSION WITH RETRY LOGIC ──────────────────────────────
def create_robust_session():
    """Create a requests session with retry logic"""
    session = requests.Session()
    
    # Configure retry strategy
    retry_strategy = Retry(
        total=5,  # Total number of retries
        backoff_factor=1,  # Exponential backoff
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
        allowed_methods=["GET", "POST"]  # Allow retrying on POST requests
    )
    
    # Mount adapter with retry strategy
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    return session

# ─── 3. SCRAPER FOR GeM CPPP ────────────────────────────────────────────────────
def scrape_gem_cppp(keyword="", org_name="", start_date=None, end_date=None, max_pages=10):
    """Scrape tender data from GeM CPPP portal with robust error handling"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://gem.gov.in/cppp',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive'
    }
    
    # Create a robust session with retry logic
    session = create_robust_session()
    
    tenders = []
    page = 1
    total_pages = max_pages
    
    while page <= total_pages and page <= max_pages:
        try:
            print(f"Fetching page {page} of maximum {max_pages}")
            
            # Prepare form data for the request
            form_data = {
                'page': str(page),
                'tid': '',
                'title': keyword,
                'orgname': org_name,
                'startdate': start_date.strftime('%d-%m-%Y') if start_date else '',
                'enddate': end_date.strftime('%d-%m-%Y') if end_date else '',
                't_outrefid': '',
                'search': '1',
            }
            
            # Add a small random delay to avoid rate limiting
            time.sleep(random.uniform(0.5, 1.5))
            
            # Make POST request with increased timeouts
            resp = session.post(
                "https://gem.gov.in/cppp",
                headers=headers,
                data=form_data,
                timeout=(30, 60)  # (Connect timeout, Read timeout)
            )
            
            # Check if request was successful
            if resp.status_code != 200:
                print(f"Error: Received status code {resp.status_code}")
                break
                
            # Parse the response
            soup = BeautifulSoup(resp.text, "html.parser")
            
            # Find the tender table
            table = soup.find("table", {"class": "table"})
            if not table:
                print(f"No tender table found on page {page}")
                break
                
            # Extract data from rows (skip header row)
            rows = table.find_all("tr")[1:]
            if not rows:
                print(f"No tender rows found on page {page}")
                break
                
            print(f"Found {len(rows)} tender rows on page {page}")
            
            # Process each row
            for row in rows:
                cols = row.find_all("td")
                if len(cols) < 8:
                    continue
                    
                try:
                    # Extract fields with detailed error handling
                    closing = cols[0].get_text(strip=True)
                    opening_date = cols[1].get_text(strip=True)
                    publish_date = cols[2].get_text(strip=True)
                    
                    # Extract title and link with careful error handling
                    title_el = cols[3].find("a")
                    title = title_el.get_text(strip=True) if title_el else cols[3].get_text(strip=True)
                    
                    # Extract full link with proper domain
                    link = ""
                    if title_el and title_el.has_attr("href"):
                        link = title_el["href"]
                        if link and link.startswith("/"):
                            link = "https://gem.gov.in" + link
                    
                    # Extract organization 
                    org = cols[4].get_text(strip=True)
                    
                    # Extract reference ID with better parsing
                    full_text = cols[3].get_text(strip=True)
                    ref_id = ""
                    if title in full_text:
                        ref_id = full_text.replace(title, "").strip("/").strip()
                    else:
                        # Try to extract any alphanumeric ID patterns
                        id_match = re.search(r'[A-Za-z0-9_-]+/\d+', full_text)
                        if id_match:
                            ref_id = id_match.group(0)
                    
                    # Extract download link with proper error handling
                    dl_el = cols[7].find("a")
                    dl_link = ""
                    if dl_el and dl_el.has_attr("href"):
                        dl_link = dl_el["href"]
                        # Ensure it's a complete URL
                        if dl_link and dl_link.startswith("/"):
                            dl_link = "https://gem.gov.in" + dl_link
                            
                    # Apply date filters if specified
                    try:
                        if closing:
                            cdate = parse(closing)
                            if start_date and cdate < start_date:
                                continue
                            if end_date and cdate > end_date:
                                continue
                    except Exception:
                        # If date parsing fails, include the tender anyway
                        pass
                        
                    # Add to results
                    tenders.append({
                        "Title": title,
                        "Organization": org,
                        "Closing Date": closing,
                        "Opening Date": opening_date,
                        "Published Date": publish_date,
                        "Reference/Tender ID": ref_id,
                        "Tender Link": link,
                        "Download Link": dl_link
                    })
                    
                except Exception as row_err:
                    print(f"Error processing row on page {page}: {row_err}")
                    continue
                    
            # Check for pagination
            pag = soup.find("ul", {"class": "pagination"})
            next_page_exists = False
            
            if pag:
                # Look for "Next" button or links to next pages
                next_link = pag.find("a", string=re.compile(r"Next", re.I))
                if next_link:
                    next_page_exists = True
                    
                # Also check for numbered page links
                page_links = pag.find_all("a")
                for link in page_links:
                    try:
                        page_num = int(link.get_text(strip=True))
                        total_pages = max(total_pages, page_num)
                    except (ValueError, TypeError):
                        pass
                        
            if not next_page_exists:
                print(f"No next page found after page {page}")
                break
                
            # Move to the next page
            page += 1
            
        except requests.Timeout:
            print(f"Timeout error on page {page}. Retrying...")
            continue
            
        except requests.RequestException as e:
            print(f"Request error on page {page}: {e}")
            # Wait before retrying
            time.sleep(5)
            continue
            
        except Exception as e:
            print(f"Unexpected error on page {page}: {e}")
            break
            
    print(f"Scraping completed: found {len(tenders)} tenders across {page} pages")
    return tenders

# ─── 4. SUMMARY GENERATOR (ALL RESULTS) ────────────────────────────────────────
def summarize_tenders(tenders: list[dict]) -> str:
    if not tenders:
        return "No tenders were found matching those criteria."
        
    lines = [f"I found {len(tenders)} tenders matching your criteria:\n"]
    
    # Sort tenders by closing date (newest first)
    try:
        tenders = sorted(tenders, 
                         key=lambda x: parse(x.get("Closing Date", "01-01-2000")),
                         reverse=True)
    except Exception:
        # If sorting fails, continue with unsorted data
        pass
        
    # Generate the summary
    for idx, t in enumerate(tenders, 1):
        # Format title with link if available
        title_line = f"{idx}. "
        if t.get("Tender Link"):
            title_line += f"[{t['Title']}]({t['Tender Link']})"
        else:
            title_line += t['Title']
            
        lines.append(title_line)
        
        # Add organization info
        lines.append(f"   β€’ Organization: {t['Organization']}")
        
        # Add date information
        lines.append(f"   β€’ Closing Date: {t['Closing Date']}")
        
        if t.get("Opening Date") and t["Opening Date"].strip():
            lines.append(f"   β€’ Opening Date: {t['Opening Date']}")
        
        if t.get("Published Date") and t["Published Date"].strip():
            lines.append(f"   β€’ Published Date: {t['Published Date']}")
        
        # Add Reference ID
        if t.get("Reference/Tender ID") and t["Reference/Tender ID"].strip():
            lines.append(f"   β€’ Ref ID: {t['Reference/Tender ID']}")
        
        # Add download link if available
        if t.get("Download Link") and t["Download Link"].strip():
            lines.append(f"   β€’ [Download Tender Document]({t['Download Link']})")
            
        lines.append("")  # Add a blank line between tenders
        
    # Return the formatted summary
    return "\n".join(lines)

# ─── 5. CHAT FUNCTION ──────────────────────────────────────────────────────────
def chat_fn(user_message: str, history):
    """Process chat messages and extract search parameters"""
    # Debug output
    print(f"User Message: {user_message}")
    
    try:
        # Clean and potentially correct user message
        corrected = correct_text(user_message)
        print(f"Corrected Text: {corrected}")
        
        # Extract date ranges with flexible patterns
        date_patterns = [
            # Format: "from DD/MM/YYYY to DD/MM/YYYY"
            r"from\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+to\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})",
            # Format: "between DD/MM/YYYY and DD/MM/YYYY" 
            r"between\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+and\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})"
        ]
        
        start_date = end_date = None
        
        for pattern in date_patterns:
            match = re.search(pattern, corrected, re.I)
            if match:
                try:
                    start_date = parse(match.group(1))
                    end_date = parse(match.group(2))
                    print(f"Dates extracted: {start_date} to {end_date}")
                    break
                except Exception as e:
                    print(f"Date parsing error: {e}")
        
        # Extract organization with multiple patterns
        org_patterns = [
            r"from\s+ministry\s+of\s+(\w+)",
            r"from\s+(\w+)\s+ministry", 
            r"by\s+(\w+\s+\w+)",
            r"organization\s+(\w+\s+\w+)"
        ]
        
        org = ""
        for pattern in org_patterns:
            org_match = re.search(pattern, corrected.lower())
            if org_match:
                org = org_match.group(1)
                print(f"Organization extracted: {org}")
                break
        
        # Extract keywords with smarter filtering
        stops = {"find", "search", "get", "tenders", "tender", "from", "to", 
                "between", "after", "before", "the", "and", "of", "in"}
                
        # Try pattern matching first
        keyword = ""
        kw_match = re.search(r"(?:get|find|search)\s+(.*?)\s+tenders?", corrected.lower())
        if kw_match:
            keyword = kw_match.group(1).strip()
        else:
            # Fallback to word filtering
            words = re.findall(r"\b\w+\b", corrected.lower())
            keyword = " ".join(w for w in words if w not in stops and len(w) > 2)
            
        print(f"Final keyword: '{keyword}'")
        
        # Search for tenders
        results = scrape_gem_cppp(
            keyword=keyword.strip(),
            org_name=org, 
            start_date=start_date,
            end_date=end_date,
            max_pages=10  # Increased max pages
        )
        
        # Generate reply
        bot_reply = summarize_tenders(results)
        
    except Exception as e:
        import traceback
        print(f"Error in chat function: {e}")
        print(traceback.format_exc())
        bot_reply = f"Sorry, an error occurred while processing your request: {str(e)}"
        
    return bot_reply

# ─── 6. GRADIO APP ─────────────────────────────────────────────────────────────
with gr.Blocks() as demo:
    gr.Markdown("## Government Tender Search Chatbot")
    gr.Markdown("Ask me to find tenders by keyword, organization, or date range.")
    gr.ChatInterface(
        fn=chat_fn,
        title="TenderBot",
        description="E.g. Search solar panel tenders from 01/06/2025 to 30/06/2025",
        examples=[
            "Find solar panel tenders",
            "Search for IT tenders from Ministry of Defense",
            "Get construction tenders from 01/05/2025 to 30/06/2025"
        ],

    )

if __name__ == "__main__":
    # Launch with appropriate parameters
    demo.launch(debug=True, share=False)