open-navigator / scripts /development /debug_eboard.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
#!/usr/bin/env python3
"""
Debug script to examine eBoard page structure
"""
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
async def main():
url = "https://simbli.eboardsolutions.com/SB_Meetings/SB_MeetingListing.aspx?S=2088"
base_url = "https://simbli.eboardsolutions.com"
print(f"Loading: {url}\n")
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox'
]
)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent=user_agent,
locale='en-US',
timezone_id='America/Chicago',
)
page = await context.new_page()
# Apply stealth
stealth = Stealth()
await stealth.apply_stealth_async(page)
# Navigate
response = await page.goto(url, wait_until='networkidle', timeout=60000)
print(f"Response status: {response.status}")
# Wait for JavaScript
await page.wait_for_timeout(5000)
content = await page.content()
print(f"Page size: {len(content)} bytes\n")
# Save full HTML for inspection
with open('/tmp/eboard_page.html', 'w') as f:
f.write(content)
print("Saved full HTML to /tmp/eboard_page.html\n")
# Parse with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
# Find all links
all_links = soup.find_all('a', href=True)
print(f"Total links found: {len(all_links)}\n")
# Categorize links
mid_links = []
meetingdetail_links = []
pdf_links = []
other_links = []
for link in all_links:
href = link.get('href', '')
text = link.get_text().strip()
if 'MID=' in href.upper():
mid_links.append((href, text))
elif 'meetingdetail' in href.lower():
meetingdetail_links.append((href, text))
elif href.lower().endswith('.pdf'):
pdf_links.append((href, text))
elif href and not href.startswith('#') and not href.startswith('javascript:'):
other_links.append((href, text[:50]))
print(f"Links with MID=: {len(mid_links)}")
for href, text in mid_links[:10]:
print(f" - {text[:60]}: {href[:80]}")
print(f"\nLinks with 'meetingdetail': {len(meetingdetail_links)}")
for href, text in meetingdetail_links[:10]:
print(f" - {text[:60]}: {href[:80]}")
print(f"\nPDF links: {len(pdf_links)}")
for href, text in pdf_links[:10]:
print(f" - {text[:60]}: {href[:80]}")
print(f"\nOther significant links: {len(other_links)}")
for href, text in other_links[:20]:
print(f" - {text[:60]}: {href[:80]}")
# Look for ASP.NET ViewState and other dynamic content indicators
print("\n" + "="*80)
print("Page Analysis:")
print("="*80)
viewstate = soup.find('input', {'id': '__VIEWSTATE'})
if viewstate:
print(f"✓ ASP.NET ViewState present ({len(viewstate.get('value', ''))} chars)")
# Look for tables or grids that might contain meetings
tables = soup.find_all('table')
print(f"Tables found: {len(tables)}")
for i, table in enumerate(tables[:5]):
rows = table.find_all('tr')
print(f" Table {i+1}: {len(rows)} rows")
if rows:
first_row_text = rows[0].get_text().strip()[:100]
print(f" First row: {first_row_text}")
# Look for JavaScript-rendered content
scripts = soup.find_all('script')
print(f"\nJavaScript blocks: {len(scripts)}")
# Check for common eBoard element IDs
meeting_list_elem = soup.find(id=re.compile(r'meeting.*list', re.I))
if meeting_list_elem:
print(f"✓ Found element with 'meeting' and 'list' in ID: {meeting_list_elem.get('id')}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())