''' This script fetches the bill text from Congress.gov using alternative methods. ''' import requests from bs4 import BeautifulSoup import os import time from urllib.parse import urljoin # Working URLs for bill text URLS = [ "https://www.congress.gov/119/bills/hr1/BILLS-119hr1eh.htm", "https://www.congress.gov/119/bills/hr1/BILLS-119hr1ih.htm" ] # Output directory and file path OUTPUT_DIR = "books" FILE_NAME = "one_big_beautiful_bill.txt" OUTPUT_PATH = os.path.join(OUTPUT_DIR, FILE_NAME) def fetch_doc(): '''Fetches the bill text from Congress.gov and saves it as plain text.''' try: # Create the output directory if it doesn't exist if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) print(f"Created directory: {OUTPUT_DIR}") # Try each working URL for i, url in enumerate(URLS): print(f"Fetching from URL {i+1}: {url}...") try: # Create a session to maintain cookies and connection state session = requests.Session() # Add comprehensive headers to make the request appear more like a browser headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Cache-Control': 'max-age=0' } session.headers.update(headers) # Add a small delay to be respectful time.sleep(2) response = session.get(url, timeout=30) response.raise_for_status() # Raise an exception for HTTP errors print(f"Content fetched successfully from {url}") # Parse the HTML using BeautifulSoup print("Parsing HTML content...") soup = BeautifulSoup(response.content, 'html.parser') print("HTML parsed successfully.") # Try to extract the bill text content # Look for the main content area that typically contains bill text bill_text = None # Try different selectors that might contain the bill text selectors = [ '.bill-text-container', '.generated-html-container', '.main-content', '#main-content', '.bill-text', 'main' ] for selector in selectors: content = soup.select_one(selector) if content: bill_text = content.get_text(separator='\n', strip=True) print(f"Found content using selector: {selector}") break # If no specific selector worked, try getting all text from body if not bill_text: bill_text = soup.body.get_text(separator='\n', strip=True) if soup.body else soup.get_text(separator='\n', strip=True) print("Using general body text extraction") # Write the extracted text to the output file if bill_text and len(bill_text.strip()) > 100: # Make sure we got substantial content print(f"Saving text to {OUTPUT_PATH}...") with open(OUTPUT_PATH, "w", encoding="utf-8") as f: f.write(f"Fetched from: {url}\n") f.write("="*50 + "\n\n") f.write(bill_text) print(f"Successfully saved bill text to {OUTPUT_PATH}") print(f"Content length: {len(bill_text)} characters") return # Success, exit the function else: print(f"Content too short or empty from {url}, trying next URL...") except requests.exceptions.RequestException as e: print(f"Error fetching from {url}: {e}") continue # If we get here, all working URLs failed print("All working URLs failed. Please check:") print("1. Your internet connection") print("2. If Congress.gov is accessible") print("3. Try running the script again later") except IOError as e: print(f"Error writing file: {e}") except Exception as e: print(f"An unexpected error occurred: {e}") if __name__ == "__main__": fetch_doc()