from bs4 import BeautifulSoup import requests ''' - Error handing - Look if alternative to main tag is needed. Provide error message if main tag is not found. - Menus are li tags with a tags within. - li tags with text and tags should be exported - Find divs that have text or p tags maybe other tags like divs - Export the text ''' # Make request and get html content. def get_soup( url ): # try: # request = requests.get(url) # except: # print('Unable to retrieve content, skipping URL') # return # if not request.ok: # print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code )) # return request = requests.get(url) html = request.content soup = BeautifulSoup(html, 'html.parser') return soup # Extract content from main tag. def get_main( soup ): return soup.main def is_childless( tag ): return len( tag.find_all('div') ) == 0 def get_divs( tag ): # Get all the divs from within the main tag. divs = tag.find_all('div') return filter( is_childless, divs ) def extract_content( url ): soup = get_soup( url ) main = get_main( soup ) divs = get_divs( main ) return [p.get_text() for p in div.find_all('p')] # # Get all the divs from within the main tag. # divs = soup.main.find_all('div') # for div in divs: # # Get all of the divs that do not have further divs within. # no_child_div = len(div.find_all('div')) == 0 # if no_child_div: # # Find all p tags in the div. # content += [p.get_text() for p in div.find_all('p')] # # Find all li in the div. # for li in div.find_all('li'): # # # content += ''.join(li.find_all(text=True, recursive=False)) # content += ''.join(div.find_all(text=True, recursive=False)) # return content if __name__ == '__main': url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans' print(extract_content(url))