from bs4 import BeautifulSoup import requests import uuid from os.path import exists ''' - Error handing - Look if alternative to main tag is needed. Provide error message if main tag is not found. - Menus are li tags with a tags within. - li tags with text and tags should be exported - Find divs that have text or p tags maybe other tags like divs - Export the text ''' # Make request and get html content. def get_soup( url ): file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html' if ( exists( file_path ) ): with open( file_path, 'r' ) as web_page: html = web_page.read() else: try: request = requests.get(url) except: print('Unable to retrieve content, skipping URL') return if not request.ok: print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) ) return if not request.content: print(request.content) return html = request.content with open( file_path, 'wb' ) as file: file.write( html ) return BeautifulSoup(html, 'html.parser') # Extract content from main tag. def get_main( soup ): return soup.main def get_deepest_divs( tag ): # Get all the divs from within a tag. return [div for div in tag.findAll('div') if not div.find('div')] def get_tag_text( tags ): text = '' for tag in tags: print(tag.find_all('li')) # text += [p.get_text() for p in tag.find_all('p)] return text def get_list_text( tags ): list_items = [] for tag in tags: list_items = tag.find_all(find_direct_text) return list_items def find_direct_text( tag ): return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' def extract_content( url ): soup = get_soup( url ) if ( soup == None ): return None main = get_main( soup ) if ( main == None ): return 'No main tag found.' return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )]) if __name__ == '__main__': url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans' print(extract_content(url))