grapplerulrich's picture
inital version with small test for beautiful soup
8b32433
from bs4 import BeautifulSoup
import requests
'''
- Error handing
- Look if alternative to main tag is needed. Provide error message if main tag is not found.
- Menus are li tags with a tags within.
- li tags with text and tags should be exported
- Find divs that have text or p tags maybe other tags like divs
- Export the text
'''
# Make request and get html content.
def get_soup( url ):
# try:
# request = requests.get(url)
# except:
# print('Unable to retrieve content, skipping URL')
# return
# if not request.ok:
# print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
# return
request = requests.get(url)
html = request.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# Extract content from main tag.
def get_main( soup ):
return soup.main
def is_childless( tag ):
return len( tag.find_all('div') ) == 0
def get_divs( tag ):
# Get all the divs from within the main tag.
divs = tag.find_all('div')
return filter( is_childless, divs )
def extract_content( url ):
soup = get_soup( url )
main = get_main( soup )
divs = get_divs( main )
return [p.get_text() for p in div.find_all('p')]
# # Get all the divs from within the main tag.
# divs = soup.main.find_all('div')
# for div in divs:
# # Get all of the divs that do not have further divs within.
# no_child_div = len(div.find_all('div')) == 0
# if no_child_div:
# # Find all p tags in the div.
# content += [p.get_text() for p in div.find_all('p')]
# # Find all li in the div.
# for li in div.find_all('li'):
# #
# content += ''.join(li.find_all(text=True, recursive=False))
# content += ''.join(div.find_all(text=True, recursive=False))
# return content
if __name__ == '__main':
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
print(extract_content(url))