Spaces:

grapplerulrich
/

raccoon

Sleeping

File size: 1,984 Bytes

8b32433

from bs4 import BeautifulSoup
import requests

'''
 - Error handing
 - Look if alternative to main tag is needed. Provide error message if main tag is not found.
 - Menus are li tags with a tags within.
 - li tags with text and tags should be exported
 - Find divs that have text or p tags maybe other tags like divs
 - Export the text
'''

# Make request and get html content.
def get_soup( url ):
    # try:
        # request = requests.get(url)
    # except:
    #     print('Unable to retrieve content, skipping URL')
    #    return

    # if not request.ok:
    #     print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
    #     return

    request = requests.get(url)
    html = request.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup

# Extract content from main tag.
def get_main( soup ):
    return soup.main

def is_childless( tag ):
    return len( tag.find_all('div') ) == 0

def get_divs( tag ):
    # Get all the divs from within the main tag.
    divs = tag.find_all('div')
    return filter( is_childless, divs )


def extract_content( url ):
    soup = get_soup( url )
    main = get_main( soup )
    divs = get_divs( main )
    return [p.get_text() for p in div.find_all('p')]


#   # Get all the divs from within the main tag.
#   divs = soup.main.find_all('div')
#   for div in divs:
#     # Get all of the divs that do not have further divs within.
#     no_child_div = len(div.find_all('div')) == 0
#     if no_child_div:
#       # Find all p tags in the div.
#       content += [p.get_text() for p in div.find_all('p')]
#       # Find all li in the div.
#       for li in div.find_all('li'):
#         # 
#         content += ''.join(li.find_all(text=True, recursive=False))
#       content += ''.join(div.find_all(text=True, recursive=False))
#   return content

if __name__ == '__main':
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
  print(extract_content(url))