from bs4 import BeautifulSoup
import requests
import uuid
from os.path import exists

'''
 - Error handing
 - Look if alternative to main tag is needed. Provide error message if main tag is not found.
 - Menus are li tags with a tags within.
 - li tags with text and tags should be exported
 - Find divs that have text or p tags maybe other tags like divs
 - Export the text
'''

# Make request and get html content.
def get_soup( url ):


    file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
    if ( exists( file_path ) ):
        with open( file_path, 'r' ) as web_page:
            html = web_page.read()
    else:
        try:
            request = requests.get(url)
        except:
            print('Unable to retrieve content, skipping URL')
            return
        if not request.ok:
            print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) )
            return
        if not request.content:
            print(request.content)
            return
        html = request.content
        with open( file_path, 'wb' ) as file:
            file.write( html )

    return BeautifulSoup(html, 'html.parser')

# Extract content from main tag.
def get_main( soup ):
    return soup.main

def get_deepest_divs( tag ):
    # Get all the divs from within a tag.
    return [div for div in tag.findAll('div') if not div.find('div')]

def get_tag_text( tags ):
    text = ''
    for tag in tags:
        print(tag.find_all('li'))
        # text += [p.get_text() for p in tag.find_all('p)]
    return text

def get_list_text( tags ):
    list_items = []
    for tag in tags:
        list_items = tag.find_all(find_direct_text)
    return list_items

def find_direct_text( tag ):
    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3'

def extract_content( url ):
    soup = get_soup( url )
    if ( soup == None ):
        return None
    main = get_main( soup )
    if ( main == None ):
        return 'No main tag found.'
    return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])

if __name__ == '__main__':
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
  print(extract_content(url))