import uuid
import json
from os import makedirs, remove
from os.path import exists, dirname
from bs4 import BeautifulSoup
import requests

'''
 - Error handing
 - Look if alternative to main tag is needed. Provide error message if main tag is not found.
 - Menus are li tags with a tags within.
 - li tags with text and tags should be exported
 - Find divs that have text or p tags maybe other tags like divs
 - Export the text
'''

def get_url_content( url ):
    file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
    makedirs(dirname(file_path), exist_ok=True)
    if exists( file_path ):
        with open( file_path, 'r' ) as file:
            strings = json.load( file )
    else:
        try:
            strings = extract_strings( url )
        except Exception as exception:
            raise exception

        with open( file_path, 'w' ) as file:
            json.dump( strings, file )

    return strings

def extract_strings( url ):
    try :
        soup = get_soup( url )
    except Exception as exception:
        raise exception

    if soup is None:
        raise Exception('No HTML content found.')

    # Remove scripts and styles.
    for script in soup(["script", "style"]):
        script.decompose()

    content = get_main_content( soup )
    if content is None :
        raise Exception('No main content found.')

    strings = get_tags_text( content )
    if strings is None :
        raise Exception('No text found.')
    return strings

# Make request and get html content.
def get_soup( url ):
    file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
    makedirs(dirname(file_path), exist_ok=True)
    if exists( file_path ):
        with open( file_path, 'r' ) as web_page:
            html = web_page.read()
    else:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
        response = requests.get( url, headers=headers )
        response.raise_for_status()
        if not response.text:
            raise Exception('HTML empty.')
        html = response.text
        with open( file_path, 'w' ) as file:
            file.write( html )

    return BeautifulSoup(html, 'html.parser')

def get_main_content( soup ):

    content = soup.find( "div", { "class": "post-body" } )
    if content is not None:
        print('Has .post-body class.')
        return content

    content = soup.find( "div", { "class": "article-content" } )
    if content is not None:
        print('Has .article-content class.')
        return content

    content = soup.find( "div", { "class": "blog-post-content" } )
    if content is not None:
        print('Has .blog-post-content class.')
        return content

    content = soup.find( "div", { "class": "region-content" } )
    if content is not None:
        print('Has .region-content class.')
        return content

    content = soup.find( "div", { "class": "entry-content" } )
    if content is not None:
        print('Has .entry-content class.')
        return content

    content = soup.find( "div", { "class": "region--content" } )
    if content is not None:
        print('Has .region--content class.')
        return content
    
    content = soup.find( "div", { "class": "article" } )
    if content is not None:
        print('Has .article class.')
        return content

    content = soup.find( "div", { "class": "article-inner_html" } )
    if content is not None:
        print('Has .article-inner_html class.')
        return content

    content = soup.find( "div", { "id": "bmdDetail-Content" } )
    if content is not None:
        print('Has .bmdDetail-Content id.')
        return content

    content = soup.find( "div", { "id": "main" } )
    if content is not None:
        print('Has .bmdDetail-Content id.')
        return content

    content = soup.main
    if content is not None:
        print('Has main tag.')
        return content

    content = soup.find( "article" )
    if content is not None:
        print('Has article tag.')
        return content

    content = soup.find( "body" )
    if content is not None:
        print('Has body tag.')
        return content

    return None

def get_tags_text( soup ):
    text = []
    tags = soup.find_all( allowed_tags )
    for tag in tags:
        if tag.name == 'div' :
            for div in tag.find_all(text=True, recursive=False):
                found_text = div.get_text( ' ', strip=True )
                if found_text != '':
                    text.append( found_text )
        else :
            found_text = tag.get_text( ' ', strip=True )
            if found_text != '':
                text.append( found_text )
    return text

def allowed_tags( tag ):
    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'

# -------------------------------------- #

# Extract content from main tag.
def get_main( soup ):
    return soup.main

def get_deepest_divs( tag ):
    # Get all the divs from within a tag.
    return [div for div in tag.findAll('div') if not div.find('div')]

def get_tag_text( tags ):
    text = ''
    for tag in tags:
        print(tag.find_all('li'))
        # text += [p.get_text() for p in tag.find_all('p)]
    return text

def get_list_text( tags ):
    list_items = []
    for tag in tags:
        list_items = tag.find_all(find_direct_text)
    return list_items

def find_div_text( tag ):
    return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()

if __name__ == '__main__':
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
  print(extract_content(url))