grapplerulrich's picture
Do not save just spaces
ea2fb58
import uuid
from os.path import exists
from bs4 import BeautifulSoup
import requests
'''
- Error handing
- Look if alternative to main tag is needed. Provide error message if main tag is not found.
- Menus are li tags with a tags within.
- li tags with text and tags should be exported
- Find divs that have text or p tags maybe other tags like divs
- Export the text
'''
def get_url_content( url ):
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
if exists( file_path ):
with open( file_path, 'r' ) as file_content:
content = file_content.read()
else:
try:
content = extract_content( url )
except Exception as exception:
raise exception
with open( file_path, 'w' ) as file:
file.write( content.strip() )
return content
def extract_content( url ):
try :
soup = get_soup( url )
except Exception as exception:
raise exception
if soup is None:
raise Exception('No HTML content found.')
content = get_main_content( soup )
if content is None :
raise Exception('No main content found.')
text = get_tags_text( content )
if text is None :
raise Exception('No text found.')
return text
# Make request and get html content.
def get_soup( url ):
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
print(file_path)
if exists( file_path ):
with open( file_path, 'r' ) as web_page:
html = web_page.read()
else:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
response = requests.get( url, headers=headers )
response.raise_for_status()
if not response.text:
raise Exception('HTML empty.')
html = response.text
with open( file_path, 'w' ) as file:
file.write( html )
return BeautifulSoup(html, 'html.parser')
def get_main_content( soup ):
content = soup.find( "div", { "class": "post-body" } )
if content is not None:
print('Has .post-body class.')
return content
content = soup.find( "div", { "class": "article-content" } )
if content is not None:
print('Has .article-content class.')
return content
content = soup.find( "div", { "class": "entry-content" } )
if content is not None:
print('Has .entry-content class.')
return content
content = soup.find( "div", { "class": "region--content" } )
if content is not None:
print('Has .region--content class.')
return content
content = soup.find( "div", { "class": "article" } )
if content is not None:
print('Has .article class.')
return content
content = soup.find( "div", { "class": "article-inner_html" } )
if content is not None:
print('Has .article-inner_html class.')
return content
content = soup.find( "div", { "id": "bmdDetail-Content" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.find( "div", { "id": "main" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.main
if content is not None:
print('Has main tag.')
return content
content = soup.find( "article" )
if content is not None:
print('Has article tag.')
return content
return None
def get_tags_text( soup ):
text = ''
tags = soup.find_all( find_direct_text )
for tag in tags:
if tag.name == 'div' and tag.find( text=True, recursive=False ) :
for div in tag.find_all(text=True, recursive=False):
text += div.get_text().strip() + ' '
else :
text += tag.get_text().strip() + ' '
return text
# -------------------------------------- #
# Extract content from main tag.
def get_main( soup ):
return soup.main
def get_deepest_divs( tag ):
# Get all the divs from within a tag.
return [div for div in tag.findAll('div') if not div.find('div')]
def get_tag_text( tags ):
text = ''
for tag in tags:
print(tag.find_all('li'))
# text += [p.get_text() for p in tag.find_all('p)]
return text
def get_list_text( tags ):
list_items = []
for tag in tags:
list_items = tag.find_all(find_direct_text)
return list_items
def find_direct_text( tag ):
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )
def find_div_text( tag ):
return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
if __name__ == '__main__':
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
print(extract_content(url))