import uuid import json from os import makedirs, remove from os.path import exists, dirname from bs4 import BeautifulSoup import requests ''' - Error handing - Look if alternative to main tag is needed. Provide error message if main tag is not found. - Menus are li tags with a tags within. - li tags with text and tags should be exported - Find divs that have text or p tags maybe other tags like divs - Export the text ''' # Get array of strings from page based off URL. def get_url_content( url ): file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json' # Create directory if it doesn't exist. makedirs(dirname(file_path), exist_ok=True) # If cache file exists get content from cache. if exists( file_path ): with open( file_path, 'r' ) as file: strings = json.load( file ) else: try: strings = extract_strings( url ) except Exception as exception: raise exception # Write strings to cache. with open( file_path, 'w' ) as file: json.dump( strings, file ) return strings # Extract text from page based off URL. def extract_strings( url ): try : # Parse html content using BeautifulSoup. soup = get_soup( url ) except Exception as exception: raise exception if soup is None: raise Exception('No HTML content found.') # Remove scripts and styles. for script in soup(["script", "style"]): script.decompose() # Get main content of html page. content = get_main_content( soup ) if content is None : raise Exception('No main content found.') # Extract strings from main content based on allowed tags. strings = get_tags_text( content ) if strings is None : raise Exception('No text found.') return strings # Make request and get html content. def get_soup( url ): file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html' makedirs(dirname(file_path), exist_ok=True) # If cache file exists get content from cache. if exists( file_path ): with open( file_path, 'r' ) as web_page: html = web_page.read() else: # Add user agent header to request to make request more realistic. headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'} response = requests.get( url, headers=headers ) # Raise exception if response is not 200. response.raise_for_status() if not response.text: raise Exception('HTML empty.') html = response.text # Save html to cache. with open( file_path, 'w' ) as file: file.write( html ) return BeautifulSoup(html, 'html.parser') # Find main content of html page based rules. def get_main_content( soup ): content = soup.find( "div", { "class": "post-body" } ) if content is not None: return content content = soup.find( "div", { "class": "article-content" } ) if content is not None: return content content = soup.find( "div", { "class": "blog-post-content" } ) if content is not None: return content content = soup.find( "div", { "class": "region-content" } ) if content is not None: return content content = soup.find( "div", { "class": "entry-content" } ) if content is not None: return content content = soup.find( "div", { "class": "region--content" } ) if content is not None: return content content = soup.find( "div", { "class": "article" } ) if content is not None: return content content = soup.find( "div", { "class": "article-inner_html" } ) if content is not None: return content content = soup.find( "div", { "id": "bmdDetail-Content" } ) if content is not None: return content content = soup.find( "div", { "id": "main" } ) if content is not None: return content content = soup.main if content is not None: return content content = soup.find( "article" ) if content is not None: return content content = soup.find( "body" ) if content is not None: return content return None # Extract text from allowed tags. def get_tags_text( soup ): text = [] # Find all tags that are allowed. tags = soup.find_all( allowed_tags ) # Loop through tags and extract text. for tag in tags: # If div tag extract text from sub tags. if tag.name == 'div' : for div in tag.find_all(text=True, recursive=False): found_text = div.get_text( ' ', strip=True ) if found_text != '': found_text = found_text.replace( '\n', ' ' ) found_text = found_text.replace( '\r', ' ' ) text.append( found_text ) else : found_text = tag.get_text( ' ', strip=True ) if found_text != '': found_text = found_text.replace( '\n', ' ' ) found_text = found_text.replace( '\r', ' ' ) text.append( found_text ) return text # List of allowed tags. def allowed_tags( tag ): return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div' ## To be deleted. # -------------------------------------- # # Extract content from main tag. def get_main( soup ): return soup.main def get_deepest_divs( tag ): # Get all the divs from within a tag. return [div for div in tag.findAll('div') if not div.find('div')] def get_tag_text( tags ): text = '' for tag in tags: print(tag.find_all('li')) # text += [p.get_text() for p in tag.find_all('p)] return text