File size: 4,990 Bytes
151c2dd
 
1f95777
 
8b32433
 
 
 
 
 
 
 
 
 
561abab
 
 
 
 
 
 
 
 
 
 
ea2fb58
561abab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea2fb58
 
 
 
 
561abab
8b32433
 
151c2dd
1ec143e
1f95777
151c2dd
 
 
1f95777
 
 
 
 
 
 
151c2dd
 
 
8b32433
1ec143e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37ee6a5
 
 
 
 
 
 
 
 
 
 
561abab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37ee6a5
 
 
 
1ec143e
 
 
561abab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151c2dd
 
 
 
 
 
 
 
 
 
8b32433
151c2dd
 
 
 
 
8b32433
151c2dd
1f95777
 
 
 
 
151c2dd
8b32433
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import uuid
from os.path import exists
from bs4 import BeautifulSoup
import requests

'''
 - Error handing
 - Look if alternative to main tag is needed. Provide error message if main tag is not found.
 - Menus are li tags with a tags within.
 - li tags with text and tags should be exported
 - Find divs that have text or p tags maybe other tags like divs
 - Export the text
'''

def get_url_content( url ):
    file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
    if exists( file_path ):
        with open( file_path, 'r' ) as file_content:
            content = file_content.read()
    else:
        try:
            content = extract_content( url )
        except Exception as exception:
            raise exception
        with open( file_path, 'w' ) as file:
            file.write( content.strip() )

    return content

def extract_content( url ):
    try :
        soup = get_soup( url )
    except Exception as exception:
        raise exception

    if soup is None:
        raise Exception('No HTML content found.')

    content = get_main_content( soup )
    if content is None :
        raise Exception('No main content found.')

    text = get_tags_text( content )
    if text is None :
        raise Exception('No text found.')

    return text

# Make request and get html content.
def get_soup( url ):
    file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
    print(file_path)
    if exists( file_path ):
        with open( file_path, 'r' ) as web_page:
            html = web_page.read()
    else:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
        response = requests.get( url, headers=headers )
        response.raise_for_status()
        if not response.text:
            raise Exception('HTML empty.')
        html = response.text
        with open( file_path, 'w' ) as file:
            file.write( html )

    return BeautifulSoup(html, 'html.parser')

def get_main_content( soup ):

    content = soup.find( "div", { "class": "post-body" } )
    if content is not None:
        print('Has .post-body class.')
        return content
    
    content = soup.find( "div", { "class": "article-content" } )
    if content is not None:
        print('Has .article-content class.')
        return content

    content = soup.find( "div", { "class": "entry-content" } )
    if content is not None:
        print('Has .entry-content class.')
        return content

    content = soup.find( "div", { "class": "region--content" } )
    if content is not None:
        print('Has .region--content class.')
        return content
    
    content = soup.find( "div", { "class": "article" } )
    if content is not None:
        print('Has .article class.')
        return content

    content = soup.find( "div", { "class": "article-inner_html" } )
    if content is not None:
        print('Has .article-inner_html class.')
        return content

    content = soup.find( "div", { "id": "bmdDetail-Content" } )
    if content is not None:
        print('Has .bmdDetail-Content id.')
        return content

    content = soup.find( "div", { "id": "main" } )
    if content is not None:
        print('Has .bmdDetail-Content id.')
        return content

    content = soup.main
    if content is not None:
        print('Has main tag.')
        return content

    content = soup.find( "article" )
    if content is not None:
        print('Has article tag.')
        return content

    return None

def get_tags_text( soup ):
    text = ''
    tags = soup.find_all( find_direct_text )
    for tag in tags:
        if tag.name == 'div' and tag.find( text=True, recursive=False ) :
            for div in tag.find_all(text=True, recursive=False):
                text += div.get_text().strip() + ' '
        else :
            text += tag.get_text().strip() + ' '
    return text

# -------------------------------------- #

# Extract content from main tag.
def get_main( soup ):
    return soup.main

def get_deepest_divs( tag ):
    # Get all the divs from within a tag.
    return [div for div in tag.findAll('div') if not div.find('div')]

def get_tag_text( tags ):
    text = ''
    for tag in tags:
        print(tag.find_all('li'))
        # text += [p.get_text() for p in tag.find_all('p)]
    return text

def get_list_text( tags ):
    list_items = []
    for tag in tags:
        list_items = tag.find_all(find_direct_text)
    return list_items

def find_direct_text( tag ):
    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )

def find_div_text( tag ):
    return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()

if __name__ == '__main__':
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
  print(extract_content(url))