File size: 5,923 Bytes
151c2dd
1f86974
59d5e33
 
1f95777
 
8b32433
 
 
 
 
 
 
 
 
 
0227a07
561abab
2f05319
0227a07
 
2f05319
0227a07
 
2f05319
 
1f86974
561abab
 
1f86974
561abab
 
67c42e7
0227a07
2f05319
1f86974
561abab
1f86974
561abab
0227a07
1f86974
561abab
0227a07
561abab
 
 
 
 
 
 
d834e4c
 
 
 
0227a07
561abab
 
 
 
0227a07
1f86974
 
ea2fb58
1f86974
561abab
8b32433
 
151c2dd
59d5e33
0227a07
1f95777
151c2dd
 
 
0227a07
1f95777
 
0227a07
1f95777
 
 
 
0227a07
1f95777
151c2dd
 
 
8b32433
0227a07
1ec143e
 
 
 
 
d834e4c
1ec143e
 
 
 
d834e4c
 
 
 
 
 
 
 
1ec143e
 
 
 
 
 
 
37ee6a5
 
 
 
 
 
 
 
 
561abab
 
 
 
 
 
 
 
 
 
 
 
37ee6a5
 
 
1ec143e
2f05319
 
 
 
1ec143e
 
0227a07
561abab
1f86974
0227a07
d834e4c
0227a07
561abab
0227a07
d834e4c
561abab
d834e4c
 
6e5749a
 
1f86974
561abab
67c42e7
 
6e5749a
 
67c42e7
561abab
 
0227a07
d834e4c
 
 
0227a07
561abab
 
 
 
 
 
151c2dd
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import uuid
import json
from os import makedirs, remove
from os.path import exists, dirname
from bs4 import BeautifulSoup
import requests

'''
 - Error handing
 - Look if alternative to main tag is needed. Provide error message if main tag is not found.
 - Menus are li tags with a tags within.
 - li tags with text and tags should be exported
 - Find divs that have text or p tags maybe other tags like divs
 - Export the text
'''

# Get array of strings from page based off URL.
def get_url_content( url ):
    file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'

    # Create directory if it doesn't exist.
    makedirs(dirname(file_path), exist_ok=True)

    # If cache file exists get content from cache.
    if exists( file_path ):
        with open( file_path, 'r' ) as file:
            strings = json.load( file )
    else:
        try:
            strings = extract_strings( url )
        except Exception as exception:
            raise exception

        # Write strings to cache.
        with open( file_path, 'w' ) as file:
            json.dump( strings, file )

    return strings

# Extract text from page based off URL.
def extract_strings( url ):
    try :
        # Parse html content using BeautifulSoup.
        soup = get_soup( url )
    except Exception as exception:
        raise exception

    if soup is None:
        raise Exception('No HTML content found.')

    # Remove scripts and styles.
    for script in soup(["script", "style"]):
        script.decompose()

    # Get main content of html page.
    content = get_main_content( soup )
    if content is None :
        raise Exception('No main content found.')

    # Extract strings from main content based on allowed tags.
    strings = get_tags_text( content )
    if strings is None :
        raise Exception('No text found.')
    return strings

# Make request and get html content.
def get_soup( url ):
    file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
    makedirs(dirname(file_path), exist_ok=True)
    # If cache file exists get content from cache.
    if exists( file_path ):
        with open( file_path, 'r' ) as web_page:
            html = web_page.read()
    else:
        # Add user agent header to request to make request more realistic.
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
        response = requests.get( url, headers=headers )
        # Raise exception if response is not 200.
        response.raise_for_status()
        if not response.text:
            raise Exception('HTML empty.')
        html = response.text
        # Save html to cache.
        with open( file_path, 'w' ) as file:
            file.write( html )

    return BeautifulSoup(html, 'html.parser')

# Find main content of html page based rules.
def get_main_content( soup ):

    content = soup.find( "div", { "class": "post-body" } )
    if content is not None:
        return content

    content = soup.find( "div", { "class": "article-content" } )
    if content is not None:
        return content

    content = soup.find( "div", { "class": "blog-post-content" } )
    if content is not None:
        return content

    content = soup.find( "div", { "class": "region-content" } )
    if content is not None:
        return content

    content = soup.find( "div", { "class": "entry-content" } )
    if content is not None:
        return content

    content = soup.find( "div", { "class": "region--content" } )
    if content is not None:
        return content
    
    content = soup.find( "div", { "class": "article" } )
    if content is not None:
        return content

    content = soup.find( "div", { "class": "article-inner_html" } )
    if content is not None:
        return content

    content = soup.find( "div", { "id": "bmdDetail-Content" } )
    if content is not None:
        return content

    content = soup.find( "div", { "id": "main" } )
    if content is not None:
        return content

    content = soup.main
    if content is not None:
        return content

    content = soup.find( "article" )
    if content is not None:
        return content

    content = soup.find( "body" )
    if content is not None:
        return content

    return None

# Extract text from allowed tags.
def get_tags_text( soup ):
    text = []
    # Find all tags that are allowed.
    tags = soup.find_all( allowed_tags )
    # Loop through tags and extract text.
    for tag in tags:
        # If div tag extract text from sub tags.
        if tag.name == 'div' :
            for div in tag.find_all(text=True, recursive=False):
                found_text = div.get_text( ' ', strip=True )
                if found_text != '':
                    found_text = found_text.replace( '\n', ' ' )
                    found_text = found_text.replace( '\r', ' ' )
                    text.append( found_text )
        else :
            found_text = tag.get_text( ' ', strip=True )
            if found_text != '':
                found_text = found_text.replace( '\n', ' ' )
                found_text = found_text.replace( '\r', ' ' )
                text.append( found_text )
    return text

# List of allowed tags.
def allowed_tags( tag ):
    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'

## To be deleted.
# -------------------------------------- #

# Extract content from main tag.
def get_main( soup ):
    return soup.main

def get_deepest_divs( tag ):
    # Get all the divs from within a tag.
    return [div for div in tag.findAll('div') if not div.find('div')]

def get_tag_text( tags ):
    text = ''
    for tag in tags:
        print(tag.find_all('li'))
        # text += [p.get_text() for p in tag.find_all('p)]
    return text