Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Commit
•
1ec143e
1
Parent(s):
1f95777
Use classes to get main content
Browse files- beautiful_soup/app.py +36 -5
- main.py +2 -2
beautiful_soup/app.py
CHANGED
@@ -15,6 +15,7 @@ import requests
|
|
15 |
# Make request and get html content.
|
16 |
def get_soup( url ):
|
17 |
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
|
|
|
18 |
if exists( file_path ):
|
19 |
with open( file_path, 'r' ) as web_page:
|
20 |
html = web_page.read()
|
@@ -34,6 +35,35 @@ def get_soup( url ):
|
|
34 |
def get_main( soup ):
|
35 |
return soup.main
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def get_deepest_divs( tag ):
|
38 |
# Get all the divs from within a tag.
|
39 |
return [div for div in tag.findAll('div') if not div.find('div')]
|
@@ -75,13 +105,14 @@ def extract_content( url ):
|
|
75 |
raise exception
|
76 |
|
77 |
if soup is None:
|
78 |
-
raise Exception('No content found.')
|
79 |
|
80 |
-
|
81 |
-
if
|
82 |
-
|
|
|
83 |
|
84 |
-
return get_tags_text(
|
85 |
# return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
|
86 |
|
87 |
if __name__ == '__main__':
|
|
|
15 |
# Make request and get html content.
|
16 |
def get_soup( url ):
|
17 |
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
|
18 |
+
print(file_path)
|
19 |
if exists( file_path ):
|
20 |
with open( file_path, 'r' ) as web_page:
|
21 |
html = web_page.read()
|
|
|
35 |
def get_main( soup ):
|
36 |
return soup.main
|
37 |
|
38 |
+
def get_main_content( soup ):
|
39 |
+
content = soup.main
|
40 |
+
|
41 |
+
if content is not None:
|
42 |
+
print('Has main tag.')
|
43 |
+
return content
|
44 |
+
|
45 |
+
content = soup.find( "div", { "class": "post-body" } )
|
46 |
+
if content is not None:
|
47 |
+
print('Has .post-body class.')
|
48 |
+
return content
|
49 |
+
|
50 |
+
content = soup.find( "div", { "class": "article-content" } )
|
51 |
+
if content is not None:
|
52 |
+
print('Has .article-content class.')
|
53 |
+
return content
|
54 |
+
|
55 |
+
content = soup.find( "div", { "class": "entry-content" } )
|
56 |
+
if content is not None:
|
57 |
+
print('Has .entry-content class.')
|
58 |
+
return content
|
59 |
+
|
60 |
+
content = soup.find( "div", { "class": "region--content" } )
|
61 |
+
if content is not None:
|
62 |
+
print('Has .region--content class.')
|
63 |
+
return content
|
64 |
+
|
65 |
+
return None
|
66 |
+
|
67 |
def get_deepest_divs( tag ):
|
68 |
# Get all the divs from within a tag.
|
69 |
return [div for div in tag.findAll('div') if not div.find('div')]
|
|
|
105 |
raise exception
|
106 |
|
107 |
if soup is None:
|
108 |
+
raise Exception('No HTML content found.')
|
109 |
|
110 |
+
content = get_main_content( soup )
|
111 |
+
if content is None :
|
112 |
+
# content = soup.body
|
113 |
+
raise Exception('No main content found.')
|
114 |
|
115 |
+
return get_tags_text( content )
|
116 |
# return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
|
117 |
|
118 |
if __name__ == '__main__':
|
main.py
CHANGED
@@ -19,13 +19,13 @@ def google_search_api_request( query ):
|
|
19 |
"customsearch",
|
20 |
"v1",
|
21 |
developerKey=api_key,
|
22 |
-
cache_discovery=False
|
23 |
-
num=5
|
24 |
)
|
25 |
|
26 |
return service.cse().list(
|
27 |
q=query,
|
28 |
cx='05048cc2df6134a06',
|
|
|
29 |
).execute()
|
30 |
|
31 |
def search_results( query ):
|
|
|
19 |
"customsearch",
|
20 |
"v1",
|
21 |
developerKey=api_key,
|
22 |
+
cache_discovery=False
|
|
|
23 |
)
|
24 |
|
25 |
return service.cse().list(
|
26 |
q=query,
|
27 |
cx='05048cc2df6134a06',
|
28 |
+
num=5,
|
29 |
).execute()
|
30 |
|
31 |
def search_results( query ):
|