grapplerulrich commited on
Commit
1ec143e
1 Parent(s): 1f95777

Use classes to get main content

Browse files
Files changed (2) hide show
  1. beautiful_soup/app.py +36 -5
  2. main.py +2 -2
beautiful_soup/app.py CHANGED
@@ -15,6 +15,7 @@ import requests
15
  # Make request and get html content.
16
  def get_soup( url ):
17
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
 
18
  if exists( file_path ):
19
  with open( file_path, 'r' ) as web_page:
20
  html = web_page.read()
@@ -34,6 +35,35 @@ def get_soup( url ):
34
  def get_main( soup ):
35
  return soup.main
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def get_deepest_divs( tag ):
38
  # Get all the divs from within a tag.
39
  return [div for div in tag.findAll('div') if not div.find('div')]
@@ -75,13 +105,14 @@ def extract_content( url ):
75
  raise exception
76
 
77
  if soup is None:
78
- raise Exception('No content found.')
79
 
80
- main = get_main( soup )
81
- if main is None :
82
- raise Exception('No main tag found.')
 
83
 
84
- return get_tags_text( main )
85
  # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
86
 
87
  if __name__ == '__main__':
 
15
  # Make request and get html content.
16
  def get_soup( url ):
17
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
18
+ print(file_path)
19
  if exists( file_path ):
20
  with open( file_path, 'r' ) as web_page:
21
  html = web_page.read()
 
35
  def get_main( soup ):
36
  return soup.main
37
 
38
+ def get_main_content( soup ):
39
+ content = soup.main
40
+
41
+ if content is not None:
42
+ print('Has main tag.')
43
+ return content
44
+
45
+ content = soup.find( "div", { "class": "post-body" } )
46
+ if content is not None:
47
+ print('Has .post-body class.')
48
+ return content
49
+
50
+ content = soup.find( "div", { "class": "article-content" } )
51
+ if content is not None:
52
+ print('Has .article-content class.')
53
+ return content
54
+
55
+ content = soup.find( "div", { "class": "entry-content" } )
56
+ if content is not None:
57
+ print('Has .entry-content class.')
58
+ return content
59
+
60
+ content = soup.find( "div", { "class": "region--content" } )
61
+ if content is not None:
62
+ print('Has .region--content class.')
63
+ return content
64
+
65
+ return None
66
+
67
  def get_deepest_divs( tag ):
68
  # Get all the divs from within a tag.
69
  return [div for div in tag.findAll('div') if not div.find('div')]
 
105
  raise exception
106
 
107
  if soup is None:
108
+ raise Exception('No HTML content found.')
109
 
110
+ content = get_main_content( soup )
111
+ if content is None :
112
+ # content = soup.body
113
+ raise Exception('No main content found.')
114
 
115
+ return get_tags_text( content )
116
  # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
117
 
118
  if __name__ == '__main__':
main.py CHANGED
@@ -19,13 +19,13 @@ def google_search_api_request( query ):
19
  "customsearch",
20
  "v1",
21
  developerKey=api_key,
22
- cache_discovery=False,
23
- num=5
24
  )
25
 
26
  return service.cse().list(
27
  q=query,
28
  cx='05048cc2df6134a06',
 
29
  ).execute()
30
 
31
  def search_results( query ):
 
19
  "customsearch",
20
  "v1",
21
  developerKey=api_key,
22
+ cache_discovery=False
 
23
  )
24
 
25
  return service.cse().list(
26
  q=query,
27
  cx='05048cc2df6134a06',
28
+ num=5,
29
  ).execute()
30
 
31
  def search_results( query ):