grapplerulrich commited on
Commit
2f05319
1 Parent(s): ad98547

Fix strings caching

Browse files

add body as last fallback

Files changed (1) hide show
  1. beautiful_soup/beautiful_soup.py +10 -5
beautiful_soup/beautiful_soup.py CHANGED
@@ -15,17 +15,17 @@ import requests
15
  '''
16
 
17
  def get_url_content( url ):
18
- file = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
19
- makedirs(dirname(file), exist_ok=True)
20
- if exists( file ):
21
- with open( file, 'r' ) as file_content:
22
  strings = json.load( file )
23
  else:
24
  try:
25
  strings = extract_strings( url )
26
  except Exception as exception:
27
  raise exception
28
- with open( file, 'w' ) as file:
29
  json.dump( strings, file )
30
 
31
  return strings
@@ -133,6 +133,11 @@ def get_main_content( soup ):
133
  print('Has article tag.')
134
  return content
135
 
 
 
 
 
 
136
  return None
137
 
138
  def get_tags_text( soup ):
 
15
  '''
16
 
17
  def get_url_content( url ):
18
+ file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
19
+ makedirs(dirname(file_path), exist_ok=True)
20
+ if exists( file_path ):
21
+ with open( file_path, 'r' ) as file:
22
  strings = json.load( file )
23
  else:
24
  try:
25
  strings = extract_strings( url )
26
  except Exception as exception:
27
  raise exception
28
+ with open( file_path, 'w' ) as file:
29
  json.dump( strings, file )
30
 
31
  return strings
 
133
  print('Has article tag.')
134
  return content
135
 
136
+ content = soup.find( "body" )
137
+ if content is not None:
138
+ print('Has body tag.')
139
+ return content
140
+
141
  return None
142
 
143
  def get_tags_text( soup ):