grapplerulrich commited on
Commit
526644d
1 Parent(s): 37ee6a5

Cache page content

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. beautiful_soup/app.py +15 -0
  3. main.py +2 -2
.gitignore CHANGED
@@ -3,3 +3,4 @@
3
  __pycache__
4
  /search-results
5
  /web-pages
 
 
3
  __pycache__
4
  /search-results
5
  /web-pages
6
+ /page-content
beautiful_soup/app.py CHANGED
@@ -130,6 +130,21 @@ def extract_content( url ):
130
  return get_tags_text( content )
131
  # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  if __name__ == '__main__':
134
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
135
  print(extract_content(url))
 
130
  return get_tags_text( content )
131
  # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
132
 
133
+ def get_url_content( url ):
134
+ file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
135
+ if exists( file_path ):
136
+ with open( file_path, 'r' ) as file_content:
137
+ content = file_content.read()
138
+ else:
139
+ try:
140
+ content = extract_content( url )
141
+ except Exception as exception:
142
+ raise exception
143
+ with open( file_path, 'w' ) as file:
144
+ file.write( content )
145
+
146
+ return content
147
+
148
  if __name__ == '__main__':
149
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
150
  print(extract_content(url))
main.py CHANGED
@@ -8,7 +8,7 @@ from dotenv import load_dotenv
8
  from googleapiclient.discovery import build
9
  from slugify import slugify
10
 
11
- from beautiful_soup.app import extract_content
12
 
13
  @cache
14
  def google_search_api_request( query ):
@@ -60,7 +60,7 @@ def main():
60
  for result in results:
61
  st.write(result['link'])
62
  try:
63
- st.write( extract_content( result['link'] ) )
64
  except Exception as exception:
65
  st.exception(exception)
66
 
 
8
  from googleapiclient.discovery import build
9
  from slugify import slugify
10
 
11
+ from beautiful_soup.app import get_url_content
12
 
13
  @cache
14
  def google_search_api_request( query ):
 
60
  for result in results:
61
  st.write(result['link'])
62
  try:
63
+ st.write( get_url_content( result['link'] ) )
64
  except Exception as exception:
65
  st.exception(exception)
66