Spaces:

grapplerulrich
/

raccoon

Sleeping

grapplerulrich commited on Apr 24, 2022

Commit

526644d

1 Parent(s): 37ee6a5

Cache page content

Files changed (3) hide show

.gitignore CHANGED Viewed

@@ -3,3 +3,4 @@
 __pycache__
 /search-results
 /web-pages

 __pycache__
 /search-results
 /web-pages
+/page-content

beautiful_soup/app.py CHANGED Viewed

@@ -130,6 +130,21 @@ def extract_content( url ):
     return get_tags_text( content )
     # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
   print(extract_content(url))

     return get_tags_text( content )
     # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
+def get_url_content( url ):
+    file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
+    if exists( file_path ):
+        with open( file_path, 'r' ) as file_content:
+            content = file_content.read()
+    else:
+        try:
+            content = extract_content( url )
+        except Exception as exception:
+            raise exception
+        with open( file_path, 'w' ) as file:
+            file.write( content )
+    return content
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
   print(extract_content(url))

main.py CHANGED Viewed

@@ -8,7 +8,7 @@ from dotenv import load_dotenv
 from googleapiclient.discovery import build
 from slugify import slugify
-from beautiful_soup.app import extract_content
 @cache
 def google_search_api_request( query ):
@@ -60,7 +60,7 @@ def main():
         for result in results:
             st.write(result['link'])
             try:
-                st.write( extract_content( result['link'] ) )
             except Exception as exception:
                 st.exception(exception)

 from googleapiclient.discovery import build
 from slugify import slugify
+from beautiful_soup.app import get_url_content
 @cache
 def google_search_api_request( query ):
         for result in results:
             st.write(result['link'])
             try:
+                st.write( get_url_content( result['link'] ) )
             except Exception as exception:
                 st.exception(exception)