Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Apr 23, 2022

Commit

1ec143e

•

1 Parent(s): 1f95777

Use classes to get main content

Browse files

Files changed (2) hide show

beautiful_soup/app.py +36 -5
main.py +2 -2

beautiful_soup/app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import requests
 # Make request and get html content.
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
     if exists( file_path ):
         with open( file_path, 'r' ) as web_page:
             html = web_page.read()
@@ -34,6 +35,35 @@ def get_soup( url ):
 def get_main( soup ):
     return soup.main
 def get_deepest_divs( tag ):
     # Get all the divs from within a tag.
     return [div for div in tag.findAll('div') if not div.find('div')]
@@ -75,13 +105,14 @@ def extract_content( url ):
         raise exception
     if soup is None:
-        raise Exception('No content found.')
-    main = get_main( soup )
-    if main is None :
-        raise Exception('No main tag found.')
-    return get_tags_text( main )
     # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
 if __name__ == '__main__':

 # Make request and get html content.
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
+    print(file_path)
     if exists( file_path ):
         with open( file_path, 'r' ) as web_page:
             html = web_page.read()
 def get_main( soup ):
     return soup.main
+def get_main_content( soup ):
+    content = soup.main
+    if content is not None:
+        print('Has main tag.')
+        return content
+    content = soup.find( "div", { "class": "post-body" } )
+    if content is not None:
+        print('Has .post-body class.')
+        return content
+    content = soup.find( "div", { "class": "article-content" } )
+    if content is not None:
+        print('Has .article-content class.')
+        return content
+    content = soup.find( "div", { "class": "entry-content" } )
+    if content is not None:
+        print('Has .entry-content class.')
+        return content
+    content = soup.find( "div", { "class": "region--content" } )
+    if content is not None:
+        print('Has .region--content class.')
+        return content
+    return None
 def get_deepest_divs( tag ):
     # Get all the divs from within a tag.
     return [div for div in tag.findAll('div') if not div.find('div')]
         raise exception
     if soup is None:
+        raise Exception('No HTML content found.')
+    content = get_main_content( soup )
+    if content is None :
+        # content = soup.body
+        raise Exception('No main content found.')
+    return get_tags_text( content )
     # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
 if __name__ == '__main__':

main.py CHANGED Viewed

@@ -19,13 +19,13 @@ def google_search_api_request( query ):
         "customsearch",
         "v1",
         developerKey=api_key,
-        cache_discovery=False,
-        num=5
     )
     return service.cse().list(
         q=query,
         cx='05048cc2df6134a06',
         ).execute()
 def search_results( query ):

         "customsearch",
         "v1",
         developerKey=api_key,
+        cache_discovery=False
     )
     return service.cse().list(
         q=query,
         cx='05048cc2df6134a06',
+        num=5,
         ).execute()
 def search_results( query ):