Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on May 8, 2022

Commit

d834e4c

•

1 Parent(s): 14029bd

Strip scripts and styles

Browse files

Improve getting inline text from div

Files changed (1) hide show

beautiful_soup/beautiful_soup.py +25 -9

beautiful_soup/beautiful_soup.py CHANGED Viewed

@@ -38,6 +38,10 @@ def extract_content( url ):
     if soup is None:
         raise Exception('No HTML content found.')
     content = get_main_content( soup )
     if content is None :
         raise Exception('No main content found.')
@@ -73,12 +77,22 @@ def get_main_content( soup ):
     if content is not None:
         print('Has .post-body class.')
         return content
     content = soup.find( "div", { "class": "article-content" } )
     if content is not None:
         print('Has .article-content class.')
         return content
     content = soup.find( "div", { "class": "entry-content" } )
     if content is not None:
         print('Has .entry-content class.')
@@ -123,15 +137,20 @@ def get_main_content( soup ):
 def get_tags_text( soup ):
     text = ''
-    tags = soup.find_all( find_direct_text )
     for tag in tags:
-        if tag.name == 'div' and tag.find( text=True, recursive=False ) :
             for div in tag.find_all(text=True, recursive=False):
-                text += div.get_text().strip() + ' '
         else :
-            text += tag.get_text().strip() + ' '
     return text
 # -------------------------------------- #
 # Extract content from main tag.
@@ -155,11 +174,8 @@ def get_list_text( tags ):
         list_items = tag.find_all(find_direct_text)
     return list_items
-def find_direct_text( tag ):
-    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )
 def find_div_text( tag ):
-    return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'

     if soup is None:
         raise Exception('No HTML content found.')
+    # Remove scripts and styles.
+    for script in soup(["script", "style"]):
+        script.decompose()
     content = get_main_content( soup )
     if content is None :
         raise Exception('No main content found.')
     if content is not None:
         print('Has .post-body class.')
         return content
     content = soup.find( "div", { "class": "article-content" } )
     if content is not None:
         print('Has .article-content class.')
         return content
+    content = soup.find( "div", { "class": "blog-post-content" } )
+    if content is not None:
+        print('Has .blog-post-content class.')
+        return content
+    content = soup.find( "div", { "class": "region-content" } )
+    if content is not None:
+        print('Has .region-content class.')
+        return content
     content = soup.find( "div", { "class": "entry-content" } )
     if content is not None:
         print('Has .entry-content class.')
 def get_tags_text( soup ):
     text = ''
+    tags = soup.find_all( allowed_tags )
     for tag in tags:
+        if tag.name == 'div' :
             for div in tag.find_all(text=True, recursive=False):
+                found_text = div.get_text( ' ', strip=True )
+                if found_text != '':
+                    text += found_text
         else :
+            text += tag.get_text( ' ', strip=True ) + ' '
     return text
+def allowed_tags( tag ):
+    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
 # -------------------------------------- #
 # Extract content from main tag.
         list_items = tag.find_all(find_direct_text)
     return list_items
 def find_div_text( tag ):
+    return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'