grapplerulrich commited on
Commit
d834e4c
1 Parent(s): 14029bd

Strip scripts and styles

Browse files

Improve getting inline text from div

Files changed (1) hide show
  1. beautiful_soup/beautiful_soup.py +25 -9
beautiful_soup/beautiful_soup.py CHANGED
@@ -38,6 +38,10 @@ def extract_content( url ):
38
  if soup is None:
39
  raise Exception('No HTML content found.')
40
 
 
 
 
 
41
  content = get_main_content( soup )
42
  if content is None :
43
  raise Exception('No main content found.')
@@ -73,12 +77,22 @@ def get_main_content( soup ):
73
  if content is not None:
74
  print('Has .post-body class.')
75
  return content
76
-
77
  content = soup.find( "div", { "class": "article-content" } )
78
  if content is not None:
79
  print('Has .article-content class.')
80
  return content
81
 
 
 
 
 
 
 
 
 
 
 
82
  content = soup.find( "div", { "class": "entry-content" } )
83
  if content is not None:
84
  print('Has .entry-content class.')
@@ -123,15 +137,20 @@ def get_main_content( soup ):
123
 
124
  def get_tags_text( soup ):
125
  text = ''
126
- tags = soup.find_all( find_direct_text )
127
  for tag in tags:
128
- if tag.name == 'div' and tag.find( text=True, recursive=False ) :
129
  for div in tag.find_all(text=True, recursive=False):
130
- text += div.get_text().strip() + ' '
 
 
131
  else :
132
- text += tag.get_text().strip() + ' '
133
  return text
134
 
 
 
 
135
  # -------------------------------------- #
136
 
137
  # Extract content from main tag.
@@ -155,11 +174,8 @@ def get_list_text( tags ):
155
  list_items = tag.find_all(find_direct_text)
156
  return list_items
157
 
158
- def find_direct_text( tag ):
159
- return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )
160
-
161
  def find_div_text( tag ):
162
- return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
163
 
164
  if __name__ == '__main__':
165
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
 
38
  if soup is None:
39
  raise Exception('No HTML content found.')
40
 
41
+ # Remove scripts and styles.
42
+ for script in soup(["script", "style"]):
43
+ script.decompose()
44
+
45
  content = get_main_content( soup )
46
  if content is None :
47
  raise Exception('No main content found.')
 
77
  if content is not None:
78
  print('Has .post-body class.')
79
  return content
80
+
81
  content = soup.find( "div", { "class": "article-content" } )
82
  if content is not None:
83
  print('Has .article-content class.')
84
  return content
85
 
86
+ content = soup.find( "div", { "class": "blog-post-content" } )
87
+ if content is not None:
88
+ print('Has .blog-post-content class.')
89
+ return content
90
+
91
+ content = soup.find( "div", { "class": "region-content" } )
92
+ if content is not None:
93
+ print('Has .region-content class.')
94
+ return content
95
+
96
  content = soup.find( "div", { "class": "entry-content" } )
97
  if content is not None:
98
  print('Has .entry-content class.')
 
137
 
138
  def get_tags_text( soup ):
139
  text = ''
140
+ tags = soup.find_all( allowed_tags )
141
  for tag in tags:
142
+ if tag.name == 'div' :
143
  for div in tag.find_all(text=True, recursive=False):
144
+ found_text = div.get_text( ' ', strip=True )
145
+ if found_text != '':
146
+ text += found_text
147
  else :
148
+ text += tag.get_text( ' ', strip=True ) + ' '
149
  return text
150
 
151
+ def allowed_tags( tag ):
152
+ return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
153
+
154
  # -------------------------------------- #
155
 
156
  # Extract content from main tag.
 
174
  list_items = tag.find_all(find_direct_text)
175
  return list_items
176
 
 
 
 
177
  def find_div_text( tag ):
178
+ return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
179
 
180
  if __name__ == '__main__':
181
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'