Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Commit
•
d834e4c
1
Parent(s):
14029bd
Strip scripts and styles
Browse filesImprove getting inline text from div
beautiful_soup/beautiful_soup.py
CHANGED
@@ -38,6 +38,10 @@ def extract_content( url ):
|
|
38 |
if soup is None:
|
39 |
raise Exception('No HTML content found.')
|
40 |
|
|
|
|
|
|
|
|
|
41 |
content = get_main_content( soup )
|
42 |
if content is None :
|
43 |
raise Exception('No main content found.')
|
@@ -73,12 +77,22 @@ def get_main_content( soup ):
|
|
73 |
if content is not None:
|
74 |
print('Has .post-body class.')
|
75 |
return content
|
76 |
-
|
77 |
content = soup.find( "div", { "class": "article-content" } )
|
78 |
if content is not None:
|
79 |
print('Has .article-content class.')
|
80 |
return content
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
content = soup.find( "div", { "class": "entry-content" } )
|
83 |
if content is not None:
|
84 |
print('Has .entry-content class.')
|
@@ -123,15 +137,20 @@ def get_main_content( soup ):
|
|
123 |
|
124 |
def get_tags_text( soup ):
|
125 |
text = ''
|
126 |
-
tags = soup.find_all(
|
127 |
for tag in tags:
|
128 |
-
if tag.name == 'div'
|
129 |
for div in tag.find_all(text=True, recursive=False):
|
130 |
-
|
|
|
|
|
131 |
else :
|
132 |
-
text += tag.get_text(
|
133 |
return text
|
134 |
|
|
|
|
|
|
|
135 |
# -------------------------------------- #
|
136 |
|
137 |
# Extract content from main tag.
|
@@ -155,11 +174,8 @@ def get_list_text( tags ):
|
|
155 |
list_items = tag.find_all(find_direct_text)
|
156 |
return list_items
|
157 |
|
158 |
-
def find_direct_text( tag ):
|
159 |
-
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )
|
160 |
-
|
161 |
def find_div_text( tag ):
|
162 |
-
return tag.name == 'div'
|
163 |
|
164 |
if __name__ == '__main__':
|
165 |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|
|
|
38 |
if soup is None:
|
39 |
raise Exception('No HTML content found.')
|
40 |
|
41 |
+
# Remove scripts and styles.
|
42 |
+
for script in soup(["script", "style"]):
|
43 |
+
script.decompose()
|
44 |
+
|
45 |
content = get_main_content( soup )
|
46 |
if content is None :
|
47 |
raise Exception('No main content found.')
|
|
|
77 |
if content is not None:
|
78 |
print('Has .post-body class.')
|
79 |
return content
|
80 |
+
|
81 |
content = soup.find( "div", { "class": "article-content" } )
|
82 |
if content is not None:
|
83 |
print('Has .article-content class.')
|
84 |
return content
|
85 |
|
86 |
+
content = soup.find( "div", { "class": "blog-post-content" } )
|
87 |
+
if content is not None:
|
88 |
+
print('Has .blog-post-content class.')
|
89 |
+
return content
|
90 |
+
|
91 |
+
content = soup.find( "div", { "class": "region-content" } )
|
92 |
+
if content is not None:
|
93 |
+
print('Has .region-content class.')
|
94 |
+
return content
|
95 |
+
|
96 |
content = soup.find( "div", { "class": "entry-content" } )
|
97 |
if content is not None:
|
98 |
print('Has .entry-content class.')
|
|
|
137 |
|
138 |
def get_tags_text( soup ):
|
139 |
text = ''
|
140 |
+
tags = soup.find_all( allowed_tags )
|
141 |
for tag in tags:
|
142 |
+
if tag.name == 'div' :
|
143 |
for div in tag.find_all(text=True, recursive=False):
|
144 |
+
found_text = div.get_text( ' ', strip=True )
|
145 |
+
if found_text != '':
|
146 |
+
text += found_text
|
147 |
else :
|
148 |
+
text += tag.get_text( ' ', strip=True ) + ' '
|
149 |
return text
|
150 |
|
151 |
+
def allowed_tags( tag ):
|
152 |
+
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
|
153 |
+
|
154 |
# -------------------------------------- #
|
155 |
|
156 |
# Extract content from main tag.
|
|
|
174 |
list_items = tag.find_all(find_direct_text)
|
175 |
return list_items
|
176 |
|
|
|
|
|
|
|
177 |
def find_div_text( tag ):
|
178 |
+
return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
|
179 |
|
180 |
if __name__ == '__main__':
|
181 |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|