Spaces:
Sleeping
Sleeping
import unittest | |
from bs4 import BeautifulSoup | |
import beautiful_soup | |
class BeautifulSoupTest(unittest.TestCase): | |
def setUp(self): | |
self.html = ''' | |
<html> | |
<head></head> | |
<body> | |
<main> | |
<div> | |
<ul> | |
<li><a href="https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans">Electronic Billing</a></li> | |
<li><a href="https://www.cms.gov/Medicare/Billing/BillingFAQs">Billing FAQs</a></li> | |
</ul> | |
</div> | |
<div> | |
<div> | |
<p>Paragraph</p> | |
<ul> | |
<li>List Item</li> | |
</ul> | |
Text within div | |
</div> | |
</div> | |
</main> | |
</body> | |
</html> | |
''' | |
def test_main_tag(self): | |
soup = BeautifulSoup( self.html, 'html.parser' ) | |
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' ) | |
soup = BeautifulSoup( "", 'html.parser' ) | |
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' ) | |
def test_has_no_div_childre(self): | |
childless = ''' | |
<html> | |
<body> | |
<div><p>Text in div.</p></div> | |
</body> | |
</html> | |
''' | |
soup = BeautifulSoup( childless, 'html.parser' ) | |
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body ) ) | |
# self.assertTrue( beautiful_soup.has_no_div_children( soup.body.div ) ) | |
nested_div = ''' | |
<html> | |
<body> | |
<div> | |
<div>Text in paragraph.</div> | |
</div> | |
</body> | |
</html> | |
''' | |
soup = BeautifulSoup( nested_div, 'html.parser' ) | |
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body.div ) ) | |
def test_get_deepest_divs(self): | |
nested_div = ''' | |
<html> | |
<body> | |
<div> | |
<div><p>Text in paragraph.</p></div> | |
</div> | |
</body> | |
</html> | |
''' | |
soup = BeautifulSoup( nested_div, 'html.parser' ) | |
self.assertEqual( beautiful_soup.get_deepest_divs( soup.body )[0].text, 'Text in paragraph.' ) | |
def test_list(self): | |
nested_div = ''' | |
<html> | |
<body> | |
<div> | |
<ul> | |
<li>Text in list.</li> | |
<li><a href"">Link in list.</a></li> | |
<li>Text with <a href"">Link</a> in list.</li> | |
</ul> | |
</div> | |
</body> | |
</html> | |
''' | |
soup = BeautifulSoup( nested_div, 'html.parser' ) | |
divs = beautiful_soup.get_deepest_divs( soup.body ) | |
# self.assertEqual( beautiful_soup.get_list_text( divs )[0], 'Text in list.' ) | |
def test_exlcude_links(self): | |
nested_div = ''' | |
<li><a href='somelink'>I DONT WANT THIS</a></li> | |
<li>blablalba <a href='both'>I WANT THIS</a> blalba</li> | |
<li><a href='right'>I WANT THIS</a> blalba</li> | |
<li>blablalba <a href='left'>I WANT THIS</a></li> | |
<p><a href='somelink'>I WANT THIS</a></p> | |
<p>blablalba <a href='both'>I WANT THIS</a> blalba</p> | |
<p><a href='right'>I WANT THIS</a> blalba</p> | |
<p>blablalba <a href='left'>I WANT THIS</a></p> | |
''' | |
soup = BeautifulSoup( nested_div, 'html.parser' ) | |
list_items = soup.find_all(beautiful_soup.find_direct_text) | |
results = [ | |
'blablalba I WANT THIS blalba', | |
'I WANT THIS blalba', | |
'blablalba I WANT THIS', | |
'I WANT THIS', | |
'blablalba I WANT THIS blalba', | |
'I WANT THIS blalba', | |
'blablalba I WANT THIS' | |
] | |
print(list_items) | |
# for item in list_items: | |
# print('item.get_text(): ' + item.get_text()) | |
# help(list_items) | |
for i, item in enumerate(list_items): | |
self.assertEqual( item.get_text(), results[i] ) | |
# self.assertEqual( list_items[0].get_text(), 'blablalba I WANT THIS blalba' ) | |
# self.assertEqual( list_items[1].get_text(), 'I WANT THI Sblalba' ) | |
# self.assertEqual( list_items[2].get_text(), 'blablalba I WANT THIS' ) | |
if __name__ == '__main__': | |
unittest.main() | |