grapplerulrich's picture
Add caching and save search results url and HTML
151c2dd
raw
history blame
4.54 kB
import unittest
from bs4 import BeautifulSoup
import beautiful_soup
class BeautifulSoupTest(unittest.TestCase):
def setUp(self):
self.html = '''
<html>
<head></head>
<body>
<main>
<div>
<ul>
<li><a href="https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans">Electronic Billing</a></li>
<li><a href="https://www.cms.gov/Medicare/Billing/BillingFAQs">Billing FAQs</a></li>
</ul>
</div>
<div>
<div>
<p>Paragraph</p>
<ul>
<li>List Item</li>
</ul>
Text within div
</div>
</div>
</main>
</body>
</html>
'''
def test_main_tag(self):
soup = BeautifulSoup( self.html, 'html.parser' )
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
soup = BeautifulSoup( "", 'html.parser' )
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
def test_has_no_div_childre(self):
childless = '''
<html>
<body>
<div><p>Text in div.</p></div>
</body>
</html>
'''
soup = BeautifulSoup( childless, 'html.parser' )
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body ) )
# self.assertTrue( beautiful_soup.has_no_div_children( soup.body.div ) )
nested_div = '''
<html>
<body>
<div>
<div>Text in paragraph.</div>
</div>
</body>
</html>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body.div ) )
def test_get_deepest_divs(self):
nested_div = '''
<html>
<body>
<div>
<div><p>Text in paragraph.</p></div>
</div>
</body>
</html>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
self.assertEqual( beautiful_soup.get_deepest_divs( soup.body )[0].text, 'Text in paragraph.' )
def test_list(self):
nested_div = '''
<html>
<body>
<div>
<ul>
<li>Text in list.</li>
<li><a href"">Link in list.</a></li>
<li>Text with <a href"">Link</a> in list.</li>
</ul>
</div>
</body>
</html>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
divs = beautiful_soup.get_deepest_divs( soup.body )
# self.assertEqual( beautiful_soup.get_list_text( divs )[0], 'Text in list.' )
def test_exlcude_links(self):
nested_div = '''
<li><a href='somelink'>I DONT WANT THIS</a></li>
<li>blablalba <a href='both'>I WANT THIS</a> blalba</li>
<li><a href='right'>I WANT THIS</a> blalba</li>
<li>blablalba <a href='left'>I WANT THIS</a></li>
<p><a href='somelink'>I WANT THIS</a></p>
<p>blablalba <a href='both'>I WANT THIS</a> blalba</p>
<p><a href='right'>I WANT THIS</a> blalba</p>
<p>blablalba <a href='left'>I WANT THIS</a></p>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
list_items = soup.find_all(beautiful_soup.find_direct_text)
results = [
'blablalba I WANT THIS blalba',
'I WANT THIS blalba',
'blablalba I WANT THIS',
'I WANT THIS',
'blablalba I WANT THIS blalba',
'I WANT THIS blalba',
'blablalba I WANT THIS'
]
print(list_items)
# for item in list_items:
# print('item.get_text(): ' + item.get_text())
# help(list_items)
for i, item in enumerate(list_items):
self.assertEqual( item.get_text(), results[i] )
# self.assertEqual( list_items[0].get_text(), 'blablalba I WANT THIS blalba' )
# self.assertEqual( list_items[1].get_text(), 'I WANT THI Sblalba' )
# self.assertEqual( list_items[2].get_text(), 'blablalba I WANT THIS' )
if __name__ == '__main__':
unittest.main()