grapplerulrich commited on
Commit
8b32433
1 Parent(s): 35d7624

inital version with small test for beautiful soup

Browse files
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  /.venv
2
  .env
 
 
1
  /.venv
2
  .env
3
+ __pycache__
beautiful-soup/app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+
4
+ '''
5
+ - Error handing
6
+ - Look if alternative to main tag is needed. Provide error message if main tag is not found.
7
+ - Menus are li tags with a tags within.
8
+ - li tags with text and tags should be exported
9
+ - Find divs that have text or p tags maybe other tags like divs
10
+ - Export the text
11
+ '''
12
+
13
+ # Make request and get html content.
14
+ def get_soup( url ):
15
+ # try:
16
+ # request = requests.get(url)
17
+ # except:
18
+ # print('Unable to retrieve content, skipping URL')
19
+ # return
20
+
21
+ # if not request.ok:
22
+ # print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
23
+ # return
24
+
25
+ request = requests.get(url)
26
+ html = request.content
27
+ soup = BeautifulSoup(html, 'html.parser')
28
+ return soup
29
+
30
+ # Extract content from main tag.
31
+ def get_main( soup ):
32
+ return soup.main
33
+
34
+ def is_childless( tag ):
35
+ return len( tag.find_all('div') ) == 0
36
+
37
+ def get_divs( tag ):
38
+ # Get all the divs from within the main tag.
39
+ divs = tag.find_all('div')
40
+ return filter( is_childless, divs )
41
+
42
+
43
+ def extract_content( url ):
44
+ soup = get_soup( url )
45
+ main = get_main( soup )
46
+ divs = get_divs( main )
47
+ return [p.get_text() for p in div.find_all('p')]
48
+
49
+
50
+ # # Get all the divs from within the main tag.
51
+ # divs = soup.main.find_all('div')
52
+ # for div in divs:
53
+ # # Get all of the divs that do not have further divs within.
54
+ # no_child_div = len(div.find_all('div')) == 0
55
+ # if no_child_div:
56
+ # # Find all p tags in the div.
57
+ # content += [p.get_text() for p in div.find_all('p')]
58
+ # # Find all li in the div.
59
+ # for li in div.find_all('li'):
60
+ # #
61
+ # content += ''.join(li.find_all(text=True, recursive=False))
62
+ # content += ''.join(div.find_all(text=True, recursive=False))
63
+ # return content
64
+
65
+ if __name__ == '__main':
66
+ url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
67
+ print(extract_content(url))
beautiful-soup/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ beautifulsoup4
beautiful-soup/test.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from bs4 import BeautifulSoup
3
+ import app
4
+
5
+ class BeautifulSoupTest(unittest.TestCase):
6
+ def test_beautiful_soup(self):
7
+ self.assertTrue(True)
8
+
9
+ def test_main_tag(self):
10
+ html = '''
11
+ <html>
12
+ <head> </head>
13
+ <body>
14
+ <main>
15
+ <div>
16
+ <ul>
17
+ <li><a href="https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans">Electronic Billing</a></li>
18
+ <li><a href="https://www.cms.gov/Medicare/Billing/BillingFAQs">Billing FAQs</a></li>
19
+ </ul>
20
+ </div>
21
+ <div>
22
+ <div>
23
+ <p>Paragraph</p>
24
+ <ul>
25
+ <li>List Item</li>
26
+ </ul>
27
+ Text within div
28
+ </div>
29
+ </div>
30
+ </main>
31
+ </body>
32
+ </html>
33
+ '''
34
+ soup = BeautifulSoup(html, 'html.parser')
35
+ self.assertEqual( app.get_main( soup ).name, 'main' )
36
+
37
+ if __name__ == '__main__':
38
+ unittest.main()
google-search/requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  streamlit
2
  google
3
  python-dotenv
 
 
1
  streamlit
2
  google
3
  python-dotenv
4
+ beautifulsoup4