grapplerulrich commited on
Commit
151c2dd
1 Parent(s): 152531a

Add caching and save search results url and HTML

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. beautiful_soup/app.py +46 -37
  3. beautiful_soup/test.py +104 -8
  4. main.py +59 -0
  5. requirements.txt +5 -90
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  /.venv
2
  .env
3
  __pycache__
 
 
 
1
  /.venv
2
  .env
3
  __pycache__
4
+ /search-urls
5
+ /web-pages
beautiful_soup/app.py CHANGED
@@ -1,5 +1,7 @@
1
  from bs4 import BeautifulSoup
2
  import requests
 
 
3
 
4
  '''
5
  - Error handing
@@ -12,56 +14,63 @@ import requests
12
 
13
  # Make request and get html content.
14
  def get_soup( url ):
15
- # try:
16
- # request = requests.get(url)
17
- # except:
18
- # print('Unable to retrieve content, skipping URL')
19
- # return
20
 
21
- # if not request.ok:
22
- # print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
23
- # return
24
 
25
- request = requests.get(url)
26
- html = request.content
27
- soup = BeautifulSoup(html, 'html.parser')
28
- return soup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Extract content from main tag.
31
  def get_main( soup ):
32
  return soup.main
33
 
34
- def is_childless( tag ):
35
- return len( tag.find_all('div') ) == 0
 
 
 
 
 
 
 
 
36
 
37
- def get_divs( tag ):
38
- # Get all the divs from within the main tag.
39
- divs = tag.find_all('div')
40
- return filter( is_childless, divs )
 
41
 
 
 
42
 
43
  def extract_content( url ):
44
  soup = get_soup( url )
 
 
45
  main = get_main( soup )
46
- divs = get_divs( main )
47
- return [p.get_text() for p in div.find_all('p')]
48
-
49
-
50
- # # Get all the divs from within the main tag.
51
- # divs = soup.main.find_all('div')
52
- # for div in divs:
53
- # # Get all of the divs that do not have further divs within.
54
- # no_child_div = len(div.find_all('div')) == 0
55
- # if no_child_div:
56
- # # Find all p tags in the div.
57
- # content += [p.get_text() for p in div.find_all('p')]
58
- # # Find all li in the div.
59
- # for li in div.find_all('li'):
60
- # #
61
- # content += ''.join(li.find_all(text=True, recursive=False))
62
- # content += ''.join(div.find_all(text=True, recursive=False))
63
- # return content
64
 
65
- if __name__ == '__main':
66
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
67
  print(extract_content(url))
 
1
  from bs4 import BeautifulSoup
2
  import requests
3
+ import uuid
4
+ from os.path import exists
5
 
6
  '''
7
  - Error handing
 
14
 
15
  # Make request and get html content.
16
  def get_soup( url ):
 
 
 
 
 
17
 
 
 
 
18
 
19
+ file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
20
+ if ( exists( file_path ) ):
21
+ with open( file_path, 'r' ) as web_page:
22
+ html = web_page.read()
23
+ else:
24
+ try:
25
+ request = requests.get(url)
26
+ except:
27
+ print('Unable to retrieve content, skipping URL')
28
+ return
29
+ if not request.ok:
30
+ print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) )
31
+ return
32
+ if not request.content:
33
+ print(request.content)
34
+ return
35
+ html = request.content
36
+ with open( file_path, 'wb' ) as file:
37
+ file.write( html )
38
+
39
+ return BeautifulSoup(html, 'html.parser')
40
 
41
  # Extract content from main tag.
42
  def get_main( soup ):
43
  return soup.main
44
 
45
+ def get_deepest_divs( tag ):
46
+ # Get all the divs from within a tag.
47
+ return [div for div in tag.findAll('div') if not div.find('div')]
48
+
49
+ def get_tag_text( tags ):
50
+ text = ''
51
+ for tag in tags:
52
+ print(tag.find_all('li'))
53
+ # text += [p.get_text() for p in tag.find_all('p)]
54
+ return text
55
 
56
+ def get_list_text( tags ):
57
+ list_items = []
58
+ for tag in tags:
59
+ list_items = tag.find_all(find_direct_text)
60
+ return list_items
61
 
62
+ def find_direct_text( tag ):
63
+ return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3'
64
 
65
  def extract_content( url ):
66
  soup = get_soup( url )
67
+ if ( soup == None ):
68
+ return None
69
  main = get_main( soup )
70
+ if ( main == None ):
71
+ return 'No main tag found.'
72
+ return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ if __name__ == '__main__':
75
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
76
  print(extract_content(url))
beautiful_soup/test.py CHANGED
@@ -1,15 +1,13 @@
1
  import unittest
2
  from bs4 import BeautifulSoup
3
- import app
4
 
5
  class BeautifulSoupTest(unittest.TestCase):
6
- def test_beautiful_soup(self):
7
- self.assertTrue(True)
8
 
9
- def test_main_tag(self):
10
- html = '''
11
  <html>
12
- <head> </head>
13
  <body>
14
  <main>
15
  <div>
@@ -31,8 +29,106 @@ class BeautifulSoupTest(unittest.TestCase):
31
  </body>
32
  </html>
33
  '''
34
- soup = BeautifulSoup(html, 'html.parser')
35
- self.assertEqual( app.get_main( soup ).name, 'main' )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  if __name__ == '__main__':
38
  unittest.main()
 
1
  import unittest
2
  from bs4 import BeautifulSoup
3
+ import beautiful_soup
4
 
5
  class BeautifulSoupTest(unittest.TestCase):
 
 
6
 
7
+ def setUp(self):
8
+ self.html = '''
9
  <html>
10
+ <head></head>
11
  <body>
12
  <main>
13
  <div>
 
29
  </body>
30
  </html>
31
  '''
32
+
33
+ def test_main_tag(self):
34
+ soup = BeautifulSoup( self.html, 'html.parser' )
35
+ self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
36
+
37
+ soup = BeautifulSoup( "", 'html.parser' )
38
+ self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
39
+
40
+ def test_has_no_div_childre(self):
41
+ childless = '''
42
+ <html>
43
+ <body>
44
+ <div><p>Text in div.</p></div>
45
+ </body>
46
+ </html>
47
+ '''
48
+ soup = BeautifulSoup( childless, 'html.parser' )
49
+ # self.assertFalse( beautiful_soup.has_no_div_children( soup.body ) )
50
+ # self.assertTrue( beautiful_soup.has_no_div_children( soup.body.div ) )
51
+
52
+ nested_div = '''
53
+ <html>
54
+ <body>
55
+ <div>
56
+ <div>Text in paragraph.</div>
57
+ </div>
58
+ </body>
59
+ </html>
60
+ '''
61
+ soup = BeautifulSoup( nested_div, 'html.parser' )
62
+ # self.assertFalse( beautiful_soup.has_no_div_children( soup.body.div ) )
63
+
64
+ def test_get_deepest_divs(self):
65
+ nested_div = '''
66
+ <html>
67
+ <body>
68
+ <div>
69
+ <div><p>Text in paragraph.</p></div>
70
+ </div>
71
+ </body>
72
+ </html>
73
+ '''
74
+ soup = BeautifulSoup( nested_div, 'html.parser' )
75
+ self.assertEqual( beautiful_soup.get_deepest_divs( soup.body )[0].text, 'Text in paragraph.' )
76
+
77
+
78
+ def test_list(self):
79
+ nested_div = '''
80
+ <html>
81
+ <body>
82
+ <div>
83
+ <ul>
84
+ <li>Text in list.</li>
85
+ <li><a href"">Link in list.</a></li>
86
+ <li>Text with <a href"">Link</a> in list.</li>
87
+ </ul>
88
+ </div>
89
+ </body>
90
+ </html>
91
+ '''
92
+ soup = BeautifulSoup( nested_div, 'html.parser' )
93
+ divs = beautiful_soup.get_deepest_divs( soup.body )
94
+ # self.assertEqual( beautiful_soup.get_list_text( divs )[0], 'Text in list.' )
95
+
96
+ def test_exlcude_links(self):
97
+ nested_div = '''
98
+ <li><a href='somelink'>I DONT WANT THIS</a></li>
99
+ <li>blablalba <a href='both'>I WANT THIS</a> blalba</li>
100
+ <li><a href='right'>I WANT THIS</a> blalba</li>
101
+ <li>blablalba <a href='left'>I WANT THIS</a></li>
102
+
103
+ <p><a href='somelink'>I WANT THIS</a></p>
104
+ <p>blablalba <a href='both'>I WANT THIS</a> blalba</p>
105
+ <p><a href='right'>I WANT THIS</a> blalba</p>
106
+ <p>blablalba <a href='left'>I WANT THIS</a></p>
107
+ '''
108
+ soup = BeautifulSoup( nested_div, 'html.parser' )
109
+
110
+ list_items = soup.find_all(beautiful_soup.find_direct_text)
111
+ results = [
112
+ 'blablalba I WANT THIS blalba',
113
+ 'I WANT THIS blalba',
114
+ 'blablalba I WANT THIS',
115
+ 'I WANT THIS',
116
+ 'blablalba I WANT THIS blalba',
117
+ 'I WANT THIS blalba',
118
+ 'blablalba I WANT THIS'
119
+ ]
120
+
121
+ print(list_items)
122
+ # for item in list_items:
123
+ # print('item.get_text(): ' + item.get_text())
124
+
125
+ # help(list_items)
126
+ for i, item in enumerate(list_items):
127
+ self.assertEqual( item.get_text(), results[i] )
128
+
129
+ # self.assertEqual( list_items[0].get_text(), 'blablalba I WANT THIS blalba' )
130
+ # self.assertEqual( list_items[1].get_text(), 'I WANT THI Sblalba' )
131
+ # self.assertEqual( list_items[2].get_text(), 'blablalba I WANT THIS' )
132
 
133
  if __name__ == '__main__':
134
  unittest.main()
main.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from googleapiclient.discovery import build
4
+ from functools import cache
5
+ from slugify import slugify
6
+ from os import getenv
7
+ from os.path import exists
8
+ import json
9
+
10
+ from beautiful_soup.app import extract_content
11
+
12
+ @cache
13
+ def google_search( query ):
14
+ api_key = getenv('GOOGLE_SEARCH_API_KEY')
15
+ # cx = os.getenv('GOOGLE_SEARCH_ENGIN_ID')
16
+ service = build(
17
+ "customsearch",
18
+ "v1",
19
+ developerKey=api_key,
20
+ cache_discovery=False
21
+ )
22
+
23
+ return service.cse().list(
24
+ q=query,
25
+ cx='05048cc2df6134a06',
26
+ ).execute()
27
+
28
+ def main():
29
+ load_dotenv()
30
+ st.title('Google Search')
31
+ query = st.text_input('Search query')
32
+
33
+ if ( query ):
34
+ file_path = 'search-urls/' + slugify( query ) + '.json'
35
+
36
+ if ( exists( file_path ) ):
37
+ with open( file_path, 'r' ) as results_file:
38
+ results = json.load(results_file)
39
+ else:
40
+ search_result = google_search( query )
41
+ if( int( search_result['searchInformation']['totalResults'] ) > 0 ):
42
+ results = search_result['items']
43
+ with open( file_path, 'w' ) as results_file:
44
+ json.dump( results, results_file )
45
+ else:
46
+ results = []
47
+
48
+ if ( len( results ) == 0 ) :
49
+ st.write( 'No results found.' )
50
+
51
+ try:
52
+ for item in results:
53
+ st.write(item['link'])
54
+ st.write(extract_content( item['link'] ))
55
+ except Exception as e:
56
+ st.exception(e)
57
+
58
+ if __name__ == '__main__':
59
+ main()
requirements.txt CHANGED
@@ -1,90 +1,5 @@
1
- altair==4.2.0
2
- appnope==0.1.2
3
- argon2-cffi==21.3.0
4
- argon2-cffi-bindings==21.2.0
5
- asttokens==2.0.5
6
- attrs==21.4.0
7
- backcall==0.2.0
8
- beautifulsoup4==4.10.0
9
- bleach==4.1.0
10
- blinker==1.4
11
- cachetools==5.0.0
12
- certifi==2021.10.8
13
- cffi==1.15.0
14
- charset-normalizer==2.0.12
15
- click==8.0.4
16
- debugpy==1.6.0
17
- decorator==5.1.1
18
- defusedxml==0.7.1
19
- entrypoints==0.4
20
- executing==0.8.3
21
- gitdb==4.0.9
22
- GitPython==3.1.27
23
- idna==3.3
24
- importlib-metadata==4.11.3
25
- ipykernel==6.11.0
26
- ipython==8.2.0
27
- ipython-genutils==0.2.0
28
- ipywidgets==7.7.0
29
- jedi==0.18.1
30
- Jinja2==3.1.1
31
- jsonschema==4.4.0
32
- jupyter-client==7.2.1
33
- jupyter-core==4.9.2
34
- jupyterlab-pygments==0.1.2
35
- jupyterlab-widgets==1.1.0
36
- MarkupSafe==2.1.1
37
- matplotlib-inline==0.1.3
38
- mistune==0.8.4
39
- nbclient==0.5.13
40
- nbconvert==6.4.5
41
- nbformat==5.2.0
42
- nest-asyncio==1.5.4
43
- notebook==6.4.10
44
- numpy==1.22.3
45
- packaging==21.3
46
- pandas==1.4.1
47
- pandocfilters==1.5.0
48
- parso==0.8.3
49
- pexpect==4.8.0
50
- pickleshare==0.7.5
51
- Pillow==9.0.1
52
- prometheus-client==0.13.1
53
- prompt-toolkit==3.0.28
54
- protobuf==3.19.4
55
- psutil==5.9.0
56
- ptyprocess==0.7.0
57
- pure-eval==0.2.2
58
- pyarrow==7.0.0
59
- pycparser==2.21
60
- pydeck==0.7.1
61
- Pygments==2.11.2
62
- Pympler==1.0.1
63
- pyparsing==3.0.7
64
- pyrsistent==0.18.1
65
- python-dateutil==2.8.2
66
- pytz==2022.1
67
- pytz-deprecation-shim==0.1.0.post0
68
- pyzmq==22.3.0
69
- requests==2.27.1
70
- semver==2.13.0
71
- Send2Trash==1.8.0
72
- six==1.16.0
73
- smmap==5.0.0
74
- soupsieve==2.3.1
75
- stack-data==0.2.0
76
- streamlit==1.8.1
77
- terminado==0.13.3
78
- testpath==0.6.0
79
- toml==0.10.2
80
- toolz==0.11.2
81
- tornado==6.1
82
- traitlets==5.1.1
83
- tzdata==2022.1
84
- tzlocal==4.1
85
- urllib3==1.26.9
86
- validators==0.18.2
87
- wcwidth==0.2.5
88
- webencodings==0.5.1
89
- widgetsnbextension==3.6.0
90
- zipp==3.7.0
 
1
+ streamlit
2
+ google
3
+ python-dotenv
4
+ beautifulsoup4
5
+ python-slugify