grapplerulrich commited on
Commit
1f95777
1 Parent(s): 151c2dd

Better exception handeling

Browse files
Files changed (3) hide show
  1. .gitignore +1 -1
  2. beautiful_soup/app.py +38 -25
  3. main.py +40 -30
.gitignore CHANGED
@@ -1,5 +1,5 @@
1
  /.venv
2
  .env
3
  __pycache__
4
- /search-urls
5
  /web-pages
 
1
  /.venv
2
  .env
3
  __pycache__
4
+ /search-results
5
  /web-pages
beautiful_soup/app.py CHANGED
@@ -1,7 +1,7 @@
1
- from bs4 import BeautifulSoup
2
- import requests
3
  import uuid
4
  from os.path import exists
 
 
5
 
6
  '''
7
  - Error handing
@@ -14,26 +14,18 @@ from os.path import exists
14
 
15
  # Make request and get html content.
16
  def get_soup( url ):
17
-
18
-
19
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
20
- if ( exists( file_path ) ):
21
  with open( file_path, 'r' ) as web_page:
22
  html = web_page.read()
23
  else:
24
- try:
25
- request = requests.get(url)
26
- except:
27
- print('Unable to retrieve content, skipping URL')
28
- return
29
- if not request.ok:
30
- print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) )
31
- return
32
- if not request.content:
33
- print(request.content)
34
- return
35
- html = request.content
36
- with open( file_path, 'wb' ) as file:
37
  file.write( html )
38
 
39
  return BeautifulSoup(html, 'html.parser')
@@ -60,16 +52,37 @@ def get_list_text( tags ):
60
  return list_items
61
 
62
  def find_direct_text( tag ):
63
- return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  def extract_content( url ):
66
- soup = get_soup( url )
67
- if ( soup == None ):
68
- return None
 
 
 
 
 
69
  main = get_main( soup )
70
- if ( main == None ):
71
- return 'No main tag found.'
72
- return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
 
 
73
 
74
  if __name__ == '__main__':
75
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
 
 
 
1
  import uuid
2
  from os.path import exists
3
+ from bs4 import BeautifulSoup
4
+ import requests
5
 
6
  '''
7
  - Error handing
 
14
 
15
  # Make request and get html content.
16
  def get_soup( url ):
 
 
17
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
18
+ if exists( file_path ):
19
  with open( file_path, 'r' ) as web_page:
20
  html = web_page.read()
21
  else:
22
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
23
+ response = requests.get( url, headers=headers )
24
+ response.raise_for_status()
25
+ if not response.text:
26
+ raise Exception('HTML empty.')
27
+ html = response.text
28
+ with open( file_path, 'w' ) as file:
 
 
 
 
 
 
29
  file.write( html )
30
 
31
  return BeautifulSoup(html, 'html.parser')
 
52
  return list_items
53
 
54
  def find_direct_text( tag ):
55
+ return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )
56
+
57
+ def find_div_text( tag ):
58
+ return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
59
+
60
+ def get_tags_text( soup ):
61
+ text = ''
62
+ tags = soup.find_all( find_direct_text )
63
+ for tag in tags:
64
+ if tag.name == 'div' and tag.find( text=True, recursive=False ) :
65
+ for div in tag.find_all(text=True, recursive=False):
66
+ text += div.get_text().strip() + ' '
67
+ else :
68
+ text += tag.get_text().strip() + ' '
69
+ return text
70
 
71
  def extract_content( url ):
72
+ try :
73
+ soup = get_soup( url )
74
+ except Exception as exception:
75
+ raise exception
76
+
77
+ if soup is None:
78
+ raise Exception('No content found.')
79
+
80
  main = get_main( soup )
81
+ if main is None :
82
+ raise Exception('No main tag found.')
83
+
84
+ return get_tags_text( main )
85
+ # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
86
 
87
  if __name__ == '__main__':
88
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
main.py CHANGED
@@ -1,23 +1,26 @@
 
 
 
 
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
  from googleapiclient.discovery import build
4
- from functools import cache
5
  from slugify import slugify
6
- from os import getenv
7
- from os.path import exists
8
- import json
9
 
10
  from beautiful_soup.app import extract_content
11
 
12
  @cache
13
- def google_search( query ):
 
14
  api_key = getenv('GOOGLE_SEARCH_API_KEY')
15
- # cx = os.getenv('GOOGLE_SEARCH_ENGIN_ID')
16
  service = build(
17
  "customsearch",
18
  "v1",
19
  developerKey=api_key,
20
- cache_discovery=False
 
21
  )
22
 
23
  return service.cse().list(
@@ -25,35 +28,42 @@ def google_search( query ):
25
  cx='05048cc2df6134a06',
26
  ).execute()
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def main():
29
- load_dotenv()
30
  st.title('Google Search')
31
  query = st.text_input('Search query')
32
 
33
- if ( query ):
34
- file_path = 'search-urls/' + slugify( query ) + '.json'
35
-
36
- if ( exists( file_path ) ):
37
- with open( file_path, 'r' ) as results_file:
38
- results = json.load(results_file)
39
- else:
40
- search_result = google_search( query )
41
- if( int( search_result['searchInformation']['totalResults'] ) > 0 ):
42
- results = search_result['items']
43
- with open( file_path, 'w' ) as results_file:
44
- json.dump( results, results_file )
45
- else:
46
- results = []
47
 
48
- if ( len( results ) == 0 ) :
49
- st.write( 'No results found.' )
 
 
 
 
50
 
51
- try:
52
- for item in results:
53
- st.write(item['link'])
54
- st.write(extract_content( item['link'] ))
55
- except Exception as e:
56
- st.exception(e)
57
 
58
  if __name__ == '__main__':
59
  main()
 
1
+
2
+ from os import getenv
3
+ from os.path import exists
4
+ from functools import cache
5
+ import json
6
  import streamlit as st
7
  from dotenv import load_dotenv
8
  from googleapiclient.discovery import build
 
9
  from slugify import slugify
 
 
 
10
 
11
  from beautiful_soup.app import extract_content
12
 
13
  @cache
14
+ def google_search_api_request( query ):
15
+ load_dotenv()
16
  api_key = getenv('GOOGLE_SEARCH_API_KEY')
17
+ # cx = os.getenv('GOOGLE_SEARCH_ENGINE_ID')
18
  service = build(
19
  "customsearch",
20
  "v1",
21
  developerKey=api_key,
22
+ cache_discovery=False,
23
+ num=5
24
  )
25
 
26
  return service.cse().list(
 
28
  cx='05048cc2df6134a06',
29
  ).execute()
30
 
31
+ def search_results( query ):
32
+ file_path = 'search-results/' + slugify( query ) + '.json'
33
+
34
+ results = []
35
+ if exists( file_path ):
36
+ with open( file_path, 'r' ) as results_file:
37
+ results = json.load( results_file )
38
+ else:
39
+ search_result = google_search_api_request( query )
40
+ if ( int( search_result['searchInformation']['totalResults'] ) > 0 ):
41
+ results = search_result['items']
42
+ with open( file_path, 'w' ) as results_file:
43
+ json.dump( results, results_file )
44
+
45
+ if ( len( results ) == 0 ) :
46
+ raise Exception('No results found.')
47
+
48
+ return results
49
+
50
  def main():
 
51
  st.title('Google Search')
52
  query = st.text_input('Search query')
53
 
54
+ if query :
55
+ try:
56
+ results = search_results( query )
57
+ except Exception as exception:
58
+ st.exception(exception)
 
 
 
 
 
 
 
 
 
59
 
60
+ for result in results:
61
+ st.write(result['link'])
62
+ try:
63
+ st.write( extract_content( result['link'] ) )
64
+ except Exception as exception:
65
+ st.exception(exception)
66
 
 
 
 
 
 
 
67
 
68
  if __name__ == '__main__':
69
  main()