grapplerulrich commited on
Commit
561abab
1 Parent(s): 9c1234d

Exclude pdf from search results

Browse files

- reorder beautiful soup code
- add extra content classes / id

Files changed (2) hide show
  1. beautiful_soup/app.py +62 -52
  2. main.py +31 -23
beautiful_soup/app.py CHANGED
@@ -12,6 +12,36 @@ import requests
12
  - Export the text
13
  '''
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Make request and get html content.
16
  def get_soup( url ):
17
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
@@ -31,16 +61,7 @@ def get_soup( url ):
31
 
32
  return BeautifulSoup(html, 'html.parser')
33
 
34
- # Extract content from main tag.
35
- def get_main( soup ):
36
- return soup.main
37
-
38
  def get_main_content( soup ):
39
- content = soup.main
40
-
41
- if content is not None:
42
- print('Has main tag.')
43
- return content
44
 
45
  content = soup.find( "div", { "class": "post-body" } )
46
  if content is not None:
@@ -72,6 +93,21 @@ def get_main_content( soup ):
72
  print('Has .article-inner_html class.')
73
  return content
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  content = soup.find( "article" )
76
  if content is not None:
77
  print('Has article tag.')
@@ -79,6 +115,23 @@ def get_main_content( soup ):
79
 
80
  return None
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def get_deepest_divs( tag ):
83
  # Get all the divs from within a tag.
84
  return [div for div in tag.findAll('div') if not div.find('div')]
@@ -102,49 +155,6 @@ def find_direct_text( tag ):
102
  def find_div_text( tag ):
103
  return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
104
 
105
- def get_tags_text( soup ):
106
- text = ''
107
- tags = soup.find_all( find_direct_text )
108
- for tag in tags:
109
- if tag.name == 'div' and tag.find( text=True, recursive=False ) :
110
- for div in tag.find_all(text=True, recursive=False):
111
- text += div.get_text().strip() + ' '
112
- else :
113
- text += tag.get_text().strip() + ' '
114
- return text
115
-
116
- def extract_content( url ):
117
- try :
118
- soup = get_soup( url )
119
- except Exception as exception:
120
- raise exception
121
-
122
- if soup is None:
123
- raise Exception('No HTML content found.')
124
-
125
- content = get_main_content( soup )
126
- if content is None :
127
- # content = soup.body
128
- raise Exception('No main content found.')
129
-
130
- return get_tags_text( content )
131
- # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
132
-
133
- def get_url_content( url ):
134
- file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
135
- if exists( file_path ):
136
- with open( file_path, 'r' ) as file_content:
137
- content = file_content.read()
138
- else:
139
- try:
140
- content = extract_content( url )
141
- except Exception as exception:
142
- raise exception
143
- with open( file_path, 'w' ) as file:
144
- file.write( content )
145
-
146
- return content
147
-
148
  if __name__ == '__main__':
149
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
150
  print(extract_content(url))
 
12
  - Export the text
13
  '''
14
 
15
+ def get_url_content( url ):
16
+ file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
17
+ if exists( file_path ):
18
+ with open( file_path, 'r' ) as file_content:
19
+ content = file_content.read()
20
+ else:
21
+ try:
22
+ content = extract_content( url )
23
+ except Exception as exception:
24
+ raise exception
25
+ with open( file_path, 'w' ) as file:
26
+ file.write( content )
27
+
28
+ return content
29
+
30
+ def extract_content( url ):
31
+ try :
32
+ soup = get_soup( url )
33
+ except Exception as exception:
34
+ raise exception
35
+
36
+ if soup is None:
37
+ raise Exception('No HTML content found.')
38
+
39
+ content = get_main_content( soup )
40
+ if content is None :
41
+ raise Exception('No main content found.')
42
+
43
+ return get_tags_text( content )
44
+
45
  # Make request and get html content.
46
  def get_soup( url ):
47
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
 
61
 
62
  return BeautifulSoup(html, 'html.parser')
63
 
 
 
 
 
64
  def get_main_content( soup ):
 
 
 
 
 
65
 
66
  content = soup.find( "div", { "class": "post-body" } )
67
  if content is not None:
 
93
  print('Has .article-inner_html class.')
94
  return content
95
 
96
+ content = soup.find( "div", { "id": "bmdDetail-Content" } )
97
+ if content is not None:
98
+ print('Has .bmdDetail-Content id.')
99
+ return content
100
+
101
+ content = soup.find( "div", { "id": "main" } )
102
+ if content is not None:
103
+ print('Has .bmdDetail-Content id.')
104
+ return content
105
+
106
+ content = soup.main
107
+ if content is not None:
108
+ print('Has main tag.')
109
+ return content
110
+
111
  content = soup.find( "article" )
112
  if content is not None:
113
  print('Has article tag.')
 
115
 
116
  return None
117
 
118
+ def get_tags_text( soup ):
119
+ text = ''
120
+ tags = soup.find_all( find_direct_text )
121
+ for tag in tags:
122
+ if tag.name == 'div' and tag.find( text=True, recursive=False ) :
123
+ for div in tag.find_all(text=True, recursive=False):
124
+ text += div.get_text().strip() + ' '
125
+ else :
126
+ text += tag.get_text().strip() + ' '
127
+ return text
128
+
129
+ # -------------------------------------- #
130
+
131
+ # Extract content from main tag.
132
+ def get_main( soup ):
133
+ return soup.main
134
+
135
  def get_deepest_divs( tag ):
136
  # Get all the divs from within a tag.
137
  return [div for div in tag.findAll('div') if not div.find('div')]
 
155
  def find_div_text( tag ):
156
  return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  if __name__ == '__main__':
159
  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
160
  print(extract_content(url))
main.py CHANGED
@@ -24,6 +24,9 @@ def google_search_api_request( query ):
24
  cache_discovery=False
25
  )
26
 
 
 
 
27
  return service.cse().list(
28
  q=query,
29
  cx='05048cc2df6134a06',
@@ -52,36 +55,41 @@ def search_results( query ):
52
  def main():
53
  st.title('Google Search')
54
  query = st.text_input('Search query')
55
-
56
  if query :
57
- try:
58
- results = search_results( query )
59
- except Exception as exception:
60
- st.exception(exception)
61
-
62
- for result in results:
63
- st.write(result['link'])
64
-
65
  try:
66
- content = get_url_content( result['link'] )
67
  except Exception as exception:
68
  st.exception(exception)
 
 
 
 
 
 
 
69
 
70
- file_path = 'summaries/' + uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex + '.json'
71
- if exists( file_path ):
72
- with open( file_path, 'r' ) as file:
73
- summary = json.load( file )
74
- else:
75
  try:
76
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
77
- summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True)
78
  except Exception as exception:
79
- raise exception
80
- with open( file_path, 'w' ) as file:
81
- json.dump( summary, file )
82
-
83
- for sentence in summary:
84
- st.write(sentence['summary_text'])
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  if __name__ == '__main__':
87
  main()
 
24
  cache_discovery=False
25
  )
26
 
27
+ # Exclude PDFs from search results.
28
+ query = query + ' -filetype:pdf'
29
+
30
  return service.cse().list(
31
  q=query,
32
  cx='05048cc2df6134a06',
 
55
  def main():
56
  st.title('Google Search')
57
  query = st.text_input('Search query')
 
58
  if query :
59
+ with st.spinner('Loading search results...'):
 
 
 
 
 
 
 
60
  try:
61
+ results = search_results( query )
62
  except Exception as exception:
63
  st.exception(exception)
64
+ return
65
+
66
+
67
+ for result in results:
68
+ url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
69
+ st.write(result['link'])
70
+ st.write(url_id)
71
 
 
 
 
 
 
72
  try:
73
+ content = get_url_content( result['link'] )
 
74
  except Exception as exception:
75
+ st.exception(exception)
76
+ continue
77
+
78
+ file_path = 'summaries/' + url_id + '.json'
79
+ if exists( file_path ):
80
+ with open( file_path, 'r' ) as file:
81
+ summary = json.load( file )
82
+ else:
83
+ try:
84
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
85
+ summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True)
86
+ except Exception as exception:
87
+ raise exception
88
+ with open( file_path, 'w' ) as file:
89
+ json.dump( summary, file )
90
+
91
+ for sentence in summary:
92
+ st.write(sentence['summary_text'])
93
 
94
  if __name__ == '__main__':
95
  main()