grapplerulrich commited on
Commit
1f86974
1 Parent(s): b72fb4c

Save strings as json instead of text

Browse files

Allows for better processing of sentences

Files changed (2) hide show
  1. README.md +1 -0
  2. beautiful_soup/beautiful_soup.py +17 -17
README.md CHANGED
@@ -39,6 +39,7 @@ google_search_engine_id = "search-engine-id"
39
  - [ ] Improve fetched content.
40
  - [ ] Get some content from every search result.
41
  - [ ] Find sentences that contain the search keywords.
 
42
  - [ ] Summarization requires truncation. Find solution where not needed.
43
  - [ ] Support German content.
44
  - [ ] Improve queries to include more keywords (Expand abrivations & define context)
 
39
  - [ ] Improve fetched content.
40
  - [ ] Get some content from every search result.
41
  - [ ] Find sentences that contain the search keywords.
42
+ - [ ] Div's with text & tags. Extract text from tags and then decompose the tags.
43
  - [ ] Summarization requires truncation. Find solution where not needed.
44
  - [ ] Support German content.
45
  - [ ] Improve queries to include more keywords (Expand abrivations & define context)
beautiful_soup/beautiful_soup.py CHANGED
@@ -1,4 +1,5 @@
1
  import uuid
 
2
  from os import makedirs, remove
3
  from os.path import exists, dirname
4
  from bs4 import BeautifulSoup
@@ -14,22 +15,22 @@ import requests
14
  '''
15
 
16
  def get_url_content( url ):
17
- file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
18
- makedirs(dirname(file_path), exist_ok=True)
19
- if exists( file_path ):
20
- with open( file_path, 'r' ) as file_content:
21
- content = file_content.read()
22
  else:
23
  try:
24
- content = extract_content( url )
25
  except Exception as exception:
26
  raise exception
27
- with open( file_path, 'w' ) as file:
28
- file.write( content.strip() )
29
 
30
- return content
31
 
32
- def extract_content( url ):
33
  try :
34
  soup = get_soup( url )
35
  except Exception as exception:
@@ -46,11 +47,10 @@ def extract_content( url ):
46
  if content is None :
47
  raise Exception('No main content found.')
48
 
49
- text = get_tags_text( content )
50
- if text is None :
51
  raise Exception('No text found.')
52
-
53
- return text
54
 
55
  # Make request and get html content.
56
  def get_soup( url ):
@@ -136,16 +136,16 @@ def get_main_content( soup ):
136
  return None
137
 
138
  def get_tags_text( soup ):
139
- text = ''
140
  tags = soup.find_all( allowed_tags )
141
  for tag in tags:
142
  if tag.name == 'div' :
143
  for div in tag.find_all(text=True, recursive=False):
144
  found_text = div.get_text( ' ', strip=True )
145
  if found_text != '':
146
- text += found_text
147
  else :
148
- text += tag.get_text( ' ', strip=True ) + ' '
149
  return text
150
 
151
  def allowed_tags( tag ):
 
1
  import uuid
2
+ import json
3
  from os import makedirs, remove
4
  from os.path import exists, dirname
5
  from bs4 import BeautifulSoup
 
15
  '''
16
 
17
  def get_url_content( url ):
18
+ file = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
19
+ makedirs(dirname(file), exist_ok=True)
20
+ if exists( file ):
21
+ with open( file, 'r' ) as file_content:
22
+ strings = json.load( file )
23
  else:
24
  try:
25
+ strings = extract_strings( url )
26
  except Exception as exception:
27
  raise exception
28
+ with open( file, 'w' ) as file:
29
+ json.dump( strings, file )
30
 
31
+ return strings
32
 
33
+ def extract_strings( url ):
34
  try :
35
  soup = get_soup( url )
36
  except Exception as exception:
 
47
  if content is None :
48
  raise Exception('No main content found.')
49
 
50
+ strings = get_tags_text( content )
51
+ if strings is None :
52
  raise Exception('No text found.')
53
+ return strings
 
54
 
55
  # Make request and get html content.
56
  def get_soup( url ):
 
136
  return None
137
 
138
  def get_tags_text( soup ):
139
+ text = []
140
  tags = soup.find_all( allowed_tags )
141
  for tag in tags:
142
  if tag.name == 'div' :
143
  for div in tag.find_all(text=True, recursive=False):
144
  found_text = div.get_text( ' ', strip=True )
145
  if found_text != '':
146
+ text.append( found_text )
147
  else :
148
+ text.append( tag.get_text( ' ', strip=True ))
149
  return text
150
 
151
  def allowed_tags( tag ):