Cpp4App_test / SEM /get_text.py
HaochenGong
.
b3e6fcf
import re
import bs4
from SEM.paragraph_bayesian import clf,tf
from bs4 import BeautifulSoup
mark_txt = {'0':"/data_types.txt",
'1':"/data_types.txt",
'2':"/personal_information_type.txt",
'3':"/share_information.txt",
'4':"/protect_information.txt",
'5':"/advertising.txt",
'6':"/user_right.txt",
'7':"/children.txt",
'8':"/region.txt",
'9':"/update.txt",
'10':"/way_to_collect.txt",
'11':"/provider.txt",
'12':"/data_retention.txt",
'13':"/data_types.txt",
'14':"/thrid_party.txt",
'15':"/data_types.txt"}
def process_content_outside_heading(content, pathName):
# 将内容转换为文本并进行预处理
soup = BeautifulSoup(content, 'html.parser')
text_content = soup.get_text(separator=' ', strip=True)
# 如果内容不为空,进行分类或其他处理
if text_content:
# 您可以在这里使用与标题相同的分类器进行分类
mark = clf.predict(tf.transform([text_content]))
# 将处理后的内容写入对应的文件
with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]), "a", encoding='utf-8') as f:
f.write(text_content)
f.write("\n")
def write_text(title_list, pathName, soup):
type = 0
security = 0
right = 0
specialGroup = 0
specialArea = 0
update = 0
retention = 0
useData = 0
clean_title_list = []
for title in title_list:
if title.text != "•":
clean_title_list.append(title)
# # 处理第一个标题之前的内容
# if clean_title_list:
# first_title = clean_title_list[0]
# content_before_first_title = []
# for element in first_title.find_previous_siblings():
# content_before_first_title.insert(0, element) # 逆序插入,保持顺序
# content_before_first_title = ''.join([str(elem) for elem in content_before_first_title])
# # 处理标题外的内容
# process_content_outside_heading(content_before_first_title, pathName)
# print("title list:"+str(clean_title_list))
lastMark = ""
for title in clean_title_list:
title_Str = re.sub(r'\s+', ' ',str(title))
title_Str = re.sub(r'<[^<]+?>', '', title_Str).replace('\n','').strip()
if title is None:
continue
try:
mark = clf.predict(tf.transform([title_Str]))
except Exception as e:
continue
# print(mark)
if mark == "1":
type = 1
if mark == "4":
security = 1
if mark == "6":
right = 1
if mark == "13":
useData = 1
if mark == "8":
specialArea = 1
if mark == "9":
update = 1
if mark == "12":
retention = 1
if mark == "7":
specialGroup = 1
if mark == "0":
if lastMark != "":
mark = lastMark
lastMark = mark
for sibling in title.next_elements:
# print("sibling", sibling)
# if len(str(sibling).split(' ')) < 5:
# continue
try:
if clean_title_list[clean_title_list.index(title) + 1] == sibling:
break
except Exception:
continue
# if isinstance(sibling, bs4.element.Tag):
#
# continue
if str(sibling) == '\n':
continue
if sibling == title.string:
continue
if clean_title_list.index(title) == len(clean_title_list) - 1:
with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as f:
if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)):
continue
if sibling.name == 'li':
if sibling.find_previous('p'):
# p_text = sibling.find_previous('p').text.strip()
parent = ' '.join(sibling.find_previous('p').text.split())
text = ' '.join(sibling.get_text().split())
currentSibing = f"{parent} {text}"
# if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
# currentSibing = currentSibing + "."
# g.write(currentSibing)
# print("Found ul after a p tag with text:", parent)
else:
# currentSibing = str(sibling)
currentSibing = ' '.join(sibling.get_text().split())
else:
# currentSibing = str(sibling)
currentSibing = ' '.join(sibling.get_text().split())
# currentSibing = str(sibling)
if len(currentSibing) != 0:
if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
currentSibing = currentSibing + "."
elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",":
currentSibing = currentSibing[:-1]
currentSibing = currentSibing + "."
f.write(currentSibing)
f.write("\n")
f.close()
else:
with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as g:
if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)):
continue
if sibling.name == 'li':
if sibling.find_previous('p'):
# p_text = sibling.find_previous('p').text.strip()
parent = ' '.join(sibling.find_previous('p').text.split())
text = ' '.join(sibling.get_text().split())
currentSibing = f"{parent} {text}"
# if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
# currentSibing = currentSibing + "."
# g.write(currentSibing)
# print("Found ul after a p tag with text:", parent)
else:
# currentSibing = str(sibling)
currentSibing = ' '.join(sibling.get_text().split())
else:
# currentSibing = str(sibling)
currentSibing = ' '.join(sibling.get_text().split())
# currentSibing = str(sibling)
if len(currentSibing) != 0:
if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
currentSibing = currentSibing + "."
elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",":
currentSibing = currentSibing[:-1]
currentSibing = currentSibing + "."
g.write(currentSibing)
g.write("\n")
g.close()
# 处理标题之外的段落
remaining_soup = soup # 保留整个页面的 soup 结构
# 遍历标题列表,移除已经处理的标题和标题下的段落
for title in clean_title_list:
title.extract() # 移除每个标题及其下的段落
# 剩下的文本内容未被标题覆盖,进行同样的分类处理
removeUnneccessaryElements(remaining_soup)
remaining_segments = makeCoarseSegments(remaining_soup)
for seg in remaining_segments:
seg_clean = ' '.join(seg.split())
if len(seg_clean) != 0:
try:
mark = clf.predict(tf.transform([seg_clean]))
with open('./SEM/txt/' + pathName[:-5] + mark_txt.get(mark[0]), "a", encoding='utf-8') as f:
f.write(seg_clean)
f.write("\n")
f.close()
except Exception as e:
continue
# # 处理最后一个标题之后的内容
# if clean_title_list:
# last_title = clean_title_list[-1]
# content_after_last_title = []
# for element in last_title.next_siblings:
# content_after_last_title.append(element)
# content_after_last_title = ''.join([str(elem) for elem in content_after_last_title])
# # 处理标题外的内容
# process_content_outside_heading(content_after_last_title, pathName)
return type,security,right,specialArea,specialGroup,update,retention,useData
def write_text_without_label(text, pathName):
with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:
currentSibing = str(text)
# print("currentSibing", currentSibing)
if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
currentSibing = currentSibing + "."
elif currentSibing[-1] == ";":
currentSibing[-1] = "."
f.write(currentSibing)
f.close()
def removeUnneccessaryElements(soup):
for script in soup(["script", "style", "nav", "footer", "header", "img", "option", "select", "head", "button"]):
script.extract() # rip it out
for div in soup.find_all("div", {'class': 'footer'}):
div.decompose()
for div in soup.find_all("div", {'class': re.compile(r"sidebar")}):
div.decompose()
for div in soup.find_all("div", {'data-testid': re.compile(r"ax-navigation-menubar")}):
div.decompose()
for div in soup.find_all("div", {'class': re.compile(r"menu")}):
div.decompose()
for li in soup.find_all("li", {'class': re.compile(r"menu")}):
li.decompose()
for p in soup.find_all("p", {'class': re.compile(r"heading")}):
p.decompose()
for p in soup.find_all("p", {'class': re.compile(r"fw-bold")}):
p.decompose()
for ul in soup.find_all("ul", {'class': re.compile(r"menu")}):
ul.decompose()
for div in soup.find_all("div", {'class': re.compile(r"header")}):
div.decompose()
for div in soup.find_all("div", {'data-referrer': re.compile(r"page_footer")}):
div.decompose()
for div in soup.find_all("div", {'id': 'footer'}):
div.decompose()
for div in soup.find_all("div", {'id': re.compile(r"sidebar")}):
div.decompose()
for div in soup.find_all("div", {'id': re.compile(r"menu")}):
div.decompose()
for li in soup.find_all("li", {'id': re.compile(r"menu")}):
li.decompose()
for ul in soup.find_all("ul", {'id': re.compile(r"menu")}):
ul.decompose()
for div in soup.find_all("div", {'id': re.compile(r"header")}):
div.decompose()
for div in soup.find_all("div", {'id': re.compile(r"breadcrumbs")}):
div.decompose()
for div in soup.find_all("div", {'id': re.compile(r"instagram")}):
div.decompose()
for div in soup.find_all("div", {'role': re.compile(r"navigation")}):
div.decompose()
for div in soup.find_all("div", {'role': re.compile(r"banner")}):
div.decompose()
for div in soup.find_all("div", {'role': re.compile(r"button")}):
div.decompose()
for div in soup.find_all("ul", {'role': re.compile(r"navigation")}):
div.decompose()
def makeCoarseSegments(soup):
segments = []
for p in soup.find_all("p"):
if p.find_next() is not None:
if p.find_next().name != "ul":
# segments.append(' '.join(p.get_text().split()))
text = ' '.join(p.get_text().split())
if len(text) != 0:
if text[-1].isalpha() or text[-1] == ")":
text = text + "."
elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",":
text = text[:-1]
text = text + "."
segments.append(text)
listSplitter = []
for ul in soup.find_all("ul"):
if ul.find_previous('p') is not None:
parent = ' '.join(ul.find_previous('p').text.split())
for element in ul.findChildren('li'):
text = ' '.join(element.get_text().split())
listElement = f"{parent} {text}"
if len(listElement) != 0:
if listElement[-1].isalpha() or listElement[-1] == ")":
listElement = listElement + "."
elif listElement[-1] == ";" or listElement[-1] == ":" or listElement[-1] == ",":
listElement = listElement[:-1]
listElement = listElement + "."
segments.append(listElement)
else:
for element in ul.findChildren('li'):
text = ' '.join(element.get_text().split())
if len(text) != 0:
if text[-1].isalpha() or text[-1] == ")":
text = text + "."
elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",":
text = text[:-1]
text = text + "."
segments.append(text)
# if not segments:
# text = soup.getText().replace('\n', '').replace('↵', '')
# result = useAlgorithm(text)
# else:
# # text = " ".join(segments)
# # print("TEXT??", text)
# print("SEGMENTS??", segments)
# result = segments
result = segments
return result