Spaces:

Cpp4App
/

Cpp4App_test

Sleeping

HaochenGong

b3e6fcf over 1 year ago

14 kB

	import re

	import bs4

	from SEM.paragraph_bayesian import clf,tf
	from bs4 import BeautifulSoup

	mark_txt = {'0':"/data_types.txt",
	'1':"/data_types.txt",
	'2':"/personal_information_type.txt",
	'3':"/share_information.txt",
	'4':"/protect_information.txt",
	'5':"/advertising.txt",
	'6':"/user_right.txt",
	'7':"/children.txt",
	'8':"/region.txt",
	'9':"/update.txt",
	'10':"/way_to_collect.txt",
	'11':"/provider.txt",
	'12':"/data_retention.txt",
	'13':"/data_types.txt",
	'14':"/thrid_party.txt",
	'15':"/data_types.txt"}

	def process_content_outside_heading(content, pathName):
	# 将内容转换为文本并进行预处理
	soup = BeautifulSoup(content, 'html.parser')
	text_content = soup.get_text(separator=' ', strip=True)
	# 如果内容不为空，进行分类或其他处理
	if text_content:
	# 您可以在这里使用与标题相同的分类器进行分类
	mark = clf.predict(tf.transform([text_content]))
	# 将处理后的内容写入对应的文件
	with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]), "a", encoding='utf-8') as f:
	f.write(text_content)
	f.write("\n")


	def write_text(title_list, pathName, soup):
	type = 0
	security = 0
	right = 0
	specialGroup = 0
	specialArea = 0
	update = 0
	retention = 0
	useData = 0
	clean_title_list = []

	for title in title_list:
	if title.text != "•":
	clean_title_list.append(title)


	# # 处理第一个标题之前的内容
	# if clean_title_list:
	# first_title = clean_title_list[0]
	# content_before_first_title = []
	# for element in first_title.find_previous_siblings():
	# content_before_first_title.insert(0, element) # 逆序插入，保持顺序
	# content_before_first_title = ''.join([str(elem) for elem in content_before_first_title])
	# # 处理标题外的内容
	# process_content_outside_heading(content_before_first_title, pathName)





	# print("title list:"+str(clean_title_list))

	lastMark = ""
	for title in clean_title_list:
	title_Str = re.sub(r'\s+', ' ',str(title))
	title_Str = re.sub(r'<[^<]+?>', '', title_Str).replace('\n','').strip()
	if title is None:
	continue
	try:
	mark = clf.predict(tf.transform([title_Str]))

	except Exception as e:
	continue
	# print(mark)
	if mark == "1":
	type = 1
	if mark == "4":
	security = 1
	if mark == "6":
	right = 1
	if mark == "13":
	useData = 1
	if mark == "8":
	specialArea = 1
	if mark == "9":
	update = 1
	if mark == "12":
	retention = 1

	if mark == "7":
	specialGroup = 1
	if mark == "0":
	if lastMark != "":
	mark = lastMark
	lastMark = mark
	for sibling in title.next_elements:
	# print("sibling", sibling)

	# if len(str(sibling).split(' ')) < 5:
	# continue
	try:
	if clean_title_list[clean_title_list.index(title) + 1] == sibling:

	break
	except Exception:
	continue
	# if isinstance(sibling, bs4.element.Tag):
	#
	# continue
	if str(sibling) == '\n':

	continue
	if sibling == title.string:

	continue

	if clean_title_list.index(title) == len(clean_title_list) - 1:

	with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as f:

	if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)):
	continue
	if sibling.name == 'li':

	if sibling.find_previous('p'):

	# p_text = sibling.find_previous('p').text.strip()
	parent = ' '.join(sibling.find_previous('p').text.split())
	text = ' '.join(sibling.get_text().split())
	currentSibing = f"{parent} {text}"
	# if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
	# currentSibing = currentSibing + "."
	# g.write(currentSibing)
	# print("Found ul after a p tag with text:", parent)
	else:
	# currentSibing = str(sibling)
	currentSibing = ' '.join(sibling.get_text().split())
	else:
	# currentSibing = str(sibling)
	currentSibing = ' '.join(sibling.get_text().split())
	# currentSibing = str(sibling)
	if len(currentSibing) != 0:
	if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
	currentSibing = currentSibing + "."
	elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",":
	currentSibing = currentSibing[:-1]
	currentSibing = currentSibing + "."

	f.write(currentSibing)
	f.write("\n")
	f.close()

	else:

	with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as g:

	if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)):
	continue
	if sibling.name == 'li':

	if sibling.find_previous('p'):

	# p_text = sibling.find_previous('p').text.strip()
	parent = ' '.join(sibling.find_previous('p').text.split())
	text = ' '.join(sibling.get_text().split())
	currentSibing = f"{parent} {text}"
	# if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
	# currentSibing = currentSibing + "."
	# g.write(currentSibing)
	# print("Found ul after a p tag with text:", parent)
	else:
	# currentSibing = str(sibling)
	currentSibing = ' '.join(sibling.get_text().split())
	else:
	# currentSibing = str(sibling)
	currentSibing = ' '.join(sibling.get_text().split())
	# currentSibing = str(sibling)
	if len(currentSibing) != 0:
	if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
	currentSibing = currentSibing + "."
	elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",":
	currentSibing = currentSibing[:-1]
	currentSibing = currentSibing + "."
	g.write(currentSibing)
	g.write("\n")
	g.close()

	# 处理标题之外的段落
	remaining_soup = soup # 保留整个页面的 soup 结构

	# 遍历标题列表，移除已经处理的标题和标题下的段落
	for title in clean_title_list:
	title.extract() # 移除每个标题及其下的段落

	# 剩下的文本内容未被标题覆盖，进行同样的分类处理
	removeUnneccessaryElements(remaining_soup)
	remaining_segments = makeCoarseSegments(remaining_soup)
	for seg in remaining_segments:
	seg_clean = ' '.join(seg.split())
	if len(seg_clean) != 0:
	try:
	mark = clf.predict(tf.transform([seg_clean]))
	with open('./SEM/txt/' + pathName[:-5] + mark_txt.get(mark[0]), "a", encoding='utf-8') as f:
	f.write(seg_clean)
	f.write("\n")
	f.close()
	except Exception as e:
	continue

	# # 处理最后一个标题之后的内容
	# if clean_title_list:
	# last_title = clean_title_list[-1]
	# content_after_last_title = []
	# for element in last_title.next_siblings:
	# content_after_last_title.append(element)
	# content_after_last_title = ''.join([str(elem) for elem in content_after_last_title])
	# # 处理标题外的内容
	# process_content_outside_heading(content_after_last_title, pathName)




	return type,security,right,specialArea,specialGroup,update,retention,useData

	def write_text_without_label(text, pathName):
	with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:
	currentSibing = str(text)
	# print("currentSibing", currentSibing)
	if currentSibing[-1].isalpha() or currentSibing[-1] == ")":
	currentSibing = currentSibing + "."
	elif currentSibing[-1] == ";":
	currentSibing[-1] = "."
	f.write(currentSibing)
	f.close()

	def removeUnneccessaryElements(soup):
	for script in soup(["script", "style", "nav", "footer", "header", "img", "option", "select", "head", "button"]):
	script.extract() # rip it out
	for div in soup.find_all("div", {'class': 'footer'}):
	div.decompose()
	for div in soup.find_all("div", {'class': re.compile(r"sidebar")}):
	div.decompose()
	for div in soup.find_all("div", {'data-testid': re.compile(r"ax-navigation-menubar")}):
	div.decompose()
	for div in soup.find_all("div", {'class': re.compile(r"menu")}):
	div.decompose()
	for li in soup.find_all("li", {'class': re.compile(r"menu")}):
	li.decompose()
	for p in soup.find_all("p", {'class': re.compile(r"heading")}):
	p.decompose()
	for p in soup.find_all("p", {'class': re.compile(r"fw-bold")}):
	p.decompose()
	for ul in soup.find_all("ul", {'class': re.compile(r"menu")}):
	ul.decompose()
	for div in soup.find_all("div", {'class': re.compile(r"header")}):
	div.decompose()
	for div in soup.find_all("div", {'data-referrer': re.compile(r"page_footer")}):
	div.decompose()
	for div in soup.find_all("div", {'id': 'footer'}):
	div.decompose()
	for div in soup.find_all("div", {'id': re.compile(r"sidebar")}):
	div.decompose()
	for div in soup.find_all("div", {'id': re.compile(r"menu")}):
	div.decompose()
	for li in soup.find_all("li", {'id': re.compile(r"menu")}):
	li.decompose()
	for ul in soup.find_all("ul", {'id': re.compile(r"menu")}):
	ul.decompose()
	for div in soup.find_all("div", {'id': re.compile(r"header")}):
	div.decompose()
	for div in soup.find_all("div", {'id': re.compile(r"breadcrumbs")}):
	div.decompose()
	for div in soup.find_all("div", {'id': re.compile(r"instagram")}):
	div.decompose()
	for div in soup.find_all("div", {'role': re.compile(r"navigation")}):
	div.decompose()
	for div in soup.find_all("div", {'role': re.compile(r"banner")}):
	div.decompose()
	for div in soup.find_all("div", {'role': re.compile(r"button")}):
	div.decompose()
	for div in soup.find_all("ul", {'role': re.compile(r"navigation")}):
	div.decompose()

	def makeCoarseSegments(soup):
	segments = []
	for p in soup.find_all("p"):
	if p.find_next() is not None:
	if p.find_next().name != "ul":
	# segments.append(' '.join(p.get_text().split()))
	text = ' '.join(p.get_text().split())

	if len(text) != 0:
	if text[-1].isalpha() or text[-1] == ")":
	text = text + "."
	elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",":
	text = text[:-1]
	text = text + "."

	segments.append(text)

	listSplitter = []

	for ul in soup.find_all("ul"):
	if ul.find_previous('p') is not None:
	parent = ' '.join(ul.find_previous('p').text.split())
	for element in ul.findChildren('li'):
	text = ' '.join(element.get_text().split())
	listElement = f"{parent} {text}"

	if len(listElement) != 0:
	if listElement[-1].isalpha() or listElement[-1] == ")":
	listElement = listElement + "."
	elif listElement[-1] == ";" or listElement[-1] == ":" or listElement[-1] == ",":
	listElement = listElement[:-1]
	listElement = listElement + "."

	segments.append(listElement)
	else:
	for element in ul.findChildren('li'):
	text = ' '.join(element.get_text().split())

	if len(text) != 0:
	if text[-1].isalpha() or text[-1] == ")":
	text = text + "."
	elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",":
	text = text[:-1]
	text = text + "."

	segments.append(text)

	# if not segments:
	# text = soup.getText().replace('\n', '').replace('↵', '')
	# result = useAlgorithm(text)
	# else:
	# # text = " ".join(segments)
	# # print("TEXT??", text)
	# print("SEGMENTS??", segments)
	# result = segments
	result = segments
	return result