Spaces:

unb-lamfo-sgd
/

prototipo-1-rag

Sleeping

App Files Files Community

prototipo-1-rag / config /estruturado /html_extraction.py

dauid64

adicionando arquivos completos para funcionar o agente

d5116f8 3 months ago

raw

history blame contribute delete

2.07 kB

	##################################
	### Web Scrapping sites Gov.Br ###
	##################################

	from bs4 import BeautifulSoup


	def clean_html(file_path):
	# Abre e lê o arquivo HTML
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# Parse o conteúdo do arquivo HTML com BeautifulSoup
	soup = BeautifulSoup(content, 'lxml')

	# Encontra a div com id 'main' (padrão do Gov Br)
	main_div = soup.find('div', id='main')

	if main_div:
	classes_to_remove = ['canais-atendimento', 'listagem-servicos',
	'social-links', 'avaliacao-container',
	'govbr-recom-servico']
	dl_to_remove = ['portalMessage info']

	# Remove tags div especídicas
	for class_name in classes_to_remove:
	div_to_remove = main_div.find('div', class_=class_name)
	if div_to_remove:
	if class_name == classes_to_remove[0]:
	for sibling in div_to_remove.find_next_siblings():
	sibling.extract()
	div_to_remove.extract()

	# Remove tags dl específicas
	for dl in dl_to_remove:
	dl_to_remove = main_div.find('dl', class_=dl)
	if dl_to_remove:
	dl_to_remove.extract()

	# Obtém o conteúdo principal como HTML
	cleaned_content = main_div.prettify()
	else:
	cleaned_content = "A div com id 'main' não foi encontrada."

	return cleaned_content


	def save_html(content, output_file_path):
	# Salva o conteúdo principal em um novo arquivo HTML
	with open(output_file_path, 'w', encoding='utf-8') as file:
	file.write(content)


	if __name__ == '__main__':
	# Caminho do arquivo HTML
	file_path = './exemplo.html'
	output_file_path = './extraido_main_content.html'

	# Limpa o HTML e obtém o conteúdo principal
	cleaned_content = clean_html(file_path)

	# Salva o conteúdo limpo em um novo arquivo HTML
	save_html(cleaned_content, output_file_path)