import os import re import csv import json import requests from bs4 import BeautifulSoup def clean_text(text): """ Limpa o texto removendo quebras de linha e espaços extras. """ text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s*([\.:])\s*', r'\1', text) text = re.sub(r'\.(?![A-Za-z])', '.\n', text) text = re.sub(r';', ';\n', text) text = re.sub(r'(#+)', r'\n\1', text) text = re.sub(r'\$\$\$', '\n', text) return text def format_for_markdown(soup: BeautifulSoup): """ Formata um arquivo soup para markdown. """ for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): if tag.name == 'h1': tag.insert(0, '#') text = tag.text text = re.sub(r'#\s+', '# ', text) tag.string = text.strip() + "$$$" elif tag.name == 'h2': tag.insert(0, '##') text = tag.text text = re.sub(r'#\s+', '## ', text) tag.string = text.strip() + "$$$" elif tag.name == 'h3': tag.insert(0, '###') text = tag.text text = re.sub(r'#\s+', '### ', text) tag.string = text.strip() + "$$$" elif tag.name == 'h4': tag.insert(0, '####') text = tag.text text = re.sub(r'#\s+', '#### ', text) tag.string = text.strip() + "$$$" elif tag.name == 'h5': tag.insert(0, '#####') text = tag.text.trim() text = re.sub(r'#\s+', '##### ', text) tag.string = text.strip() + "$$$" elif tag.name == 'h6': tag.insert(0, '######') text = tag.text text = re.sub(r'#\s+', '###### ', text) tag.string = text.strip() + "$$$" for tag in soup.find_all(['li']): tag.insert(0, '-') text = tag.text text = re.sub(r'-\s+', '- ', text) tag.string = text.strip() for tag in soup.find_all('img'): tag.insert(0, f'![{tag.get("alt", "")}](') tag.append(tag["src"]) tag.append(')') for tag in soup.find_all('a'): tag.insert(0, '[') tag.append(f']({tag["href"]})') for tag in soup.find_all(['p', 'span']): text = tag.text tag.string = text.strip() + "$$$" text = soup.get_text() cleaned_text = clean_text(text) # for tag in soup.find_all('video'): # tag.insert(0, f'![{tag.get("alt", "")}](') # tag.append(f')') # for tag in soup.find_all('iframe'): # tag.insert(0, f'![{tag.get("alt", "")}](') # tag.append(f')') return cleaned_text def youtube_link(src): """ Gera um link do YouTube a partir de um identificador de vídeo no link. """ youtube_id_match = re.search(r'/([a-zA-Z0-9_-]{11})\.html$', src) return f'https://www.youtube.com/watch?v={youtube_id_match.group(1)}' if youtube_id_match else '' def save_links_to_csv(links, csv_file): """ Salva uma lista de links em um arquivo CSV. """ with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(["Link"]) for link in links: writer.writerow([link]) def find_best_match(link, all_paths): """ Encontra o melhor caminho que corresponde ao link fornecido. """ best_match = "" max_overlap = 0 for path in all_paths: max_len = min(len(link), len(path)) overlap_length = 0 for i in range(1, max_len + 1): if link[-i:] == path[-i:]: overlap_length = i else: break if overlap_length > max_overlap: max_overlap = overlap_length best_match = path return best_match if best_match else link def extract_html_info(all_paths, root_dir, html_file, base_url): """ Extrai informações de um arquivo HTML, como imagens, vídeos, arquivos e links governamentais. """ # Remove './' ou '../' do início do diretório raiz root_dir = root_dir.lstrip('./').lstrip('../') absolute_path = os.path.abspath(html_file) # Encontra o caminho relativo a partir do diretório raiz folder_index = absolute_path.find(root_dir) folder_path = absolute_path[folder_index:] if folder_index != -1 else absolute_path # Separa o caminho da pasta e o nome do arquivo folder_path, file_name = os.path.split(folder_path) folder_path = folder_path.rstrip('/') with open(html_file, 'r', encoding='utf-8') as file: html_content = file.read() soup = BeautifulSoup(html_content, 'html.parser') images, images_names = [], [] files, gov_links = [], [] file_extensions = ['.pdf', '.docx', '.xlsx', '.pptx'] # Procura por links dentro do HTML for a_tag in soup.find_all('a', href=True): href = a_tag['href'] if 'gov.br' in href: gov_links.append(href) img_tag = a_tag.find('img') if img_tag: src = img_tag['src'] path = find_best_match(src, all_paths) # Armazena informações da imagem image_info = { "name": img_tag.get('alt', ''), "path": path, "url": src, "hyperlink": a_tag.get('href'), "alt": img_tag.get('alt', '') } images.append(image_info) images_names.append(img_tag.get('alt', '')) # Verifica se o link é um arquivo com extensão conhecida if any(ext in href for ext in file_extensions): file_info = { "name": a_tag.get_text().strip(), "url": href, "hyperlink": '', "alt": a_tag.get('alt', '') } files.append(file_info) # Procura por imagens não encapsuladas em links for img_tag in soup.find_all('img'): name = img_tag.get('alt', '') if name not in images_names: src = img_tag['src'] path = find_best_match(src, all_paths) image_info = { "name": name, "path": path, "url": src, "hyperlink": img_tag.get('href', ''), "alt": img_tag.get('alt', '') } images.append(image_info) videos = [] # Procura por vídeos e iframes para identificar links de vídeo for video_tag in soup.find_all(['video', 'iframe']): src = video_tag.get('src') hyperlink = youtube_link(src) video_info = { "name": video_tag.get('alt', video_tag.get('title', '')), "url": src, "hyperlink": hyperlink, "alt": video_tag.get('alt', '') } videos.append(video_info) text = format_for_markdown(soup) # Constrói a URL absoluta para os links relative_path = os.path.relpath(absolute_path, 'downloaded_files').replace('cleaned_', '') absolute_url = os.path.join(base_url, relative_path).replace("\\", "/").rstrip('index.html').rstrip('.html') absolute_url = re.sub(r'.*?gov\.br/(.*)', r'\1', absolute_url) absolute_url = re.sub(r'^.*?gov\.br/', '', absolute_url) if not absolute_url.startswith(('www.gov.br/', 'gov.br/')): absolute_url = 'https://www.gov.br/' + absolute_url elif absolute_url.startswith('www.gov.br/'): absolute_url = 'https://' + absolute_url elif absolute_url.startswith('gov.br/'): absolute_url = 'https://www.' + absolute_url # Atualiza links governamentais com URLs absolutas for i, link in enumerate(gov_links): if link.endswith('.html'): link = link.lstrip('./').lstrip('../').rstrip('index.html').rstrip('.html') if requests.get(absolute_url + link.lstrip('/')).status_code != 404: gov_links[i] = absolute_url + link.lstrip('/') # Cria um dicionário com todos os dados extraídos data = { "absolute_url": absolute_url, "gov_links": gov_links, "images": images, "videos": videos, "files": files, "text": text } return data, gov_links def save_to_json(data, json_file): """ Salva dados em um arquivo JSON. """ with open(json_file, 'w', encoding='utf-8') as file: json.dump(data, file, indent=4, ensure_ascii=False) if __name__ == '__main__': # Define os parâmetros para o processamento root_dir = './downloaded_files' base_url = "https://www.gov.br" html_file = './extraido_main_content.html' json_file = './saida.json' csv_file = './links.csv' all_paths = ['/downloaded_files'] # Extrai informações do HTML e salva os resultados html_data, gov_links = extract_html_info(all_paths, root_dir, html_file, base_url) save_to_json(html_data, json_file) save_links_to_csv(gov_links, csv_file)