import os
import re
import csv
import json
import requests
from bs4 import BeautifulSoup


def clean_text(text):
    """
    Limpa o texto removendo quebras de linha e espaços extras.
    """
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s*([\.:])\s*', r'\1', text)
    text = re.sub(r'\.(?![A-Za-z])', '.\n', text)
    text = re.sub(r';', ';\n', text)
    text = re.sub(r'(#+)', r'\n\1', text)
    text = re.sub(r'\$\$\$', '\n', text)
    return text


def format_for_markdown(soup: BeautifulSoup):
    """
    Formata um arquivo soup para markdown.
    """
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        if tag.name == 'h1':
            tag.insert(0, '#')
            text = tag.text
            text = re.sub(r'#\s+', '# ', text)
            tag.string = text.strip() + "$$$"
        elif tag.name == 'h2':
            tag.insert(0, '##')
            text = tag.text
            text = re.sub(r'#\s+', '## ', text)
            tag.string = text.strip() + "$$$"
        elif tag.name == 'h3':
            tag.insert(0, '###')
            text = tag.text
            text = re.sub(r'#\s+', '### ', text)
            tag.string = text.strip() + "$$$"
        elif tag.name == 'h4': 
            tag.insert(0, '####')
            text = tag.text
            text = re.sub(r'#\s+', '#### ', text)
            tag.string = text.strip() + "$$$"
        elif tag.name == 'h5':
            tag.insert(0, '#####')
            text = tag.text.trim()
            text = re.sub(r'#\s+', '##### ', text)
            tag.string = text.strip() + "$$$"
        elif tag.name == 'h6':
            tag.insert(0, '######')
            text = tag.text
            text = re.sub(r'#\s+', '###### ', text)
            tag.string = text.strip() + "$$$"

    for tag in soup.find_all(['li']):
        tag.insert(0, '-')
        text = tag.text
        text = re.sub(r'-\s+', '- ', text)
        tag.string = text.strip()

    for tag in soup.find_all('img'):
        tag.insert(0, f'![{tag.get("alt", "")}](')
        tag.append(tag["src"])
        tag.append(')')
 
    for tag in soup.find_all('a'):
        tag.insert(0, '[')
        tag.append(f']({tag["href"]})')
 
    for tag in soup.find_all(['p', 'span']):
        text = tag.text
        tag.string = text.strip() + "$$$"

    text = soup.get_text()

    cleaned_text = clean_text(text)

    # for tag in soup.find_all('video'):
    #     tag.insert(0, f'![{tag.get("alt", "")}](')
    #     tag.append(f')')

    # for tag in soup.find_all('iframe'):
    #     tag.insert(0, f'![{tag.get("alt", "")}](')
    #     tag.append(f')')

    return cleaned_text


def youtube_link(src):
    """
    Gera um link do YouTube a partir de um identificador de vídeo no link.
    """
    youtube_id_match = re.search(r'/([a-zA-Z0-9_-]{11})\.html$', src)
    return f'https://www.youtube.com/watch?v={youtube_id_match.group(1)}' if youtube_id_match else ''


def save_links_to_csv(links, csv_file):
    """
    Salva uma lista de links em um arquivo CSV.
    """
    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Link"])
        for link in links:
            writer.writerow([link])


def find_best_match(link, all_paths):
    """
    Encontra o melhor caminho que corresponde ao link fornecido.
    """
    best_match = ""
    max_overlap = 0

    for path in all_paths:
        max_len = min(len(link), len(path))
        overlap_length = 0
        for i in range(1, max_len + 1):
            if link[-i:] == path[-i:]:
                overlap_length = i
            else:
                break

        if overlap_length > max_overlap:
            max_overlap = overlap_length
            best_match = path

    return best_match if best_match else link


def extract_html_info(all_paths, root_dir, html_file, base_url):
    """
    Extrai informações de um arquivo HTML, como imagens, vídeos, arquivos e links governamentais.
    """
    # Remove './' ou '../' do início do diretório raiz
    root_dir = root_dir.lstrip('./').lstrip('../')
    absolute_path = os.path.abspath(html_file)

    # Encontra o caminho relativo a partir do diretório raiz
    folder_index = absolute_path.find(root_dir)
    folder_path = absolute_path[folder_index:] if folder_index != -1 else absolute_path

    # Separa o caminho da pasta e o nome do arquivo
    folder_path, file_name = os.path.split(folder_path)
    folder_path = folder_path.rstrip('/')

    with open(html_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    images, images_names = [], []
    files, gov_links = [], []
    file_extensions = ['.pdf', '.docx', '.xlsx', '.pptx']

    # Procura por links dentro do HTML
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if 'gov.br' in href:
            gov_links.append(href)

        img_tag = a_tag.find('img')
        if img_tag:
            src = img_tag['src']
            path = find_best_match(src, all_paths)

            # Armazena informações da imagem
            image_info = {
                "name": img_tag.get('alt', ''),
                "path": path,
                "url": src,
                "hyperlink": a_tag.get('href'),
                "alt": img_tag.get('alt', '')
            }
            images.append(image_info)
            images_names.append(img_tag.get('alt', ''))

        # Verifica se o link é um arquivo com extensão conhecida
        if any(ext in href for ext in file_extensions):
            file_info = {
                "name": a_tag.get_text().strip(),
                "url": href,
                "hyperlink": '',
                "alt": a_tag.get('alt', '')
            }
            files.append(file_info)

    # Procura por imagens não encapsuladas em links
    for img_tag in soup.find_all('img'):
        name = img_tag.get('alt', '')
        if name not in images_names:
            src = img_tag['src']
            path = find_best_match(src, all_paths)
            image_info = {
                "name": name,
                "path": path,
                "url": src,
                "hyperlink": img_tag.get('href', ''),
                "alt": img_tag.get('alt', '')
            }
            images.append(image_info)

    videos = []
    # Procura por vídeos e iframes para identificar links de vídeo
    for video_tag in soup.find_all(['video', 'iframe']):
        src = video_tag.get('src')
        hyperlink = youtube_link(src)
        video_info = {
            "name": video_tag.get('alt', video_tag.get('title', '')),
            "url": src,
            "hyperlink": hyperlink,
            "alt": video_tag.get('alt', '')
        }
        videos.append(video_info)
    
    text = format_for_markdown(soup)

    # Constrói a URL absoluta para os links
    relative_path = os.path.relpath(absolute_path, 'downloaded_files').replace('cleaned_', '')
    absolute_url = os.path.join(base_url, relative_path).replace("\\", "/").rstrip('index.html').rstrip('.html')
    absolute_url = re.sub(r'.*?gov\.br/(.*)', r'\1', absolute_url)
    absolute_url = re.sub(r'^.*?gov\.br/', '', absolute_url)

    if not absolute_url.startswith(('www.gov.br/', 'gov.br/')):
        absolute_url = 'https://www.gov.br/' + absolute_url
    elif absolute_url.startswith('www.gov.br/'):
        absolute_url = 'https://' + absolute_url
    elif absolute_url.startswith('gov.br/'):
        absolute_url = 'https://www.' + absolute_url

    # Atualiza links governamentais com URLs absolutas
    for i, link in enumerate(gov_links):
        if link.endswith('.html'):
            link = link.lstrip('./').lstrip('../').rstrip('index.html').rstrip('.html')
            if requests.get(absolute_url + link.lstrip('/')).status_code != 404:
                gov_links[i] = absolute_url + link.lstrip('/')

    # Cria um dicionário com todos os dados extraídos
    data = {
        "absolute_url": absolute_url,
        "gov_links": gov_links,
        "images": images,
        "videos": videos,
        "files": files,
        "text": text
    }

    return data, gov_links


def save_to_json(data, json_file):
    """
    Salva dados em um arquivo JSON.
    """
    with open(json_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)


if __name__ == '__main__':
    # Define os parâmetros para o processamento
    root_dir = './downloaded_files'
    base_url = "https://www.gov.br"
    html_file = './extraido_main_content.html'
    json_file = './saida.json'
    csv_file = './links.csv'
    all_paths = ['/downloaded_files']

    # Extrai informações do HTML e salva os resultados
    html_data, gov_links = extract_html_info(all_paths, root_dir, html_file, base_url)
    save_to_json(html_data, json_file)
    save_links_to_csv(gov_links, csv_file)