|
import re |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse, unquote |
|
|
|
import streamlit as st |
|
from openai import OpenAI |
|
from qdrant_client import QdrantClient |
|
import edgedb |
|
import os |
|
|
|
def extract_content(text, tag): |
|
"""Extrai conteúdo de um texto com base na tag fornecida.""" |
|
pattern = rf'<{tag}>(.*?)</{tag}>' |
|
match = re.search(pattern, text, re.DOTALL) |
|
return match.group(1).strip() if match else None |
|
|
|
def fetch_webpage_content(url): |
|
"""Obtém o conteúdo HTML de uma URL e retorna o conteúdo principal, desativando links específicos, mas mantendo os links originais para sugestões.""" |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
main_content = soup.find('div', id='main') |
|
|
|
if main_content: |
|
|
|
original_links = [link['href'] for link in main_content.find_all('a', class_='govbr-card-content')] |
|
|
|
|
|
for link in main_content.find_all('a', class_='govbr-card-content'): |
|
link['href'] = 'javascript:void(0);' |
|
link['style'] = 'pointer-events: none; cursor: not-allowed;' |
|
|
|
|
|
for unwanted_element in main_content.find_all('div', class_='outstanding-header'): |
|
unwanted_element.decompose() |
|
|
|
|
|
body_tag = soup.find('body') |
|
if body_tag: |
|
body_tag.clear() |
|
body_tag.append(main_content) |
|
|
|
|
|
return str(soup), original_links |
|
|
|
|
|
return "<html><body><p>Could not find main content on the page.</p></body></html>", [] |
|
|
|
except requests.RequestException as e: |
|
|
|
return f"<html><body><p>Error fetching the webpage: {str(e)}</p></body></html>", [] |
|
|
|
|
|
def extract_links(html_content): |
|
"""Extrai todos os links (URLs) de um conteúdo HTML.""" |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
return [a_tag['href'] for a_tag in soup.find_all('a', href=True)] |
|
|
|
|
|
def url_to_suggestion(url): |
|
"""Converte uma URL longa em uma sugestão amigável e natural.""" |
|
path = unquote(urlparse(url).path).strip('/').split('/') |
|
|
|
|
|
if len(path) > 2: |
|
|
|
section = path[-2].replace('-', ' ').title() |
|
subsection = path[-1].replace('-', ' ').title() |
|
|
|
|
|
return f"Acesso à seção '{section}' sobre '{subsection}'" |
|
return None |
|
|
|
@st.cache_resource |
|
def connect_to_services(): |
|
oa_client = OpenAI( |
|
api_key=os.environ.get("OPENAI_API_KEY") |
|
) |
|
|
|
qdrant_client = QdrantClient( |
|
url=os.environ.get("QDRANT_URL"), |
|
api_key=os.environ.get("QDRANT_KEY") |
|
) |
|
|
|
edgedb_client = edgedb.create_client() |
|
|
|
return oa_client, qdrant_client, edgedb_client |