Spaces:

spunteam
/

streamlit-web-crawler

Sleeping

App Files Files Community

streamlit-web-crawler / src /Main.py

mrfirdauss

fix: UI for details

e2304c7 5 months ago

raw

history blame contribute delete

5.31 kB

	import streamlit as st
	from typing import Dict, Any
	import requests
	from models import LinkNode, Status
	from typing import Dict, Any
	import os
	from dotenv import load_dotenv

	load_dotenv()

	def display_map(link_map: Dict[str, Any]):
	"""
	Displays the entire link map in collapsible Streamlit expanders.
	If a link is not relevant based on its overview, it's tagged with a red icon.
	"""
	st.header("🌐 Full Exploration Map")

	if not link_map:
	st.info("The exploration map is empty.")
	return

	validated_map = {}
	for href, dict_node in link_map.items():
	try:
	node = LinkNode.model_validate(dict_node)
	validated_map[href] = node
	except Exception as e:
	st.error(f"Failed to validate data for {href}. Skipping. Error: {e}")
	continue

	sorted_map = sorted(validated_map.items(), key=lambda item: item[1].depth)

	for href, node in sorted_map:
	st.divider()
	st.subheader(f"📄 [{href}]({href})")
	if node.parent:
	st.caption(f"Found on: {node.parent}")

	status = node.overview.status
	if status == Status.RELEVANT:
	st.success(f"Status: RELEVANT ✅")
	elif status == Status.IRRELEVANT:
	st.warning(f"Status: IRRELEVANT ⚠️ - Page deemed not relevant to search criteria.")
	elif status == Status.FAILED:
	st.error(f"Status: FAILED ❌ - Could not scrape or analyze this page.")
	else:
	st.info(f"Status: UNKNOWN 🟡")

	st.markdown("📝 Summary")
	st.info(node.overview.summary)

	with st.expander("View Full Extracted Data and Found Links"):
	st.markdown("##### 📋 Full Extracted Data")
	overview_data = node.overview.model_dump()

	display_order = ['details', 'required_docs', 'price', 'SLA']

	items_to_display = []
	for key in display_order:
	value = overview_data.get(key)
	if value:
	title = key.replace('_', ' ').capitalize()
	items_to_display.append((title, str(value)))

	for i, (title, value) in enumerate(items_to_display):
	st.markdown(f"{title}")
	st.markdown(value)
	if i < len(items_to_display) - 1:
	st.markdown("---")
	st.markdown("##### 🔗 Links Found on This Page")
	if node.child:
	st.write(f"Found {len(node.child)} link(s):")
	links_text = "\n".join(f"- {link}" for link in node.child)
	st.text_area("Links", links_text, height=150, key=f"links_{href}")
	else:
	st.write("No valid links were found on this page.")

	def main():
	st.title("🤖 Browser Agent: Visa Data Extractor (Streamlit Demo)")
	st.markdown("Enter an API Key and a URL to start a recursive web crawl for structured visa information.")

	with st.sidebar:
	st.header("Configuration")

	default_url = "https://www.netherlandsworldwide.nl/visa-the-netherlands/visa-application-form"

	url = st.text_input("Starting URL (e.g., website.com)", default_url)

	max_depth = st.slider("Max Exploration Depth", min_value=1, max_value=5, value=1)

	st.markdown("""
	Note: Depth 1 is fast. Depth 2 or 3 can be very slow and consume many tokens.
	""")

	# --- Main Execution ---
	if st.button("Start Exploration and Extraction"):
	print(f"starting crawl for {url} with depth {max_depth}")
	if not url:
	st.error("Please enter a valid Starting URL.")
	return

	with st.spinner(f"Crawling {url} up to depth {max_depth}... (This may take a while)"):
	BASE_URI = os.getenv("BASE_URI", "http://localhost:5000")
	print(f"{BASE_URI}/scrape")
	try:
	result = requests.post(
	f"{BASE_URI}/scrape",
	headers={"Content-Type": "application/json"},
	json={
	"url": url,
	"max_depth": max_depth
	}
	)
	except requests.exceptions.ConnectionError:
	st.error(f"Connection Error: Could not connect to the Flask API at {BASE_URI}. Please ensure your Flask app is running (e.g., `flask run`).")
	return
	except Exception as e:
	st.exception(f"An unexpected error occurred during the crawl: {e}")
	return

	if result.status_code != 200:
	st.error(f"Exploration failed with status {result.status_code}: {result.text}")
	return

	data = result.json()

	display_map(data.get("link_map", {}))

	st.subheader("💰 Accumulated Token Usage (All LLM Calls)")
	token_usage = data.get("token_usage", {"input": 0, "output": 0, "total": 0})
	st.write(f"Input Tokens: {token_usage['input']}")
	st.write(f"Output Tokens: {token_usage['output']}")
	st.write(f"Total Tokens: {token_usage['total']}")

	if __name__ == "__main__":
	main()