import streamlit as st from htmlrag import clean_html, build_block_tree from htmlrag import EmbedHTMLPruner from htmlrag import GenHTMLPruner import torch # Title st.title("HtmlRAG Demo - HTML Cleaning and Query Rewriting") # HTML document comparison before and after cleaning st.header("HTML Document Comparison Before and After Cleaning") # Upload an HTML document html_file = st.file_uploader("Upload an HTML file", type=["html"]) if html_file is not None: # Display the original HTML content raw_html = html_file.getvalue().decode("utf-8") st.subheader("Original HTML Content") st.code(raw_html, language="html") # HtmlRAG cleaning if st.button("Clean HTML"): # Clean the HTML using the clean_html function simplified_html = clean_html(raw_html) st.subheader("Cleaned HTML Content") st.code(simplified_html, language="html") # Compare the original HTML and the cleaned HTML st.subheader("Comparison") st.write("The parts removed or compressed will be highlighted in the cleaned version.") # Build the HTML block tree and display it block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=10) st.subheader("Block Tree") for block in block_tree: st.write(f"Block Content: {block[0]}") st.write(f"Block Path: {block[1]}") st.write(f"Is Leaf: {block[2]}") st.write("---") # Query rewriting and web search results visualization st.header("Query Rewriting and Web Search Results Visualization") # Input a query query = st.text_input("Enter a query:") if query: # Display the rewritten query rewritten_query = query st.subheader("Rewritten Query") st.write(rewritten_query) # Simulate fetching search results search_results = ["Result 1: Bellagio is a luxury hotel.", "Result 2: It was built in 1998.", "Result 3: The Bellagio is on the Las Vegas Strip."] # Display the search results st.subheader("Search Results") for i, result in enumerate(search_results): st.write(f"Result {i+1}:") st.write(result) st.write("---") # Display a comparison of the original query and rewritten query st.subheader("Query Comparison") st.write(f"Original Query: {query}") st.write(f"Rewritten Query: {rewritten_query}")