|
import streamlit as st |
|
from htmlrag import clean_html, build_block_tree |
|
from htmlrag import EmbedHTMLPruner |
|
from htmlrag import GenHTMLPruner |
|
import torch |
|
|
|
|
|
st.title("HtmlRAG Demo - HTML Cleaning and Query Rewriting") |
|
|
|
|
|
st.header("HTML Document Comparison Before and After Cleaning") |
|
|
|
|
|
html_file = st.file_uploader("Upload an HTML file", type=["html"]) |
|
|
|
if html_file is not None: |
|
|
|
raw_html = html_file.getvalue().decode("utf-8") |
|
st.subheader("Original HTML Content") |
|
st.code(raw_html, language="html") |
|
|
|
|
|
if st.button("Clean HTML"): |
|
|
|
simplified_html = clean_html(raw_html) |
|
st.subheader("Cleaned HTML Content") |
|
st.code(simplified_html, language="html") |
|
|
|
|
|
st.subheader("Comparison") |
|
st.write("The parts removed or compressed will be highlighted in the cleaned version.") |
|
|
|
|
|
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=10) |
|
st.subheader("Block Tree") |
|
for block in block_tree: |
|
st.write(f"Block Content: {block[0]}") |
|
st.write(f"Block Path: {block[1]}") |
|
st.write(f"Is Leaf: {block[2]}") |
|
st.write("---") |
|
|
|
|
|
st.header("Query Rewriting and Web Search Results Visualization") |
|
|
|
|
|
query = st.text_input("Enter a query:") |
|
|
|
if query: |
|
|
|
rewritten_query = query |
|
st.subheader("Rewritten Query") |
|
st.write(rewritten_query) |
|
|
|
|
|
search_results = ["Result 1: Bellagio is a luxury hotel.", "Result 2: It was built in 1998.", "Result 3: The Bellagio is on the Las Vegas Strip."] |
|
|
|
|
|
st.subheader("Search Results") |
|
for i, result in enumerate(search_results): |
|
st.write(f"Result {i+1}:") |
|
st.write(result) |
|
st.write("---") |
|
|
|
|
|
st.subheader("Query Comparison") |
|
st.write(f"Original Query: {query}") |
|
st.write(f"Rewritten Query: {rewritten_query}") |
|
|