Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from htmlrag import clean_html, build_block_tree
|
3 |
+
from htmlrag import EmbedHTMLPruner
|
4 |
+
from htmlrag import GenHTMLPruner
|
5 |
+
import torch
|
6 |
+
|
7 |
+
# Title
|
8 |
+
st.title("HtmlRAG Demo - HTML Cleaning and Query Rewriting")
|
9 |
+
|
10 |
+
# HTML document comparison before and after cleaning
|
11 |
+
st.header("HTML Document Comparison Before and After Cleaning")
|
12 |
+
|
13 |
+
# Upload an HTML document
|
14 |
+
html_file = st.file_uploader("Upload an HTML file", type=["html"])
|
15 |
+
|
16 |
+
if html_file is not None:
|
17 |
+
# Display the original HTML content
|
18 |
+
raw_html = html_file.getvalue().decode("utf-8")
|
19 |
+
st.subheader("Original HTML Content")
|
20 |
+
st.code(raw_html, language="html")
|
21 |
+
|
22 |
+
# HtmlRAG cleaning
|
23 |
+
if st.button("Clean HTML"):
|
24 |
+
# Clean the HTML using the clean_html function
|
25 |
+
simplified_html = clean_html(raw_html)
|
26 |
+
st.subheader("Cleaned HTML Content")
|
27 |
+
st.code(simplified_html, language="html")
|
28 |
+
|
29 |
+
# Compare the original HTML and the cleaned HTML
|
30 |
+
st.subheader("Comparison")
|
31 |
+
st.write("The parts removed or compressed will be highlighted in the cleaned version.")
|
32 |
+
|
33 |
+
# Build the HTML block tree and display it
|
34 |
+
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=10)
|
35 |
+
st.subheader("Block Tree")
|
36 |
+
for block in block_tree:
|
37 |
+
st.write(f"Block Content: {block[0]}")
|
38 |
+
st.write(f"Block Path: {block[1]}")
|
39 |
+
st.write(f"Is Leaf: {block[2]}")
|
40 |
+
st.write("---")
|
41 |
+
|
42 |
+
# Query rewriting and web search results visualization
|
43 |
+
st.header("Query Rewriting and Web Search Results Visualization")
|
44 |
+
|
45 |
+
# Input a query
|
46 |
+
query = st.text_input("Enter a query:")
|
47 |
+
|
48 |
+
if query:
|
49 |
+
# Display the rewritten query
|
50 |
+
rewritten_query = query
|
51 |
+
st.subheader("Rewritten Query")
|
52 |
+
st.write(rewritten_query)
|
53 |
+
|
54 |
+
# Simulate fetching search results
|
55 |
+
search_results = ["Result 1: Bellagio is a luxury hotel.", "Result 2: It was built in 1998.", "Result 3: The Bellagio is on the Las Vegas Strip."]
|
56 |
+
|
57 |
+
# Display the search results
|
58 |
+
st.subheader("Search Results")
|
59 |
+
for i, result in enumerate(search_results):
|
60 |
+
st.write(f"Result {i+1}:")
|
61 |
+
st.write(result)
|
62 |
+
st.write("---")
|
63 |
+
|
64 |
+
# Display a comparison of the original query and rewritten query
|
65 |
+
st.subheader("Query Comparison")
|
66 |
+
st.write(f"Original Query: {query}")
|
67 |
+
st.write(f"Rewritten Query: {rewritten_query}")
|