Update app.py
Browse files
app.py
CHANGED
@@ -1,30 +1,23 @@
|
|
1 |
import streamlit as st
|
2 |
import advertools as adv
|
3 |
import pandas as pd
|
4 |
-
import os
|
5 |
|
6 |
def extract_headers(url):
|
7 |
try:
|
8 |
# Define the output file path
|
9 |
-
output_file = "
|
10 |
-
|
11 |
-
#
|
12 |
-
adv.crawl(url, output_file=output_file, follow_links=
|
13 |
-
|
14 |
-
# Load the crawl data
|
15 |
-
|
16 |
-
|
17 |
-
# Extract headers from
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
23 |
-
headers.extend(pd.read_html(f'<{header_tag}>{html_content}</{header_tag}>', header=0)[0].values.flatten())
|
24 |
-
|
25 |
-
# Remove duplicate headers and empty values
|
26 |
-
headers = [header for header in headers if header and isinstance(header, str)]
|
27 |
-
return list(set(headers))
|
28 |
|
29 |
except Exception as e:
|
30 |
return str(e)
|
@@ -35,4 +28,14 @@ def main():
|
|
35 |
url = st.text_input("Enter the URL of the web page:")
|
36 |
if st.button("Extract Headers"):
|
37 |
if url:
|
38 |
-
headers =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import advertools as adv
|
3 |
import pandas as pd
|
|
|
4 |
|
5 |
def extract_headers(url):
|
6 |
try:
|
7 |
# Define the output file path
|
8 |
+
output_file = "crawl_output.jl"
|
9 |
+
|
10 |
+
# Perform the crawl
|
11 |
+
adv.crawl(url, output_file=output_file, follow_links=True)
|
12 |
+
|
13 |
+
# Load the crawl data
|
14 |
+
crawl_df = pd.read_json(output_file, lines=True)
|
15 |
+
|
16 |
+
# Extract headers from h1 to h6
|
17 |
+
headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
|
18 |
+
headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
|
19 |
+
|
20 |
+
return headers
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
except Exception as e:
|
23 |
return str(e)
|
|
|
28 |
url = st.text_input("Enter the URL of the web page:")
|
29 |
if st.button("Extract Headers"):
|
30 |
if url:
|
31 |
+
headers = extract_headers(url)
|
32 |
+
if isinstance(headers, pd.DataFrame) and not headers.empty:
|
33 |
+
st.write("Extracted Headers:")
|
34 |
+
st.write(headers)
|
35 |
+
else:
|
36 |
+
st.error("No headers found or an error occurred.")
|
37 |
+
else:
|
38 |
+
st.error("Please enter a valid URL.")
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
main()
|