blazingbunny commited on
Commit
c2d57f1
·
verified ·
1 Parent(s): 6a96128

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -21
app.py CHANGED
@@ -1,30 +1,23 @@
1
  import streamlit as st
2
  import advertools as adv
3
  import pandas as pd
4
- import os
5
 
6
  def extract_headers(url):
7
  try:
8
  # Define the output file path
9
- output_file = "crawl_data.jl"
10
-
11
- # Crawl the webpage
12
- adv.crawl(url, output_file=output_file, follow_links=False)
13
-
14
- # Load the crawl data from the output file
15
- crawl_data = pd.read_json(output_file, lines=True)
16
-
17
- # Extract headers from the HTML content
18
- headers = []
19
- for _, row in crawl_data.iterrows():
20
- html_content = row['body']
21
- # Using Pandas to parse headers
22
- for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
23
- headers.extend(pd.read_html(f'<{header_tag}>{html_content}</{header_tag}>', header=0)[0].values.flatten())
24
-
25
- # Remove duplicate headers and empty values
26
- headers = [header for header in headers if header and isinstance(header, str)]
27
- return list(set(headers))
28
 
29
  except Exception as e:
30
  return str(e)
@@ -35,4 +28,14 @@ def main():
35
  url = st.text_input("Enter the URL of the web page:")
36
  if st.button("Extract Headers"):
37
  if url:
38
- headers = extract_he
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import advertools as adv
3
  import pandas as pd
 
4
 
5
  def extract_headers(url):
6
  try:
7
  # Define the output file path
8
+ output_file = "crawl_output.jl"
9
+
10
+ # Perform the crawl
11
+ adv.crawl(url, output_file=output_file, follow_links=True)
12
+
13
+ # Load the crawl data
14
+ crawl_df = pd.read_json(output_file, lines=True)
15
+
16
+ # Extract headers from h1 to h6
17
+ headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
18
+ headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
19
+
20
+ return headers
 
 
 
 
 
 
21
 
22
  except Exception as e:
23
  return str(e)
 
28
  url = st.text_input("Enter the URL of the web page:")
29
  if st.button("Extract Headers"):
30
  if url:
31
+ headers = extract_headers(url)
32
+ if isinstance(headers, pd.DataFrame) and not headers.empty:
33
+ st.write("Extracted Headers:")
34
+ st.write(headers)
35
+ else:
36
+ st.error("No headers found or an error occurred.")
37
+ else:
38
+ st.error("Please enter a valid URL.")
39
+
40
+ if __name__ == "__main__":
41
+ main()