Spaces:

blazingbunny
/

free-seo-headers-audit-tool-advertools

Sleeping

blazingbunny commited on Aug 1, 2024

Commit

c2d57f1

verified ·

1 Parent(s): 6a96128

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,30 +1,23 @@
 import streamlit as st
 import advertools as adv
 import pandas as pd
-import os
 def extract_headers(url):
     try:
         # Define the output file path
-        output_file = "crawl_data.jl"
-        # Crawl the webpage
-        adv.crawl(url, output_file=output_file, follow_links=False)
-        # Load the crawl data from the output file
-        crawl_data = pd.read_json(output_file, lines=True)
-        # Extract headers from the HTML content
-        headers = []
-        for _, row in crawl_data.iterrows():
-            html_content = row['body']
-            # Using Pandas to parse headers
-            for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-                headers.extend(pd.read_html(f'<{header_tag}>{html_content}</{header_tag}>', header=0)[0].values.flatten())
-        # Remove duplicate headers and empty values
-        headers = [header for header in headers if header and isinstance(header, str)]
-        return list(set(headers))
     except Exception as e:
         return str(e)
@@ -35,4 +28,14 @@ def main():
     url = st.text_input("Enter the URL of the web page:")
     if st.button("Extract Headers"):
         if url:
-            headers = extract_he

 import streamlit as st
 import advertools as adv
 import pandas as pd
 def extract_headers(url):
     try:
         # Define the output file path
+        output_file = "crawl_output.jl"
+        # Perform the crawl
+        adv.crawl(url, output_file=output_file, follow_links=True)
+        # Load the crawl data
+        crawl_df = pd.read_json(output_file, lines=True)
+        # Extract headers from h1 to h6
+        headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
+        headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
+        return headers
     except Exception as e:
         return str(e)
     url = st.text_input("Enter the URL of the web page:")
     if st.button("Extract Headers"):
         if url:
+            headers = extract_headers(url)
+            if isinstance(headers, pd.DataFrame) and not headers.empty:
+                st.write("Extracted Headers:")
+                st.write(headers)
+            else:
+                st.error("No headers found or an error occurred.")
+        else:
+            st.error("Please enter a valid URL.")
+if __name__ == "__main__":
+    main()