sitemap-to-df-advertoolsv3

Sleeping

File size: 4,227 Bytes

415b0ee
9179066
415b0ee
00af42a
415b0ee
3ad5c72
 
415b0ee
284c1ee
 
 
 
 
98a4de8
284c1ee
 
 
 
 
415b0ee
284c1ee
 
 
 
415b0ee
 
 
 
 
 
 
cfa673a
c66f051
 
415b0ee
3ad5c72
 
b0f65bb
 
3ad5c72
 
6ef28f0
9179066
b0f65bb
3ad5c72
 
6ef28f0
 
 
 
 
 
9179066
415b0ee
 
0bc9d2e
 
 
 
 
 
 
 
 
 
 
 
 
 
415b0ee

import streamlit as st
from advertools import sitemap_to_df, word_frequency
import pandas as pd
import matplotlib.pyplot as plt

# List of common words to filter
common_words = set(["author", "category", "product", "authors", "categories", "products", "blog", "blogs"])

# Sidebar instructions
st.sidebar.markdown("### How to use this Sitemap Analyzer")
st.sidebar.markdown("""
This sitemap analyzer shows you how many pages each domain has published over a period of time.
To use it, input the client's sitemap on "Input client sitemap here" and put up to 3 competitor sitemaps below it, pressing enter after every time you put the sitemap URL.
Credits to [Advertools](https://github.com/eliasdabbas/advertools) and [holisticseo.digital](https://www.holisticseo.digital/python-seo/content-analysis-with-sitemaps/)""")
st.sidebar.markdown("You can use this tool to detect or guess where the sitemap of each domain can be: [Free Sitemap Finder & Checker Tool](https://seomator.com/sitemap-finder)")

st.sidebar.markdown("## Tool uploaded and maintained by: [Blazing SEO](http://blazing-seo.com/)")

# Sidebar inputs for up to 4 sitemaps
sitemap_urls = [
    st.sidebar.text_input("Input client sitemap here:", ""),
    st.sidebar.text_input("Enter the competitor sitemap URL 1:", ""),
    st.sidebar.text_input("Enter the competitor sitemap URL 2:", ""),
    st.sidebar.text_input("Enter the competitor sitemap URL 3:", "")
]

for idx, sitemap_url in enumerate(sitemap_urls):
    if sitemap_url:
        try:
            sitemap_data = sitemap_to_df(sitemap_url)
            sitemap_data['lastmod'] = pd.to_datetime(sitemap_data['lastmod'])

            # Extracting relevant words from the last folder, replacing hyphens with spaces
            slugs = sitemap_data['loc'].apply(lambda x: x.split("/")[-2].replace("-", " "))

            # Filtering out common words
            slugs_filtered = [' '.join([word for word in slug.split() if word.lower() not in common_words]) for slug in slugs]

            # Word frequency for single words, excluding common words
            word_freq = word_frequency(slugs_filtered, phrase_len=1)
            st.subheader(f"Most-frequently used words in article titles for {sitemap_url} (excluding common words)")
            st.dataframe(word_freq.head(100))

            # Word frequency for two-word phrases, excluding common words
            word_freq_phrases = word_frequency(slugs_filtered, phrase_len=2)
            st.subheader(f"Most-frequently used two-word phrases in article titles for {sitemap_url} (excluding common words)")
            st.dataframe(word_freq_phrases.head(100))

            # Word frequency for three-word phrases, excluding common words
            word_freq_trigrams = word_frequency(slugs_filtered, phrase_len=3)
            st.subheader(f"Most-frequently used three-word phrases in article titles for {sitemap_url} (excluding common words)")
            st.dataframe(word_freq_trigrams.head(100))

            # Plotting trends
            for trend_name, resample_rule, ylabel in [("Yearly", "A", "Count"), ("Monthly", "M", "Count"), ("Weekly", "W", "Count")]:
                st.subheader(f"{trend_name} Trends for {sitemap_url}")
                fig, ax = plt.subplots(figsize=(10, 6))
                
                if trend_name == "Weekly":
                    trends = sitemap_data['lastmod'].dt.dayofweek.value_counts().sort_index()
                    trends.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
                else:
                    trends = sitemap_data.resample(resample_rule, on='lastmod').size()
                    trends.index = trends.index.strftime('%Y-%m-%d')  # Format datetime index
                
                ax.bar(trends.index, trends.values)
                ax.set_ylabel(ylabel)
                ax.set_title(f"{trend_name} Trends")
                st.pyplot(fig)

            # Total number of URLs
            st.subheader(f"Total Number of URLs for {sitemap_url}")
            total_urls = len(sitemap_data)
            st.write(f"The total number of URLs in the sitemap is {total_urls}.")

        except Exception as e:
            st.write(f"An error occurred for {sitemap_url}:", str(e))