File size: 4,227 Bytes
415b0ee 9179066 415b0ee 00af42a 415b0ee 3ad5c72 415b0ee 284c1ee 98a4de8 284c1ee 415b0ee 284c1ee 415b0ee cfa673a c66f051 415b0ee 3ad5c72 b0f65bb 3ad5c72 6ef28f0 9179066 b0f65bb 3ad5c72 6ef28f0 9179066 415b0ee 0bc9d2e 415b0ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import streamlit as st
from advertools import sitemap_to_df, word_frequency
import pandas as pd
import matplotlib.pyplot as plt
# List of common words to filter
common_words = set(["author", "category", "product", "authors", "categories", "products", "blog", "blogs"])
# Sidebar instructions
st.sidebar.markdown("### How to use this Sitemap Analyzer")
st.sidebar.markdown("""
This sitemap analyzer shows you how many pages each domain has published over a period of time.
To use it, input the client's sitemap on "Input client sitemap here" and put up to 3 competitor sitemaps below it, pressing enter after every time you put the sitemap URL.
Credits to [Advertools](https://github.com/eliasdabbas/advertools) and [holisticseo.digital](https://www.holisticseo.digital/python-seo/content-analysis-with-sitemaps/)""")
st.sidebar.markdown("You can use this tool to detect or guess where the sitemap of each domain can be: [Free Sitemap Finder & Checker Tool](https://seomator.com/sitemap-finder)")
st.sidebar.markdown("## Tool uploaded and maintained by: [Blazing SEO](http://blazing-seo.com/)")
# Sidebar inputs for up to 4 sitemaps
sitemap_urls = [
st.sidebar.text_input("Input client sitemap here:", ""),
st.sidebar.text_input("Enter the competitor sitemap URL 1:", ""),
st.sidebar.text_input("Enter the competitor sitemap URL 2:", ""),
st.sidebar.text_input("Enter the competitor sitemap URL 3:", "")
]
for idx, sitemap_url in enumerate(sitemap_urls):
if sitemap_url:
try:
sitemap_data = sitemap_to_df(sitemap_url)
sitemap_data['lastmod'] = pd.to_datetime(sitemap_data['lastmod'])
# Extracting relevant words from the last folder, replacing hyphens with spaces
slugs = sitemap_data['loc'].apply(lambda x: x.split("/")[-2].replace("-", " "))
# Filtering out common words
slugs_filtered = [' '.join([word for word in slug.split() if word.lower() not in common_words]) for slug in slugs]
# Word frequency for single words, excluding common words
word_freq = word_frequency(slugs_filtered, phrase_len=1)
st.subheader(f"Most-frequently used words in article titles for {sitemap_url} (excluding common words)")
st.dataframe(word_freq.head(100))
# Word frequency for two-word phrases, excluding common words
word_freq_phrases = word_frequency(slugs_filtered, phrase_len=2)
st.subheader(f"Most-frequently used two-word phrases in article titles for {sitemap_url} (excluding common words)")
st.dataframe(word_freq_phrases.head(100))
# Word frequency for three-word phrases, excluding common words
word_freq_trigrams = word_frequency(slugs_filtered, phrase_len=3)
st.subheader(f"Most-frequently used three-word phrases in article titles for {sitemap_url} (excluding common words)")
st.dataframe(word_freq_trigrams.head(100))
# Plotting trends
for trend_name, resample_rule, ylabel in [("Yearly", "A", "Count"), ("Monthly", "M", "Count"), ("Weekly", "W", "Count")]:
st.subheader(f"{trend_name} Trends for {sitemap_url}")
fig, ax = plt.subplots(figsize=(10, 6))
if trend_name == "Weekly":
trends = sitemap_data['lastmod'].dt.dayofweek.value_counts().sort_index()
trends.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
else:
trends = sitemap_data.resample(resample_rule, on='lastmod').size()
trends.index = trends.index.strftime('%Y-%m-%d') # Format datetime index
ax.bar(trends.index, trends.values)
ax.set_ylabel(ylabel)
ax.set_title(f"{trend_name} Trends")
st.pyplot(fig)
# Total number of URLs
st.subheader(f"Total Number of URLs for {sitemap_url}")
total_urls = len(sitemap_data)
st.write(f"The total number of URLs in the sitemap is {total_urls}.")
except Exception as e:
st.write(f"An error occurred for {sitemap_url}:", str(e))
|