blazingbunny commited on
Commit
22822be
0 Parent(s):

Duplicate from blazingbunny/sitemap-to-df-advertools

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +48 -0
  4. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sitemap To Df Advertools
3
+ emoji: 🔥
4
+ colorFrom: pink
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.25.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: blazingbunny/sitemap-to-df-advertools
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing libraries
2
+ import streamlit as st
3
+ from advertools import sitemap_to_df
4
+
5
+ # Function to safely extract the category from the sitemap URL
6
+ def extract_category(url, index):
7
+ parts = url.split('/')
8
+ return parts[index] if len(parts) > index else 'Unknown'
9
+
10
+ # Title
11
+ st.title("Sitemap Analyzer")
12
+
13
+ # Input for URL in the left sidebar
14
+ sitemap_url = st.sidebar.text_input("Enter the sitemap URL:", "")
15
+ category_index = st.sidebar.number_input("Enter the index of the category folder in the URL:", min_value=0, value=4, step=1)
16
+
17
+ # Check if the URL is provided
18
+ if sitemap_url:
19
+ # Fetching the sitemap data
20
+ try:
21
+ sitemap_data = sitemap_to_df(sitemap_url)
22
+ st.write("Sitemap successfully retrieved!")
23
+
24
+ # Extracting the sitemap category from the "loc" column using the specified index
25
+ sitemap_data['sitemap_cat'] = sitemap_data['loc'].apply(lambda x: extract_category(x, category_index))
26
+ st.dataframe(sitemap_data) # Displaying the DataFrame
27
+
28
+ # Number of articles per category
29
+ category_count = sitemap_data['sitemap_cat'].value_counts().reset_index()
30
+ category_count.columns = ['Category', 'Number of URLs']
31
+ st.subheader("Number of articles per category:")
32
+ st.dataframe(category_count)
33
+
34
+ # Categories where "lastmod" is not available
35
+ nat_values = sitemap_data[sitemap_data['lastmod'].isna()]['sitemap_cat'].value_counts().reset_index()
36
+ nat_values.columns = ['Category', 'Number of NaT']
37
+ st.subheader("Categories where 'lastmod' is not available:")
38
+ st.dataframe(nat_values)
39
+
40
+ # Unique categories where "lastmod" is not available
41
+ unique_nat_categories = nat_values['Category'].nunique()
42
+ st.subheader("Number of unique categories where 'lastmod' is not available:")
43
+ st.write(unique_nat_categories)
44
+
45
+ except Exception as e:
46
+ st.write("An error occurred:", str(e))
47
+ else:
48
+ st.write("Please enter a sitemap URL to analyze.")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ advertools
3
+ plotly
4
+ pandas
5
+ matplotlib