blog-audit / utils /data_processor.py
vijaykumaredstellar's picture
Upload 3 files
6d28094 verified
import pandas as pd
import numpy as np
# Fixed non-click columns
NON_CLICK_COLS = {"date", "month", "year", "blogs", "h1", "meta title",
"meta description", "total clicks", "total_clicks"}
TIER_CONFIG = {
"takedown_zero_max": 0,
"takedown_low_max": 5,
"takedown_low_monthly": 2,
"monitor_min": 6,
"monitor_max": 20,
"performing_min": 21,
"performing_max": 100,
"strong_min": 101,
"strong_max": 500,
"top_min": 501,
}
def detect_columns(df: pd.DataFrame) -> dict:
"""
Auto-detect URL, title, month, and total_clicks columns
from any CSV with the expected structure.
Returns a dict with keys: url_col, title_col, month_cols, total_col
"""
cols_lower = {c.lower(): c for c in df.columns}
# URL column β€” 'Blogs' or any col containing http values
url_col = cols_lower.get("blogs") or cols_lower.get("url") or cols_lower.get("urls")
if not url_col:
for col in df.columns:
sample = df[col].dropna().astype(str).head(5)
if sample.str.startswith("http").any():
url_col = col
break
# Title column β€” 'H1' or 'Title'
title_col = cols_lower.get("h1") or cols_lower.get("title") or cols_lower.get("meta title")
# Total clicks column
total_col = cols_lower.get("total clicks") or cols_lower.get("total_clicks")
# Month columns β€” numeric columns not in the known set
month_cols = []
for col in df.columns:
if col.lower() in NON_CLICK_COLS:
continue
if pd.api.types.is_numeric_dtype(df[col]) or _is_mostly_numeric(df[col]):
month_cols.append(col)
# Remove total_col from month_cols if accidentally included
if total_col and total_col in month_cols:
month_cols.remove(total_col)
return {
"url_col": url_col,
"title_col": title_col,
"month_cols": month_cols,
"total_col": total_col,
}
def _is_mostly_numeric(series: pd.Series, threshold=0.7) -> bool:
converted = pd.to_numeric(series, errors="coerce")
valid = converted.notna().sum()
return valid / max(len(series), 1) >= threshold
def clean_and_tier(df: pd.DataFrame, col_map: dict) -> pd.DataFrame:
"""
Build a clean analysis dataframe with tier, trend, and slug columns.
"""
url_col = col_map["url_col"]
title_col = col_map["title_col"]
month_cols = col_map["month_cols"]
total_col = col_map["total_col"]
out = pd.DataFrame()
out["url"] = df[url_col].astype(str).str.strip()
out["title"] = df[title_col].astype(str).str.strip() if title_col else out["url"]
out["slug"] = out["url"].apply(
lambda x: x.split("/blog/")[-1] if "/blog/" in x else x.rstrip("/").split("/")[-1]
)
for col in month_cols:
out[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
if total_col:
out["total_clicks"] = pd.to_numeric(df[total_col], errors="coerce").fillna(0).astype(int)
else:
out["total_clicks"] = out[month_cols].sum(axis=1)
# Trend
out["trend"] = out.apply(lambda r: _get_trend(r, month_cols), axis=1)
# Tier
out["tier"] = out.apply(lambda r: _assign_tier(r, month_cols), axis=1)
# Recommended action
out["action"] = out.apply(lambda r: _get_action(r["tier"], r["total_clicks"]), axis=1)
return out
def _get_trend(row, month_cols):
if len(month_cols) < 2:
return "➑️ Stable"
vals = [row[c] for c in month_cols]
if vals[-1] > vals[0]:
return "πŸ“ˆ Growing"
elif vals[-1] < vals[0]:
return "πŸ“‰ Declining"
return "➑️ Stable"
def _assign_tier(row, month_cols):
c = row["total_clicks"]
max_m = max([row[m] for m in month_cols]) if month_cols else 0
if c == 0:
return "TAKEDOWN_ZERO"
elif c <= TIER_CONFIG["takedown_low_max"] and max_m <= TIER_CONFIG["takedown_low_monthly"]:
return "TAKEDOWN_LOW"
elif TIER_CONFIG["monitor_min"] <= c <= TIER_CONFIG["monitor_max"]:
return "MONITOR"
elif TIER_CONFIG["performing_min"] <= c <= TIER_CONFIG["performing_max"]:
return "PERFORMING"
elif TIER_CONFIG["strong_min"] <= c <= TIER_CONFIG["strong_max"]:
return "STRONG"
elif c >= TIER_CONFIG["top_min"]:
return "TOP"
return "PERFORMING"
def _get_action(tier, clicks):
actions = {
"TAKEDOWN_ZERO": "Remove immediately. Zero organic traction across all months. Set up 410 or 301 redirect.",
"TAKEDOWN_LOW": "Merge into a stronger related article or remove. Implement 301 redirect.",
"MONITOR": "Optimize meta title, description & keywords. Review in 90 days. Merge if no improvement.",
"PERFORMING": "Refresh content, strengthen internal links, add FAQ schema. Push for top 50 clicks.",
"STRONG": "Update statistics & examples. Add lead gen CTA. Build backlinks to reach 500+ clicks.",
"TOP": "Priority asset. Add lead magnets, improve CTAs, build backlinks. Protect rankings.",
}
if tier == "MONITOR" and clicks >= 15:
return "Good potential. Optimize meta description and add 2–3 internal links to improve CTR."
return actions.get(tier, "Review manually.")
def get_tier_summary(df: pd.DataFrame) -> dict:
counts = df["tier"].value_counts().to_dict()
return {
"TAKEDOWN_ZERO": counts.get("TAKEDOWN_ZERO", 0),
"TAKEDOWN_LOW": counts.get("TAKEDOWN_LOW", 0),
"MONITOR": counts.get("MONITOR", 0),
"PERFORMING": counts.get("PERFORMING", 0),
"STRONG": counts.get("STRONG", 0),
"TOP": counts.get("TOP", 0),
}
TIER_META = {
"TAKEDOWN_ZERO": {"label": "πŸ”΄ Take Down (0 clicks)", "color": "#FF4B4B"},
"TAKEDOWN_LOW": {"label": "🟠 Take Down (1–5 clicks)", "color": "#FF8C00"},
"MONITOR": {"label": "🟑 Monitor (6–20 clicks)", "color": "#FFC300"},
"PERFORMING": {"label": "βœ… Performing (21–100 clicks)", "color": "#2ECC71"},
"STRONG": {"label": "πŸ’ͺ Strong (101–500 clicks)", "color": "#1ABC9C"},
"TOP": {"label": "πŸ† Top Performers (500+ clicks)","color": "#9B59B6"},
}