Spaces:

edstellar
/

blog-audit

Sleeping

App Files Files Community

blog-audit / utils /data_processor.py

vijaykumaredstellar

Upload 3 files

6d28094 verified 10 days ago

raw

history blame contribute delete

6.33 kB

	import pandas as pd
	import numpy as np

	# Fixed non-click columns
	NON_CLICK_COLS = {"date", "month", "year", "blogs", "h1", "meta title",
	"meta description", "total clicks", "total_clicks"}

	TIER_CONFIG = {
	"takedown_zero_max": 0,
	"takedown_low_max": 5,
	"takedown_low_monthly": 2,
	"monitor_min": 6,
	"monitor_max": 20,
	"performing_min": 21,
	"performing_max": 100,
	"strong_min": 101,
	"strong_max": 500,
	"top_min": 501,
	}


	def detect_columns(df: pd.DataFrame) -> dict:
	"""
	Auto-detect URL, title, month, and total_clicks columns
	from any CSV with the expected structure.
	Returns a dict with keys: url_col, title_col, month_cols, total_col
	"""
	cols_lower = {c.lower(): c for c in df.columns}

	# URL column — 'Blogs' or any col containing http values
	url_col = cols_lower.get("blogs") or cols_lower.get("url") or cols_lower.get("urls")
	if not url_col:
	for col in df.columns:
	sample = df[col].dropna().astype(str).head(5)
	if sample.str.startswith("http").any():
	url_col = col
	break

	# Title column — 'H1' or 'Title'
	title_col = cols_lower.get("h1") or cols_lower.get("title") or cols_lower.get("meta title")

	# Total clicks column
	total_col = cols_lower.get("total clicks") or cols_lower.get("total_clicks")

	# Month columns — numeric columns not in the known set
	month_cols = []
	for col in df.columns:
	if col.lower() in NON_CLICK_COLS:
	continue
	if pd.api.types.is_numeric_dtype(df[col]) or _is_mostly_numeric(df[col]):
	month_cols.append(col)

	# Remove total_col from month_cols if accidentally included
	if total_col and total_col in month_cols:
	month_cols.remove(total_col)

	return {
	"url_col": url_col,
	"title_col": title_col,
	"month_cols": month_cols,
	"total_col": total_col,
	}


	def _is_mostly_numeric(series: pd.Series, threshold=0.7) -> bool:
	converted = pd.to_numeric(series, errors="coerce")
	valid = converted.notna().sum()
	return valid / max(len(series), 1) >= threshold


	def clean_and_tier(df: pd.DataFrame, col_map: dict) -> pd.DataFrame:
	"""
	Build a clean analysis dataframe with tier, trend, and slug columns.
	"""
	url_col = col_map["url_col"]
	title_col = col_map["title_col"]
	month_cols = col_map["month_cols"]
	total_col = col_map["total_col"]

	out = pd.DataFrame()
	out["url"] = df[url_col].astype(str).str.strip()
	out["title"] = df[title_col].astype(str).str.strip() if title_col else out["url"]
	out["slug"] = out["url"].apply(
	lambda x: x.split("/blog/")[-1] if "/blog/" in x else x.rstrip("/").split("/")[-1]
	)

	for col in month_cols:
	out[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

	if total_col:
	out["total_clicks"] = pd.to_numeric(df[total_col], errors="coerce").fillna(0).astype(int)
	else:
	out["total_clicks"] = out[month_cols].sum(axis=1)

	# Trend
	out["trend"] = out.apply(lambda r: _get_trend(r, month_cols), axis=1)

	# Tier
	out["tier"] = out.apply(lambda r: _assign_tier(r, month_cols), axis=1)

	# Recommended action
	out["action"] = out.apply(lambda r: _get_action(r["tier"], r["total_clicks"]), axis=1)

	return out


	def _get_trend(row, month_cols):
	if len(month_cols) < 2:
	return "➡️ Stable"
	vals = [row[c] for c in month_cols]
	if vals[-1] > vals[0]:
	return "📈 Growing"
	elif vals[-1] < vals[0]:
	return "📉 Declining"
	return "➡️ Stable"


	def _assign_tier(row, month_cols):
	c = row["total_clicks"]
	max_m = max([row[m] for m in month_cols]) if month_cols else 0
	if c == 0:
	return "TAKEDOWN_ZERO"
	elif c <= TIER_CONFIG["takedown_low_max"] and max_m <= TIER_CONFIG["takedown_low_monthly"]:
	return "TAKEDOWN_LOW"
	elif TIER_CONFIG["monitor_min"] <= c <= TIER_CONFIG["monitor_max"]:
	return "MONITOR"
	elif TIER_CONFIG["performing_min"] <= c <= TIER_CONFIG["performing_max"]:
	return "PERFORMING"
	elif TIER_CONFIG["strong_min"] <= c <= TIER_CONFIG["strong_max"]:
	return "STRONG"
	elif c >= TIER_CONFIG["top_min"]:
	return "TOP"
	return "PERFORMING"


	def _get_action(tier, clicks):
	actions = {
	"TAKEDOWN_ZERO": "Remove immediately. Zero organic traction across all months. Set up 410 or 301 redirect.",
	"TAKEDOWN_LOW": "Merge into a stronger related article or remove. Implement 301 redirect.",
	"MONITOR": "Optimize meta title, description & keywords. Review in 90 days. Merge if no improvement.",
	"PERFORMING": "Refresh content, strengthen internal links, add FAQ schema. Push for top 50 clicks.",
	"STRONG": "Update statistics & examples. Add lead gen CTA. Build backlinks to reach 500+ clicks.",
	"TOP": "Priority asset. Add lead magnets, improve CTAs, build backlinks. Protect rankings.",
	}
	if tier == "MONITOR" and clicks >= 15:
	return "Good potential. Optimize meta description and add 2–3 internal links to improve CTR."
	return actions.get(tier, "Review manually.")


	def get_tier_summary(df: pd.DataFrame) -> dict:
	counts = df["tier"].value_counts().to_dict()
	return {
	"TAKEDOWN_ZERO": counts.get("TAKEDOWN_ZERO", 0),
	"TAKEDOWN_LOW": counts.get("TAKEDOWN_LOW", 0),
	"MONITOR": counts.get("MONITOR", 0),
	"PERFORMING": counts.get("PERFORMING", 0),
	"STRONG": counts.get("STRONG", 0),
	"TOP": counts.get("TOP", 0),
	}


	TIER_META = {
	"TAKEDOWN_ZERO": {"label": "🔴 Take Down (0 clicks)", "color": "#FF4B4B"},
	"TAKEDOWN_LOW": {"label": "🟠 Take Down (1–5 clicks)", "color": "#FF8C00"},
	"MONITOR": {"label": "🟡 Monitor (6–20 clicks)", "color": "#FFC300"},
	"PERFORMING": {"label": "✅ Performing (21–100 clicks)", "color": "#2ECC71"},
	"STRONG": {"label": "💪 Strong (101–500 clicks)", "color": "#1ABC9C"},
	"TOP": {"label": "🏆 Top Performers (500+ clicks)","color": "#9B59B6"},
	}