cjber's picture
fix: dont use covers docx
9df9069
raw
history blame contribute delete
20.9 kB
import logging
import re
from collections import Counter
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import polars as pl
import requests
from matplotlib.patches import Patch
from polars.dependencies import subprocess
from planning_ai.common.utils import Paths
plt.rcParams.update(
{"font.family": "sans-serif", "font.sans-serif": ["Liberation Sans"]}
)
def _process_postcodes(final):
"""Processes postcodes from the final document data.
Args:
final (dict): A dictionary containing document data.
Returns:
pl.DataFrame: A DataFrame with postcodes and their counts joined with ONSPD data.
"""
documents = final["documents"]
postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents]
missing = ["Missing" for pcd in postcodes if not pcd]
postcodes = (
pl.DataFrame({"postcode": postcodes})["postcode"]
.value_counts()
.with_columns(pl.col("postcode").str.replace_all(" ", ""))
)
onspd = pl.read_parquet(
Paths.RAW / "onspd_cambridge.parquet",
columns=["pcd", "osward", "lsoa11", "oa21"],
).with_columns(pl.col("pcd").str.replace_all(" ", "").alias("postcode"))
postcodes = postcodes.join(onspd, on="postcode", how="left")
outside_pcs = postcodes.filter(pl.col("osward").is_null()).drop_nulls("postcode")
pcs_url = "https://api.postcodes.io/postcodes"
def get_pcs(postcode):
response = requests.get(f"{pcs_url}/{postcode}")
if response.status_code == 200:
out = response.json()["result"]
admin_ward = out.get("admin_ward")
admin_district = out.get("admin_district")
return f"{admin_ward}, {admin_district}"
else:
return "Unknown"
outside_pcs = pl.concat(
[
outside_pcs.with_columns(
pl.col("postcode")
.map_elements(lambda x: get_pcs(x), return_dtype=pl.String)
.alias("osward")
)
.select(["postcode", "osward", "count"])
.with_columns(pl.col("count").cast(pl.Int32)),
pl.DataFrame(
{"postcode": missing, "osward": "Unknown", "count": len(missing)}
).with_columns(pl.col("count").cast(pl.Int32)),
],
)
return postcodes.drop_nulls(subset=["osward"]), outside_pcs
def _process_policies(final):
"""Processes policies from the final document data.
Args:
final (dict): A dictionary containing document data.
Returns:
tuple: A tuple containing strings of support, object, and other policies.
"""
def process_policy_group(policy_group):
details = "".join(
f"\n### {row['themes']} - {row['stance']}\n\n"
+ "".join(
f"- {detail} {doc_id}\n"
for detail, doc_id in zip(row["detail"], row["doc_id"])
)
for row in policy_group.rows(named=True)[:1]
)
return details
policies_df = final["policies"]
support_policies = ""
object_policies = ""
other_policies = ""
for (_, stance), policy in policies_df.group_by(
["themes", "stance"], maintain_order=True
):
if stance == "Support":
support_policies += process_policy_group(policy)
elif stance == "Object":
object_policies += process_policy_group(policy)
else:
other_policies += process_policy_group(policy)
return support_policies, object_policies, other_policies
def _process_stances(responses):
"""Processes stances from the final document data.
Args:
final (dict): A dictionary containing document data.
Returns:
str: A formatted string of stances with their percentages and counts.
"""
stances = responses["representations_support/object"].to_list()
value_counts = Counter(stances)
total_values = sum(value_counts.values())
percentages = {
key: {"count": count, "percentage": (count / total_values)}
for key, count in value_counts.items()
}
stances_top = sorted(
percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
)
return " | ".join(
[
f"**{item}**: {stance['percentage']:.1%} _({stance['count']})_"
for item, stance in stances_top
]
)
def _process_themes(final):
"""Processes themes from the final document data.
Args:
final (dict): A dictionary containing document data.
Returns:
str: A markdown table of themes with their counts and percentages.
"""
documents = final["documents"]
themes = Counter([theme["chapter"] for doc in documents for theme in doc["themes"]])
themes = pl.DataFrame(themes).transpose(include_header=True)
themes_breakdown = themes.with_columns(
((pl.col("column_0") / pl.sum("column_0")) * 100).round(2).alias("percentage")
).sort("percentage", descending=True)
themes_breakdown = themes_breakdown.rename(
{"column": "Theme", "column_0": "Count", "percentage": "Percentage"}
)
pd.set_option("display.precision", 1)
return themes_breakdown.to_pandas().to_markdown(index=False)
def fig_oa(postcodes, rep):
"""Generates a figure for Output Area (OA) classifications.
Args:
postcodes (pl.DataFrame): A DataFrame containing postcode data.
"""
oa_pop = pl.read_parquet(Paths.RAW / "camb_oa_pop.parquet")
oa_pop = (
oa_pop.group_by(pl.col("Output Areas Code"))
.sum()
.rename({"Output Areas Code": "OA2021", "Observation": "population"})
.select(["OA2021", "population"])
)
oac = pl.read_parquet(Paths.RAW / "camb_oac.parquet")
oac_names = pl.read_csv(Paths.RAW / "classification_codes_and_names.csv")
oac = (
oac.with_columns(pl.col("supergroup").cast(str))
.join(oac_names, left_on="supergroup", right_on="Classification Code")
.select(["oa21cd", "Classification Name", "supergroup"])
.rename(
{
"Classification Name": "supergroup_name",
}
)
)
oac = oac.join(oa_pop, left_on="oa21cd", right_on="OA2021")
oac = (
postcodes.join(oac, left_on="oa21", right_on="oa21cd", how="right")
.group_by(["supergroup", "supergroup_name"])
.sum()
.select(["supergroup", "supergroup_name", "population", "count"])
.sort("supergroup")
.with_columns(
((pl.col("count") / pl.col("count").sum()) * 100).alias("perc_count"),
((pl.col("population") / pl.col("population").sum()) * 100).alias(
"perc_pop"
),
)
.with_columns((pl.col("perc_count") - pl.col("perc_pop")).alias("perc_diff"))
)
oa_pd = oac.to_pandas()
_, ax1 = plt.subplots(figsize=(8, 8))
# Define a list of colors for each supergroup
colors = [
"#7f7f7f", # retired
"#2ca02c", # suburbanites
"#d62728", # multicultural
"#e377c2", # low skilled
"#ff7f0e", # ethnically diverse
"#bcbd22", # baseline uk
"#1f77b4", # semi unskilled
"#9467bd", # legacy
]
# Plot bars for percentage of representations
bars1 = ax1.bar(
oa_pd["supergroup"],
oa_pd["perc_diff"],
label="Percentage of Representations (%)",
color=colors[: len(oa_pd)],
edgecolor="none",
width=0.9,
)
# Add centerline at y=0
ax1.axhline(0, color="black", linewidth=1)
# Annotate bars with percentage values
for bar in bars1:
height = bar.get_height()
if height > 0:
ax1.annotate(
f"{height:.0f}%",
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha="center",
va="bottom",
)
else:
ax1.annotate(
f"{height:.0f}%",
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, -6),
textcoords="offset points",
ha="center",
va="top",
)
ax1.set_xlabel("Output Area Classification (OAC) Supergroup")
ax1.set_ylabel("Difference from national average (%)")
supergroup_names = [
f"{i}: {name}"
for i, name in enumerate(oa_pd["supergroup_name"].unique(), start=1)
]
legend_patches = [
Patch(color=colors[i], label=supergroup_names[i])
for i in range(len(supergroup_names))
]
ax1.legend(handles=legend_patches, title="Supergroup", frameon=False)
plt.tight_layout()
plt.savefig(Paths.SUMMARY / "figs" / f"oas-{rep}.pdf")
def fig_wards(postcodes, rep):
ward_boundaries = gpd.read_parquet(Paths.RAW / "camb_wards.parquet")
ward_pcs = postcodes.group_by("osward").sum()
ward_boundaries_prop = ward_boundaries.merge(
ward_pcs.to_pandas(), left_on="WD21CD", right_on="osward"
)
_, ax = plt.subplots(figsize=(8, 8))
ward_boundaries.plot(ax=ax, color="none", edgecolor="black", linewidth=1.5)
ward_boundaries.plot(ax=ax, color="white", edgecolor="grey", linewidth=0.5)
ward_boundaries_prop.plot(
ax=ax,
column="count",
edgecolor="none",
legend=True,
legend_kwds={"label": "Number of Representations"},
)
cbar = ax.get_figure().axes[-1] # Get the colorbar axis
cbar.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{int(x)}"))
plt.axis("off")
plt.tight_layout()
plt.savefig(Paths.SUMMARY / "figs" / f"wards-{rep}.pdf")
def fig_imd(postcodes, rep):
imd = pl.read_parquet(Paths.RAW / "imd_camb.parquet")
pops = pl.read_parquet(Paths.RAW / "pops_camb.parquet")
imd = (
postcodes.join(imd, left_on="lsoa11", right_on="LSOA", how="right")
.join(pops, left_on="LSOA", right_on="LSOA 2021 Code")
.group_by("SOA_decile")
.agg(pl.col("count").sum(), pl.col("LSOA").count(), pl.col("Total").sum())
.sort("SOA_decile")
.with_columns(
((pl.col("count") / pl.col("count").sum()) * 100).alias("perc_count"),
((pl.col("Total") / pl.col("Total").sum()) * 100).alias("perc_pop"),
)
.with_columns((pl.col("perc_count") - pl.col("perc_pop")).alias("perc_diff"))
)
postcodes_pd = imd.to_pandas()
colors = [
"#d62728",
"#9f4e64",
"#6f76a0",
"#478dbf",
"#1f77b4",
]
_, ax1 = plt.subplots(figsize=(8, 8))
x = np.arange(len(postcodes_pd))
ax1.bar(
x, # Shift to the left
postcodes_pd["perc_diff"],
edgecolor="none",
color=colors,
width=0.9,
)
# Set labels and ticks
ax1.set_xlabel("Deprivation Quintile")
ax1.set_ylabel("Difference from national average (%)")
ax1.set_xticks(x)
ax1.axhline(0, color="black", linewidth=1)
# ax1.legend(loc="upper center", bbox_to_anchor=(0.5, 1.1), ncol=5, frameon=False)
plt.tight_layout()
ax1.set_xticklabels(["1 - Most Deprived", "2", "3", "4", "5 - Least Deprived"])
plt.savefig(Paths.SUMMARY / "figs" / f"imd_decile-{rep}.pdf")
def load_txt(file_path):
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
def build_final_report(out, rep):
introduction_paragraph = load_txt("planning_ai/documents/introduction.txt")
figures_paragraph = load_txt("planning_ai/documents/figures.txt")
themes_paragraph = load_txt("planning_ai/documents/themes.txt")
final = out["generate_final_report"]
responses = (
pl.scan_parquet(Paths.STAGING / "gcpt3.parquet")
.filter(pl.col("representations_document") == rep)
.unique("id")
.collect()
)
support_policies, object_policies, other_policies = _process_policies(final)
postcodes, outside_pcs = _process_postcodes(final)
stances = _process_stances(responses)
themes = _process_themes(final)
fig_wards(postcodes, rep)
fig_oa(postcodes, rep)
fig_imd(postcodes, rep)
outside_pcs = (
outside_pcs.group_by("osward")
.sum()[["osward", "count"]]
.filter(pl.col("osward") != "Unknown")
.rename(
{"osward": "Ward, Local Authority", "count": "Number of Representations"}
)
.to_pandas()
.to_markdown(index=False)
)
quarto_doc = (
f"---\ntitle: '**Overview of Public Submissions: {rep}**'\n"
r"""
mainfont: Liberation Sans
fontsize: 12pt
margin: 2cm
geometry: a4paper
header-includes: |
\usepackage{graphicx}
\usepackage{fancyhdr}
\usepackage{geometry}
\usepackage{sectsty}
\geometry{top=1in, bottom=1in, left=1in, right=1in}
\pagestyle{fancy}
\fancyhead[L]{}
\fancyhead[C]{}
\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}
\fancyfoot[L]{}
\fancyfoot[C]{}
\fancyfoot[R]{\thepage}
\renewcommand{\headrulewidth}{0pt}
\renewcommand{\footrulewidth}{0pt}
\linespread{1.2}
\usepackage{titlesec}
\usepackage{xcolor}
\definecolor{uolblue}{HTML}{1F2B7D}
\titleformat{\section}[block]{\normalfont\Large\bfseries\color{uolblue}}{}{0em}{}
\titleformat{\subsection}[block]{\normalfont\large\bfseries\color{uolblue}}{}{0em}{}
\fancypagestyle{plain}{\fancyhf{}\fancyfoot[R]{\thepage}\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}}
\usepackage{titling}
\pretitle{\begin{flushleft}\Huge\color{uolblue}}
\posttitle{\end{flushleft}\vspace{-2em}}
---
"""
"\n# Executive Summary\n\n"
f"{final['executive']}\n\n"
f"There were a total of {len(responses):,} responses. Of these, representations left "
"comment, or indicated the following support and objection of the consultation document:\n\n"
f"{stances}\n\n"
"# Introduction\n\n"
f"{introduction_paragraph}\n\n"
"\n# Profile of Submissions\n\n"
f"{figures_paragraph}\n\n"
f"![Total number of representations submitted by Ward within Greater Cambridgeshire](./data/out/summary/figs/wards-{rep}.pdf)\n\n"
f": Postcodes outside the Greater Cambridge Ward areas\n\n{outside_pcs}n\n"
f"![The proportion of representations submitted by 2021 Output Area Classification^[Wyszomierski, J., Longley, P.A., Singleton, A.D., Gale, C. & O’Brien, O. (2024) A neighbourhood Output Area Classification from the 2021 and 2022 UK censuses. The Geographical Journal, 190, e12550. Available from: https://doi.org/10.1111/geoj.12550] relative to the national average](./data/out/summary/figs/oas-{rep}.pdf)\n\n"
f"![The proportion of representations submitted by the Index of Multiple Deprivation (2019) relative to the national average](./data/out/summary/figs/imd_decile-{rep}.pdf)\n\n"
r"\newpage"
"\n\n# Themes and Policies\n\n"
f"{themes_paragraph}\n\n"
f": Breakdown of representations by section\n\n{themes}\n\n"
"## Supporting Representations\n\n"
"The following section presents a list of all points raised in representations that support the consultation document, grouped by sections.\n\n"
f"{support_policies or '_No supporting representations._'}\n\n"
"## Objecting Representations\n\n"
"The following section presents a list of all points raised in representations that object to the consultation document, grouped by sections.\n\n"
f"{object_policies or '_No objecting representations._'}\n\n"
"## Comment\n\n"
"The following section presents a list of all points raised in representations that do not support or object to the consultation document, grouped by sections.\n\n"
f"{other_policies or '_No other representations._'}\n\n"
)
rep_joined = rep.replace(" ", "_")
out_path = Paths.SUMMARY / f"Overview_of_Public_Submissions-{rep_joined}.md"
out_file = Paths.SUMMARY / f"Overview_of_Public_Submissions-{rep_joined}"
with open(out_path, "w") as f:
f.write(quarto_doc)
try:
command = [
"pandoc",
f"{out_path}",
"-o",
f"{out_file}.pdf",
"--pdf-engine=xelatex",
]
subprocess.run(command, check=True, capture_output=True)
command = [
"pdftk",
"data/covers/Overview_of_Public_Submissions.pdf",
f"{out_file}.pdf",
"cat",
"output",
f"{out_file}-cover.pdf",
]
subprocess.run(command, check=True, capture_output=True)
command = ["mv", "-f", f"{out_file}-cover.pdf", f"{out_file}.pdf"]
subprocess.run(command, check=True, capture_output=True)
command = [
"pandoc",
f"{out_path}",
"-o",
f"{out_file}.docx",
"--reference-doc=data/covers/reference.docx",
]
subprocess.run(command, check=True, capture_output=True)
# command = [
# "pandoc",
# "data/covers/Overview_of_Public_Submissions.docx",
# f"{out_file}.docx",
# "-o",
# f"{out_file}.docx",
# "--reference-doc=data/covers/reference.docx",
# ]
# subprocess.run(command, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
logging.error(f"Error during Overview_of_Public_Submissions render: {e}")
def build_summaries_document(out, rep):
sub = r"Document ID: \[\d+\]\n\n"
summary_intro = load_txt("planning_ai/documents/summary_intro.txt")
full_text = "".join(
f"**Part ID**: {document['doc_id']}\n\n"
f"**Representations ID**: {document['document'].metadata['id']}\n\n"
f"\n\n{re.sub(sub, '', document['summary'])}\n\n"
"---\n\n"
for document in out["generate_final_report"]["documents"]
)
header = (
f"---\ntitle: '**Summaries of Public Submissions: {rep}**'\n"
r"""
mainfont: Liberation Sans
fontsize: 12pt
margin: 2cm
geometry: a4paper
header-includes: |
\usepackage{graphicx}
\usepackage{fancyhdr}
\usepackage{geometry}
\usepackage{sectsty}
\geometry{top=1in, bottom=1in, left=1in, right=1in}
\pagestyle{fancy}
\fancyhead[L]{}
\fancyhead[C]{}
\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}
\fancyfoot[L]{}
\fancyfoot[C]{}
\fancyfoot[R]{\thepage}
\renewcommand{\headrulewidth}{0pt}
\renewcommand{\footrulewidth}{0pt}
\linespread{1.2}
\usepackage{titlesec}
\usepackage{xcolor}
\definecolor{uolblue}{HTML}{1F2B7D}
\titleformat{\section}[block]{\normalfont\Large\bfseries\color{uolblue}}{}{0em}{}
\titleformat{\subsection}[block]{\normalfont\large\bfseries\color{uolblue}}{}{0em}{}
\fancypagestyle{plain}{\fancyhf{}\fancyfoot[R]{\thepage}\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}}
\usepackage{titling}
\pretitle{\begin{flushleft}\Huge\color{uolblue}}
\posttitle{\end{flushleft}\vspace{-2em}}
---
"""
f"\n{summary_intro}\n\n"
)
rep_joined = rep.replace(" ", "_")
out_path = Paths.SUMMARY / f"Summaries_of_Public_Submissions-{rep_joined}.md"
out_file = Paths.SUMMARY / f"Summaries_of_Public_Submissions-{rep_joined}"
with open(out_path, "w") as f:
f.write(f"{header}{full_text}")
try:
command = [
"pandoc",
f"{out_path}",
"-o",
f"{out_file}.pdf",
"--pdf-engine=xelatex",
]
subprocess.run(command, check=True, capture_output=True)
command = [
"pdftk",
"data/covers/Summaries_of_Public_Submissions.pdf",
f"{out_file}.pdf",
"cat",
"output",
f"{out_file}-cover.pdf",
]
subprocess.run(command, check=True, capture_output=True)
command = ["mv", "-f", f"{out_file}-cover.pdf", f"{out_file}.pdf"]
subprocess.run(command, check=True, capture_output=True)
command = [
"pandoc",
f"{out_path}",
"-o",
f"{out_file}.docx",
"--reference-doc=data/covers/reference.docx",
]
subprocess.run(command, check=True, capture_output=True)
# command = [
# "pandoc",
# "data/covers/Summaries_of_Public_Submissions.docx",
# f"{out_file}.docx",
# "-o",
# f"{out_file}.docx",
# "--reference-doc=data/covers/reference.docx",
# ]
# subprocess.run(command, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
logging.error(f"Error during render: {e}")