Spaces:
Running
Running
import logging | |
import re | |
from collections import Counter | |
import geopandas as gpd | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as mticker | |
import numpy as np | |
import pandas as pd | |
import polars as pl | |
import requests | |
from matplotlib.patches import Patch | |
from polars.dependencies import subprocess | |
from planning_ai.common.utils import Paths | |
plt.rcParams.update( | |
{"font.family": "sans-serif", "font.sans-serif": ["Liberation Sans"]} | |
) | |
def _process_postcodes(final): | |
"""Processes postcodes from the final document data. | |
Args: | |
final (dict): A dictionary containing document data. | |
Returns: | |
pl.DataFrame: A DataFrame with postcodes and their counts joined with ONSPD data. | |
""" | |
documents = final["documents"] | |
postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents] | |
missing = ["Missing" for pcd in postcodes if not pcd] | |
postcodes = ( | |
pl.DataFrame({"postcode": postcodes})["postcode"] | |
.value_counts() | |
.with_columns(pl.col("postcode").str.replace_all(" ", "")) | |
) | |
onspd = pl.read_parquet( | |
Paths.RAW / "onspd_cambridge.parquet", | |
columns=["pcd", "osward", "lsoa11", "oa21"], | |
).with_columns(pl.col("pcd").str.replace_all(" ", "").alias("postcode")) | |
postcodes = postcodes.join(onspd, on="postcode", how="left") | |
outside_pcs = postcodes.filter(pl.col("osward").is_null()).drop_nulls("postcode") | |
pcs_url = "https://api.postcodes.io/postcodes" | |
def get_pcs(postcode): | |
response = requests.get(f"{pcs_url}/{postcode}") | |
if response.status_code == 200: | |
out = response.json()["result"] | |
admin_ward = out.get("admin_ward") | |
admin_district = out.get("admin_district") | |
return f"{admin_ward}, {admin_district}" | |
else: | |
return "Unknown" | |
outside_pcs = pl.concat( | |
[ | |
outside_pcs.with_columns( | |
pl.col("postcode") | |
.map_elements(lambda x: get_pcs(x), return_dtype=pl.String) | |
.alias("osward") | |
) | |
.select(["postcode", "osward", "count"]) | |
.with_columns(pl.col("count").cast(pl.Int32)), | |
pl.DataFrame( | |
{"postcode": missing, "osward": "Unknown", "count": len(missing)} | |
).with_columns(pl.col("count").cast(pl.Int32)), | |
], | |
) | |
return postcodes.drop_nulls(subset=["osward"]), outside_pcs | |
def _process_policies(final): | |
"""Processes policies from the final document data. | |
Args: | |
final (dict): A dictionary containing document data. | |
Returns: | |
tuple: A tuple containing strings of support, object, and other policies. | |
""" | |
def process_policy_group(policy_group): | |
details = "".join( | |
f"\n### {row['themes']} - {row['stance']}\n\n" | |
+ "".join( | |
f"- {detail} {doc_id}\n" | |
for detail, doc_id in zip(row["detail"], row["doc_id"]) | |
) | |
for row in policy_group.rows(named=True)[:1] | |
) | |
return details | |
policies_df = final["policies"] | |
support_policies = "" | |
object_policies = "" | |
other_policies = "" | |
for (_, stance), policy in policies_df.group_by( | |
["themes", "stance"], maintain_order=True | |
): | |
if stance == "Support": | |
support_policies += process_policy_group(policy) | |
elif stance == "Object": | |
object_policies += process_policy_group(policy) | |
else: | |
other_policies += process_policy_group(policy) | |
return support_policies, object_policies, other_policies | |
def _process_stances(responses): | |
"""Processes stances from the final document data. | |
Args: | |
final (dict): A dictionary containing document data. | |
Returns: | |
str: A formatted string of stances with their percentages and counts. | |
""" | |
stances = responses["representations_support/object"].to_list() | |
value_counts = Counter(stances) | |
total_values = sum(value_counts.values()) | |
percentages = { | |
key: {"count": count, "percentage": (count / total_values)} | |
for key, count in value_counts.items() | |
} | |
stances_top = sorted( | |
percentages.items(), key=lambda x: x[1]["percentage"], reverse=True | |
) | |
return " | ".join( | |
[ | |
f"**{item}**: {stance['percentage']:.1%} _({stance['count']})_" | |
for item, stance in stances_top | |
] | |
) | |
def _process_themes(final): | |
"""Processes themes from the final document data. | |
Args: | |
final (dict): A dictionary containing document data. | |
Returns: | |
str: A markdown table of themes with their counts and percentages. | |
""" | |
documents = final["documents"] | |
themes = Counter([theme["chapter"] for doc in documents for theme in doc["themes"]]) | |
themes = pl.DataFrame(themes).transpose(include_header=True) | |
themes_breakdown = themes.with_columns( | |
((pl.col("column_0") / pl.sum("column_0")) * 100).round(2).alias("percentage") | |
).sort("percentage", descending=True) | |
themes_breakdown = themes_breakdown.rename( | |
{"column": "Theme", "column_0": "Count", "percentage": "Percentage"} | |
) | |
pd.set_option("display.precision", 1) | |
return themes_breakdown.to_pandas().to_markdown(index=False) | |
def fig_oa(postcodes, rep): | |
"""Generates a figure for Output Area (OA) classifications. | |
Args: | |
postcodes (pl.DataFrame): A DataFrame containing postcode data. | |
""" | |
oa_pop = pl.read_parquet(Paths.RAW / "camb_oa_pop.parquet") | |
oa_pop = ( | |
oa_pop.group_by(pl.col("Output Areas Code")) | |
.sum() | |
.rename({"Output Areas Code": "OA2021", "Observation": "population"}) | |
.select(["OA2021", "population"]) | |
) | |
oac = pl.read_parquet(Paths.RAW / "camb_oac.parquet") | |
oac_names = pl.read_csv(Paths.RAW / "classification_codes_and_names.csv") | |
oac = ( | |
oac.with_columns(pl.col("supergroup").cast(str)) | |
.join(oac_names, left_on="supergroup", right_on="Classification Code") | |
.select(["oa21cd", "Classification Name", "supergroup"]) | |
.rename( | |
{ | |
"Classification Name": "supergroup_name", | |
} | |
) | |
) | |
oac = oac.join(oa_pop, left_on="oa21cd", right_on="OA2021") | |
oac = ( | |
postcodes.join(oac, left_on="oa21", right_on="oa21cd", how="right") | |
.group_by(["supergroup", "supergroup_name"]) | |
.sum() | |
.select(["supergroup", "supergroup_name", "population", "count"]) | |
.sort("supergroup") | |
.with_columns( | |
((pl.col("count") / pl.col("count").sum()) * 100).alias("perc_count"), | |
((pl.col("population") / pl.col("population").sum()) * 100).alias( | |
"perc_pop" | |
), | |
) | |
.with_columns((pl.col("perc_count") - pl.col("perc_pop")).alias("perc_diff")) | |
) | |
oa_pd = oac.to_pandas() | |
_, ax1 = plt.subplots(figsize=(8, 8)) | |
# Define a list of colors for each supergroup | |
colors = [ | |
"#7f7f7f", # retired | |
"#2ca02c", # suburbanites | |
"#d62728", # multicultural | |
"#e377c2", # low skilled | |
"#ff7f0e", # ethnically diverse | |
"#bcbd22", # baseline uk | |
"#1f77b4", # semi unskilled | |
"#9467bd", # legacy | |
] | |
# Plot bars for percentage of representations | |
bars1 = ax1.bar( | |
oa_pd["supergroup"], | |
oa_pd["perc_diff"], | |
label="Percentage of Representations (%)", | |
color=colors[: len(oa_pd)], | |
edgecolor="none", | |
width=0.9, | |
) | |
# Add centerline at y=0 | |
ax1.axhline(0, color="black", linewidth=1) | |
# Annotate bars with percentage values | |
for bar in bars1: | |
height = bar.get_height() | |
if height > 0: | |
ax1.annotate( | |
f"{height:.0f}%", | |
xy=(bar.get_x() + bar.get_width() / 2, height), | |
xytext=(0, 3), # 3 points vertical offset | |
textcoords="offset points", | |
ha="center", | |
va="bottom", | |
) | |
else: | |
ax1.annotate( | |
f"{height:.0f}%", | |
xy=(bar.get_x() + bar.get_width() / 2, height), | |
xytext=(0, -6), | |
textcoords="offset points", | |
ha="center", | |
va="top", | |
) | |
ax1.set_xlabel("Output Area Classification (OAC) Supergroup") | |
ax1.set_ylabel("Difference from national average (%)") | |
supergroup_names = [ | |
f"{i}: {name}" | |
for i, name in enumerate(oa_pd["supergroup_name"].unique(), start=1) | |
] | |
legend_patches = [ | |
Patch(color=colors[i], label=supergroup_names[i]) | |
for i in range(len(supergroup_names)) | |
] | |
ax1.legend(handles=legend_patches, title="Supergroup", frameon=False) | |
plt.tight_layout() | |
plt.savefig(Paths.SUMMARY / "figs" / f"oas-{rep}.pdf") | |
def fig_wards(postcodes, rep): | |
ward_boundaries = gpd.read_parquet(Paths.RAW / "camb_wards.parquet") | |
ward_pcs = postcodes.group_by("osward").sum() | |
ward_boundaries_prop = ward_boundaries.merge( | |
ward_pcs.to_pandas(), left_on="WD21CD", right_on="osward" | |
) | |
_, ax = plt.subplots(figsize=(8, 8)) | |
ward_boundaries.plot(ax=ax, color="none", edgecolor="black", linewidth=1.5) | |
ward_boundaries.plot(ax=ax, color="white", edgecolor="grey", linewidth=0.5) | |
ward_boundaries_prop.plot( | |
ax=ax, | |
column="count", | |
edgecolor="none", | |
legend=True, | |
legend_kwds={"label": "Number of Representations"}, | |
) | |
cbar = ax.get_figure().axes[-1] # Get the colorbar axis | |
cbar.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{int(x)}")) | |
plt.axis("off") | |
plt.tight_layout() | |
plt.savefig(Paths.SUMMARY / "figs" / f"wards-{rep}.pdf") | |
def fig_imd(postcodes, rep): | |
imd = pl.read_parquet(Paths.RAW / "imd_camb.parquet") | |
pops = pl.read_parquet(Paths.RAW / "pops_camb.parquet") | |
imd = ( | |
postcodes.join(imd, left_on="lsoa11", right_on="LSOA", how="right") | |
.join(pops, left_on="LSOA", right_on="LSOA 2021 Code") | |
.group_by("SOA_decile") | |
.agg(pl.col("count").sum(), pl.col("LSOA").count(), pl.col("Total").sum()) | |
.sort("SOA_decile") | |
.with_columns( | |
((pl.col("count") / pl.col("count").sum()) * 100).alias("perc_count"), | |
((pl.col("Total") / pl.col("Total").sum()) * 100).alias("perc_pop"), | |
) | |
.with_columns((pl.col("perc_count") - pl.col("perc_pop")).alias("perc_diff")) | |
) | |
postcodes_pd = imd.to_pandas() | |
colors = [ | |
"#d62728", | |
"#9f4e64", | |
"#6f76a0", | |
"#478dbf", | |
"#1f77b4", | |
] | |
_, ax1 = plt.subplots(figsize=(8, 8)) | |
x = np.arange(len(postcodes_pd)) | |
ax1.bar( | |
x, # Shift to the left | |
postcodes_pd["perc_diff"], | |
edgecolor="none", | |
color=colors, | |
width=0.9, | |
) | |
# Set labels and ticks | |
ax1.set_xlabel("Deprivation Quintile") | |
ax1.set_ylabel("Difference from national average (%)") | |
ax1.set_xticks(x) | |
ax1.axhline(0, color="black", linewidth=1) | |
# ax1.legend(loc="upper center", bbox_to_anchor=(0.5, 1.1), ncol=5, frameon=False) | |
plt.tight_layout() | |
ax1.set_xticklabels(["1 - Most Deprived", "2", "3", "4", "5 - Least Deprived"]) | |
plt.savefig(Paths.SUMMARY / "figs" / f"imd_decile-{rep}.pdf") | |
def load_txt(file_path): | |
with open(file_path, "r", encoding="utf-8") as file: | |
return file.read() | |
def build_final_report(out, rep): | |
introduction_paragraph = load_txt("planning_ai/documents/introduction.txt") | |
figures_paragraph = load_txt("planning_ai/documents/figures.txt") | |
themes_paragraph = load_txt("planning_ai/documents/themes.txt") | |
final = out["generate_final_report"] | |
responses = ( | |
pl.scan_parquet(Paths.STAGING / "gcpt3.parquet") | |
.filter(pl.col("representations_document") == rep) | |
.unique("id") | |
.collect() | |
) | |
support_policies, object_policies, other_policies = _process_policies(final) | |
postcodes, outside_pcs = _process_postcodes(final) | |
stances = _process_stances(responses) | |
themes = _process_themes(final) | |
fig_wards(postcodes, rep) | |
fig_oa(postcodes, rep) | |
fig_imd(postcodes, rep) | |
outside_pcs = ( | |
outside_pcs.group_by("osward") | |
.sum()[["osward", "count"]] | |
.filter(pl.col("osward") != "Unknown") | |
.rename( | |
{"osward": "Ward, Local Authority", "count": "Number of Representations"} | |
) | |
.to_pandas() | |
.to_markdown(index=False) | |
) | |
quarto_doc = ( | |
f"---\ntitle: '**Overview of Public Submissions: {rep}**'\n" | |
r""" | |
mainfont: Liberation Sans | |
fontsize: 12pt | |
margin: 2cm | |
geometry: a4paper | |
header-includes: | | |
\usepackage{graphicx} | |
\usepackage{fancyhdr} | |
\usepackage{geometry} | |
\usepackage{sectsty} | |
\geometry{top=1in, bottom=1in, left=1in, right=1in} | |
\pagestyle{fancy} | |
\fancyhead[L]{} | |
\fancyhead[C]{} | |
\fancyhead[R]{\includegraphics[width=3cm]{logo.png}} | |
\fancyfoot[L]{} | |
\fancyfoot[C]{} | |
\fancyfoot[R]{\thepage} | |
\renewcommand{\headrulewidth}{0pt} | |
\renewcommand{\footrulewidth}{0pt} | |
\linespread{1.2} | |
\usepackage{titlesec} | |
\usepackage{xcolor} | |
\definecolor{uolblue}{HTML}{1F2B7D} | |
\titleformat{\section}[block]{\normalfont\Large\bfseries\color{uolblue}}{}{0em}{} | |
\titleformat{\subsection}[block]{\normalfont\large\bfseries\color{uolblue}}{}{0em}{} | |
\fancypagestyle{plain}{\fancyhf{}\fancyfoot[R]{\thepage}\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}} | |
\usepackage{titling} | |
\pretitle{\begin{flushleft}\Huge\color{uolblue}} | |
\posttitle{\end{flushleft}\vspace{-2em}} | |
--- | |
""" | |
"\n# Executive Summary\n\n" | |
f"{final['executive']}\n\n" | |
f"There were a total of {len(responses):,} responses. Of these, representations left " | |
"comment, or indicated the following support and objection of the consultation document:\n\n" | |
f"{stances}\n\n" | |
"# Introduction\n\n" | |
f"{introduction_paragraph}\n\n" | |
"\n# Profile of Submissions\n\n" | |
f"{figures_paragraph}\n\n" | |
f"\n\n" | |
f": Postcodes outside the Greater Cambridge Ward areas\n\n{outside_pcs}n\n" | |
f"![The proportion of representations submitted by 2021 Output Area Classification^[Wyszomierski, J., Longley, P.A., Singleton, A.D., Gale, C. & O’Brien, O. (2024) A neighbourhood Output Area Classification from the 2021 and 2022 UK censuses. The Geographical Journal, 190, e12550. Available from: https://doi.org/10.1111/geoj.12550] relative to the national average](./data/out/summary/figs/oas-{rep}.pdf)\n\n" | |
f"\n\n" | |
r"\newpage" | |
"\n\n# Themes and Policies\n\n" | |
f"{themes_paragraph}\n\n" | |
f": Breakdown of representations by section\n\n{themes}\n\n" | |
"## Supporting Representations\n\n" | |
"The following section presents a list of all points raised in representations that support the consultation document, grouped by sections.\n\n" | |
f"{support_policies or '_No supporting representations._'}\n\n" | |
"## Objecting Representations\n\n" | |
"The following section presents a list of all points raised in representations that object to the consultation document, grouped by sections.\n\n" | |
f"{object_policies or '_No objecting representations._'}\n\n" | |
"## Comment\n\n" | |
"The following section presents a list of all points raised in representations that do not support or object to the consultation document, grouped by sections.\n\n" | |
f"{other_policies or '_No other representations._'}\n\n" | |
) | |
rep_joined = rep.replace(" ", "_") | |
out_path = Paths.SUMMARY / f"Overview_of_Public_Submissions-{rep_joined}.md" | |
out_file = Paths.SUMMARY / f"Overview_of_Public_Submissions-{rep_joined}" | |
with open(out_path, "w") as f: | |
f.write(quarto_doc) | |
try: | |
command = [ | |
"pandoc", | |
f"{out_path}", | |
"-o", | |
f"{out_file}.pdf", | |
"--pdf-engine=xelatex", | |
] | |
subprocess.run(command, check=True, capture_output=True) | |
command = [ | |
"pdftk", | |
"data/covers/Overview_of_Public_Submissions.pdf", | |
f"{out_file}.pdf", | |
"cat", | |
"output", | |
f"{out_file}-cover.pdf", | |
] | |
subprocess.run(command, check=True, capture_output=True) | |
command = ["mv", "-f", f"{out_file}-cover.pdf", f"{out_file}.pdf"] | |
subprocess.run(command, check=True, capture_output=True) | |
command = [ | |
"pandoc", | |
f"{out_path}", | |
"-o", | |
f"{out_file}.docx", | |
"--reference-doc=data/covers/reference.docx", | |
] | |
subprocess.run(command, check=True, capture_output=True) | |
# command = [ | |
# "pandoc", | |
# "data/covers/Overview_of_Public_Submissions.docx", | |
# f"{out_file}.docx", | |
# "-o", | |
# f"{out_file}.docx", | |
# "--reference-doc=data/covers/reference.docx", | |
# ] | |
# subprocess.run(command, check=True, capture_output=True) | |
except subprocess.CalledProcessError as e: | |
logging.error(f"Error during Overview_of_Public_Submissions render: {e}") | |
def build_summaries_document(out, rep): | |
sub = r"Document ID: \[\d+\]\n\n" | |
summary_intro = load_txt("planning_ai/documents/summary_intro.txt") | |
full_text = "".join( | |
f"**Part ID**: {document['doc_id']}\n\n" | |
f"**Representations ID**: {document['document'].metadata['id']}\n\n" | |
f"\n\n{re.sub(sub, '', document['summary'])}\n\n" | |
"---\n\n" | |
for document in out["generate_final_report"]["documents"] | |
) | |
header = ( | |
f"---\ntitle: '**Summaries of Public Submissions: {rep}**'\n" | |
r""" | |
mainfont: Liberation Sans | |
fontsize: 12pt | |
margin: 2cm | |
geometry: a4paper | |
header-includes: | | |
\usepackage{graphicx} | |
\usepackage{fancyhdr} | |
\usepackage{geometry} | |
\usepackage{sectsty} | |
\geometry{top=1in, bottom=1in, left=1in, right=1in} | |
\pagestyle{fancy} | |
\fancyhead[L]{} | |
\fancyhead[C]{} | |
\fancyhead[R]{\includegraphics[width=3cm]{logo.png}} | |
\fancyfoot[L]{} | |
\fancyfoot[C]{} | |
\fancyfoot[R]{\thepage} | |
\renewcommand{\headrulewidth}{0pt} | |
\renewcommand{\footrulewidth}{0pt} | |
\linespread{1.2} | |
\usepackage{titlesec} | |
\usepackage{xcolor} | |
\definecolor{uolblue}{HTML}{1F2B7D} | |
\titleformat{\section}[block]{\normalfont\Large\bfseries\color{uolblue}}{}{0em}{} | |
\titleformat{\subsection}[block]{\normalfont\large\bfseries\color{uolblue}}{}{0em}{} | |
\fancypagestyle{plain}{\fancyhf{}\fancyfoot[R]{\thepage}\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}} | |
\usepackage{titling} | |
\pretitle{\begin{flushleft}\Huge\color{uolblue}} | |
\posttitle{\end{flushleft}\vspace{-2em}} | |
--- | |
""" | |
f"\n{summary_intro}\n\n" | |
) | |
rep_joined = rep.replace(" ", "_") | |
out_path = Paths.SUMMARY / f"Summaries_of_Public_Submissions-{rep_joined}.md" | |
out_file = Paths.SUMMARY / f"Summaries_of_Public_Submissions-{rep_joined}" | |
with open(out_path, "w") as f: | |
f.write(f"{header}{full_text}") | |
try: | |
command = [ | |
"pandoc", | |
f"{out_path}", | |
"-o", | |
f"{out_file}.pdf", | |
"--pdf-engine=xelatex", | |
] | |
subprocess.run(command, check=True, capture_output=True) | |
command = [ | |
"pdftk", | |
"data/covers/Summaries_of_Public_Submissions.pdf", | |
f"{out_file}.pdf", | |
"cat", | |
"output", | |
f"{out_file}-cover.pdf", | |
] | |
subprocess.run(command, check=True, capture_output=True) | |
command = ["mv", "-f", f"{out_file}-cover.pdf", f"{out_file}.pdf"] | |
subprocess.run(command, check=True, capture_output=True) | |
command = [ | |
"pandoc", | |
f"{out_path}", | |
"-o", | |
f"{out_file}.docx", | |
"--reference-doc=data/covers/reference.docx", | |
] | |
subprocess.run(command, check=True, capture_output=True) | |
# command = [ | |
# "pandoc", | |
# "data/covers/Summaries_of_Public_Submissions.docx", | |
# f"{out_file}.docx", | |
# "-o", | |
# f"{out_file}.docx", | |
# "--reference-doc=data/covers/reference.docx", | |
# ] | |
# subprocess.run(command, check=True, capture_output=True) | |
except subprocess.CalledProcessError as e: | |
logging.error(f"Error during render: {e}") | |