Spaces:

cjber
/

planning-ai

Running

App Files Files Community

planning-ai / planning_ai /documents /document.py

cjber

fix: dont use covers docx

9df9069 about 2 months ago

raw

history blame contribute delete

20.9 kB

	import logging
	import re
	from collections import Counter

	import geopandas as gpd
	import matplotlib.pyplot as plt
	import matplotlib.ticker as mticker
	import numpy as np
	import pandas as pd
	import polars as pl
	import requests
	from matplotlib.patches import Patch
	from polars.dependencies import subprocess

	from planning_ai.common.utils import Paths

	plt.rcParams.update(
	{"font.family": "sans-serif", "font.sans-serif": ["Liberation Sans"]}
	)


	def _process_postcodes(final):
	"""Processes postcodes from the final document data.

	Args:
	final (dict): A dictionary containing document data.

	Returns:
	pl.DataFrame: A DataFrame with postcodes and their counts joined with ONSPD data.
	"""
	documents = final["documents"]
	postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents]
	missing = ["Missing" for pcd in postcodes if not pcd]
	postcodes = (
	pl.DataFrame({"postcode": postcodes})["postcode"]
	.value_counts()
	.with_columns(pl.col("postcode").str.replace_all(" ", ""))
	)
	onspd = pl.read_parquet(
	Paths.RAW / "onspd_cambridge.parquet",
	columns=["pcd", "osward", "lsoa11", "oa21"],
	).with_columns(pl.col("pcd").str.replace_all(" ", "").alias("postcode"))
	postcodes = postcodes.join(onspd, on="postcode", how="left")
	outside_pcs = postcodes.filter(pl.col("osward").is_null()).drop_nulls("postcode")
	pcs_url = "https://api.postcodes.io/postcodes"

	def get_pcs(postcode):
	response = requests.get(f"{pcs_url}/{postcode}")
	if response.status_code == 200:
	out = response.json()["result"]
	admin_ward = out.get("admin_ward")
	admin_district = out.get("admin_district")
	return f"{admin_ward}, {admin_district}"
	else:
	return "Unknown"

	outside_pcs = pl.concat(
	[
	outside_pcs.with_columns(
	pl.col("postcode")
	.map_elements(lambda x: get_pcs(x), return_dtype=pl.String)
	.alias("osward")
	)
	.select(["postcode", "osward", "count"])
	.with_columns(pl.col("count").cast(pl.Int32)),
	pl.DataFrame(
	{"postcode": missing, "osward": "Unknown", "count": len(missing)}
	).with_columns(pl.col("count").cast(pl.Int32)),
	],
	)
	return postcodes.drop_nulls(subset=["osward"]), outside_pcs


	def _process_policies(final):
	"""Processes policies from the final document data.

	Args:
	final (dict): A dictionary containing document data.

	Returns:
	tuple: A tuple containing strings of support, object, and other policies.
	"""

	def process_policy_group(policy_group):
	details = "".join(
	f"\n### {row['themes']} - {row['stance']}\n\n"
	+ "".join(
	f"- {detail} {doc_id}\n"
	for detail, doc_id in zip(row["detail"], row["doc_id"])
	)
	for row in policy_group.rows(named=True)[:1]
	)
	return details

	policies_df = final["policies"]

	support_policies = ""
	object_policies = ""
	other_policies = ""

	for (_, stance), policy in policies_df.group_by(
	["themes", "stance"], maintain_order=True
	):
	if stance == "Support":
	support_policies += process_policy_group(policy)
	elif stance == "Object":
	object_policies += process_policy_group(policy)
	else:
	other_policies += process_policy_group(policy)
	return support_policies, object_policies, other_policies


	def _process_stances(responses):
	"""Processes stances from the final document data.

	Args:
	final (dict): A dictionary containing document data.

	Returns:
	str: A formatted string of stances with their percentages and counts.
	"""
	stances = responses["representations_support/object"].to_list()
	value_counts = Counter(stances)
	total_values = sum(value_counts.values())
	percentages = {
	key: {"count": count, "percentage": (count / total_values)}
	for key, count in value_counts.items()
	}
	stances_top = sorted(
	percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
	)
	return " \| ".join(
	[
	f"{item}: {stance['percentage']:.1%} _({stance['count']})_"
	for item, stance in stances_top
	]
	)


	def _process_themes(final):
	"""Processes themes from the final document data.

	Args:
	final (dict): A dictionary containing document data.

	Returns:
	str: A markdown table of themes with their counts and percentages.
	"""
	documents = final["documents"]
	themes = Counter([theme["chapter"] for doc in documents for theme in doc["themes"]])
	themes = pl.DataFrame(themes).transpose(include_header=True)
	themes_breakdown = themes.with_columns(
	((pl.col("column_0") / pl.sum("column_0")) * 100).round(2).alias("percentage")
	).sort("percentage", descending=True)
	themes_breakdown = themes_breakdown.rename(
	{"column": "Theme", "column_0": "Count", "percentage": "Percentage"}
	)
	pd.set_option("display.precision", 1)
	return themes_breakdown.to_pandas().to_markdown(index=False)


	def fig_oa(postcodes, rep):
	"""Generates a figure for Output Area (OA) classifications.

	Args:
	postcodes (pl.DataFrame): A DataFrame containing postcode data.
	"""
	oa_pop = pl.read_parquet(Paths.RAW / "camb_oa_pop.parquet")
	oa_pop = (
	oa_pop.group_by(pl.col("Output Areas Code"))
	.sum()
	.rename({"Output Areas Code": "OA2021", "Observation": "population"})
	.select(["OA2021", "population"])
	)

	oac = pl.read_parquet(Paths.RAW / "camb_oac.parquet")
	oac_names = pl.read_csv(Paths.RAW / "classification_codes_and_names.csv")
	oac = (
	oac.with_columns(pl.col("supergroup").cast(str))
	.join(oac_names, left_on="supergroup", right_on="Classification Code")
	.select(["oa21cd", "Classification Name", "supergroup"])
	.rename(
	{
	"Classification Name": "supergroup_name",
	}
	)
	)
	oac = oac.join(oa_pop, left_on="oa21cd", right_on="OA2021")
	oac = (
	postcodes.join(oac, left_on="oa21", right_on="oa21cd", how="right")
	.group_by(["supergroup", "supergroup_name"])
	.sum()
	.select(["supergroup", "supergroup_name", "population", "count"])
	.sort("supergroup")
	.with_columns(
	((pl.col("count") / pl.col("count").sum()) * 100).alias("perc_count"),
	((pl.col("population") / pl.col("population").sum()) * 100).alias(
	"perc_pop"
	),
	)
	.with_columns((pl.col("perc_count") - pl.col("perc_pop")).alias("perc_diff"))
	)
	oa_pd = oac.to_pandas()

	_, ax1 = plt.subplots(figsize=(8, 8))

	# Define a list of colors for each supergroup
	colors = [
	"#7f7f7f", # retired
	"#2ca02c", # suburbanites
	"#d62728", # multicultural
	"#e377c2", # low skilled
	"#ff7f0e", # ethnically diverse
	"#bcbd22", # baseline uk
	"#1f77b4", # semi unskilled
	"#9467bd", # legacy
	]

	# Plot bars for percentage of representations
	bars1 = ax1.bar(
	oa_pd["supergroup"],
	oa_pd["perc_diff"],
	label="Percentage of Representations (%)",
	color=colors[: len(oa_pd)],
	edgecolor="none",
	width=0.9,
	)

	# Add centerline at y=0
	ax1.axhline(0, color="black", linewidth=1)

	# Annotate bars with percentage values
	for bar in bars1:
	height = bar.get_height()
	if height > 0:
	ax1.annotate(
	f"{height:.0f}%",
	xy=(bar.get_x() + bar.get_width() / 2, height),
	xytext=(0, 3), # 3 points vertical offset
	textcoords="offset points",
	ha="center",
	va="bottom",
	)
	else:
	ax1.annotate(
	f"{height:.0f}%",
	xy=(bar.get_x() + bar.get_width() / 2, height),
	xytext=(0, -6),
	textcoords="offset points",
	ha="center",
	va="top",
	)

	ax1.set_xlabel("Output Area Classification (OAC) Supergroup")
	ax1.set_ylabel("Difference from national average (%)")

	supergroup_names = [
	f"{i}: {name}"
	for i, name in enumerate(oa_pd["supergroup_name"].unique(), start=1)
	]
	legend_patches = [
	Patch(color=colors[i], label=supergroup_names[i])
	for i in range(len(supergroup_names))
	]
	ax1.legend(handles=legend_patches, title="Supergroup", frameon=False)

	plt.tight_layout()

	plt.savefig(Paths.SUMMARY / "figs" / f"oas-{rep}.pdf")


	def fig_wards(postcodes, rep):
	ward_boundaries = gpd.read_parquet(Paths.RAW / "camb_wards.parquet")
	ward_pcs = postcodes.group_by("osward").sum()
	ward_boundaries_prop = ward_boundaries.merge(
	ward_pcs.to_pandas(), left_on="WD21CD", right_on="osward"
	)

	_, ax = plt.subplots(figsize=(8, 8))
	ward_boundaries.plot(ax=ax, color="none", edgecolor="black", linewidth=1.5)
	ward_boundaries.plot(ax=ax, color="white", edgecolor="grey", linewidth=0.5)
	ward_boundaries_prop.plot(
	ax=ax,
	column="count",
	edgecolor="none",
	legend=True,
	legend_kwds={"label": "Number of Representations"},
	)
	cbar = ax.get_figure().axes[-1] # Get the colorbar axis
	cbar.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{int(x)}"))

	plt.axis("off")
	plt.tight_layout()

	plt.savefig(Paths.SUMMARY / "figs" / f"wards-{rep}.pdf")


	def fig_imd(postcodes, rep):
	imd = pl.read_parquet(Paths.RAW / "imd_camb.parquet")
	pops = pl.read_parquet(Paths.RAW / "pops_camb.parquet")
	imd = (
	postcodes.join(imd, left_on="lsoa11", right_on="LSOA", how="right")
	.join(pops, left_on="LSOA", right_on="LSOA 2021 Code")
	.group_by("SOA_decile")
	.agg(pl.col("count").sum(), pl.col("LSOA").count(), pl.col("Total").sum())
	.sort("SOA_decile")
	.with_columns(
	((pl.col("count") / pl.col("count").sum()) * 100).alias("perc_count"),
	((pl.col("Total") / pl.col("Total").sum()) * 100).alias("perc_pop"),
	)
	.with_columns((pl.col("perc_count") - pl.col("perc_pop")).alias("perc_diff"))
	)

	postcodes_pd = imd.to_pandas()
	colors = [
	"#d62728",
	"#9f4e64",
	"#6f76a0",
	"#478dbf",
	"#1f77b4",
	]

	_, ax1 = plt.subplots(figsize=(8, 8))

	x = np.arange(len(postcodes_pd))
	ax1.bar(
	x, # Shift to the left
	postcodes_pd["perc_diff"],
	edgecolor="none",
	color=colors,
	width=0.9,
	)

	# Set labels and ticks
	ax1.set_xlabel("Deprivation Quintile")
	ax1.set_ylabel("Difference from national average (%)")
	ax1.set_xticks(x)
	ax1.axhline(0, color="black", linewidth=1)

	# ax1.legend(loc="upper center", bbox_to_anchor=(0.5, 1.1), ncol=5, frameon=False)
	plt.tight_layout()
	ax1.set_xticklabels(["1 - Most Deprived", "2", "3", "4", "5 - Least Deprived"])

	plt.savefig(Paths.SUMMARY / "figs" / f"imd_decile-{rep}.pdf")


	def load_txt(file_path):
	with open(file_path, "r", encoding="utf-8") as file:
	return file.read()


	def build_final_report(out, rep):
	introduction_paragraph = load_txt("planning_ai/documents/introduction.txt")
	figures_paragraph = load_txt("planning_ai/documents/figures.txt")
	themes_paragraph = load_txt("planning_ai/documents/themes.txt")
	final = out["generate_final_report"]
	responses = (
	pl.scan_parquet(Paths.STAGING / "gcpt3.parquet")
	.filter(pl.col("representations_document") == rep)
	.unique("id")
	.collect()
	)

	support_policies, object_policies, other_policies = _process_policies(final)
	postcodes, outside_pcs = _process_postcodes(final)
	stances = _process_stances(responses)
	themes = _process_themes(final)

	fig_wards(postcodes, rep)
	fig_oa(postcodes, rep)
	fig_imd(postcodes, rep)

	outside_pcs = (
	outside_pcs.group_by("osward")
	.sum()[["osward", "count"]]
	.filter(pl.col("osward") != "Unknown")
	.rename(
	{"osward": "Ward, Local Authority", "count": "Number of Representations"}
	)
	.to_pandas()
	.to_markdown(index=False)
	)

	quarto_doc = (
	f"---\ntitle: 'Overview of Public Submissions: {rep}'\n"
	r"""
	mainfont: Liberation Sans
	fontsize: 12pt
	margin: 2cm
	geometry: a4paper
	header-includes: \|
	\usepackage{graphicx}
	\usepackage{fancyhdr}
	\usepackage{geometry}
	\usepackage{sectsty}
	\geometry{top=1in, bottom=1in, left=1in, right=1in}
	\pagestyle{fancy}
	\fancyhead[L]{}
	\fancyhead[C]{}
	\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}
	\fancyfoot[L]{}
	\fancyfoot[C]{}
	\fancyfoot[R]{\thepage}
	\renewcommand{\headrulewidth}{0pt}
	\renewcommand{\footrulewidth}{0pt}
	\linespread{1.2}
	\usepackage{titlesec}
	\usepackage{xcolor}
	\definecolor{uolblue}{HTML}{1F2B7D}
	\titleformat{\section}[block]{\normalfont\Large\bfseries\color{uolblue}}{}{0em}{}
	\titleformat{\subsection}[block]{\normalfont\large\bfseries\color{uolblue}}{}{0em}{}
	\fancypagestyle{plain}{\fancyhf{}\fancyfoot[R]{\thepage}\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}}
	\usepackage{titling}
	\pretitle{\begin{flushleft}\Huge\color{uolblue}}
	\posttitle{\end{flushleft}\vspace{-2em}}
	---
	"""
	"\n# Executive Summary\n\n"
	f"{final['executive']}\n\n"
	f"There were a total of {len(responses):,} responses. Of these, representations left "
	"comment, or indicated the following support and objection of the consultation document:\n\n"
	f"{stances}\n\n"
	"# Introduction\n\n"
	f"{introduction_paragraph}\n\n"
	"\n# Profile of Submissions\n\n"
	f"{figures_paragraph}\n\n"
	f"![Total number of representations submitted by Ward within Greater Cambridgeshire](./data/out/summary/figs/wards-{rep}.pdf)\n\n"
	f": Postcodes outside the Greater Cambridge Ward areas\n\n{outside_pcs}n\n"
	f"![The proportion of representations submitted by 2021 Output Area Classification^[Wyszomierski, J., Longley, P.A., Singleton, A.D., Gale, C. & O’Brien, O. (2024) A neighbourhood Output Area Classification from the 2021 and 2022 UK censuses. The Geographical Journal, 190, e12550. Available from: https://doi.org/10.1111/geoj.12550] relative to the national average](./data/out/summary/figs/oas-{rep}.pdf)\n\n"
	f"![The proportion of representations submitted by the Index of Multiple Deprivation (2019) relative to the national average](./data/out/summary/figs/imd_decile-{rep}.pdf)\n\n"
	r"\newpage"
	"\n\n# Themes and Policies\n\n"
	f"{themes_paragraph}\n\n"
	f": Breakdown of representations by section\n\n{themes}\n\n"
	"## Supporting Representations\n\n"
	"The following section presents a list of all points raised in representations that support the consultation document, grouped by sections.\n\n"
	f"{support_policies or '_No supporting representations._'}\n\n"
	"## Objecting Representations\n\n"
	"The following section presents a list of all points raised in representations that object to the consultation document, grouped by sections.\n\n"
	f"{object_policies or '_No objecting representations._'}\n\n"
	"## Comment\n\n"
	"The following section presents a list of all points raised in representations that do not support or object to the consultation document, grouped by sections.\n\n"
	f"{other_policies or '_No other representations._'}\n\n"
	)

	rep_joined = rep.replace(" ", "_")
	out_path = Paths.SUMMARY / f"Overview_of_Public_Submissions-{rep_joined}.md"
	out_file = Paths.SUMMARY / f"Overview_of_Public_Submissions-{rep_joined}"
	with open(out_path, "w") as f:
	f.write(quarto_doc)
	try:
	command = [
	"pandoc",
	f"{out_path}",
	"-o",
	f"{out_file}.pdf",
	"--pdf-engine=xelatex",
	]
	subprocess.run(command, check=True, capture_output=True)
	command = [
	"pdftk",
	"data/covers/Overview_of_Public_Submissions.pdf",
	f"{out_file}.pdf",
	"cat",
	"output",
	f"{out_file}-cover.pdf",
	]
	subprocess.run(command, check=True, capture_output=True)
	command = ["mv", "-f", f"{out_file}-cover.pdf", f"{out_file}.pdf"]
	subprocess.run(command, check=True, capture_output=True)

	command = [
	"pandoc",
	f"{out_path}",
	"-o",
	f"{out_file}.docx",
	"--reference-doc=data/covers/reference.docx",
	]
	subprocess.run(command, check=True, capture_output=True)
	# command = [
	# "pandoc",
	# "data/covers/Overview_of_Public_Submissions.docx",
	# f"{out_file}.docx",
	# "-o",
	# f"{out_file}.docx",
	# "--reference-doc=data/covers/reference.docx",
	# ]
	# subprocess.run(command, check=True, capture_output=True)
	except subprocess.CalledProcessError as e:
	logging.error(f"Error during Overview_of_Public_Submissions render: {e}")


	def build_summaries_document(out, rep):
	sub = r"Document ID: \[\d+\]\n\n"
	summary_intro = load_txt("planning_ai/documents/summary_intro.txt")

	full_text = "".join(
	f"Part ID: {document['doc_id']}\n\n"
	f"Representations ID: {document['document'].metadata['id']}\n\n"
	f"\n\n{re.sub(sub, '', document['summary'])}\n\n"
	"---\n\n"
	for document in out["generate_final_report"]["documents"]
	)
	header = (
	f"---\ntitle: 'Summaries of Public Submissions: {rep}'\n"
	r"""
	mainfont: Liberation Sans
	fontsize: 12pt
	margin: 2cm
	geometry: a4paper
	header-includes: \|
	\usepackage{graphicx}
	\usepackage{fancyhdr}
	\usepackage{geometry}
	\usepackage{sectsty}
	\geometry{top=1in, bottom=1in, left=1in, right=1in}
	\pagestyle{fancy}
	\fancyhead[L]{}
	\fancyhead[C]{}
	\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}
	\fancyfoot[L]{}
	\fancyfoot[C]{}
	\fancyfoot[R]{\thepage}
	\renewcommand{\headrulewidth}{0pt}
	\renewcommand{\footrulewidth}{0pt}
	\linespread{1.2}
	\usepackage{titlesec}
	\usepackage{xcolor}
	\definecolor{uolblue}{HTML}{1F2B7D}
	\titleformat{\section}[block]{\normalfont\Large\bfseries\color{uolblue}}{}{0em}{}
	\titleformat{\subsection}[block]{\normalfont\large\bfseries\color{uolblue}}{}{0em}{}
	\fancypagestyle{plain}{\fancyhf{}\fancyfoot[R]{\thepage}\fancyhead[R]{\includegraphics[width=3cm]{logo.png}}}
	\usepackage{titling}
	\pretitle{\begin{flushleft}\Huge\color{uolblue}}
	\posttitle{\end{flushleft}\vspace{-2em}}
	---
	"""
	f"\n{summary_intro}\n\n"
	)

	rep_joined = rep.replace(" ", "_")
	out_path = Paths.SUMMARY / f"Summaries_of_Public_Submissions-{rep_joined}.md"
	out_file = Paths.SUMMARY / f"Summaries_of_Public_Submissions-{rep_joined}"
	with open(out_path, "w") as f:
	f.write(f"{header}{full_text}")
	try:
	command = [
	"pandoc",
	f"{out_path}",
	"-o",
	f"{out_file}.pdf",
	"--pdf-engine=xelatex",
	]
	subprocess.run(command, check=True, capture_output=True)
	command = [
	"pdftk",
	"data/covers/Summaries_of_Public_Submissions.pdf",
	f"{out_file}.pdf",
	"cat",
	"output",
	f"{out_file}-cover.pdf",
	]
	subprocess.run(command, check=True, capture_output=True)
	command = ["mv", "-f", f"{out_file}-cover.pdf", f"{out_file}.pdf"]
	subprocess.run(command, check=True, capture_output=True)

	command = [
	"pandoc",
	f"{out_path}",
	"-o",
	f"{out_file}.docx",
	"--reference-doc=data/covers/reference.docx",
	]
	subprocess.run(command, check=True, capture_output=True)
	# command = [
	# "pandoc",
	# "data/covers/Summaries_of_Public_Submissions.docx",
	# f"{out_file}.docx",
	# "-o",
	# f"{out_file}.docx",
	# "--reference-doc=data/covers/reference.docx",
	# ]
	# subprocess.run(command, check=True, capture_output=True)
	except subprocess.CalledProcessError as e:
	logging.error(f"Error during render: {e}")