Spaces:

ds460
/

docker_streamlit

Sleeping

App Files Files Community

docker_streamlit / scripts_build /01_read.py

hathawayj

ismall edits

d3b39b6 6 months ago

raw

history blame

1.99 kB

	# %%
	import polars as pl
	# Notice that the world health leaves missing as blanks in the csv. We need to explain that blanks aren't strings but missing values.

	dat = pl.read_csv("../API_Download_DS2_en_csv_v2_5657328.csv", skip_rows=4, null_values = "")
	# We don't like the World Banks wide format. Let's clean it upt to long format.
	dat_long = dat.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"])
	# no we need to fix the year column and give it a better name.
	# we could have fixed the name as an argument in `.melt()` as well.
	# https://docs.pola.rs/user-guide/expressions/casting/#overflow
	dat_long = dat_long\
	.with_columns(
	pl.col("variable").cast(pl.Int64, strict=False).alias("variable"),
	pl.col("value").cast(pl.Float32, strict=False).alias("value"))\
	.rename({"variable":"year"})\
	.filter(pl.col("value").is_not_null())


	# %%
	# Can we split out the information in the indicator Code
	indicator_columns = dat_long\
	.select(
	pl.col("Indicator Code"),
	pl.col("Indicator Code")\
	# split string: example VC.IDP.TOCV into list object ["VC", "IDP", "TOCV"]
	.str.split_exact(".", 6).alias("split")).unnest("split")\
	.unique()


	# What should we call these columns?
	# https://datahelpdesk.worldbank.org/knowledgebase/articles/201175-how-does-the-world-bank-code-its-indicators
	new_names = {"field_0":"topic", "field_1":"general_subj", "field_2":"specific_subj",
	"field_3":"ext_1", "field_4":"ext_2", "field_5":"ext_3", "field_6":"ext_4"}
	indicator_columns = indicator_columns.rename(new_names)

	# %%
	# now we need to finalize our munge and write our data
	dat_final = dat_long.join(indicator_columns, how="left", on="Indicator Code")
	# Now I want to reorder the columns
	names = dat_final.columns
	new_order = [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0]
	name_order = [names[i] for i in new_order]
	dat_final = dat_final.select(name_order)

	# %%
	# write data
	dat_final.write_csv("../dat_munged.csv")



	# %%