hathawayj's picture
ismall edits
d3b39b6
raw
history blame
1.99 kB
# %%
import polars as pl
# Notice that the world health leaves missing as blanks in the csv. We need to explain that blanks aren't strings but missing values.
dat = pl.read_csv("../API_Download_DS2_en_csv_v2_5657328.csv", skip_rows=4, null_values = "")
# We don't like the World Banks wide format. Let's clean it upt to long format.
dat_long = dat.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"])
# no we need to fix the year column and give it a better name.
# we could have fixed the name as an argument in `.melt()` as well.
# https://docs.pola.rs/user-guide/expressions/casting/#overflow
dat_long = dat_long\
.with_columns(
pl.col("variable").cast(pl.Int64, strict=False).alias("variable"),
pl.col("value").cast(pl.Float32, strict=False).alias("value"))\
.rename({"variable":"year"})\
.filter(pl.col("value").is_not_null())
# %%
# Can we split out the information in the indicator Code
indicator_columns = dat_long\
.select(
pl.col("Indicator Code"),
pl.col("Indicator Code")\
# split string: example VC.IDP.TOCV into list object ["VC", "IDP", "TOCV"]
.str.split_exact(".", 6).alias("split")).unnest("split")\
.unique()
# What should we call these columns?
# https://datahelpdesk.worldbank.org/knowledgebase/articles/201175-how-does-the-world-bank-code-its-indicators
new_names = {"field_0":"topic", "field_1":"general_subj", "field_2":"specific_subj",
"field_3":"ext_1", "field_4":"ext_2", "field_5":"ext_3", "field_6":"ext_4"}
indicator_columns = indicator_columns.rename(new_names)
# %%
# now we need to finalize our munge and write our data
dat_final = dat_long.join(indicator_columns, how="left", on="Indicator Code")
# Now I want to reorder the columns
names = dat_final.columns
new_order = [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0]
name_order = [names[i] for i in new_order]
dat_final = dat_final.select(name_order)
# %%
# write data
dat_final.write_csv("../dat_munged.csv")
# %%