LocationFinder / src /subset_data.py
mattupson's picture
new: First version
65e9efa unverified
raw
history blame
604 Bytes
import numpy as np
import pandas as pd
INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
print(f"Reading data from {INPUT_FILE}")
data = pd.read_csv(INPUT_FILE)
data = (
data[["Description"]]
.replace("Not available", np.nan)
.dropna()
.drop_duplicates()
.reset_index(drop=True)
.sample(1000)
)
print(f"Number of rows: {data.shape[0]}")
print(f"Number of unique rows: {data['Description'].nunique()}")
print(f"Saving file to {OUTPUT_FILE}")
data.to_csv(OUTPUT_FILE, index=False)