Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv" | |
OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv" | |
print(f"Reading data from {INPUT_FILE}") | |
data = pd.read_csv(INPUT_FILE) | |
data = ( | |
data[["Description"]] | |
.replace("Not available", np.nan) | |
.dropna() | |
.drop_duplicates() | |
.reset_index(drop=True) | |
.sample(1000) | |
) | |
print(f"Number of rows: {data.shape[0]}") | |
print(f"Number of unique rows: {data['Description'].nunique()}") | |
print(f"Saving file to {OUTPUT_FILE}") | |
data.to_csv(OUTPUT_FILE, index=False) | |