import numpy as np import pandas as pd import yaml def load_config(config_file: str) -> dict: with open(config_file) as f: config = yaml.safe_load(f) return config INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv" OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv" def subset_docs(input_file: str, output_file: str, sample: int): print(f"Reading data from {input_file}") data = pd.read_csv(input_file) data = ( data[["Description"]] .replace("Not available", np.nan) .dropna() .drop_duplicates() .reset_index(drop=True) .sample(sample) ) print(f"Number of rows: {data.shape[0]}") print(f"Number of unique rows: {data['Description'].nunique()}") print(f"Saving file to {output_file}") data.to_csv(output_file, index=False) if __name__ == "__main__": params = load_config("params.yaml") subset_docs(INPUT_FILE, OUTPUT_FILE, sample=params["n_docs"])