defmodule MedicalTranscription.Coding.VectorPrecomputation do @moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list" alias MedicalTranscription.Coding @doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database" def run do if File.exists?(csv_file()) do IO.puts("CSV file found. Precomputing vectors...") else IO.puts("CSV file not found. Downloading and preparing...") AudioTagger.SampleData.get_icd9_code_list_csv() IO.puts("Precomputing vectors...") end precompute_vectors() :ok end defp precompute_vectors do time_start = System.monotonic_time() df = load_dataframe_from_csv() num_rows = Explorer.DataFrame.n_rows(df) ProgressBar.render(0, num_rows, suffix: :count) df |> Explorer.DataFrame.to_rows_stream() |> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end) |> Stream.with_index() |> Enum.each(fn {%{"code" => code, "long_description" => description}, index} -> if !Coding.exists_for_code?(code) do compute_vector_for_code(code, description) end ProgressBar.render(index + 1, num_rows, suffix: :count) end) ProgressBar.render(num_rows, num_rows, suffix: :count) time_end = System.monotonic_time() IO.puts( "Finished in #{System.convert_time_unit(time_end - time_start, :native, :millisecond)}ms" ) end defp load_dataframe_from_csv do {:ok, df} = Explorer.DataFrame.from_csv( csv_file(), dtypes: [ {"code", :string}, {"long_description", :string} ] ) df end defp compute_vector_for_code(code, description) do vector_for_db = Coding.compute_vector_as_list(description) Coding.insert_vector(%{ code: code, description: description, description_vector: vector_for_db }) end defp csv_file do AudioTagger.SampleData.cache_dir() |> Path.join("icd9_codelist.csv") end end