medicode / lib /medical_transcription /coding /vector_precomputation.ex
noahsettersten's picture
chore: Address Credo messages
59fddbd
raw
history blame
2.04 kB
defmodule MedicalTranscription.Coding.VectorPrecomputation do
@moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
alias MedicalTranscription.Coding
@doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
def run do
if File.exists?(csv_file()) do
IO.puts("CSV file found. Precomputing vectors...")
else
IO.puts("CSV file not found. Downloading and preparing...")
AudioTagger.SampleData.get_icd9_code_list_csv()
IO.puts("Precomputing vectors...")
end
precompute_vectors()
:ok
end
defp precompute_vectors do
time_start = System.monotonic_time()
df = load_dataframe_from_csv()
num_rows = Explorer.DataFrame.n_rows(df)
ProgressBar.render(0, num_rows, suffix: :count)
df
|> Explorer.DataFrame.to_rows_stream()
|> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
|> Stream.with_index()
|> Enum.each(fn {%{"code" => code, "long_description" => description}, index} ->
if !Coding.exists_for_code?(code) do
compute_vector_for_code(code, description)
end
ProgressBar.render(index + 1, num_rows, suffix: :count)
end)
ProgressBar.render(num_rows, num_rows, suffix: :count)
time_end = System.monotonic_time()
IO.puts(
"Finished in #{System.convert_time_unit(time_end - time_start, :native, :millisecond)}ms"
)
end
defp load_dataframe_from_csv do
{:ok, df} =
Explorer.DataFrame.from_csv(
csv_file(),
dtypes: [
{"code", :string},
{"long_description", :string}
]
)
df
end
defp compute_vector_for_code(code, description) do
vector_for_db = Coding.compute_vector_as_list(description)
Coding.insert_vector(%{
code: code,
description: description,
description_vector: vector_for_db
})
end
defp csv_file do
AudioTagger.SampleData.cache_dir()
|> Path.join("icd9_codelist.csv")
end
end