|
defmodule MedicalTranscription.Coding.VectorPrecomputation do |
|
@moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list" |
|
|
|
alias MedicalTranscription.Coding |
|
|
|
@doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database" |
|
def run do |
|
if File.exists?(csv_file()) do |
|
IO.puts("CSV file found. Precomputing vectors...") |
|
else |
|
IO.puts("CSV file not found. Downloading and preparing...") |
|
AudioTagger.SampleData.get_icd9_code_list_csv() |
|
|
|
IO.puts("Precomputing vectors...") |
|
end |
|
|
|
precompute_vectors() |
|
|
|
:ok |
|
end |
|
|
|
defp precompute_vectors do |
|
time_start = System.monotonic_time() |
|
df = load_dataframe_from_csv() |
|
|
|
num_rows = Explorer.DataFrame.n_rows(df) |
|
ProgressBar.render(0, num_rows, suffix: :count) |
|
|
|
df |
|
|> Explorer.DataFrame.to_rows_stream() |
|
|> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end) |
|
|> Stream.with_index() |
|
|> Enum.each(fn {%{"code" => code, "long_description" => description}, index} -> |
|
if !Coding.exists_for_code?(code) do |
|
compute_vector_for_code(code, description) |
|
end |
|
|
|
ProgressBar.render(index + 1, num_rows, suffix: :count) |
|
end) |
|
|
|
ProgressBar.render(num_rows, num_rows, suffix: :count) |
|
time_end = System.monotonic_time() |
|
|
|
IO.puts( |
|
"Finished in #{System.convert_time_unit(time_end - time_start, :native, :millisecond)}ms" |
|
) |
|
end |
|
|
|
defp load_dataframe_from_csv do |
|
{:ok, df} = |
|
Explorer.DataFrame.from_csv( |
|
csv_file(), |
|
dtypes: [ |
|
{"code", :string}, |
|
{"long_description", :string} |
|
] |
|
) |
|
|
|
df |
|
end |
|
|
|
defp compute_vector_for_code(code, description) do |
|
vector_for_db = Coding.compute_vector_as_list(description) |
|
|
|
Coding.insert_vector(%{ |
|
code: code, |
|
description: description, |
|
description_vector: vector_for_db |
|
}) |
|
end |
|
|
|
defp csv_file do |
|
AudioTagger.SampleData.cache_dir() |
|
|> Path.join("icd9_codelist.csv") |
|
end |
|
end |
|
|