noahsettersten commited on
Commit
fd1b984
1 Parent(s): e64366f

chore: Move vector precomputation into separate module

Browse files
lib/medical_transcription/coding/vector_precomputation.ex ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defmodule MedicalTranscription.Coding.VectorPrecomputation do
2
+ @moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
3
+
4
+ alias MedicalTranscription.CodeVector
5
+
6
+ @doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
7
+ def run() do
8
+ if File.exists?(csv_file()) do
9
+ IO.puts("CSV file found. Precomputing vectors...")
10
+ else
11
+ IO.puts("CSV file not found. Downloading and preparing...")
12
+ AudioTagger.SampleData.get_icd9_code_list_csv()
13
+
14
+ IO.puts("Precomputing vectors...")
15
+ end
16
+
17
+ precompute_vectors()
18
+
19
+ :ok
20
+ end
21
+
22
+ defp precompute_vectors() do
23
+ time_start = System.monotonic_time()
24
+ df = load_dataframe_from_csv()
25
+ model_tuple = AudioTagger.Classifier.SemanticSearch.prepare_model()
26
+
27
+ num_rows = Explorer.DataFrame.n_rows(df)
28
+ ProgressBar.render(0, num_rows, suffix: :count)
29
+
30
+ df
31
+ |> Explorer.DataFrame.to_rows_stream()
32
+ |> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
33
+ |> Stream.with_index()
34
+ |> Enum.each(fn {%{"code" => code, "long_description" => description}, index} ->
35
+ if !CodeVector.exists_for_code?(code) do
36
+ compute_vector_for_code(model_tuple, code, description)
37
+ end
38
+
39
+ ProgressBar.render(index + 1, num_rows, suffix: :count)
40
+ end)
41
+
42
+ ProgressBar.render(num_rows, num_rows, suffix: :count)
43
+ time_end = System.monotonic_time()
44
+
45
+ IO.puts(
46
+ "Finished in #{System.convert_time_unit(time_end - time_start, :native, :millisecond)}ms"
47
+ )
48
+ end
49
+
50
+ defp load_dataframe_from_csv() do
51
+ {:ok, df} =
52
+ Explorer.DataFrame.from_csv(
53
+ csv_file(),
54
+ dtypes: [
55
+ {"code", :string},
56
+ {"long_description", :string}
57
+ ]
58
+ )
59
+
60
+ df
61
+ end
62
+
63
+ defp compute_vector_for_code({model_info, tokenizer}, code, description) do
64
+ vector =
65
+ AudioTagger.Vectors.embed_with_model(model_info, tokenizer, [description])
66
+
67
+ vector_for_db = Nx.to_flat_list(vector.pooled_state)
68
+
69
+ CodeVector.insert_vector(%{
70
+ code: code,
71
+ description: description,
72
+ description_vector: vector_for_db
73
+ })
74
+ end
75
+
76
+ defp csv_file() do
77
+ AudioTagger.SampleData.cache_dir()
78
+ |> Path.join("icd9_codelist.csv")
79
+ end
80
+ end
lib/mix/build_code_vectors.ex CHANGED
@@ -2,83 +2,12 @@ defmodule Mix.Tasks.BuildCodeVectors do
2
  @moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
3
 
4
  use Mix.Task
5
- alias MedicalTranscription.CodeVector
6
 
7
  @shortdoc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
8
  def run(_args) do
9
  Mix.Task.run("app.start")
10
  Logger.configure(level: :info)
11
 
12
- if File.exists?(csv_file()) do
13
- IO.puts("CSV file found. Precomputing vectors...")
14
- else
15
- IO.puts("CSV file not found. Downloading and preparing...")
16
- AudioTagger.SampleData.get_icd9_code_list_csv()
17
-
18
- IO.puts("Precomputing vectors...")
19
- end
20
-
21
- precompute_vectors()
22
-
23
- :ok
24
- end
25
-
26
- defp precompute_vectors() do
27
- time_start = System.monotonic_time()
28
- df = load_dataframe_from_csv()
29
- model_tuple = AudioTagger.Classifier.SemanticSearch.prepare_model()
30
-
31
- num_rows = Explorer.DataFrame.n_rows(df)
32
- ProgressBar.render(0, num_rows, suffix: :count)
33
-
34
- df
35
- |> Explorer.DataFrame.to_rows_stream()
36
- |> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
37
- |> Stream.with_index()
38
- |> Enum.each(fn {%{"code" => code, "long_description" => description}, index} ->
39
- if !CodeVector.exists_for_code?(code) do
40
- compute_vector_for_code(model_tuple, code, description)
41
- end
42
-
43
- ProgressBar.render(index + 1, num_rows, suffix: :count)
44
- end)
45
-
46
- ProgressBar.render(num_rows, num_rows, suffix: :count)
47
- time_end = System.monotonic_time()
48
-
49
- IO.puts(
50
- "Finished in #{System.convert_time_unit(time_end - time_start, :native, :millisecond)}ms"
51
- )
52
- end
53
-
54
- defp load_dataframe_from_csv() do
55
- {:ok, df} =
56
- Explorer.DataFrame.from_csv(
57
- csv_file(),
58
- dtypes: [
59
- {"code", :string},
60
- {"long_description", :string}
61
- ]
62
- )
63
-
64
- df
65
- end
66
-
67
- defp compute_vector_for_code({model_info, tokenizer}, code, description) do
68
- vector =
69
- AudioTagger.Vectors.embed_with_model(model_info, tokenizer, [description])
70
-
71
- vector_for_db = Nx.to_flat_list(vector.pooled_state)
72
-
73
- CodeVector.insert_vector(%{
74
- code: code,
75
- description: description,
76
- description_vector: vector_for_db
77
- })
78
- end
79
-
80
- defp csv_file() do
81
- AudioTagger.SampleData.cache_dir()
82
- |> Path.join("icd9_codelist.csv")
83
  end
84
  end
 
2
  @moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
3
 
4
  use Mix.Task
 
5
 
6
  @shortdoc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
7
  def run(_args) do
8
  Mix.Task.run("app.start")
9
  Logger.configure(level: :info)
10
 
11
+ MedicalTranscription.Coding.VectorPrecomputation.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  end
13
  end