noahsettersten commited on
Commit
a7feaeb
1 Parent(s): b01bac2

refactor: Move CodeSearcher and CodeVector into Coding context

Browse files
lib/medical_transcription/code_searcher.ex DELETED
@@ -1,20 +0,0 @@
1
- defmodule MedicalTranscription.CodeSearcher do
2
- @moduledoc """
3
- Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
4
- SemanticSearch module.
5
- """
6
-
7
- alias MedicalTranscription.{CodeVector, Utilities}
8
-
9
- def process_chunk(text, opts \\ []) do
10
- k = Keyword.get(opts, :num_results, 5)
11
-
12
- search_vector_for_db = Utilities.compute_vector_as_list(text)
13
-
14
- CodeVector.find_similar(search_vector_for_db, k)
15
-
16
- # -- Remove matches that don't exceed a given threshold
17
- # TODO: This depends on receiving a similarity score from the Postgres query
18
- # |> Enum.filter(fn {_index, score} -> score >= similarity_threshold end)
19
- end
20
- end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/medical_transcription/{code_vector.ex → coding.ex} RENAMED
@@ -1,28 +1,19 @@
1
- defmodule MedicalTranscription.CodeVector do
2
  @moduledoc """
3
- Represents a code and its description, along with a vector embedding for its description.
 
4
  """
5
- use Ecto.Schema
6
- alias Ecto.Changeset
7
- alias MedicalTranscription.Repo
8
 
 
9
  import Ecto.Query
10
  import Pgvector.Ecto.Query
11
 
12
- schema "code_vectors" do
13
- field :code, :string
14
- field :description, :string
15
- field :description_vector, Pgvector.Ecto.Vector
16
- end
17
-
18
- def changeset(code_vector, params \\ %{}) do
19
- code_vector
20
- |> Changeset.cast(params, [:code, :description, :description_vector])
21
- |> Changeset.validate_required([:code, :description, :description_vector])
22
- end
23
 
24
  def insert_vector(params) do
25
- changeset = changeset(%__MODULE__{}, params)
 
 
26
 
27
  case Repo.insert(changeset) do
28
  {:ok, _} ->
@@ -42,20 +33,40 @@ defmodule MedicalTranscription.CodeVector do
42
 
43
  def exists_for_code?(code) do
44
  Repo.exists?(
45
- from v in __MODULE__,
46
  where: v.code == ^code
47
  )
48
  end
49
 
50
  def icd9_present?() do
51
- num_rows = Repo.aggregate(__MODULE__, :count)
52
 
53
  num_rows >= 14_567
54
  end
55
 
56
- def find_similar(search_vector, limit \\ 5) do
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  Repo.all(
58
- from v in __MODULE__,
59
  order_by: cosine_distance(v.description_vector, ^search_vector),
60
  limit: ^limit
61
  )
 
1
+ defmodule MedicalTranscription.Coding do
2
  @moduledoc """
3
+ Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
4
+ SemanticSearch module.
5
  """
 
 
 
6
 
7
+ alias MedicalTranscription.Repo
8
  import Ecto.Query
9
  import Pgvector.Ecto.Query
10
 
11
+ alias MedicalTranscription.Coding.CodeVector
 
 
 
 
 
 
 
 
 
 
12
 
13
  def insert_vector(params) do
14
+ changeset =
15
+ %CodeVector{}
16
+ |> CodeVector.changeset(params)
17
 
18
  case Repo.insert(changeset) do
19
  {:ok, _} ->
 
33
 
34
  def exists_for_code?(code) do
35
  Repo.exists?(
36
+ from v in CodeVector,
37
  where: v.code == ^code
38
  )
39
  end
40
 
41
  def icd9_present?() do
42
+ num_rows = Repo.aggregate(CodeVector, :count)
43
 
44
  num_rows >= 14_567
45
  end
46
 
47
+ def process_chunk(text, opts \\ []) do
48
+ k = Keyword.get(opts, :num_results, 5)
49
+
50
+ search_vector_for_db = compute_vector_as_list(text)
51
+
52
+ find_similar(search_vector_for_db, k)
53
+
54
+ # -- Remove matches that don't exceed a given threshold
55
+ # TODO: This depends on receiving a similarity score from the Postgres query
56
+ # |> Enum.filter(fn {_index, score} -> score >= similarity_threshold end)
57
+ end
58
+
59
+ @doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
60
+ def compute_vector_as_list(text) do
61
+ MedicalTranscription.TextEmbeddingServing
62
+ |> Nx.Serving.batched_run(text)
63
+ |> Map.get(:embedding)
64
+ |> Nx.to_flat_list()
65
+ end
66
+
67
+ defp find_similar(search_vector, limit \\ 5) do
68
  Repo.all(
69
+ from v in CodeVector,
70
  order_by: cosine_distance(v.description_vector, ^search_vector),
71
  limit: ^limit
72
  )
lib/medical_transcription/coding/code_vector.ex ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defmodule MedicalTranscription.Coding.CodeVector do
2
+ @moduledoc """
3
+ Represents a code and its description, along with a vector embedding for its description.
4
+ """
5
+ use Ecto.Schema
6
+ alias Ecto.Changeset
7
+
8
+ schema "code_vectors" do
9
+ field :code, :string
10
+ field :description, :string
11
+ field :description_vector, Pgvector.Ecto.Vector
12
+ end
13
+
14
+ def changeset(code_vector, params \\ %{}) do
15
+ code_vector
16
+ |> Changeset.cast(params, [:code, :description, :description_vector])
17
+ |> Changeset.validate_required([:code, :description, :description_vector])
18
+ end
19
+ end
lib/medical_transcription/coding/vector_precomputation.ex CHANGED
@@ -1,7 +1,7 @@
1
  defmodule MedicalTranscription.Coding.VectorPrecomputation do
2
  @moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
3
 
4
- alias MedicalTranscription.CodeVector
5
 
6
  @doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
7
  def run() do
@@ -31,7 +31,7 @@ defmodule MedicalTranscription.Coding.VectorPrecomputation do
31
  |> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
32
  |> Stream.with_index()
33
  |> Enum.each(fn {%{"code" => code, "long_description" => description}, index} ->
34
- if !CodeVector.exists_for_code?(code) do
35
  compute_vector_for_code(code, description)
36
  end
37
 
@@ -60,9 +60,9 @@ defmodule MedicalTranscription.Coding.VectorPrecomputation do
60
  end
61
 
62
  defp compute_vector_for_code(code, description) do
63
- vector_for_db = MedicalTranscription.Utilities.compute_vector_as_list(description)
64
 
65
- CodeVector.insert_vector(%{
66
  code: code,
67
  description: description,
68
  description_vector: vector_for_db
 
1
  defmodule MedicalTranscription.Coding.VectorPrecomputation do
2
  @moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
3
 
4
+ alias MedicalTranscription.Coding
5
 
6
  @doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
7
  def run() do
 
31
  |> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
32
  |> Stream.with_index()
33
  |> Enum.each(fn {%{"code" => code, "long_description" => description}, index} ->
34
+ if !Coding.exists_for_code?(code) do
35
  compute_vector_for_code(code, description)
36
  end
37
 
 
60
  end
61
 
62
  defp compute_vector_for_code(code, description) do
63
+ vector_for_db = Coding.compute_vector_as_list(description)
64
 
65
+ Coding.insert_vector(%{
66
  code: code,
67
  description: description,
68
  description_vector: vector_for_db
lib/medical_transcription/transcriber.ex CHANGED
@@ -4,10 +4,10 @@ defmodule MedicalTranscription.Transcriber do
4
  to look for possible matching codes.
5
  """
6
 
7
- alias MedicalTranscription.CodeSearcher
8
 
9
  defp get_tags_and_send_result(chunk, index, live_view_pid) do
10
- tags = CodeSearcher.process_chunk(chunk.text)
11
  result = build_result(index, chunk, tags)
12
 
13
  send(live_view_pid, {:transcription_row, result})
 
4
  to look for possible matching codes.
5
  """
6
 
7
+ alias MedicalTranscription.Coding
8
 
9
  defp get_tags_and_send_result(chunk, index, live_view_pid) do
10
+ tags = Coding.process_chunk(chunk.text)
11
  result = build_result(index, chunk, tags)
12
 
13
  send(live_view_pid, {:transcription_row, result})
lib/medical_transcription/utilities.ex DELETED
@@ -1,13 +0,0 @@
1
- defmodule MedicalTranscription.Utilities do
2
- @moduledoc """
3
- Holds general utility functions. Over time, consider if there are other modules that make for a more relevant home.
4
- """
5
-
6
- @doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
7
- def compute_vector_as_list(text) do
8
- MedicalTranscription.TextEmbeddingServing
9
- |> Nx.Serving.batched_run(text)
10
- |> Map.get(:embedding)
11
- |> Nx.to_flat_list()
12
- end
13
- end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/medical_transcription_web/components/layouts/app.html.heex CHANGED
@@ -14,7 +14,7 @@
14
  </div>
15
 
16
  <div class="px-6 flex flex-col items-center">
17
- <%= if MedicalTranscription.CodeVector.icd9_present?() do %>
18
  <div
19
  class="w-full px-3 py-2 bg-emerald-600 text-white text-center rounded-lg"
20
  title="Precalculated vector embeddings for classification labels were found."
 
14
  </div>
15
 
16
  <div class="px-6 flex flex-col items-center">
17
+ <%= if MedicalTranscription.Coding.icd9_present?() do %>
18
  <div
19
  class="w-full px-3 py-2 bg-emerald-600 text-white text-center rounded-lg"
20
  title="Precalculated vector embeddings for classification labels were found."
lib/medical_transcription_web/components/transcription_text_component.ex CHANGED
@@ -6,7 +6,7 @@ defmodule MedicalTranscriptionWeb.Components.TranscriptionTextComponent do
6
  use MedicalTranscriptionWeb, :live_component
7
  import MedicalTranscriptionWeb.Components
8
  import MedicalTranscriptionWeb.Components.KeywordHighlighter
9
- alias MedicalTranscription.CodeVector
10
 
11
  @impl Phoenix.LiveComponent
12
  def update(assigns, socket) do
 
6
  use MedicalTranscriptionWeb, :live_component
7
  import MedicalTranscriptionWeb.Components
8
  import MedicalTranscriptionWeb.Components.KeywordHighlighter
9
+ alias MedicalTranscription.Coding.CodeVector
10
 
11
  @impl Phoenix.LiveComponent
12
  def update(assigns, socket) do
lib/medical_transcription_web/live/home_live/index.ex CHANGED
@@ -82,7 +82,7 @@ defmodule MedicalTranscriptionWeb.HomeLive.Index do
82
 
83
  @impl true
84
  def handle_event("add_feedback", params, socket) do
85
- text_vector = MedicalTranscription.Utilities.compute_vector_as_list(params["text"])
86
 
87
  result =
88
  params
@@ -135,7 +135,7 @@ defmodule MedicalTranscriptionWeb.HomeLive.Index do
135
 
136
  @impl true
137
  def handle_info({:received_audio_payload, transcribed_text}, socket) do
138
- tags = MedicalTranscription.CodeSearcher.process_chunk(transcribed_text)
139
 
140
  result = %{
141
  id: socket.assigns.current_recording_id + 1,
 
82
 
83
  @impl true
84
  def handle_event("add_feedback", params, socket) do
85
+ text_vector = MedicalTranscription.Coding.compute_vector_as_list(params["text"])
86
 
87
  result =
88
  params
 
135
 
136
  @impl true
137
  def handle_info({:received_audio_payload, transcribed_text}, socket) do
138
+ tags = MedicalTranscription.Coding.process_chunk(transcribed_text)
139
 
140
  result = %{
141
  id: socket.assigns.current_recording_id + 1,