noahsettersten
commited on
Commit
•
a7feaeb
1
Parent(s):
b01bac2
refactor: Move CodeSearcher and CodeVector into Coding context
Browse files- lib/medical_transcription/code_searcher.ex +0 -20
- lib/medical_transcription/{code_vector.ex → coding.ex} +32 -21
- lib/medical_transcription/coding/code_vector.ex +19 -0
- lib/medical_transcription/coding/vector_precomputation.ex +4 -4
- lib/medical_transcription/transcriber.ex +2 -2
- lib/medical_transcription/utilities.ex +0 -13
- lib/medical_transcription_web/components/layouts/app.html.heex +1 -1
- lib/medical_transcription_web/components/transcription_text_component.ex +1 -1
- lib/medical_transcription_web/live/home_live/index.ex +2 -2
lib/medical_transcription/code_searcher.ex
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
defmodule MedicalTranscription.CodeSearcher do
|
2 |
-
@moduledoc """
|
3 |
-
Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
|
4 |
-
SemanticSearch module.
|
5 |
-
"""
|
6 |
-
|
7 |
-
alias MedicalTranscription.{CodeVector, Utilities}
|
8 |
-
|
9 |
-
def process_chunk(text, opts \\ []) do
|
10 |
-
k = Keyword.get(opts, :num_results, 5)
|
11 |
-
|
12 |
-
search_vector_for_db = Utilities.compute_vector_as_list(text)
|
13 |
-
|
14 |
-
CodeVector.find_similar(search_vector_for_db, k)
|
15 |
-
|
16 |
-
# -- Remove matches that don't exceed a given threshold
|
17 |
-
# TODO: This depends on receiving a similarity score from the Postgres query
|
18 |
-
# |> Enum.filter(fn {_index, score} -> score >= similarity_threshold end)
|
19 |
-
end
|
20 |
-
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lib/medical_transcription/{code_vector.ex → coding.ex}
RENAMED
@@ -1,28 +1,19 @@
|
|
1 |
-
defmodule MedicalTranscription.
|
2 |
@moduledoc """
|
3 |
-
|
|
|
4 |
"""
|
5 |
-
use Ecto.Schema
|
6 |
-
alias Ecto.Changeset
|
7 |
-
alias MedicalTranscription.Repo
|
8 |
|
|
|
9 |
import Ecto.Query
|
10 |
import Pgvector.Ecto.Query
|
11 |
|
12 |
-
|
13 |
-
field :code, :string
|
14 |
-
field :description, :string
|
15 |
-
field :description_vector, Pgvector.Ecto.Vector
|
16 |
-
end
|
17 |
-
|
18 |
-
def changeset(code_vector, params \\ %{}) do
|
19 |
-
code_vector
|
20 |
-
|> Changeset.cast(params, [:code, :description, :description_vector])
|
21 |
-
|> Changeset.validate_required([:code, :description, :description_vector])
|
22 |
-
end
|
23 |
|
24 |
def insert_vector(params) do
|
25 |
-
changeset =
|
|
|
|
|
26 |
|
27 |
case Repo.insert(changeset) do
|
28 |
{:ok, _} ->
|
@@ -42,20 +33,40 @@ defmodule MedicalTranscription.CodeVector do
|
|
42 |
|
43 |
def exists_for_code?(code) do
|
44 |
Repo.exists?(
|
45 |
-
from v in
|
46 |
where: v.code == ^code
|
47 |
)
|
48 |
end
|
49 |
|
50 |
def icd9_present?() do
|
51 |
-
num_rows = Repo.aggregate(
|
52 |
|
53 |
num_rows >= 14_567
|
54 |
end
|
55 |
|
56 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
Repo.all(
|
58 |
-
from v in
|
59 |
order_by: cosine_distance(v.description_vector, ^search_vector),
|
60 |
limit: ^limit
|
61 |
)
|
|
|
1 |
+
defmodule MedicalTranscription.Coding do
|
2 |
@moduledoc """
|
3 |
+
Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
|
4 |
+
SemanticSearch module.
|
5 |
"""
|
|
|
|
|
|
|
6 |
|
7 |
+
alias MedicalTranscription.Repo
|
8 |
import Ecto.Query
|
9 |
import Pgvector.Ecto.Query
|
10 |
|
11 |
+
alias MedicalTranscription.Coding.CodeVector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def insert_vector(params) do
|
14 |
+
changeset =
|
15 |
+
%CodeVector{}
|
16 |
+
|> CodeVector.changeset(params)
|
17 |
|
18 |
case Repo.insert(changeset) do
|
19 |
{:ok, _} ->
|
|
|
33 |
|
34 |
def exists_for_code?(code) do
|
35 |
Repo.exists?(
|
36 |
+
from v in CodeVector,
|
37 |
where: v.code == ^code
|
38 |
)
|
39 |
end
|
40 |
|
41 |
def icd9_present?() do
|
42 |
+
num_rows = Repo.aggregate(CodeVector, :count)
|
43 |
|
44 |
num_rows >= 14_567
|
45 |
end
|
46 |
|
47 |
+
def process_chunk(text, opts \\ []) do
|
48 |
+
k = Keyword.get(opts, :num_results, 5)
|
49 |
+
|
50 |
+
search_vector_for_db = compute_vector_as_list(text)
|
51 |
+
|
52 |
+
find_similar(search_vector_for_db, k)
|
53 |
+
|
54 |
+
# -- Remove matches that don't exceed a given threshold
|
55 |
+
# TODO: This depends on receiving a similarity score from the Postgres query
|
56 |
+
# |> Enum.filter(fn {_index, score} -> score >= similarity_threshold end)
|
57 |
+
end
|
58 |
+
|
59 |
+
@doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
|
60 |
+
def compute_vector_as_list(text) do
|
61 |
+
MedicalTranscription.TextEmbeddingServing
|
62 |
+
|> Nx.Serving.batched_run(text)
|
63 |
+
|> Map.get(:embedding)
|
64 |
+
|> Nx.to_flat_list()
|
65 |
+
end
|
66 |
+
|
67 |
+
defp find_similar(search_vector, limit \\ 5) do
|
68 |
Repo.all(
|
69 |
+
from v in CodeVector,
|
70 |
order_by: cosine_distance(v.description_vector, ^search_vector),
|
71 |
limit: ^limit
|
72 |
)
|
lib/medical_transcription/coding/code_vector.ex
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defmodule MedicalTranscription.Coding.CodeVector do
|
2 |
+
@moduledoc """
|
3 |
+
Represents a code and its description, along with a vector embedding for its description.
|
4 |
+
"""
|
5 |
+
use Ecto.Schema
|
6 |
+
alias Ecto.Changeset
|
7 |
+
|
8 |
+
schema "code_vectors" do
|
9 |
+
field :code, :string
|
10 |
+
field :description, :string
|
11 |
+
field :description_vector, Pgvector.Ecto.Vector
|
12 |
+
end
|
13 |
+
|
14 |
+
def changeset(code_vector, params \\ %{}) do
|
15 |
+
code_vector
|
16 |
+
|> Changeset.cast(params, [:code, :description, :description_vector])
|
17 |
+
|> Changeset.validate_required([:code, :description, :description_vector])
|
18 |
+
end
|
19 |
+
end
|
lib/medical_transcription/coding/vector_precomputation.ex
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
defmodule MedicalTranscription.Coding.VectorPrecomputation do
|
2 |
@moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
|
3 |
|
4 |
-
alias MedicalTranscription.
|
5 |
|
6 |
@doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
|
7 |
def run() do
|
@@ -31,7 +31,7 @@ defmodule MedicalTranscription.Coding.VectorPrecomputation do
|
|
31 |
|> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
|
32 |
|> Stream.with_index()
|
33 |
|> Enum.each(fn {%{"code" => code, "long_description" => description}, index} ->
|
34 |
-
if !
|
35 |
compute_vector_for_code(code, description)
|
36 |
end
|
37 |
|
@@ -60,9 +60,9 @@ defmodule MedicalTranscription.Coding.VectorPrecomputation do
|
|
60 |
end
|
61 |
|
62 |
defp compute_vector_for_code(code, description) do
|
63 |
-
vector_for_db =
|
64 |
|
65 |
-
|
66 |
code: code,
|
67 |
description: description,
|
68 |
description_vector: vector_for_db
|
|
|
1 |
defmodule MedicalTranscription.Coding.VectorPrecomputation do
|
2 |
@moduledoc "Populate database with vector embeddings from downloaded ICD-9 code list"
|
3 |
|
4 |
+
alias MedicalTranscription.Coding
|
5 |
|
6 |
@doc "Downloads the ICD-9 codelist, calculates vector embeddings for each, and adds them to the database"
|
7 |
def run() do
|
|
|
31 |
|> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
|
32 |
|> Stream.with_index()
|
33 |
|> Enum.each(fn {%{"code" => code, "long_description" => description}, index} ->
|
34 |
+
if !Coding.exists_for_code?(code) do
|
35 |
compute_vector_for_code(code, description)
|
36 |
end
|
37 |
|
|
|
60 |
end
|
61 |
|
62 |
defp compute_vector_for_code(code, description) do
|
63 |
+
vector_for_db = Coding.compute_vector_as_list(description)
|
64 |
|
65 |
+
Coding.insert_vector(%{
|
66 |
code: code,
|
67 |
description: description,
|
68 |
description_vector: vector_for_db
|
lib/medical_transcription/transcriber.ex
CHANGED
@@ -4,10 +4,10 @@ defmodule MedicalTranscription.Transcriber do
|
|
4 |
to look for possible matching codes.
|
5 |
"""
|
6 |
|
7 |
-
alias MedicalTranscription.
|
8 |
|
9 |
defp get_tags_and_send_result(chunk, index, live_view_pid) do
|
10 |
-
tags =
|
11 |
result = build_result(index, chunk, tags)
|
12 |
|
13 |
send(live_view_pid, {:transcription_row, result})
|
|
|
4 |
to look for possible matching codes.
|
5 |
"""
|
6 |
|
7 |
+
alias MedicalTranscription.Coding
|
8 |
|
9 |
defp get_tags_and_send_result(chunk, index, live_view_pid) do
|
10 |
+
tags = Coding.process_chunk(chunk.text)
|
11 |
result = build_result(index, chunk, tags)
|
12 |
|
13 |
send(live_view_pid, {:transcription_row, result})
|
lib/medical_transcription/utilities.ex
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
defmodule MedicalTranscription.Utilities do
|
2 |
-
@moduledoc """
|
3 |
-
Holds general utility functions. Over time, consider if there are other modules that make for a more relevant home.
|
4 |
-
"""
|
5 |
-
|
6 |
-
@doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
|
7 |
-
def compute_vector_as_list(text) do
|
8 |
-
MedicalTranscription.TextEmbeddingServing
|
9 |
-
|> Nx.Serving.batched_run(text)
|
10 |
-
|> Map.get(:embedding)
|
11 |
-
|> Nx.to_flat_list()
|
12 |
-
end
|
13 |
-
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lib/medical_transcription_web/components/layouts/app.html.heex
CHANGED
@@ -14,7 +14,7 @@
|
|
14 |
</div>
|
15 |
|
16 |
<div class="px-6 flex flex-col items-center">
|
17 |
-
<%= if MedicalTranscription.
|
18 |
<div
|
19 |
class="w-full px-3 py-2 bg-emerald-600 text-white text-center rounded-lg"
|
20 |
title="Precalculated vector embeddings for classification labels were found."
|
|
|
14 |
</div>
|
15 |
|
16 |
<div class="px-6 flex flex-col items-center">
|
17 |
+
<%= if MedicalTranscription.Coding.icd9_present?() do %>
|
18 |
<div
|
19 |
class="w-full px-3 py-2 bg-emerald-600 text-white text-center rounded-lg"
|
20 |
title="Precalculated vector embeddings for classification labels were found."
|
lib/medical_transcription_web/components/transcription_text_component.ex
CHANGED
@@ -6,7 +6,7 @@ defmodule MedicalTranscriptionWeb.Components.TranscriptionTextComponent do
|
|
6 |
use MedicalTranscriptionWeb, :live_component
|
7 |
import MedicalTranscriptionWeb.Components
|
8 |
import MedicalTranscriptionWeb.Components.KeywordHighlighter
|
9 |
-
alias MedicalTranscription.CodeVector
|
10 |
|
11 |
@impl Phoenix.LiveComponent
|
12 |
def update(assigns, socket) do
|
|
|
6 |
use MedicalTranscriptionWeb, :live_component
|
7 |
import MedicalTranscriptionWeb.Components
|
8 |
import MedicalTranscriptionWeb.Components.KeywordHighlighter
|
9 |
+
alias MedicalTranscription.Coding.CodeVector
|
10 |
|
11 |
@impl Phoenix.LiveComponent
|
12 |
def update(assigns, socket) do
|
lib/medical_transcription_web/live/home_live/index.ex
CHANGED
@@ -82,7 +82,7 @@ defmodule MedicalTranscriptionWeb.HomeLive.Index do
|
|
82 |
|
83 |
@impl true
|
84 |
def handle_event("add_feedback", params, socket) do
|
85 |
-
text_vector = MedicalTranscription.
|
86 |
|
87 |
result =
|
88 |
params
|
@@ -135,7 +135,7 @@ defmodule MedicalTranscriptionWeb.HomeLive.Index do
|
|
135 |
|
136 |
@impl true
|
137 |
def handle_info({:received_audio_payload, transcribed_text}, socket) do
|
138 |
-
tags = MedicalTranscription.
|
139 |
|
140 |
result = %{
|
141 |
id: socket.assigns.current_recording_id + 1,
|
|
|
82 |
|
83 |
@impl true
|
84 |
def handle_event("add_feedback", params, socket) do
|
85 |
+
text_vector = MedicalTranscription.Coding.compute_vector_as_list(params["text"])
|
86 |
|
87 |
result =
|
88 |
params
|
|
|
135 |
|
136 |
@impl true
|
137 |
def handle_info({:received_audio_payload, transcribed_text}, socket) do
|
138 |
+
tags = MedicalTranscription.Coding.process_chunk(transcribed_text)
|
139 |
|
140 |
result = %{
|
141 |
id: socket.assigns.current_recording_id + 1,
|