timgremore
commited on
Commit
•
5676fb5
1
Parent(s):
7625517
wip: Correlate code vectors with chunks
Browse files- lib/medical_transcription/classification_server.ex +4 -1
- lib/medical_transcription/coding.ex +6 -2
- lib/medical_transcription/coding/code_vector.ex +6 -1
- lib/medical_transcription/feedback/code_feedback.ex +3 -1
- lib/medical_transcription/transcriptions.ex +2 -2
- lib/medical_transcription/transcriptions/transcription_chunk.ex +2 -0
- lib/medical_transcription/transcriptions/transcription_chunk_code_vector.ex +20 -0
- priv/repo/migrations/20240116164032_create_code_feedbacks.exs +2 -1
- priv/repo/migrations/20240125151833_create_code_vectors.exs +2 -1
- priv/repo/migrations/20240125172837_migrate_to_vector_for_code_feedbacks.exs +4 -1
- priv/repo/migrations/20240209203409_create_transcription_chunk_code_vectors.exs +17 -0
- test/medical_transcription/classification_server_test.exs +5 -2
lib/medical_transcription/classification_server.ex
CHANGED
@@ -6,6 +6,8 @@ defmodule MedicalTranscription.ClassificationServer do
|
|
6 |
|
7 |
alias AudioTagger.KeywordFinder
|
8 |
|
|
|
|
|
9 |
alias MedicalTranscription.Transcriptions
|
10 |
alias MedicalTranscription.Transcriptions.TranscriptionChunk
|
11 |
|
@@ -20,6 +22,7 @@ defmodule MedicalTranscription.ClassificationServer do
|
|
20 |
|
21 |
@impl GenServer
|
22 |
def handle_continue(:start, {:chunk, chunk} = state) do
|
|
|
23 |
find_keywords(chunk)
|
24 |
|
25 |
Phoenix.PubSub.broadcast(
|
@@ -66,7 +69,7 @@ defmodule MedicalTranscription.ClassificationServer do
|
|
66 |
end
|
67 |
|
68 |
defp classify_chunk(chunk) do
|
69 |
-
|
70 |
end
|
71 |
|
72 |
defp find_keywords(chunk) do
|
|
|
6 |
|
7 |
alias AudioTagger.KeywordFinder
|
8 |
|
9 |
+
alias MedicalTranscription.Coding
|
10 |
+
alias MedicalTranscription.Coding.CodeVectorMatch
|
11 |
alias MedicalTranscription.Transcriptions
|
12 |
alias MedicalTranscription.Transcriptions.TranscriptionChunk
|
13 |
|
|
|
22 |
|
23 |
@impl GenServer
|
24 |
def handle_continue(:start, {:chunk, chunk} = state) do
|
25 |
+
classify_chunk(chunk)
|
26 |
find_keywords(chunk)
|
27 |
|
28 |
Phoenix.PubSub.broadcast(
|
|
|
69 |
end
|
70 |
|
71 |
defp classify_chunk(chunk) do
|
72 |
+
Coding.process_chunk(chunk)
|
73 |
end
|
74 |
|
75 |
defp find_keywords(chunk) do
|
lib/medical_transcription/coding.ex
CHANGED
@@ -9,6 +9,7 @@ defmodule MedicalTranscription.Coding do
|
|
9 |
import Pgvector.Ecto.Query
|
10 |
|
11 |
alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch}
|
|
|
12 |
|
13 |
def insert_vector(params) do
|
14 |
changeset = CodeVector.changeset(%CodeVector{}, params)
|
@@ -53,16 +54,18 @@ defmodule MedicalTranscription.Coding do
|
|
53 |
3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table.
|
54 |
4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback.
|
55 |
"""
|
56 |
-
def process_chunk(text, opts \\ []) do
|
57 |
k = Keyword.get(opts, :num_results, 5)
|
58 |
similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80)
|
59 |
|
60 |
search_vector_for_db = compute_vector_as_list(text)
|
61 |
|
|
|
|
|
62 |
past_feedbacks =
|
63 |
MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts)
|
64 |
|
65 |
-
code_vectors = find_similar(search_vector_for_db, k)
|
66 |
code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks)
|
67 |
|
68 |
code_vectors
|
@@ -71,6 +74,7 @@ defmodule MedicalTranscription.Coding do
|
|
71 |
|> weight_code_vectors(past_feedbacks)
|
72 |
|> filter_below_threshold(similarity_threshold)
|
73 |
|> sort_by_similarity()
|
|
|
74 |
end
|
75 |
|
76 |
@doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
|
|
|
9 |
import Pgvector.Ecto.Query
|
10 |
|
11 |
alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch}
|
12 |
+
alias MedicalTranscription.Transcriptions.TranscriptionChunk
|
13 |
|
14 |
def insert_vector(params) do
|
15 |
changeset = CodeVector.changeset(%CodeVector{}, params)
|
|
|
54 |
3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table.
|
55 |
4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback.
|
56 |
"""
|
57 |
+
def process_chunk(%TranscriptionChunk{text: text}, opts \\ []) do
|
58 |
k = Keyword.get(opts, :num_results, 5)
|
59 |
similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80)
|
60 |
|
61 |
search_vector_for_db = compute_vector_as_list(text)
|
62 |
|
63 |
+
dbg(text)
|
64 |
+
|
65 |
past_feedbacks =
|
66 |
MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts)
|
67 |
|
68 |
+
code_vectors = find_similar(search_vector_for_db, k) |> dbg()
|
69 |
code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks)
|
70 |
|
71 |
code_vectors
|
|
|
74 |
|> weight_code_vectors(past_feedbacks)
|
75 |
|> filter_below_threshold(similarity_threshold)
|
76 |
|> sort_by_similarity()
|
77 |
+
|> dbg()
|
78 |
end
|
79 |
|
80 |
@doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
|
lib/medical_transcription/coding/code_vector.ex
CHANGED
@@ -5,12 +5,17 @@ defmodule MedicalTranscription.Coding.CodeVector do
|
|
5 |
use Ecto.Schema
|
6 |
alias Ecto.Changeset
|
7 |
|
|
|
|
|
8 |
schema "code_vectors" do
|
9 |
field :code, :string
|
10 |
field :description, :string
|
11 |
field :description_vector, Pgvector.Ecto.Vector
|
12 |
|
13 |
-
|
|
|
|
|
|
|
14 |
end
|
15 |
|
16 |
def changeset(code_vector, params \\ %{}) do
|
|
|
5 |
use Ecto.Schema
|
6 |
alias Ecto.Changeset
|
7 |
|
8 |
+
@primary_key {:id, :binary_id, autogenerate: true}
|
9 |
+
@foreign_key_type :binary_id
|
10 |
schema "code_vectors" do
|
11 |
field :code, :string
|
12 |
field :description, :string
|
13 |
field :description_vector, Pgvector.Ecto.Vector
|
14 |
|
15 |
+
has_many :transcription_chunk_code_vectors, MedicalTranscription.Transcriptions.TranscriptionChunkCodeVector
|
16 |
+
has_many :transcription_chunks, through: [:transcription_chunk_code_vectors, :transcription_chunk]
|
17 |
+
|
18 |
+
timestamps(type: :utc_datetime)
|
19 |
end
|
20 |
|
21 |
def changeset(code_vector, params \\ %{}) do
|
lib/medical_transcription/feedback/code_feedback.ex
CHANGED
@@ -4,6 +4,8 @@ defmodule MedicalTranscription.Feedback.CodeFeedback do
|
|
4 |
"""
|
5 |
use Ecto.Schema
|
6 |
|
|
|
|
|
7 |
schema "code_feedbacks" do
|
8 |
field :text, :string
|
9 |
field :text_vector, Pgvector.Ecto.Vector
|
@@ -11,7 +13,7 @@ defmodule MedicalTranscription.Feedback.CodeFeedback do
|
|
11 |
|
12 |
belongs_to :code_vector, MedicalTranscription.CodeVector
|
13 |
|
14 |
-
timestamps()
|
15 |
end
|
16 |
|
17 |
def changeset(code_feedback, params \\ %{}) do
|
|
|
4 |
"""
|
5 |
use Ecto.Schema
|
6 |
|
7 |
+
@primary_key {:id, :binary_id, autogenerate: true}
|
8 |
+
@foreign_key_type :binary_id
|
9 |
schema "code_feedbacks" do
|
10 |
field :text, :string
|
11 |
field :text_vector, Pgvector.Ecto.Vector
|
|
|
13 |
|
14 |
belongs_to :code_vector, MedicalTranscription.CodeVector
|
15 |
|
16 |
+
timestamps(type: :utc_datetime)
|
17 |
end
|
18 |
|
19 |
def changeset(code_feedback, params \\ %{}) do
|
lib/medical_transcription/transcriptions.ex
CHANGED
@@ -57,7 +57,7 @@ defmodule MedicalTranscription.Transcriptions do
|
|
57 |
query =
|
58 |
if preload_transcription_chunks do
|
59 |
Transcription
|
60 |
-
|> preload(chunks: :keywords)
|
61 |
else
|
62 |
Transcription
|
63 |
end
|
@@ -83,7 +83,7 @@ defmodule MedicalTranscription.Transcriptions do
|
|
83 |
query =
|
84 |
if preload_transcription_chunks do
|
85 |
Transcription
|
86 |
-
|> preload(chunks: :keywords)
|
87 |
else
|
88 |
Transcription
|
89 |
end
|
|
|
57 |
query =
|
58 |
if preload_transcription_chunks do
|
59 |
Transcription
|
60 |
+
|> preload(chunks: [:keywords, :code_vectors])
|
61 |
else
|
62 |
Transcription
|
63 |
end
|
|
|
83 |
query =
|
84 |
if preload_transcription_chunks do
|
85 |
Transcription
|
86 |
+
|> preload(chunks: [:keywords, :code_vectors])
|
87 |
else
|
88 |
Transcription
|
89 |
end
|
lib/medical_transcription/transcriptions/transcription_chunk.ex
CHANGED
@@ -12,6 +12,8 @@ defmodule MedicalTranscription.Transcriptions.TranscriptionChunk do
|
|
12 |
belongs_to :transcription, MedicalTranscription.Transcriptions.Transcription
|
13 |
|
14 |
has_many :keywords, MedicalTranscription.Transcriptions.TranscriptionChunkKeyword
|
|
|
|
|
15 |
|
16 |
timestamps(type: :utc_datetime)
|
17 |
end
|
|
|
12 |
belongs_to :transcription, MedicalTranscription.Transcriptions.Transcription
|
13 |
|
14 |
has_many :keywords, MedicalTranscription.Transcriptions.TranscriptionChunkKeyword
|
15 |
+
has_many :transcription_chunk_code_vectors, MedicalTranscription.Transcriptions.TranscriptionChunkCodeVector
|
16 |
+
has_many :code_vectors, through: [:transcription_chunk_code_vectors, :code_vector]
|
17 |
|
18 |
timestamps(type: :utc_datetime)
|
19 |
end
|
lib/medical_transcription/transcriptions/transcription_chunk_code_vector.ex
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defmodule MedicalTranscription.Transcriptions.TranscriptionChunkCodeVector do
|
2 |
+
use Ecto.Schema
|
3 |
+
import Ecto.Changeset
|
4 |
+
|
5 |
+
@primary_key {:id, :binary_id, autogenerate: true}
|
6 |
+
@foreign_key_type :binary_id
|
7 |
+
schema "transcription_chunk_code_vectors" do
|
8 |
+
belongs_to :transcription_chunk, MedicalTranscription.Transcriptions.TranscriptionChunk
|
9 |
+
belongs_to :code_vector, MedicalTranscription.Coding.CodeVector
|
10 |
+
|
11 |
+
timestamps(type: :utc_datetime)
|
12 |
+
end
|
13 |
+
|
14 |
+
@doc false
|
15 |
+
def changeset(transcription_chunk_code_vector, attrs) do
|
16 |
+
transcription_chunk_code_vector
|
17 |
+
|> cast(attrs, [:transcription_chunk_id, :code_vector_id])
|
18 |
+
|> validate_required([:transcription_chunk_id, :code_vector_id])
|
19 |
+
end
|
20 |
+
end
|
priv/repo/migrations/20240116164032_create_code_feedbacks.exs
CHANGED
@@ -2,7 +2,8 @@ defmodule MedicalTranscription.Repo.Migrations.CreateCodeFeedbacks do
|
|
2 |
use Ecto.Migration
|
3 |
|
4 |
def change do
|
5 |
-
create table(:code_feedbacks) do
|
|
|
6 |
add :text, :string
|
7 |
add :code, :string
|
8 |
add :response, :boolean
|
|
|
2 |
use Ecto.Migration
|
3 |
|
4 |
def change do
|
5 |
+
create table(:code_feedbacks, primary_key: false) do
|
6 |
+
add :id, :binary_id, primary_key: true
|
7 |
add :text, :string
|
8 |
add :code, :string
|
9 |
add :response, :boolean
|
priv/repo/migrations/20240125151833_create_code_vectors.exs
CHANGED
@@ -2,7 +2,8 @@ defmodule MedicalTranscription.Repo.Migrations.CreateLabelVectors do
|
|
2 |
use Ecto.Migration
|
3 |
|
4 |
def change do
|
5 |
-
create table(
|
|
|
6 |
add :code, :string
|
7 |
add :description, :string
|
8 |
add :description_vector, :vector, size: 384
|
|
|
2 |
use Ecto.Migration
|
3 |
|
4 |
def change do
|
5 |
+
create table(:code_vectors, primary_key: false) do
|
6 |
+
add :id, :binary_id, primary_key: true
|
7 |
add :code, :string
|
8 |
add :description, :string
|
9 |
add :description_vector, :vector, size: 384
|
priv/repo/migrations/20240125172837_migrate_to_vector_for_code_feedbacks.exs
CHANGED
@@ -4,7 +4,10 @@ defmodule MedicalTranscription.Repo.Migrations.MigrateToVectorForCodeFeedbacks d
|
|
4 |
def change do
|
5 |
alter table("code_feedbacks") do
|
6 |
add :text_vector, :vector, size: 384
|
7 |
-
|
|
|
|
|
|
|
8 |
|
9 |
# These two columns can be found on the `code_vectors` table referenced above
|
10 |
remove :code, :string
|
|
|
4 |
def change do
|
5 |
alter table("code_feedbacks") do
|
6 |
add :text_vector, :vector, size: 384
|
7 |
+
|
8 |
+
add :code_vector_id,
|
9 |
+
references(:code_vectors, type: :binary_id, on_delete: :delete_all),
|
10 |
+
null: false
|
11 |
|
12 |
# These two columns can be found on the `code_vectors` table referenced above
|
13 |
remove :code, :string
|
priv/repo/migrations/20240209203409_create_transcription_chunk_code_vectors.exs
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defmodule MedicalTranscription.Repo.Migrations.CreateTranscriptionChunkCodeVectors do
|
2 |
+
use Ecto.Migration
|
3 |
+
|
4 |
+
def change do
|
5 |
+
create table(:transcription_chunk_code_vectors, primary_key: false) do
|
6 |
+
add :id, :binary_id, primary_key: true
|
7 |
+
add :transcription_chunk_id,
|
8 |
+
references(:transcription_chunks, type: :binary_id, on_delete: :delete_all),
|
9 |
+
null: false
|
10 |
+
add :code_vector_id,
|
11 |
+
references(:code_vectors, type: :binary_id, on_delete: :delete_all),
|
12 |
+
null: false
|
13 |
+
|
14 |
+
timestamps(type: :utc_datetime)
|
15 |
+
end
|
16 |
+
end
|
17 |
+
end
|
test/medical_transcription/classification_server_test.exs
CHANGED
@@ -31,12 +31,15 @@ defmodule MedicalTranscription.ClassificationServerTest do
|
|
31 |
ref = Process.monitor(pid)
|
32 |
assert_receive({:DOWN, ^ref, :process, _object, _pid}, 5_000)
|
33 |
|
34 |
-
|
35 |
transcription.id
|
36 |
|> Transcriptions.get_transcription!(true)
|
37 |
|> Map.fetch!(:chunks)
|
38 |
-
|
|
|
|
|
39 |
|
40 |
assert 2 == Enum.count(keywords)
|
|
|
41 |
end
|
42 |
end
|
|
|
31 |
ref = Process.monitor(pid)
|
32 |
assert_receive({:DOWN, ^ref, :process, _object, _pid}, 5_000)
|
33 |
|
34 |
+
chunks =
|
35 |
transcription.id
|
36 |
|> Transcriptions.get_transcription!(true)
|
37 |
|> Map.fetch!(:chunks)
|
38 |
+
|
39 |
+
keywords = Enum.flat_map(chunks, & &1.keywords)
|
40 |
+
code_vectors = Enum.flat_map(chunks, & &1.code_vectors)
|
41 |
|
42 |
assert 2 == Enum.count(keywords)
|
43 |
+
assert 2 == Enum.count(code_vectors)
|
44 |
end
|
45 |
end
|