timgremore commited on
Commit
5676fb5
1 Parent(s): 7625517

wip: Correlate code vectors with chunks

Browse files
lib/medical_transcription/classification_server.ex CHANGED
@@ -6,6 +6,8 @@ defmodule MedicalTranscription.ClassificationServer do
6
 
7
  alias AudioTagger.KeywordFinder
8
 
 
 
9
  alias MedicalTranscription.Transcriptions
10
  alias MedicalTranscription.Transcriptions.TranscriptionChunk
11
 
@@ -20,6 +22,7 @@ defmodule MedicalTranscription.ClassificationServer do
20
 
21
  @impl GenServer
22
  def handle_continue(:start, {:chunk, chunk} = state) do
 
23
  find_keywords(chunk)
24
 
25
  Phoenix.PubSub.broadcast(
@@ -66,7 +69,7 @@ defmodule MedicalTranscription.ClassificationServer do
66
  end
67
 
68
  defp classify_chunk(chunk) do
69
- find_keywords(chunk)
70
  end
71
 
72
  defp find_keywords(chunk) do
 
6
 
7
  alias AudioTagger.KeywordFinder
8
 
9
+ alias MedicalTranscription.Coding
10
+ alias MedicalTranscription.Coding.CodeVectorMatch
11
  alias MedicalTranscription.Transcriptions
12
  alias MedicalTranscription.Transcriptions.TranscriptionChunk
13
 
 
22
 
23
  @impl GenServer
24
  def handle_continue(:start, {:chunk, chunk} = state) do
25
+ classify_chunk(chunk)
26
  find_keywords(chunk)
27
 
28
  Phoenix.PubSub.broadcast(
 
69
  end
70
 
71
  defp classify_chunk(chunk) do
72
+ Coding.process_chunk(chunk)
73
  end
74
 
75
  defp find_keywords(chunk) do
lib/medical_transcription/coding.ex CHANGED
@@ -9,6 +9,7 @@ defmodule MedicalTranscription.Coding do
9
  import Pgvector.Ecto.Query
10
 
11
  alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch}
 
12
 
13
  def insert_vector(params) do
14
  changeset = CodeVector.changeset(%CodeVector{}, params)
@@ -53,16 +54,18 @@ defmodule MedicalTranscription.Coding do
53
  3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table.
54
  4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback.
55
  """
56
- def process_chunk(text, opts \\ []) do
57
  k = Keyword.get(opts, :num_results, 5)
58
  similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80)
59
 
60
  search_vector_for_db = compute_vector_as_list(text)
61
 
 
 
62
  past_feedbacks =
63
  MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts)
64
 
65
- code_vectors = find_similar(search_vector_for_db, k)
66
  code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks)
67
 
68
  code_vectors
@@ -71,6 +74,7 @@ defmodule MedicalTranscription.Coding do
71
  |> weight_code_vectors(past_feedbacks)
72
  |> filter_below_threshold(similarity_threshold)
73
  |> sort_by_similarity()
 
74
  end
75
 
76
  @doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
 
9
  import Pgvector.Ecto.Query
10
 
11
  alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch}
12
+ alias MedicalTranscription.Transcriptions.TranscriptionChunk
13
 
14
  def insert_vector(params) do
15
  changeset = CodeVector.changeset(%CodeVector{}, params)
 
54
  3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table.
55
  4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback.
56
  """
57
+ def process_chunk(%TranscriptionChunk{text: text}, opts \\ []) do
58
  k = Keyword.get(opts, :num_results, 5)
59
  similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80)
60
 
61
  search_vector_for_db = compute_vector_as_list(text)
62
 
63
+ dbg(text)
64
+
65
  past_feedbacks =
66
  MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts)
67
 
68
+ code_vectors = find_similar(search_vector_for_db, k) |> dbg()
69
  code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks)
70
 
71
  code_vectors
 
74
  |> weight_code_vectors(past_feedbacks)
75
  |> filter_below_threshold(similarity_threshold)
76
  |> sort_by_similarity()
77
+ |> dbg()
78
  end
79
 
80
  @doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
lib/medical_transcription/coding/code_vector.ex CHANGED
@@ -5,12 +5,17 @@ defmodule MedicalTranscription.Coding.CodeVector do
5
  use Ecto.Schema
6
  alias Ecto.Changeset
7
 
 
 
8
  schema "code_vectors" do
9
  field :code, :string
10
  field :description, :string
11
  field :description_vector, Pgvector.Ecto.Vector
12
 
13
- timestamps()
 
 
 
14
  end
15
 
16
  def changeset(code_vector, params \\ %{}) do
 
5
  use Ecto.Schema
6
  alias Ecto.Changeset
7
 
8
+ @primary_key {:id, :binary_id, autogenerate: true}
9
+ @foreign_key_type :binary_id
10
  schema "code_vectors" do
11
  field :code, :string
12
  field :description, :string
13
  field :description_vector, Pgvector.Ecto.Vector
14
 
15
+ has_many :transcription_chunk_code_vectors, MedicalTranscription.Transcriptions.TranscriptionChunkCodeVector
16
+ has_many :transcription_chunks, through: [:transcription_chunk_code_vectors, :transcription_chunk]
17
+
18
+ timestamps(type: :utc_datetime)
19
  end
20
 
21
  def changeset(code_vector, params \\ %{}) do
lib/medical_transcription/feedback/code_feedback.ex CHANGED
@@ -4,6 +4,8 @@ defmodule MedicalTranscription.Feedback.CodeFeedback do
4
  """
5
  use Ecto.Schema
6
 
 
 
7
  schema "code_feedbacks" do
8
  field :text, :string
9
  field :text_vector, Pgvector.Ecto.Vector
@@ -11,7 +13,7 @@ defmodule MedicalTranscription.Feedback.CodeFeedback do
11
 
12
  belongs_to :code_vector, MedicalTranscription.CodeVector
13
 
14
- timestamps()
15
  end
16
 
17
  def changeset(code_feedback, params \\ %{}) do
 
4
  """
5
  use Ecto.Schema
6
 
7
+ @primary_key {:id, :binary_id, autogenerate: true}
8
+ @foreign_key_type :binary_id
9
  schema "code_feedbacks" do
10
  field :text, :string
11
  field :text_vector, Pgvector.Ecto.Vector
 
13
 
14
  belongs_to :code_vector, MedicalTranscription.CodeVector
15
 
16
+ timestamps(type: :utc_datetime)
17
  end
18
 
19
  def changeset(code_feedback, params \\ %{}) do
lib/medical_transcription/transcriptions.ex CHANGED
@@ -57,7 +57,7 @@ defmodule MedicalTranscription.Transcriptions do
57
  query =
58
  if preload_transcription_chunks do
59
  Transcription
60
- |> preload(chunks: :keywords)
61
  else
62
  Transcription
63
  end
@@ -83,7 +83,7 @@ defmodule MedicalTranscription.Transcriptions do
83
  query =
84
  if preload_transcription_chunks do
85
  Transcription
86
- |> preload(chunks: :keywords)
87
  else
88
  Transcription
89
  end
 
57
  query =
58
  if preload_transcription_chunks do
59
  Transcription
60
+ |> preload(chunks: [:keywords, :code_vectors])
61
  else
62
  Transcription
63
  end
 
83
  query =
84
  if preload_transcription_chunks do
85
  Transcription
86
+ |> preload(chunks: [:keywords, :code_vectors])
87
  else
88
  Transcription
89
  end
lib/medical_transcription/transcriptions/transcription_chunk.ex CHANGED
@@ -12,6 +12,8 @@ defmodule MedicalTranscription.Transcriptions.TranscriptionChunk do
12
  belongs_to :transcription, MedicalTranscription.Transcriptions.Transcription
13
 
14
  has_many :keywords, MedicalTranscription.Transcriptions.TranscriptionChunkKeyword
 
 
15
 
16
  timestamps(type: :utc_datetime)
17
  end
 
12
  belongs_to :transcription, MedicalTranscription.Transcriptions.Transcription
13
 
14
  has_many :keywords, MedicalTranscription.Transcriptions.TranscriptionChunkKeyword
15
+ has_many :transcription_chunk_code_vectors, MedicalTranscription.Transcriptions.TranscriptionChunkCodeVector
16
+ has_many :code_vectors, through: [:transcription_chunk_code_vectors, :code_vector]
17
 
18
  timestamps(type: :utc_datetime)
19
  end
lib/medical_transcription/transcriptions/transcription_chunk_code_vector.ex ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defmodule MedicalTranscription.Transcriptions.TranscriptionChunkCodeVector do
2
+ use Ecto.Schema
3
+ import Ecto.Changeset
4
+
5
+ @primary_key {:id, :binary_id, autogenerate: true}
6
+ @foreign_key_type :binary_id
7
+ schema "transcription_chunk_code_vectors" do
8
+ belongs_to :transcription_chunk, MedicalTranscription.Transcriptions.TranscriptionChunk
9
+ belongs_to :code_vector, MedicalTranscription.Coding.CodeVector
10
+
11
+ timestamps(type: :utc_datetime)
12
+ end
13
+
14
+ @doc false
15
+ def changeset(transcription_chunk_code_vector, attrs) do
16
+ transcription_chunk_code_vector
17
+ |> cast(attrs, [:transcription_chunk_id, :code_vector_id])
18
+ |> validate_required([:transcription_chunk_id, :code_vector_id])
19
+ end
20
+ end
priv/repo/migrations/20240116164032_create_code_feedbacks.exs CHANGED
@@ -2,7 +2,8 @@ defmodule MedicalTranscription.Repo.Migrations.CreateCodeFeedbacks do
2
  use Ecto.Migration
3
 
4
  def change do
5
- create table(:code_feedbacks) do
 
6
  add :text, :string
7
  add :code, :string
8
  add :response, :boolean
 
2
  use Ecto.Migration
3
 
4
  def change do
5
+ create table(:code_feedbacks, primary_key: false) do
6
+ add :id, :binary_id, primary_key: true
7
  add :text, :string
8
  add :code, :string
9
  add :response, :boolean
priv/repo/migrations/20240125151833_create_code_vectors.exs CHANGED
@@ -2,7 +2,8 @@ defmodule MedicalTranscription.Repo.Migrations.CreateLabelVectors do
2
  use Ecto.Migration
3
 
4
  def change do
5
- create table("code_vectors") do
 
6
  add :code, :string
7
  add :description, :string
8
  add :description_vector, :vector, size: 384
 
2
  use Ecto.Migration
3
 
4
  def change do
5
+ create table(:code_vectors, primary_key: false) do
6
+ add :id, :binary_id, primary_key: true
7
  add :code, :string
8
  add :description, :string
9
  add :description_vector, :vector, size: 384
priv/repo/migrations/20240125172837_migrate_to_vector_for_code_feedbacks.exs CHANGED
@@ -4,7 +4,10 @@ defmodule MedicalTranscription.Repo.Migrations.MigrateToVectorForCodeFeedbacks d
4
  def change do
5
  alter table("code_feedbacks") do
6
  add :text_vector, :vector, size: 384
7
- add :code_vector_id, references("code_vectors")
 
 
 
8
 
9
  # These two columns can be found on the `code_vectors` table referenced above
10
  remove :code, :string
 
4
  def change do
5
  alter table("code_feedbacks") do
6
  add :text_vector, :vector, size: 384
7
+
8
+ add :code_vector_id,
9
+ references(:code_vectors, type: :binary_id, on_delete: :delete_all),
10
+ null: false
11
 
12
  # These two columns can be found on the `code_vectors` table referenced above
13
  remove :code, :string
priv/repo/migrations/20240209203409_create_transcription_chunk_code_vectors.exs ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defmodule MedicalTranscription.Repo.Migrations.CreateTranscriptionChunkCodeVectors do
2
+ use Ecto.Migration
3
+
4
+ def change do
5
+ create table(:transcription_chunk_code_vectors, primary_key: false) do
6
+ add :id, :binary_id, primary_key: true
7
+ add :transcription_chunk_id,
8
+ references(:transcription_chunks, type: :binary_id, on_delete: :delete_all),
9
+ null: false
10
+ add :code_vector_id,
11
+ references(:code_vectors, type: :binary_id, on_delete: :delete_all),
12
+ null: false
13
+
14
+ timestamps(type: :utc_datetime)
15
+ end
16
+ end
17
+ end
test/medical_transcription/classification_server_test.exs CHANGED
@@ -31,12 +31,15 @@ defmodule MedicalTranscription.ClassificationServerTest do
31
  ref = Process.monitor(pid)
32
  assert_receive({:DOWN, ^ref, :process, _object, _pid}, 5_000)
33
 
34
- keywords =
35
  transcription.id
36
  |> Transcriptions.get_transcription!(true)
37
  |> Map.fetch!(:chunks)
38
- |> Enum.flat_map(& &1.keywords)
 
 
39
 
40
  assert 2 == Enum.count(keywords)
 
41
  end
42
  end
 
31
  ref = Process.monitor(pid)
32
  assert_receive({:DOWN, ^ref, :process, _object, _pid}, 5_000)
33
 
34
+ chunks =
35
  transcription.id
36
  |> Transcriptions.get_transcription!(true)
37
  |> Map.fetch!(:chunks)
38
+
39
+ keywords = Enum.flat_map(chunks, & &1.keywords)
40
+ code_vectors = Enum.flat_map(chunks, & &1.code_vectors)
41
 
42
  assert 2 == Enum.count(keywords)
43
+ assert 2 == Enum.count(code_vectors)
44
  end
45
  end