|
defmodule MedicalTranscription.Coding do |
|
@moduledoc """ |
|
Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's |
|
SemanticSearch module. |
|
""" |
|
|
|
alias MedicalTranscription.Repo |
|
import Ecto.Query |
|
import Pgvector.Ecto.Query |
|
|
|
alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch} |
|
|
|
def insert_vector(params) do |
|
changeset = CodeVector.changeset(%CodeVector{}, params) |
|
|
|
case Repo.insert(changeset) do |
|
{:ok, _} -> |
|
{:ok, "Success!"} |
|
|
|
{:error, changeset} -> |
|
{:error, Repo.collect_errors(changeset)} |
|
end |
|
end |
|
|
|
def exists_for_code?(code) do |
|
Repo.exists?( |
|
from v in CodeVector, |
|
where: v.code == ^code |
|
) |
|
end |
|
|
|
def icd9_present? do |
|
num_rows = Repo.aggregate(CodeVector, :count) |
|
|
|
num_rows >= 14_567 |
|
end |
|
|
|
@doc """ |
|
Takes a chunk of transcribed text and classifies it based on the list of code vectors in the database. |
|
|
|
1. First, create a vector embedding for the passed `text`. |
|
2. Then, search for similar codes in the list of `code_vectors`. |
|
3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table. |
|
4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback. |
|
""" |
|
def process_chunk(text, opts \\ []) do |
|
k = Keyword.get(opts, :num_results, 5) |
|
similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80) |
|
|
|
search_vector_for_db = compute_vector_as_list(text) |
|
|
|
past_feedbacks = |
|
MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts) |
|
|
|
code_vectors = find_similar(search_vector_for_db, k) |
|
code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks) |
|
|
|
code_vectors |
|
|> Enum.concat(code_vectors_for_feedback) |
|
|> Enum.uniq_by(& &1.id) |
|
|> weight_code_vectors(past_feedbacks) |
|
|> filter_below_threshold(similarity_threshold) |
|
|> sort_by_similarity() |
|
end |
|
|
|
@doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree." |
|
def compute_vector_as_list(text) do |
|
MedicalTranscription.TextEmbeddingServing |
|
|> Nx.Serving.batched_run(text) |
|
|> Map.get(:embedding) |
|
|> Nx.to_flat_list() |
|
end |
|
|
|
|
|
|
|
|
|
@positive_response_factor 1.1 |
|
@negative_response_factor 0.9 |
|
defp weight_code_vectors(code_vector_matches, past_feedbacks) do |
|
Enum.map(code_vector_matches, &weight_code_vector(&1, past_feedbacks)) |
|
end |
|
|
|
defp weight_code_vector(code_vector_match, past_feedbacks) do |
|
relevant_feedbacks = |
|
Enum.filter(past_feedbacks, &(&1.code_vector_id == code_vector_match.id)) |
|
|
|
if relevant_feedbacks == [] do |
|
%{code_vector_match | weighting: [:none]} |
|
else |
|
Enum.reduce(relevant_feedbacks, code_vector_match, fn feedback, acc -> |
|
new_attributes = weight_code_vector_similarity(acc.cosine_similarity, feedback) |
|
|
|
Map.merge(acc, new_attributes, &merge_code_vector_match_attributes/3) |
|
end) |
|
end |
|
end |
|
|
|
defp weight_code_vector_similarity(_similarity, nil), do: %{weighting: [:none]} |
|
|
|
defp weight_code_vector_similarity(similarity, %{response: true}) do |
|
%{ |
|
cosine_similarity: similarity * @positive_response_factor, |
|
weighting: [:positive] |
|
} |
|
end |
|
|
|
defp weight_code_vector_similarity(similarity, %{response: false}) do |
|
%{ |
|
cosine_similarity: similarity * @negative_response_factor, |
|
weighting: [:negative] |
|
} |
|
end |
|
|
|
defp merge_code_vector_match_attributes(:weighting, value1, value2) do |
|
value1 ++ value2 |
|
end |
|
|
|
defp merge_code_vector_match_attributes(_key, _value1, value2), do: value2 |
|
|
|
|
|
defp filter_below_threshold(code_vector_matches, similarity_threshold) do |
|
Enum.filter( |
|
code_vector_matches, |
|
&(&1.cosine_similarity >= similarity_threshold) |
|
) |
|
end |
|
|
|
|
|
defp sort_by_similarity(code_vector_matches) do |
|
Enum.sort(code_vector_matches, &(&1.cosine_similarity >= &2.cosine_similarity)) |
|
end |
|
|
|
|
|
defp find_similar(search_vector, limit) do |
|
Repo.all( |
|
from v in CodeVector, |
|
order_by: cosine_distance(v.description_vector, ^search_vector), |
|
limit: ^limit, |
|
select: %CodeVectorMatch{ |
|
id: v.id, |
|
code: v.code, |
|
description: v.description, |
|
cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector), |
|
weighting: [] |
|
} |
|
) |
|
end |
|
|
|
|
|
|
|
defp find_for_feedback(search_vector, past_feedbacks) do |
|
code_vector_ids_for_past_feedback = Enum.map(past_feedbacks, & &1.code_vector_id) |
|
|
|
Repo.all( |
|
from v in CodeVector, |
|
order_by: cosine_distance(v.description_vector, ^search_vector), |
|
where: v.id in ^code_vector_ids_for_past_feedback, |
|
select: %CodeVectorMatch{ |
|
id: v.id, |
|
code: v.code, |
|
description: v.description, |
|
cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector), |
|
weighting: [] |
|
} |
|
) |
|
end |
|
end |
|
|