noahsettersten's picture
feat: Search for codes by code or description
3280f20
raw
history blame
6.01 kB
defmodule MedicalTranscription.Coding do
@moduledoc """
Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
SemanticSearch module.
"""
alias MedicalTranscription.Repo
import Ecto.Query
import Pgvector.Ecto.Query
alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch}
def insert_vector(params) do
changeset = CodeVector.changeset(%CodeVector{}, params)
case Repo.insert(changeset) do
{:ok, _} ->
{:ok, "Success!"}
{:error, changeset} ->
{:error, Repo.collect_errors(changeset)}
end
end
def exists_for_code?(code) do
Repo.exists?(
from v in CodeVector,
where: v.code == ^code
)
end
def icd9_present? do
num_rows = Repo.aggregate(CodeVector, :count)
num_rows >= 14_567
end
def search_for_code_vector(""), do: []
def search_for_code_vector(text) when is_binary(text) do
Repo.all(
from v in CodeVector,
where: ilike(v.code, ^"%#{text}%") or ilike(v.description, ^"%#{text}%"),
limit: 5
)
end
@doc """
Takes a chunk of transcribed text and classifies it based on the list of code vectors in the database.
1. First, create a vector embedding for the passed `text`.
2. Then, search for similar codes in the list of `code_vectors`.
3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table.
4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback.
"""
def process_chunk(text, opts \\ []) do
k = Keyword.get(opts, :num_results, 5)
similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80)
search_vector_for_db = compute_vector_as_list(text)
past_feedbacks =
MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts)
code_vectors = find_similar(search_vector_for_db, k)
code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks)
code_vectors
|> Enum.concat(code_vectors_for_feedback)
|> Enum.uniq_by(& &1.id)
|> weight_code_vectors(past_feedbacks)
|> filter_below_threshold(similarity_threshold)
|> sort_by_similarity()
end
@doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
def compute_vector_as_list(text) do
MedicalTranscription.TextEmbeddingServing
|> Nx.Serving.batched_run(text)
|> Map.get(:embedding)
|> Nx.to_flat_list()
end
# Takes a list of `%CodeVectorMatch{}`es and a list of `%CodeFeedback{}`es, both found based on the same input
# `text`. Then, modifies the similarity scores for each code vector match based on whether there was a past feedback
# with a positive or negative response.
@positive_response_factor 1.1
@negative_response_factor 0.9
defp weight_code_vectors(code_vector_matches, past_feedbacks) do
Enum.map(code_vector_matches, &weight_code_vector(&1, past_feedbacks))
end
defp weight_code_vector(code_vector_match, past_feedbacks) do
relevant_feedbacks =
Enum.filter(past_feedbacks, &(&1.code_vector_id == code_vector_match.id))
if relevant_feedbacks == [] do
%{code_vector_match | weighting: [:none]}
else
Enum.reduce(relevant_feedbacks, code_vector_match, fn feedback, acc ->
new_attributes = weight_code_vector_similarity(acc.cosine_similarity, feedback)
Map.merge(acc, new_attributes, &merge_code_vector_match_attributes/3)
end)
end
end
defp weight_code_vector_similarity(_similarity, nil), do: %{weighting: [:none]}
defp weight_code_vector_similarity(similarity, %{response: true}) do
%{
cosine_similarity: similarity * @positive_response_factor,
weighting: [:positive]
}
end
defp weight_code_vector_similarity(similarity, %{response: false}) do
%{
cosine_similarity: similarity * @negative_response_factor,
weighting: [:negative]
}
end
defp merge_code_vector_match_attributes(:weighting, value1, value2) do
value1 ++ value2
end
defp merge_code_vector_match_attributes(_key, _value1, value2), do: value2
# Remove matches that don't exceed a given threshold.
defp filter_below_threshold(code_vector_matches, similarity_threshold) do
Enum.filter(
code_vector_matches,
&(&1.cosine_similarity >= similarity_threshold)
)
end
# We sort by similarity again after weighting, to ensure that the results are sorted after processing.
defp sort_by_similarity(code_vector_matches) do
Enum.sort(code_vector_matches, &(&1.cosine_similarity >= &2.cosine_similarity))
end
# Finds similar records using cosine similarity on the vector embeddings in the database.
defp find_similar(search_vector, limit) do
Repo.all(
from v in CodeVector,
order_by: cosine_distance(v.description_vector, ^search_vector),
limit: ^limit,
select: %CodeVectorMatch{
id: v.id,
code: v.code,
description: v.description,
cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector),
weighting: []
}
)
end
# Retrieves code vectors referenced in past feedback for a given search text. This allows including additional codes
# for the weighting process outside of the closest 5 returned by `find_similar/2`.
defp find_for_feedback(search_vector, past_feedbacks) do
code_vector_ids_for_past_feedback = Enum.map(past_feedbacks, & &1.code_vector_id)
Repo.all(
from v in CodeVector,
order_by: cosine_distance(v.description_vector, ^search_vector),
where: v.id in ^code_vector_ids_for_past_feedback,
select: %CodeVectorMatch{
id: v.id,
code: v.code,
description: v.description,
cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector),
weighting: []
}
)
end
end