defmodule MedicalTranscription.Coding do @moduledoc """ Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's SemanticSearch module. """ alias MedicalTranscription.Repo import Ecto.Query import Pgvector.Ecto.Query alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch} def insert_vector(params) do changeset = CodeVector.changeset(%CodeVector{}, params) case Repo.insert(changeset) do {:ok, _} -> {:ok, "Success!"} {:error, changeset} -> {:error, Repo.collect_errors(changeset)} end end def exists_for_code?(code) do Repo.exists?( from v in CodeVector, where: v.code == ^code ) end def icd9_present? do num_rows = Repo.aggregate(CodeVector, :count) num_rows >= 14_567 end @doc """ Takes a chunk of transcribed text and classifies it based on the list of code vectors in the database. 1. First, create a vector embedding for the passed `text`. 2. Then, search for similar codes in the list of `code_vectors`. 3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table. 4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback. """ def process_chunk(text, opts \\ []) do k = Keyword.get(opts, :num_results, 5) similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80) search_vector_for_db = compute_vector_as_list(text) past_feedbacks = MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts) code_vectors = find_similar(search_vector_for_db, k) code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks) code_vectors |> Enum.concat(code_vectors_for_feedback) |> Enum.uniq_by(& &1.id) |> weight_code_vectors(past_feedbacks) |> filter_below_threshold(similarity_threshold) |> sort_by_similarity() end @doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree." def compute_vector_as_list(text) do MedicalTranscription.TextEmbeddingServing |> Nx.Serving.batched_run(text) |> Map.get(:embedding) |> Nx.to_flat_list() end # Takes a list of `%CodeVectorMatch{}`es and a list of `%CodeFeedback{}`es, both found based on the same input # `text`. Then, modifies the similarity scores for each code vector match based on whether there was a past feedback # with a positive or negative response. @positive_response_factor 1.1 @negative_response_factor 0.9 defp weight_code_vectors(code_vector_matches, past_feedbacks) do Enum.map(code_vector_matches, &weight_code_vector(&1, past_feedbacks)) end defp weight_code_vector(code_vector_match, past_feedbacks) do relevant_feedbacks = Enum.filter(past_feedbacks, &(&1.code_vector_id == code_vector_match.id)) if relevant_feedbacks == [] do %{code_vector_match | weighting: [:none]} else Enum.reduce(relevant_feedbacks, code_vector_match, fn feedback, acc -> new_attributes = weight_code_vector_similarity(acc.cosine_similarity, feedback) Map.merge(acc, new_attributes, &merge_code_vector_match_attributes/3) end) end end defp weight_code_vector_similarity(_similarity, nil), do: %{weighting: [:none]} defp weight_code_vector_similarity(similarity, %{response: true}) do %{ cosine_similarity: similarity * @positive_response_factor, weighting: [:positive] } end defp weight_code_vector_similarity(similarity, %{response: false}) do %{ cosine_similarity: similarity * @negative_response_factor, weighting: [:negative] } end defp merge_code_vector_match_attributes(:weighting, value1, value2) do value1 ++ value2 end defp merge_code_vector_match_attributes(_key, _value1, value2), do: value2 # Remove matches that don't exceed a given threshold. defp filter_below_threshold(code_vector_matches, similarity_threshold) do Enum.filter( code_vector_matches, &(&1.cosine_similarity >= similarity_threshold) ) end # We sort by similarity again after weighting, to ensure that the results are sorted after processing. defp sort_by_similarity(code_vector_matches) do Enum.sort(code_vector_matches, &(&1.cosine_similarity >= &2.cosine_similarity)) end # Finds similar records using cosine similarity on the vector embeddings in the database. defp find_similar(search_vector, limit) do Repo.all( from v in CodeVector, order_by: cosine_distance(v.description_vector, ^search_vector), limit: ^limit, select: %CodeVectorMatch{ id: v.id, code: v.code, description: v.description, cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector), weighting: [] } ) end # Retrieves code vectors referenced in past feedback for a given search text. This allows including additional codes # for the weighting process outside of the closest 5 returned by `find_similar/2`. defp find_for_feedback(search_vector, past_feedbacks) do code_vector_ids_for_past_feedback = Enum.map(past_feedbacks, & &1.code_vector_id) Repo.all( from v in CodeVector, order_by: cosine_distance(v.description_vector, ^search_vector), where: v.id in ^code_vector_ids_for_past_feedback, select: %CodeVectorMatch{ id: v.id, code: v.code, description: v.description, cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector), weighting: [] } ) end end