defmodule MedicalTranscription.Coding do
  @moduledoc """
  Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
  SemanticSearch module.
  """

  alias MedicalTranscription.Repo
  import Ecto.Query
  import Pgvector.Ecto.Query

  alias MedicalTranscription.Coding.{CodeVector, CodeVectorMatch}

  def insert_vector(params) do
    changeset = CodeVector.changeset(%CodeVector{}, params)

    case Repo.insert(changeset) do
      {:ok, _} ->
        {:ok, "Success!"}

      {:error, changeset} ->
        {:error, Repo.collect_errors(changeset)}
    end
  end

  def exists_for_code?(code) do
    Repo.exists?(
      from v in CodeVector,
        where: v.code == ^code
    )
  end

  def icd9_present? do
    num_rows = Repo.aggregate(CodeVector, :count)

    num_rows >= 14_567
  end

  @doc """
  Takes a chunk of transcribed text and classifies it based on the list of code vectors in the database.

  1. First, create a vector embedding for the passed `text`.
  2. Then, search for similar codes in the list of `code_vectors`.
  3. Then, look to see if there is any related user-provided feedback in the `code_feedbacks` table.
  4. Pass through the found `code_vectors` and modify the similarity scores based on any relevant previous feedback.
  """
  def process_chunk(text, opts \\ []) do
    k = Keyword.get(opts, :num_results, 5)
    similarity_threshold = Keyword.get(opts, :similarity_threshold, 0.80)

    search_vector_for_db = compute_vector_as_list(text)

    past_feedbacks =
      MedicalTranscription.Feedback.find_related_feedback(search_vector_for_db, opts)

    code_vectors = find_similar(search_vector_for_db, k)
    code_vectors_for_feedback = find_for_feedback(search_vector_for_db, past_feedbacks)

    code_vectors
    |> Enum.concat(code_vectors_for_feedback)
    |> Enum.uniq_by(& &1.id)
    |> weight_code_vectors(past_feedbacks)
    |> filter_below_threshold(similarity_threshold)
    |> sort_by_similarity()
  end

  @doc "Creates a vector embedding for text using the text embedding serving in the application's supervision tree."
  def compute_vector_as_list(text) do
    MedicalTranscription.TextEmbeddingServing
    |> Nx.Serving.batched_run(text)
    |> Map.get(:embedding)
    |> Nx.to_flat_list()
  end

  # Takes a list of `%CodeVectorMatch{}`es and a list of `%CodeFeedback{}`es, both found based on the same input
  # `text`. Then, modifies the similarity scores for each code vector match based on whether there was a past feedback
  # with a positive or negative response.
  @positive_response_factor 1.1
  @negative_response_factor 0.9
  defp weight_code_vectors(code_vector_matches, past_feedbacks) do
    Enum.map(code_vector_matches, &weight_code_vector(&1, past_feedbacks))
  end

  defp weight_code_vector(code_vector_match, past_feedbacks) do
    relevant_feedbacks =
      Enum.filter(past_feedbacks, &(&1.code_vector_id == code_vector_match.id))

    if relevant_feedbacks == [] do
      %{code_vector_match | weighting: [:none]}
    else
      Enum.reduce(relevant_feedbacks, code_vector_match, fn feedback, acc ->
        new_attributes = weight_code_vector_similarity(acc.cosine_similarity, feedback)

        Map.merge(acc, new_attributes, &merge_code_vector_match_attributes/3)
      end)
    end
  end

  defp weight_code_vector_similarity(_similarity, nil), do: %{weighting: [:none]}

  defp weight_code_vector_similarity(similarity, %{response: true}) do
    %{
      cosine_similarity: similarity * @positive_response_factor,
      weighting: [:positive]
    }
  end

  defp weight_code_vector_similarity(similarity, %{response: false}) do
    %{
      cosine_similarity: similarity * @negative_response_factor,
      weighting: [:negative]
    }
  end

  defp merge_code_vector_match_attributes(:weighting, value1, value2) do
    value1 ++ value2
  end

  defp merge_code_vector_match_attributes(_key, _value1, value2), do: value2

  # Remove matches that don't exceed a given threshold.
  defp filter_below_threshold(code_vector_matches, similarity_threshold) do
    Enum.filter(
      code_vector_matches,
      &(&1.cosine_similarity >= similarity_threshold)
    )
  end

  # We sort by similarity again after weighting, to ensure that the results are sorted after processing.
  defp sort_by_similarity(code_vector_matches) do
    Enum.sort(code_vector_matches, &(&1.cosine_similarity >= &2.cosine_similarity))
  end

  # Finds similar records using cosine similarity on the vector embeddings in the database.
  defp find_similar(search_vector, limit) do
    Repo.all(
      from v in CodeVector,
        order_by: cosine_distance(v.description_vector, ^search_vector),
        limit: ^limit,
        select: %CodeVectorMatch{
          id: v.id,
          code: v.code,
          description: v.description,
          cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector),
          weighting: []
        }
    )
  end

  # Retrieves code vectors referenced in past feedback for a given search text. This allows including additional codes
  # for the weighting process outside of the closest 5 returned by `find_similar/2`.
  defp find_for_feedback(search_vector, past_feedbacks) do
    code_vector_ids_for_past_feedback = Enum.map(past_feedbacks, & &1.code_vector_id)

    Repo.all(
      from v in CodeVector,
        order_by: cosine_distance(v.description_vector, ^search_vector),
        where: v.id in ^code_vector_ids_for_past_feedback,
        select: %CodeVectorMatch{
          id: v.id,
          code: v.code,
          description: v.description,
          cosine_similarity: 1 - cosine_distance(v.description_vector, ^search_vector),
          weighting: []
        }
    )
  end
end