defmodule MedicalTranscription.Transcriber do
  @input_filename "CMS32_DESC_LONG_SHORT_DX"

  alias AudioTagger.Classifier.{SemanticSearch, SemanticSearchInput}

  def stream_transcription_and_search(live_view_pid, audio_file_path) do
    {model_info, tokenizer} = AudioTagger.Classifier.SemanticSearch.prepare_model()

    labels_df = read_labels_from_csv!()

    # TODO: We could explore storing these vectors within pgvector or Pinecone.io
    label_embeddings_path = Path.join(__DIR__, "../../#{@input_filename}.bin")

    label_embeddings =
      AudioTagger.Classifier.SemanticSearch.load_label_vectors(label_embeddings_path)

    # Audio transcription + semantic search
    for {chunk, index} <-
          TranscriptionServing
          |> Nx.Serving.batched_run({:file, audio_file_path})
          |> Stream.with_index() do
      # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather
      # complete sentences based on punctuation.

      input = %SemanticSearchInput{
        labels_df: labels_df,
        label_embeddings: label_embeddings,
        model_info: model_info,
        tokenizer: tokenizer,
        opts: [similarity_threshold: 0.8]
      }

      chunk_result = process_chunk(input, index, chunk)

      send(live_view_pid, {:transcription_row, chunk_result})
    end
  end

  defp read_labels_from_csv! do
    __DIR__
    |> Path.join("../../#{@input_filename}.csv")
    |> Explorer.DataFrame.from_csv!(
      dtypes: [
        {"DIAGNOSIS CODE", :string},
        {"LONG DESCRIPTION", :string},
        {"SHORT DESCRIPTION", :string}
      ]
    )
    |> Explorer.DataFrame.select([0, 1, 2])
    |> Explorer.DataFrame.rename(["code", "long_description", "short_description"])
  end

  # defp process_chunk(model_tuple, labels_df, label_embeddings, index, chunk) do
  defp process_chunk(%SemanticSearchInput{} = input, index, chunk) do
    tags = SemanticSearch.tag_one(input, chunk.text)

    [start_mark, end_mark] = process_timestamps(chunk)

    %{
      id: index,
      start_mark: start_mark,
      end_mark: end_mark,
      text: chunk.text,
      tags: tags
    }
  end

  defp process_timestamps(chunk) do
    for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
      seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
    end
  end
end