defmodule MedicalTranscription.Transcriber do @input_filename "CMS32_DESC_LONG_SHORT_DX" alias AudioTagger.Classifier.{SemanticSearch, SemanticSearchInput} def stream_transcription_and_search(live_view_pid, audio_file_path) do {model_info, tokenizer} = AudioTagger.Classifier.SemanticSearch.prepare_model() labels_df = read_labels_from_csv!() # TODO: We could explore storing these vectors within pgvector or Pinecone.io label_embeddings_path = Path.join(__DIR__, "../../#{@input_filename}.bin") label_embeddings = AudioTagger.Classifier.SemanticSearch.load_label_vectors(label_embeddings_path) # Audio transcription + semantic search for {chunk, index} <- TranscriptionServing |> Nx.Serving.batched_run({:file, audio_file_path}) |> Stream.with_index() do # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather # complete sentences based on punctuation. input = %SemanticSearchInput{ labels_df: labels_df, label_embeddings: label_embeddings, model_info: model_info, tokenizer: tokenizer, opts: [similarity_threshold: 0.8] } chunk_result = process_chunk(input, index, chunk) send(live_view_pid, {:transcription_row, chunk_result}) end end defp read_labels_from_csv! do __DIR__ |> Path.join("../../#{@input_filename}.csv") |> Explorer.DataFrame.from_csv!( dtypes: [ {"DIAGNOSIS CODE", :string}, {"LONG DESCRIPTION", :string}, {"SHORT DESCRIPTION", :string} ] ) |> Explorer.DataFrame.select([0, 1, 2]) |> Explorer.DataFrame.rename(["code", "long_description", "short_description"]) end # defp process_chunk(model_tuple, labels_df, label_embeddings, index, chunk) do defp process_chunk(%SemanticSearchInput{} = input, index, chunk) do tags = SemanticSearch.tag_one(input, chunk.text) [start_mark, end_mark] = process_timestamps(chunk) %{ id: index, start_mark: start_mark, end_mark: end_mark, text: chunk.text, tags: tags } end defp process_timestamps(chunk) do for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string() end end end