Spaces:

headway
/

medicode

Runtime error

App Files Files Community

noahsettersten commited on Jan 19

Commit

9f13621

•

1 Parent(s): 7ae16f7

refactor: Separate transcription and code searching

Browse files

- Also, make updates for the renaming of AudioTagger modules (e.g.
`Structs` namespace, `SemanticSearchConfiguration` struct).

Files changed (2) hide show

lib/medical_transcription/code_searcher.ex +47 -0
lib/medical_transcription/transcriber.ex +27 -55

lib/medical_transcription/code_searcher.ex ADDED Viewed

	@@ -0,0 +1,47 @@

+defmodule MedicalTranscription.CodeSearcher do
+  @moduledoc """
+  Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
+  SemanticSearch module.
+  """
+  @input_filename "icd9_codelist"
+  alias AudioTagger.Classifier.SemanticSearch
+  alias AudioTagger.Structs.SemanticSearchConfiguration
+  def prepare_search_configuration() do
+    {model_info, tokenizer} = SemanticSearch.prepare_model()
+    labels_df = read_labels_from_csv!()
+    label_embeddings = SemanticSearch.load_label_vectors(vectors_filepath())
+    %SemanticSearchConfiguration{
+      labels_df: labels_df,
+      label_embeddings: label_embeddings,
+      model_info: model_info,
+      tokenizer: tokenizer,
+      opts: [similarity_threshold: 0.8]
+    }
+  end
+  def process_chunk(%SemanticSearchConfiguration{} = input, text) do
+    SemanticSearch.tag_one(input, text)
+  end
+  defp read_labels_from_csv! do
+    Explorer.DataFrame.from_csv!(
+      labels_filepath(),
+      dtypes: [
+        {"code", :string},
+        {"long_description", :string}
+      ]
+    )
+  end
+  defp input_filepath() do
+    Path.join(AudioTagger.SampleData.cache_dir(), @input_filename)
+  end
+  defp vectors_filepath(), do: "#{input_filepath()}.bin"
+  defp labels_filepath(), do: "#{input_filepath()}.csv"
+end

lib/medical_transcription/transcriber.ex CHANGED Viewed

@@ -1,74 +1,46 @@
 defmodule MedicalTranscription.Transcriber do
-  @input_filename "icd9_codelist"
-  alias AudioTagger.Classifier.{SemanticSearch, SemanticSearchInput}
   def stream_transcription_and_search(live_view_pid, audio_file_path) do
-    {model_info, tokenizer} = AudioTagger.Classifier.SemanticSearch.prepare_model()
-    labels_df = read_labels_from_csv!()
-    # TODO: We could explore storing these vectors within pgvector or Pinecone.io
-    label_embeddings =
-      AudioTagger.Classifier.SemanticSearch.load_label_vectors(vectors_filepath())
     # Audio transcription + semantic search
-    for {chunk, index} <-
-          TranscriptionServing
-          |> Nx.Serving.batched_run({:file, audio_file_path})
-          |> Stream.with_index() do
-      # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather
-      # complete sentences based on punctuation.
-      input = %SemanticSearchInput{
-        labels_df: labels_df,
-        label_embeddings: label_embeddings,
-        model_info: model_info,
-        tokenizer: tokenizer,
-        opts: [similarity_threshold: 0.8]
-      }
-      chunk_result = process_chunk(input, index, chunk)
-      send(live_view_pid, {:transcription_row, chunk_result})
     end
   end
-  defp read_labels_from_csv! do
-    column_definitions = [
-      {"code", :string},
-      {"long_description", :string}
-    ]
-    Explorer.DataFrame.from_csv!(labels_filepath(), dtypes: column_definitions)
   end
-  # defp process_chunk(model_tuple, labels_df, label_embeddings, index, chunk) do
-  defp process_chunk(%SemanticSearchInput{} = input, index, chunk) do
-    tags = SemanticSearch.tag_one(input, chunk.text)
-    [start_mark, end_mark] = process_timestamps(chunk)
     %{
       id: index,
-      start_mark: start_mark,
-      end_mark: end_mark,
       text: chunk.text,
       tags: tags
     }
   end
-  defp process_timestamps(chunk) do
-    for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
-      seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
-    end
-  end
-  defp input_filepath() do
-    AudioTagger.SampleData.cache_dir()
-    |> Path.join(@input_filename)
   end
-  defp vectors_filepath(), do: "#{input_filepath()}.bin"
-  defp labels_filepath(), do: "#{input_filepath()}.csv"
 end

 defmodule MedicalTranscription.Transcriber do
+  @moduledoc """
+  Takes a path to an audio file and transcribes it to text. As each chunk is available, it passes it to `CodeSearcher`
+  to look for possible matching codes.
+  """
+  alias MedicalTranscription.CodeSearcher
+  # Ideas for future exploration:
+  # - Instead of storing the long description vectors in a binary file on disk, we could store them within a vector DB
+  #   (such as pgvector or Pinecone.io)
+  # - A potential improvement would be to not code each chunk of transcribed audio separately, but to instead gather
+  #   complete sentences based on punctuation. We may want to suggest codes for the entire audio as a single piece as
+  #   well
   def stream_transcription_and_search(live_view_pid, audio_file_path) do
+    search_configuration = CodeSearcher.prepare_search_configuration()
     # Audio transcription + semantic search
+    for {chunk, index} <- stream_transcription(audio_file_path) do
+      tags = CodeSearcher.process_chunk(search_configuration, chunk.text)
+      result = build_result(index, chunk, tags)
+      send(live_view_pid, {:transcription_row, result})
     end
   end
+  defp stream_transcription(audio_file_path) do
+    TranscriptionServing
+    |> Nx.Serving.batched_run({:file, audio_file_path})
+    |> Stream.with_index()
   end
+  defp build_result(index, chunk, tags) do
     %{
       id: index,
+      start_mark: format_timestamp(chunk.start_timestamp_seconds),
+      end_mark: format_timestamp(chunk.end_timestamp_seconds),
       text: chunk.text,
       tags: tags
     }
   end
+  defp format_timestamp(seconds) do
+    seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
   end
 end