Spaces:

headway
/

medicode

Runtime error

App Files Files Community

noahsettersten commited on Jan 10

Commit

bae3e66

•

1 Parent(s): fc29140

chore: Break processing flow into functions

Browse files

Files changed (1) hide show

lib/medical_transcription/transcriber.ex +30 -22

lib/medical_transcription/transcriber.ex CHANGED Viewed

@@ -2,11 +2,11 @@ defmodule MedicalTranscription.Transcriber do
   @input_filename "CMS32_DESC_LONG_SHORT_DX"
   def stream_transcription_and_search(live_view_pid, audio_file_path) do
-    {model_info, tokenizer} = AudioTagger.Classifier.SemanticSearch.prepare_model()
     labels_df = read_labels_from_csv!()
-    label_embeddings_path = Path.join(__DIR__, "../../#{@input_filename}.bin")
     # TODO: We could explore storing these vectors within pgvector or Pinecone.io
     label_embeddings =
       AudioTagger.Classifier.SemanticSearch.load_label_vectors(label_embeddings_path)
@@ -17,26 +17,8 @@ defmodule MedicalTranscription.Transcriber do
           |> Stream.with_index() do
       # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather
       # complete sentences based on punctuation.
-      tags =
-        AudioTagger.Classifier.SemanticSearch.tag_one(
-          {model_info, tokenizer},
-          labels_df,
-          label_embeddings,
-          chunk.text
-        )
-      [start_mark, end_mark] =
-        for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
-          seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
-        end
-      chunk_result = %{
-        id: index,
-        start_mark: start_mark,
-        end_mark: end_mark,
-        text: chunk.text,
-        tags: tags
-      }
       send(live_view_pid, {:transcription_row, chunk_result})
     end
@@ -55,4 +37,30 @@ defmodule MedicalTranscription.Transcriber do
     |> Explorer.DataFrame.select([0, 1, 2])
     |> Explorer.DataFrame.rename(["code", "long_description", "short_description"])
   end
 end

   @input_filename "CMS32_DESC_LONG_SHORT_DX"
   def stream_transcription_and_search(live_view_pid, audio_file_path) do
+    model_tuple = AudioTagger.Classifier.SemanticSearch.prepare_model()
     labels_df = read_labels_from_csv!()
     # TODO: We could explore storing these vectors within pgvector or Pinecone.io
+    label_embeddings_path = Path.join(__DIR__, "../../#{@input_filename}.bin")
     label_embeddings =
       AudioTagger.Classifier.SemanticSearch.load_label_vectors(label_embeddings_path)
           |> Stream.with_index() do
       # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather
       # complete sentences based on punctuation.
+      chunk_result = process_chunk(model_tuple, labels_df, label_embeddings, index, chunk)
       send(live_view_pid, {:transcription_row, chunk_result})
     end
     |> Explorer.DataFrame.select([0, 1, 2])
     |> Explorer.DataFrame.rename(["code", "long_description", "short_description"])
   end
+  defp process_chunk(model_tuple, labels_df, label_embeddings, index, chunk) do
+    tags =
+      AudioTagger.Classifier.SemanticSearch.tag_one(
+        model_tuple,
+        labels_df,
+        label_embeddings,
+        chunk.text
+      )
+    [start_mark, end_mark] = process_timestamps(chunk)
+    %{
+      id: index,
+      start_mark: start_mark,
+      end_mark: end_mark,
+      text: chunk.text,
+      tags: tags
+    }
+  end
+  defp process_timestamps(chunk) do
+      for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
+        seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
+      end
+  end
 end