noahsettersten commited on
Commit
bae3e66
1 Parent(s): fc29140

chore: Break processing flow into functions

Browse files
lib/medical_transcription/transcriber.ex CHANGED
@@ -2,11 +2,11 @@ defmodule MedicalTranscription.Transcriber do
2
  @input_filename "CMS32_DESC_LONG_SHORT_DX"
3
 
4
  def stream_transcription_and_search(live_view_pid, audio_file_path) do
5
- {model_info, tokenizer} = AudioTagger.Classifier.SemanticSearch.prepare_model()
6
  labels_df = read_labels_from_csv!()
7
- label_embeddings_path = Path.join(__DIR__, "../../#{@input_filename}.bin")
8
 
9
  # TODO: We could explore storing these vectors within pgvector or Pinecone.io
 
10
  label_embeddings =
11
  AudioTagger.Classifier.SemanticSearch.load_label_vectors(label_embeddings_path)
12
 
@@ -17,26 +17,8 @@ defmodule MedicalTranscription.Transcriber do
17
  |> Stream.with_index() do
18
  # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather
19
  # complete sentences based on punctuation.
20
- tags =
21
- AudioTagger.Classifier.SemanticSearch.tag_one(
22
- {model_info, tokenizer},
23
- labels_df,
24
- label_embeddings,
25
- chunk.text
26
- )
27
-
28
- [start_mark, end_mark] =
29
- for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
30
- seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
31
- end
32
-
33
- chunk_result = %{
34
- id: index,
35
- start_mark: start_mark,
36
- end_mark: end_mark,
37
- text: chunk.text,
38
- tags: tags
39
- }
40
 
41
  send(live_view_pid, {:transcription_row, chunk_result})
42
  end
@@ -55,4 +37,30 @@ defmodule MedicalTranscription.Transcriber do
55
  |> Explorer.DataFrame.select([0, 1, 2])
56
  |> Explorer.DataFrame.rename(["code", "long_description", "short_description"])
57
  end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  end
 
2
  @input_filename "CMS32_DESC_LONG_SHORT_DX"
3
 
4
  def stream_transcription_and_search(live_view_pid, audio_file_path) do
5
+ model_tuple = AudioTagger.Classifier.SemanticSearch.prepare_model()
6
  labels_df = read_labels_from_csv!()
 
7
 
8
  # TODO: We could explore storing these vectors within pgvector or Pinecone.io
9
+ label_embeddings_path = Path.join(__DIR__, "../../#{@input_filename}.bin")
10
  label_embeddings =
11
  AudioTagger.Classifier.SemanticSearch.load_label_vectors(label_embeddings_path)
12
 
 
17
  |> Stream.with_index() do
18
  # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather
19
  # complete sentences based on punctuation.
20
+
21
+ chunk_result = process_chunk(model_tuple, labels_df, label_embeddings, index, chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  send(live_view_pid, {:transcription_row, chunk_result})
24
  end
 
37
  |> Explorer.DataFrame.select([0, 1, 2])
38
  |> Explorer.DataFrame.rename(["code", "long_description", "short_description"])
39
  end
40
+
41
+ defp process_chunk(model_tuple, labels_df, label_embeddings, index, chunk) do
42
+ tags =
43
+ AudioTagger.Classifier.SemanticSearch.tag_one(
44
+ model_tuple,
45
+ labels_df,
46
+ label_embeddings,
47
+ chunk.text
48
+ )
49
+
50
+ [start_mark, end_mark] = process_timestamps(chunk)
51
+
52
+ %{
53
+ id: index,
54
+ start_mark: start_mark,
55
+ end_mark: end_mark,
56
+ text: chunk.text,
57
+ tags: tags
58
+ }
59
+ end
60
+
61
+ defp process_timestamps(chunk) do
62
+ for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
63
+ seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
64
+ end
65
+ end
66
  end