noahsettersten commited on
Commit
9f13621
1 Parent(s): 7ae16f7

refactor: Separate transcription and code searching

Browse files

- Also, make updates for the renaming of AudioTagger modules (e.g.
`Structs` namespace, `SemanticSearchConfiguration` struct).

lib/medical_transcription/code_searcher.ex ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defmodule MedicalTranscription.CodeSearcher do
2
+ @moduledoc """
3
+ Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
4
+ SemanticSearch module.
5
+ """
6
+
7
+ @input_filename "icd9_codelist"
8
+
9
+ alias AudioTagger.Classifier.SemanticSearch
10
+ alias AudioTagger.Structs.SemanticSearchConfiguration
11
+
12
+ def prepare_search_configuration() do
13
+ {model_info, tokenizer} = SemanticSearch.prepare_model()
14
+ labels_df = read_labels_from_csv!()
15
+
16
+ label_embeddings = SemanticSearch.load_label_vectors(vectors_filepath())
17
+
18
+ %SemanticSearchConfiguration{
19
+ labels_df: labels_df,
20
+ label_embeddings: label_embeddings,
21
+ model_info: model_info,
22
+ tokenizer: tokenizer,
23
+ opts: [similarity_threshold: 0.8]
24
+ }
25
+ end
26
+
27
+ def process_chunk(%SemanticSearchConfiguration{} = input, text) do
28
+ SemanticSearch.tag_one(input, text)
29
+ end
30
+
31
+ defp read_labels_from_csv! do
32
+ Explorer.DataFrame.from_csv!(
33
+ labels_filepath(),
34
+ dtypes: [
35
+ {"code", :string},
36
+ {"long_description", :string}
37
+ ]
38
+ )
39
+ end
40
+
41
+ defp input_filepath() do
42
+ Path.join(AudioTagger.SampleData.cache_dir(), @input_filename)
43
+ end
44
+
45
+ defp vectors_filepath(), do: "#{input_filepath()}.bin"
46
+ defp labels_filepath(), do: "#{input_filepath()}.csv"
47
+ end
lib/medical_transcription/transcriber.ex CHANGED
@@ -1,74 +1,46 @@
1
  defmodule MedicalTranscription.Transcriber do
2
- @input_filename "icd9_codelist"
3
-
4
- alias AudioTagger.Classifier.{SemanticSearch, SemanticSearchInput}
5
-
 
 
 
 
 
 
 
 
 
6
  def stream_transcription_and_search(live_view_pid, audio_file_path) do
7
- {model_info, tokenizer} = AudioTagger.Classifier.SemanticSearch.prepare_model()
8
-
9
- labels_df = read_labels_from_csv!()
10
-
11
- # TODO: We could explore storing these vectors within pgvector or Pinecone.io
12
- label_embeddings =
13
- AudioTagger.Classifier.SemanticSearch.load_label_vectors(vectors_filepath())
14
 
15
  # Audio transcription + semantic search
16
- for {chunk, index} <-
17
- TranscriptionServing
18
- |> Nx.Serving.batched_run({:file, audio_file_path})
19
- |> Stream.with_index() do
20
- # TODO: A potential improvement would be to not code each chunk of transcribed audio, but to instead gather
21
- # complete sentences based on punctuation.
22
 
23
- input = %SemanticSearchInput{
24
- labels_df: labels_df,
25
- label_embeddings: label_embeddings,
26
- model_info: model_info,
27
- tokenizer: tokenizer,
28
- opts: [similarity_threshold: 0.8]
29
- }
30
-
31
- chunk_result = process_chunk(input, index, chunk)
32
-
33
- send(live_view_pid, {:transcription_row, chunk_result})
34
  end
35
  end
36
 
37
- defp read_labels_from_csv! do
38
- column_definitions = [
39
- {"code", :string},
40
- {"long_description", :string}
41
- ]
42
-
43
- Explorer.DataFrame.from_csv!(labels_filepath(), dtypes: column_definitions)
44
  end
45
 
46
- # defp process_chunk(model_tuple, labels_df, label_embeddings, index, chunk) do
47
- defp process_chunk(%SemanticSearchInput{} = input, index, chunk) do
48
- tags = SemanticSearch.tag_one(input, chunk.text)
49
-
50
- [start_mark, end_mark] = process_timestamps(chunk)
51
-
52
  %{
53
  id: index,
54
- start_mark: start_mark,
55
- end_mark: end_mark,
56
  text: chunk.text,
57
  tags: tags
58
  }
59
  end
60
 
61
- defp process_timestamps(chunk) do
62
- for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
63
- seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
64
- end
65
- end
66
-
67
- defp input_filepath() do
68
- AudioTagger.SampleData.cache_dir()
69
- |> Path.join(@input_filename)
70
  end
71
-
72
- defp vectors_filepath(), do: "#{input_filepath()}.bin"
73
- defp labels_filepath(), do: "#{input_filepath()}.csv"
74
  end
 
1
  defmodule MedicalTranscription.Transcriber do
2
+ @moduledoc """
3
+ Takes a path to an audio file and transcribes it to text. As each chunk is available, it passes it to `CodeSearcher`
4
+ to look for possible matching codes.
5
+ """
6
+
7
+ alias MedicalTranscription.CodeSearcher
8
+
9
+ # Ideas for future exploration:
10
+ # - Instead of storing the long description vectors in a binary file on disk, we could store them within a vector DB
11
+ # (such as pgvector or Pinecone.io)
12
+ # - A potential improvement would be to not code each chunk of transcribed audio separately, but to instead gather
13
+ # complete sentences based on punctuation. We may want to suggest codes for the entire audio as a single piece as
14
+ # well
15
  def stream_transcription_and_search(live_view_pid, audio_file_path) do
16
+ search_configuration = CodeSearcher.prepare_search_configuration()
 
 
 
 
 
 
17
 
18
  # Audio transcription + semantic search
19
+ for {chunk, index} <- stream_transcription(audio_file_path) do
20
+ tags = CodeSearcher.process_chunk(search_configuration, chunk.text)
21
+ result = build_result(index, chunk, tags)
 
 
 
22
 
23
+ send(live_view_pid, {:transcription_row, result})
 
 
 
 
 
 
 
 
 
 
24
  end
25
  end
26
 
27
+ defp stream_transcription(audio_file_path) do
28
+ TranscriptionServing
29
+ |> Nx.Serving.batched_run({:file, audio_file_path})
30
+ |> Stream.with_index()
 
 
 
31
  end
32
 
33
+ defp build_result(index, chunk, tags) do
 
 
 
 
 
34
  %{
35
  id: index,
36
+ start_mark: format_timestamp(chunk.start_timestamp_seconds),
37
+ end_mark: format_timestamp(chunk.end_timestamp_seconds),
38
  text: chunk.text,
39
  tags: tags
40
  }
41
  end
42
 
43
+ defp format_timestamp(seconds) do
44
+ seconds |> round() |> Time.from_seconds_after_midnight() |> Time.to_string()
 
 
 
 
 
 
 
45
  end
 
 
 
46
  end