noahsettersten's picture
refactor: Separate transcription and code searching
9f13621
raw
history blame
1.31 kB
defmodule MedicalTranscription.CodeSearcher do
@moduledoc """
Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's
SemanticSearch module.
"""
@input_filename "icd9_codelist"
alias AudioTagger.Classifier.SemanticSearch
alias AudioTagger.Structs.SemanticSearchConfiguration
def prepare_search_configuration() do
{model_info, tokenizer} = SemanticSearch.prepare_model()
labels_df = read_labels_from_csv!()
label_embeddings = SemanticSearch.load_label_vectors(vectors_filepath())
%SemanticSearchConfiguration{
labels_df: labels_df,
label_embeddings: label_embeddings,
model_info: model_info,
tokenizer: tokenizer,
opts: [similarity_threshold: 0.8]
}
end
def process_chunk(%SemanticSearchConfiguration{} = input, text) do
SemanticSearch.tag_one(input, text)
end
defp read_labels_from_csv! do
Explorer.DataFrame.from_csv!(
labels_filepath(),
dtypes: [
{"code", :string},
{"long_description", :string}
]
)
end
defp input_filepath() do
Path.join(AudioTagger.SampleData.cache_dir(), @input_filename)
end
defp vectors_filepath(), do: "#{input_filepath()}.bin"
defp labels_filepath(), do: "#{input_filepath()}.csv"
end