|
defmodule MedicalTranscription.CodeSearcher do |
|
@moduledoc """ |
|
Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's |
|
SemanticSearch module. |
|
""" |
|
|
|
@input_filename "icd9_codelist" |
|
|
|
alias AudioTagger.Classifier.SemanticSearch |
|
alias AudioTagger.Structs.SemanticSearchConfiguration |
|
|
|
def prepare_search_configuration() do |
|
{model_info, tokenizer} = SemanticSearch.prepare_model() |
|
labels_df = read_labels_from_csv!() |
|
|
|
label_embeddings = SemanticSearch.load_label_vectors(vectors_filepath()) |
|
|
|
%SemanticSearchConfiguration{ |
|
labels_df: labels_df, |
|
label_embeddings: label_embeddings, |
|
model_info: model_info, |
|
tokenizer: tokenizer, |
|
opts: [similarity_threshold: 0.8] |
|
} |
|
end |
|
|
|
def process_chunk(%SemanticSearchConfiguration{} = input, text) do |
|
SemanticSearch.tag_one(input, text) |
|
end |
|
|
|
defp read_labels_from_csv! do |
|
Explorer.DataFrame.from_csv!( |
|
labels_filepath(), |
|
dtypes: [ |
|
{"code", :string}, |
|
{"long_description", :string} |
|
] |
|
) |
|
end |
|
|
|
defp input_filepath() do |
|
Path.join(AudioTagger.SampleData.cache_dir(), @input_filename) |
|
end |
|
|
|
defp vectors_filepath(), do: "#{input_filepath()}.bin" |
|
defp labels_filepath(), do: "#{input_filepath()}.csv" |
|
end |
|
|