defmodule MedicalTranscription.CodeSearcher do @moduledoc """ Takes a portion of text and searches for closely matching results within a list of vectors, by using AudioTagger's SemanticSearch module. """ @input_filename "icd9_codelist" alias AudioTagger.Classifier.SemanticSearch alias AudioTagger.Structs.SemanticSearchConfiguration def prepare_search_configuration() do {model_info, tokenizer} = SemanticSearch.prepare_model() labels_df = read_labels_from_csv!() label_embeddings = SemanticSearch.load_label_vectors(vectors_filepath()) %SemanticSearchConfiguration{ labels_df: labels_df, label_embeddings: label_embeddings, model_info: model_info, tokenizer: tokenizer, opts: [similarity_threshold: 0.8] } end def process_chunk(%SemanticSearchConfiguration{} = input, text) do SemanticSearch.tag_one(input, text) end defp read_labels_from_csv! do Explorer.DataFrame.from_csv!( labels_filepath(), dtypes: [ {"code", :string}, {"long_description", :string} ] ) end defp input_filepath() do Path.join(AudioTagger.SampleData.cache_dir(), @input_filename) end defp vectors_filepath(), do: "#{input_filepath()}.bin" defp labels_filepath(), do: "#{input_filepath()}.csv" end