timgremore commited on
Commit
7f7d174
1 Parent(s): 7e656f0

feat: Support ICD-10 bulk inserts with notebook

Browse files
livebooks/icd-10-codes.livemd ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ICD-10 Codes
2
+
3
+ ## Section
4
+
5
+ ```elixir
6
+ defmodule VectorPrecomputation do
7
+ def precompute_vectors(df, _frame) do
8
+ num_rows = Explorer.DataFrame.n_rows(df)
9
+ now = DateTime.utc_now() |> DateTime.truncate(:second)
10
+
11
+ ProgressBar.render(0, num_rows, suffix: :count)
12
+
13
+ params =
14
+ df
15
+ |> Explorer.DataFrame.to_rows_stream()
16
+ |> Stream.filter(fn %{"code" => code} -> String.length(code) > 0 end)
17
+ |> Stream.with_index()
18
+ |> Enum.map(fn {%{"code" => code, "long_description" => description}, index} ->
19
+ vector_for_db = Medicode.Coding.compute_vector_as_list(description)
20
+
21
+ ProgressBar.render(index + 1, num_rows, suffix: :count)
22
+
23
+ %{
24
+ id: Ecto.UUID.generate(),
25
+ code: code,
26
+ description: description,
27
+ description_vector: vector_for_db,
28
+ inserted_at: now,
29
+ updated_at: now
30
+ }
31
+ end)
32
+
33
+ Medicode.Repo.insert_all(Medicode.Coding.CodeVector, params,
34
+ on_conflict: :replace_all,
35
+ conflict_target: [:code]
36
+ )
37
+ end
38
+
39
+ def foo_compute_vector_for_code(code, description) do
40
+ vector_for_db = Medicode.Coding.compute_vector_as_list(description)
41
+
42
+ params =
43
+ %{
44
+ code: code,
45
+ description: description,
46
+ description_vector: vector_for_db
47
+ }
48
+
49
+ changeset = Medicode.Coding.CodeVector.changeset(%Medicode.Coding.CodeVector{}, params)
50
+
51
+ case Medicode.Repo.insert(changeset) do
52
+ {:ok, _} ->
53
+ {:ok, "Success!"}
54
+
55
+ {:error, changeset} ->
56
+ {:error, changeset}
57
+ end
58
+ end
59
+ end
60
+ ```
61
+
62
+ ```elixir
63
+ %Req.Response{body: body} =
64
+ "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2024-Update/icd10cm-Codes-Descriptions-April-2024.zip"
65
+ |> Req.get!()
66
+
67
+ frame = Kino.Frame.new() |> Kino.render()
68
+
69
+ tmp_dir = System.tmp_dir!()
70
+ tmp_file = Path.join(tmp_dir, "icd10cm-Codes-Descriptions.zip")
71
+
72
+ File.write!(tmp_file, body)
73
+
74
+ {:ok, files} = :zip.unzip(String.to_charlist(tmp_file), [{:cwd, tmp_dir}])
75
+
76
+ icd10cm_codes_file =
77
+ Enum.find(files, fn item ->
78
+ String.contains?(to_string(item), "icd10cm-codes-April-2024")
79
+ end)
80
+
81
+ csv_data =
82
+ icd10cm_codes_file
83
+ |> File.read!()
84
+ |> String.split("\n")
85
+ |> Enum.map(fn line ->
86
+ split = String.split(line, " ", parts: 2)
87
+ code = Enum.at(split, 0)
88
+ long_description = Enum.at(split, 1)
89
+
90
+ description =
91
+ case is_binary(long_description) do
92
+ true -> String.trim(long_description)
93
+ false -> ""
94
+ end
95
+
96
+ "\"#{code}\",\"#{description}\""
97
+ end)
98
+
99
+ csv_data =
100
+ ["\"code\",\"long_description\""] ++ csv_data
101
+
102
+ {:ok, df} =
103
+ csv_data
104
+ |> Enum.join("\n")
105
+ |> Explorer.DataFrame.load_csv(header: true, delimiter: ",")
106
+
107
+ n_rows = Explorer.DataFrame.n_rows(df)
108
+ number_in_group = 500
109
+ number_of_groups = floor(n_rows / number_in_group)
110
+
111
+ Enum.each(146..number_of_groups, fn n ->
112
+ Kino.Frame.clear(frame)
113
+ Kino.Frame.append(frame, "Processing group #{n + 1} of #{number_of_groups}")
114
+
115
+ df
116
+ |> Explorer.DataFrame.slice(n * number_in_group, number_in_group)
117
+ |> VectorPrecomputation.precompute_vectors(frame)
118
+ end)
119
+
120
+ Kino.DataTable.new(df)
121
+ ```
priv/repo/migrations/20240306143720_add_unique_constraint_to_code_vectors_code.exs ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defmodule Medicode.Repo.Migrations.AddUniqueConstraintToCodeVectorsCode do
2
+ use Ecto.Migration
3
+
4
+ def change do
5
+ create(
6
+ unique_index(
7
+ :code_vectors,
8
+ :code,
9
+ name: :index_for_code_vector_code_uniqueness
10
+ )
11
+ )
12
+ end
13
+ end