Spaces:

headway
/

medicode

Runtime error

App Files Files Community

timgremore commited on Jan 12

Commit

d8e387b

•

1 Parent(s): 5aee375

feat: Notebook for training a model

Browse files

This notebook assumes 2 files exist in your Livebook: fraudTest.csv and fraudTrain.csv. These files can be found on Kaggle.com "Credit Card Transactions Fraud Detection Dataset".

Files changed (1) hide show

livebooks/training.livemd +190 -0

livebooks/training.livemd ADDED Viewed

	@@ -0,0 +1,190 @@

+<!-- livebook:{"file_entries":[{"name":"fraudTest.csv","type":"attachment"},{"name":"fraudTrain.csv","type":"attachment"}]} -->
+# Training
+```elixir
+Mix.install(
+  [
+    {:kino_bumblebee, "~> 0.4.0"},
+    {:exla, ">= 0.0.0"},
+    {:kino, "~> 0.11.0"},
+    {:kino_explorer, "~> 0.1.11"}
+  ],
+  config: [nx: [default_backend: EXLA.Backend]]
+)
+```
+## Section
+```elixir
+# {:ok, spec} = Bumblebee.load_spec({:hf, ""})
+```
+```elixir
+training_df =
+  Kino.FS.file_path("fraudTrain.csv")
+  |> Explorer.DataFrame.from_csv!()
+  |> Explorer.DataFrame.select(["merchant", "category"])
+```
+```elixir
+test_df =
+  Kino.FS.file_path("fraudTest.csv")
+  |> Explorer.DataFrame.from_csv!()
+  |> Explorer.DataFrame.select(["merchant", "category"])
+```
+```elixir
+labels =
+  training_df
+  |> Explorer.DataFrame.distinct(["category"])
+  |> Explorer.DataFrame.to_series()
+  |> Map.get("category")
+  |> Explorer.Series.to_list()
+```
+```elixir
+model_name = "facebook/bart-large-mnli"
+{:ok, spec} =
+  Bumblebee.load_spec({:hf, model_name},
+    architecture: :for_sequence_classification
+  )
+num_labels = Enum.count(labels)
+id_to_label =
+  labels
+  |> Enum.with_index(fn item, index -> {index, item} end)
+  |> Enum.into(%{})
+spec =
+  Bumblebee.configure(spec, num_labels: num_labels, id_to_label: id_to_label)
+{:ok, model_info} = Bumblebee.load_model({:hf, model_name}, spec: spec)
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
+# serving =
+#   Bumblebee.Text.zero_shot_classification(model_info, tokenizer, labels,
+#     compile: [batch_size: 1, sequence_length: 100],
+#     defn_options: [compiler: EXLA]
+#   )
+```
+```elixir
+defmodule Finance do
+  def load(df, tokenizer, opts \\ []) do
+    df
+    |> stream()
+    |> tokenize_and_batch(
+      tokenizer,
+      opts[:batch_size],
+      opts[:sequence_length],
+      opts[:id_to_label]
+    )
+  end
+  def stream(df) do
+    xs = df["merchant"]
+    ys = df["category"]
+    xs
+    |> Explorer.Series.to_enum()
+    |> Stream.zip(Explorer.Series.to_enum(ys))
+  end
+  def tokenize_and_batch(stream, tokenizer, batch_size, sequence_length, id_to_label) do
+    stream
+    |> Stream.chunk_every(batch_size)
+    |> Stream.map(fn batch ->
+      {text, labels} = Enum.unzip(batch)
+      id_to_label_values = id_to_label |> Map.values()
+      label_ids =
+        Enum.map(labels, fn item ->
+          Enum.find_index(id_to_label_values, fn label_value -> label_value == item end)
+        end)
+      tokenized = Bumblebee.apply_tokenizer(tokenizer, text, length: sequence_length)
+      {tokenized, Nx.stack(label_ids)}
+    end)
+  end
+end
+```
+```elixir
+batch_size = 32
+sequence_length = 64
+train_data =
+  training_df
+  |> Finance.load(tokenizer,
+    batch_size: batch_size,
+    sequence_length: sequence_length,
+    id_to_label: id_to_label
+  )
+test_data =
+  test_df
+  |> Finance.load(tokenizer,
+    batch_size: batch_size,
+    sequence_length: sequence_length,
+    id_to_label: id_to_label
+  )
+```
+```elixir
+train_data = Enum.take(train_data, 250)
+test_data = Enum.take(test_data, 50)
+:ok
+```
+```elixir
+%{model: model, params: params} = model_info
+model
+```
+```elixir
+[{input, _}] = Enum.take(train_data, 1)
+Axon.get_output_shape(model, input)
+```
+```elixir
+logits_model = Axon.nx(model, & &1.logits)
+```
+```elixir
+loss =
+  &Axon.Losses.categorical_cross_entropy(&1, &2,
+    reduction: :mean,
+    from_logits: true,
+    sparse: true
+  )
+optimizer = Polaris.Optimizers.adam(learning_rate: 5.0e-5)
+loop = Axon.Loop.trainer(logits_model, loss, optimizer, log: 1)
+```
+```elixir
+accuracy = &Axon.Metrics.accuracy(&1, &2, from_logits: true, sparse: true)
+loop = Axon.Loop.metric(loop, accuracy, "accuracy")
+```
+```elixir
+loop = Axon.Loop.checkpoint(loop, event: :epoch_completed)
+```
+```elixir
+trained_model_state =
+  logits_model
+  |> Axon.Loop.trainer(loss, optimizer, log: 1)
+  |> Axon.Loop.metric(accuracy, "accuracy")
+  |> Axon.Loop.checkpoint(event: :epoch_completed)
+  |> Axon.Loop.run(train_data, params, epochs: 3, compiler: EXLA, strict?: false)
+:ok
+```