# Training ```elixir Mix.install( [ {:kino_bumblebee, "~> 0.4.0"}, {:exla, ">= 0.0.0"}, {:kino, "~> 0.11.0"}, {:kino_explorer, "~> 0.1.11"} ], config: [nx: [default_backend: EXLA.Backend]] ) ``` ## Section ```elixir # {:ok, spec} = Bumblebee.load_spec({:hf, ""}) ``` ```elixir training_df = Kino.FS.file_path("fraudTrain.csv") |> Explorer.DataFrame.from_csv!() |> Explorer.DataFrame.select(["merchant", "category"]) ``` ```elixir test_df = Kino.FS.file_path("fraudTest.csv") |> Explorer.DataFrame.from_csv!() |> Explorer.DataFrame.select(["merchant", "category"]) ``` ```elixir labels = training_df |> Explorer.DataFrame.distinct(["category"]) |> Explorer.DataFrame.to_series() |> Map.get("category") |> Explorer.Series.to_list() ``` ```elixir model_name = "facebook/bart-large-mnli" {:ok, spec} = Bumblebee.load_spec({:hf, model_name}, architecture: :for_sequence_classification ) num_labels = Enum.count(labels) id_to_label = labels |> Enum.with_index(fn item, index -> {index, item} end) |> Enum.into(%{}) spec = Bumblebee.configure(spec, num_labels: num_labels, id_to_label: id_to_label) {:ok, model_info} = Bumblebee.load_model({:hf, model_name}, spec: spec) {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name}) # serving = # Bumblebee.Text.zero_shot_classification(model_info, tokenizer, labels, # compile: [batch_size: 1, sequence_length: 100], # defn_options: [compiler: EXLA] # ) ``` ```elixir defmodule Finance do def load(df, tokenizer, opts \\ []) do df |> stream() |> tokenize_and_batch( tokenizer, opts[:batch_size], opts[:sequence_length], opts[:id_to_label] ) end def stream(df) do xs = df["merchant"] ys = df["category"] xs |> Explorer.Series.to_enum() |> Stream.zip(Explorer.Series.to_enum(ys)) end def tokenize_and_batch(stream, tokenizer, batch_size, sequence_length, id_to_label) do stream |> Stream.chunk_every(batch_size) |> Stream.map(fn batch -> {text, labels} = Enum.unzip(batch) id_to_label_values = id_to_label |> Map.values() label_ids = Enum.map(labels, fn item -> Enum.find_index(id_to_label_values, fn label_value -> label_value == item end) end) tokenized = Bumblebee.apply_tokenizer(tokenizer, text, length: sequence_length) {tokenized, Nx.stack(label_ids)} end) end end ``` ```elixir batch_size = 32 sequence_length = 64 train_data = training_df |> Finance.load(tokenizer, batch_size: batch_size, sequence_length: sequence_length, id_to_label: id_to_label ) test_data = test_df |> Finance.load(tokenizer, batch_size: batch_size, sequence_length: sequence_length, id_to_label: id_to_label ) ``` ```elixir train_data = Enum.take(train_data, 250) test_data = Enum.take(test_data, 50) :ok ``` ```elixir %{model: model, params: params} = model_info model ``` ```elixir [{input, _}] = Enum.take(train_data, 1) Axon.get_output_shape(model, input) ``` ```elixir logits_model = Axon.nx(model, & &1.logits) ``` ```elixir loss = &Axon.Losses.categorical_cross_entropy(&1, &2, reduction: :mean, from_logits: true, sparse: true ) optimizer = Polaris.Optimizers.adam(learning_rate: 5.0e-5) loop = Axon.Loop.trainer(logits_model, loss, optimizer, log: 1) ``` ```elixir accuracy = &Axon.Metrics.accuracy(&1, &2, from_logits: true, sparse: true) loop = Axon.Loop.metric(loop, accuracy, "accuracy") ``` ```elixir loop = Axon.Loop.checkpoint(loop, event: :epoch_completed) ``` ```elixir trained_model_state = logits_model |> Axon.Loop.trainer(loss, optimizer, log: 1) |> Axon.Loop.metric(accuracy, "accuracy") |> Axon.Loop.checkpoint(event: :epoch_completed) |> Axon.Loop.run(train_data, params, epochs: 3, compiler: EXLA, strict?: false) :ok ```