medicode / livebooks /training.livemd
timgremore's picture
feat: Notebook for training a model
d8e387b
raw
history blame
4.02 kB
<!-- livebook:{"file_entries":[{"name":"fraudTest.csv","type":"attachment"},{"name":"fraudTrain.csv","type":"attachment"}]} -->
# Training
```elixir
Mix.install(
[
{:kino_bumblebee, "~> 0.4.0"},
{:exla, ">= 0.0.0"},
{:kino, "~> 0.11.0"},
{:kino_explorer, "~> 0.1.11"}
],
config: [nx: [default_backend: EXLA.Backend]]
)
```
## Section
```elixir
# {:ok, spec} = Bumblebee.load_spec({:hf, ""})
```
```elixir
training_df =
Kino.FS.file_path("fraudTrain.csv")
|> Explorer.DataFrame.from_csv!()
|> Explorer.DataFrame.select(["merchant", "category"])
```
```elixir
test_df =
Kino.FS.file_path("fraudTest.csv")
|> Explorer.DataFrame.from_csv!()
|> Explorer.DataFrame.select(["merchant", "category"])
```
```elixir
labels =
training_df
|> Explorer.DataFrame.distinct(["category"])
|> Explorer.DataFrame.to_series()
|> Map.get("category")
|> Explorer.Series.to_list()
```
```elixir
model_name = "facebook/bart-large-mnli"
{:ok, spec} =
Bumblebee.load_spec({:hf, model_name},
architecture: :for_sequence_classification
)
num_labels = Enum.count(labels)
id_to_label =
labels
|> Enum.with_index(fn item, index -> {index, item} end)
|> Enum.into(%{})
spec =
Bumblebee.configure(spec, num_labels: num_labels, id_to_label: id_to_label)
{:ok, model_info} = Bumblebee.load_model({:hf, model_name}, spec: spec)
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
# serving =
# Bumblebee.Text.zero_shot_classification(model_info, tokenizer, labels,
# compile: [batch_size: 1, sequence_length: 100],
# defn_options: [compiler: EXLA]
# )
```
```elixir
defmodule Finance do
def load(df, tokenizer, opts \\ []) do
df
|> stream()
|> tokenize_and_batch(
tokenizer,
opts[:batch_size],
opts[:sequence_length],
opts[:id_to_label]
)
end
def stream(df) do
xs = df["merchant"]
ys = df["category"]
xs
|> Explorer.Series.to_enum()
|> Stream.zip(Explorer.Series.to_enum(ys))
end
def tokenize_and_batch(stream, tokenizer, batch_size, sequence_length, id_to_label) do
stream
|> Stream.chunk_every(batch_size)
|> Stream.map(fn batch ->
{text, labels} = Enum.unzip(batch)
id_to_label_values = id_to_label |> Map.values()
label_ids =
Enum.map(labels, fn item ->
Enum.find_index(id_to_label_values, fn label_value -> label_value == item end)
end)
tokenized = Bumblebee.apply_tokenizer(tokenizer, text, length: sequence_length)
{tokenized, Nx.stack(label_ids)}
end)
end
end
```
```elixir
batch_size = 32
sequence_length = 64
train_data =
training_df
|> Finance.load(tokenizer,
batch_size: batch_size,
sequence_length: sequence_length,
id_to_label: id_to_label
)
test_data =
test_df
|> Finance.load(tokenizer,
batch_size: batch_size,
sequence_length: sequence_length,
id_to_label: id_to_label
)
```
```elixir
train_data = Enum.take(train_data, 250)
test_data = Enum.take(test_data, 50)
:ok
```
```elixir
%{model: model, params: params} = model_info
model
```
```elixir
[{input, _}] = Enum.take(train_data, 1)
Axon.get_output_shape(model, input)
```
```elixir
logits_model = Axon.nx(model, & &1.logits)
```
```elixir
loss =
&Axon.Losses.categorical_cross_entropy(&1, &2,
reduction: :mean,
from_logits: true,
sparse: true
)
optimizer = Polaris.Optimizers.adam(learning_rate: 5.0e-5)
loop = Axon.Loop.trainer(logits_model, loss, optimizer, log: 1)
```
```elixir
accuracy = &Axon.Metrics.accuracy(&1, &2, from_logits: true, sparse: true)
loop = Axon.Loop.metric(loop, accuracy, "accuracy")
```
```elixir
loop = Axon.Loop.checkpoint(loop, event: :epoch_completed)
```
```elixir
trained_model_state =
logits_model
|> Axon.Loop.trainer(loss, optimizer, log: 1)
|> Axon.Loop.metric(accuracy, "accuracy")
|> Axon.Loop.checkpoint(event: :epoch_completed)
|> Axon.Loop.run(train_data, params, epochs: 3, compiler: EXLA, strict?: false)
:ok
```