|
<!-- livebook:{"file_entries":[{"name":"fraudTest.csv","type":"attachment"},{"name":"fraudTrain.csv","type":"attachment"}]} --> |
|
|
|
|
|
|
|
```elixir |
|
Mix.install( |
|
[ |
|
{:kino_bumblebee, "~> 0.4.0"}, |
|
{:exla, ">= 0.0.0"}, |
|
{:kino, "~> 0.11.0"}, |
|
{:kino_explorer, "~> 0.1.11"} |
|
], |
|
config: [nx: [default_backend: EXLA.Backend]] |
|
) |
|
``` |
|
|
|
|
|
|
|
```elixir |
|
|
|
``` |
|
|
|
```elixir |
|
training_df = |
|
Kino.FS.file_path("fraudTrain.csv") |
|
|> Explorer.DataFrame.from_csv!() |
|
|> Explorer.DataFrame.select(["merchant", "category"]) |
|
``` |
|
|
|
```elixir |
|
test_df = |
|
Kino.FS.file_path("fraudTest.csv") |
|
|> Explorer.DataFrame.from_csv!() |
|
|> Explorer.DataFrame.select(["merchant", "category"]) |
|
``` |
|
|
|
```elixir |
|
labels = |
|
training_df |
|
|> Explorer.DataFrame.distinct(["category"]) |
|
|> Explorer.DataFrame.to_series() |
|
|> Map.get("category") |
|
|> Explorer.Series.to_list() |
|
``` |
|
|
|
```elixir |
|
model_name = "facebook/bart-large-mnli" |
|
|
|
{:ok, spec} = |
|
Bumblebee.load_spec({:hf, model_name}, |
|
architecture: :for_sequence_classification |
|
) |
|
|
|
num_labels = Enum.count(labels) |
|
|
|
id_to_label = |
|
labels |
|
|> Enum.with_index(fn item, index -> {index, item} end) |
|
|> Enum.into(%{}) |
|
|
|
spec = |
|
Bumblebee.configure(spec, num_labels: num_labels, id_to_label: id_to_label) |
|
|
|
{:ok, model_info} = Bumblebee.load_model({:hf, model_name}, spec: spec) |
|
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
```elixir |
|
defmodule Finance do |
|
def load(df, tokenizer, opts \\ []) do |
|
df |
|
|> stream() |
|
|> tokenize_and_batch( |
|
tokenizer, |
|
opts[:batch_size], |
|
opts[:sequence_length], |
|
opts[:id_to_label] |
|
) |
|
end |
|
|
|
def stream(df) do |
|
xs = df["merchant"] |
|
ys = df["category"] |
|
|
|
xs |
|
|> Explorer.Series.to_enum() |
|
|> Stream.zip(Explorer.Series.to_enum(ys)) |
|
end |
|
|
|
def tokenize_and_batch(stream, tokenizer, batch_size, sequence_length, id_to_label) do |
|
stream |
|
|> Stream.chunk_every(batch_size) |
|
|> Stream.map(fn batch -> |
|
{text, labels} = Enum.unzip(batch) |
|
|
|
id_to_label_values = id_to_label |> Map.values() |
|
|
|
label_ids = |
|
Enum.map(labels, fn item -> |
|
Enum.find_index(id_to_label_values, fn label_value -> label_value == item end) |
|
end) |
|
|
|
tokenized = Bumblebee.apply_tokenizer(tokenizer, text, length: sequence_length) |
|
{tokenized, Nx.stack(label_ids)} |
|
end) |
|
end |
|
end |
|
``` |
|
|
|
```elixir |
|
batch_size = 32 |
|
sequence_length = 64 |
|
|
|
train_data = |
|
training_df |
|
|> Finance.load(tokenizer, |
|
batch_size: batch_size, |
|
sequence_length: sequence_length, |
|
id_to_label: id_to_label |
|
) |
|
|
|
test_data = |
|
test_df |
|
|> Finance.load(tokenizer, |
|
batch_size: batch_size, |
|
sequence_length: sequence_length, |
|
id_to_label: id_to_label |
|
) |
|
``` |
|
|
|
```elixir |
|
train_data = Enum.take(train_data, 250) |
|
test_data = Enum.take(test_data, 50) |
|
:ok |
|
``` |
|
|
|
```elixir |
|
%{model: model, params: params} = model_info |
|
|
|
model |
|
``` |
|
|
|
```elixir |
|
[{input, _}] = Enum.take(train_data, 1) |
|
Axon.get_output_shape(model, input) |
|
``` |
|
|
|
```elixir |
|
logits_model = Axon.nx(model, & &1.logits) |
|
``` |
|
|
|
```elixir |
|
loss = |
|
&Axon.Losses.categorical_cross_entropy(&1, &2, |
|
reduction: :mean, |
|
from_logits: true, |
|
sparse: true |
|
) |
|
|
|
optimizer = Polaris.Optimizers.adam(learning_rate: 5.0e-5) |
|
|
|
loop = Axon.Loop.trainer(logits_model, loss, optimizer, log: 1) |
|
``` |
|
|
|
```elixir |
|
accuracy = &Axon.Metrics.accuracy(&1, &2, from_logits: true, sparse: true) |
|
|
|
loop = Axon.Loop.metric(loop, accuracy, "accuracy") |
|
``` |
|
|
|
```elixir |
|
loop = Axon.Loop.checkpoint(loop, event: :epoch_completed) |
|
``` |
|
|
|
```elixir |
|
trained_model_state = |
|
logits_model |
|
|> Axon.Loop.trainer(loss, optimizer, log: 1) |
|
|> Axon.Loop.metric(accuracy, "accuracy") |
|
|> Axon.Loop.checkpoint(event: :epoch_completed) |
|
|> Axon.Loop.run(train_data, params, epochs: 3, compiler: EXLA, strict?: false) |
|
|
|
:ok |
|
``` |
|
|