sberhe commited on
Commit
cadd4a6
1 Parent(s): 4af8da6

update app for tokenizer

Browse files
Files changed (1) hide show
  1. app.py +17 -16
app.py CHANGED
@@ -1,29 +1,30 @@
 
1
  from datasets import load_dataset
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
3
 
4
  # Load the dataset
5
  dataset = load_dataset("sberhe/2023-1000-software-release-notes")
6
 
7
- # Explore and preprocess the dataset
8
- # [Add your data preprocessing steps here]
9
-
10
- # Load a pre-trained model and tokenizer
11
  model_name = "bert-base-uncased"
12
  tokenizer = AutoTokenizer.from_pretrained(model_name)
13
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
14
 
15
  # Tokenize the data
16
  def tokenize_function(examples):
17
- return tokenizer(examples["text"], padding="max_length", truncation=True)
18
 
19
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
20
 
21
- # Fine-tuning the model
22
- training_args = TrainingArguments(output_dir="/results", learning_rate=2e-5, per_device_train_batch_size=16, num_train_epochs=3)
23
-
24
- trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"])
25
-
26
- trainer.train()
27
-
28
- # Evaluate the model
29
- # [Add evaluation code here]
 
 
 
 
1
+ import tensorflow as tf
2
  from datasets import load_dataset
3
+ from transformers import AutoTokenizer, TFAutoModel
4
 
5
  # Load the dataset
6
  dataset = load_dataset("sberhe/2023-1000-software-release-notes")
7
 
8
+ # Load a pre-trained model and tokenizer (TensorFlow version)
 
 
 
9
  model_name = "bert-base-uncased"
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = TFAutoModel.from_pretrained(model_name)
12
 
13
  # Tokenize the data
14
  def tokenize_function(examples):
15
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
16
 
17
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
18
 
19
+ # Function to extract embeddings
20
+ def extract_embeddings(batch):
21
+ # Convert batch to TensorFlow tensors and correct the dimensions
22
+ inputs = {k: tf.convert_to_tensor(v) for k, v in batch.items() if k in tokenizer.model_input_names}
23
+ # Get output from the model
24
+ outputs = model(**inputs, output_hidden_states=True, return_dict=True)
25
+ # Extract the embeddings from the last hidden state
26
+ embeddings = outputs.last_hidden_state
27
+ return {"embeddings": embeddings.numpy()}
28
+
29
+ # Apply the function to extract embeddings
30
+ embeddings_dataset = tokenized_datasets.map(extract_embeddings, batched=True)