thankrandomness commited on
Commit
9f09d69
1 Parent(s): f2ca0de

split data

Browse files
Files changed (2) hide show
  1. app.py +8 -1
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import torch
3
- from datasets import load_dataset
4
  from transformers import AutoTokenizer, AutoModel
5
  import chromadb
6
  import gradio as gr
@@ -15,6 +15,13 @@ def meanpooling(output, mask):
15
  # Load the dataset
16
  dataset = load_dataset("thankrandomness/mimic-iii-sample")
17
 
 
 
 
 
 
 
 
18
  # Load the model and tokenizer
19
  tokenizer = AutoTokenizer.from_pretrained("neuml/pubmedbert-base-embeddings-matryoshka")
20
  model = AutoModel.from_pretrained("neuml/pubmedbert-base-embeddings-matryoshka")
 
1
  import os
2
  import torch
3
+ from datasets import load_dataset, DatasetDict
4
  from transformers import AutoTokenizer, AutoModel
5
  import chromadb
6
  import gradio as gr
 
15
  # Load the dataset
16
  dataset = load_dataset("thankrandomness/mimic-iii-sample")
17
 
18
+ # Split the dataset into train and validation sets
19
+ split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
20
+ dataset = DatasetDict({
21
+ 'train': split_dataset['train'],
22
+ 'validation': split_dataset['test']
23
+ })
24
+
25
  # Load the model and tokenizer
26
  tokenizer = AutoTokenizer.from_pretrained("neuml/pubmedbert-base-embeddings-matryoshka")
27
  model = AutoModel.from_pretrained("neuml/pubmedbert-base-embeddings-matryoshka")
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  torch
2
  transformers
 
3
  chromadb
4
  gradio
5
  numpy
 
1
  torch
2
  transformers
3
+ datasets
4
  chromadb
5
  gradio
6
  numpy