VirBert2

Runtime error

App Files Files Community

rajaatif786 commited on Mar 24, 2023

Commit

4dd3f33

•

0 Parent(s):

Duplicate from rajaatif786/VirBert

Browse files

Files changed (12) hide show

.gitattributes +34 -0
BERT/main/berttok/vocab.txt +622 -0
README.md +13 -0
Toxonomy/modules/__pycache__/classifier.cpython-39.pyc +0 -0
Toxonomy/modules/__pycache__/confusionmatrix.cpython-39.pyc +0 -0
Toxonomy/modules/__pycache__/preprocessor.cpython-39.pyc +0 -0
Toxonomy/modules/classifier.py +526 -0
Toxonomy/modules/confusionmatrix.py +36 -0
Toxonomy/modules/preprocessor.py +44 -0
app.py +82 -0
requirements.txt +6 -0
virBERT.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

BERT/main/berttok/vocab.txt ADDED Viewed

	@@ -0,0 +1,622 @@

+[CLS]
+[SEP]
+[mask]
+a
+b
+c
+d
+g
+h
+k
+m
+n
+r
+s
+t
+v
+w
+y
+##s
+##g
+##n
+##a
+##t
+##c
+##k
+##y
+##w
+##m
+##r
+##v
+##b
+##h
+##d
+##aa
+##ca
+##tg
+tg
+gg
+##ag
+ag
+##ct
+##at
+##cc
+##ac
+##tt
+##tc
+##ta
+##cg
+cg
+caa
+aaa
+tgg
+gga
+aga
+aca
+atg
+ggg
+gaa
+cca
+aag
+ctg
+cag
+tca
+ttg
+aat
+agg
+cat
+gag
+tga
+aac
+tgc
+gca
+tgt
+cct
+ccc
+gtg
+acc
+ggc
+cac
+ctc
+ctt
+ttt
+gct
+act
+gac
+tct
+gcc
+att
+tcc
+gat
+ttc
+agc
+atc
+ata
+ggt
+gtt
+agt
+tac
+tat
+gtc
+cta
+taa
+tta
+cgg
+gta
+ccg
+tag
+gcg
+cgc
+acg
+cgt
+tcg
+cga
+nn
+nnn
+##nn
+ca
+ar
+##gt
+aa
+##gg
+##yt
+nng
+ann
+ga
+##ya
+##ty
+ac
+can
+tc
+ngt
+gr
+tr
+##tr
+cr
+##ga
+cc
+gc
+##yc
+##yg
+##gc
+ara
+ta
+##mt
+gar
+##tn
+aar
+##na
+cnn
+##ma
+car
+ayt
+##nt
+rat
+ct
+rgg
+ggr
+ytt
+nna
+rga
+raa
+ytg
+rag
+aay
+crg
+##wc
+gty
+rac
+##kg
+agr
+arg
+art
+tya
+tcr
+##ng
+rtt
+naa
+yat
+cyt
+yac
+cay
+rca
+tty
+gt
+ttr
+tgy
+ayc
+aya
+aty
+cya
+rtc
+yct
+gra
+tra
+##mc
+trt
+##wt
+##mg
+##kc
+##wg
+at
+nnt
+nnc
+acy
+as
+ctr
+tcy
+rta
+tnn
+yag
+grt
+gnn
+yaa
+yta
+acn
+gyt
+##nc
+##kt
+aan
+acr
+tyt
+yca
+grc
+ntc
+ccy
+gcn
+gya
+ggy
+gay
+trg
+ytc
+arc
+rtg
+rct
+yga
+gcy
+gtr
+crt
+cty
+cyg
+ngc
+tyg
+ycc
+grg
+gs
+cra
+ccr
+tay
+cnt
+ccn
+ygg
+ggn
+atr
+mtt
+rcc
+rgt
+tyc
+##wa
+ayg
+amt
+nag
+rgc
+nca
+sgg
+cyc
+nac
+ygc
+##sg
+acm
+ana
+cma
+ntg
+ty
+ygt
+tgn
+tgr
+trc
+ngg
+gyg
+cmt
+maa
+tcm
+gcr
+ttn
+twc
+agn
+gyc
+ncc
+##ka
+agy
+mtc
+nat
+tt
+crc
+atn
+kgg
+ntt
+ysg
+tcn
+tys
+mgg
+ts
+tan
+tmt
+aam
+gan
+ctm
+ang
+mtg
+nga
+tar
+ctn
+cna
+cgy
+wct
+wca
+twt
+ctk
+ctw
+gtw
+gna
+mat
+nta
+ggk
+acw
+gcm
+cmg
+kct
+tna
+ccm
+awg
+cwg
+nct
+tma
+cas
+tam
+cmc
+gkg
+ant
+wcc
+gsa
+gtn
+wgc
+cs
+gng
+ktt
+mag
+wtg
+cgr
+cak
+gam
+gtm
+kgt
+tkg
+tkt
+aak
+ama
+anc
+kca
+mta
+sca
+ggw
+ccw
+atm
+asc
+akg
+amc
+ckg
+cwt
+ckc
+mca
+mcc
+tkc
+tgs
+aaw
+gaw
+tcw
+saa
+cam
+atk
+atw
+asa
+cwc
+gmt
+gwc
+ktg
+rcg
+##an
+tck
+gtk
+gnt
+gnc
+kga
+nan
+sag
+scc
+tng
+wtt
+wgt
+ggs
+ggm
+ack
+cck
+akc
+mct
+mac
+stg
+tnt
+waa
+wtc
+awt
+amg
+cnc
+ckt
+cwa
+gma
+kcc
+sta
+wta
+ycg
+kag
+mgt
+ncg
+tmg
+caw
+tas
+akt
+cng
+gwa
+mcg
+tmc
+wgg
+aas
+gcw
+asg
+tsc
+awc
+gkc
+ktc
+kgc
+tnc
+wac
+wga
+tgk
+agm
+gas
+cts
+ast
+ttw
+ttm
+tst
+gkt
+twa
+wag
+tgw
+tgm
+cgs
+kaa
+kta
+mga
+##ar
+gak
+gst
+aka
+cka
+gmg
+rr
+sct
+sac
+gcs
+gck
+ats
+ay
+gmc
+gka
+kac
+mgc
+ng
+nc
+sat
+stt
+twg
+tka
+agk
+agw
+gsc
+ngn
+ncn
+gwg
+sga
+cgm
+csa
+ntn
+sr
+yg
+tcs
+gts
+gsg
+cst
+cy
+cw
+kcg
+stc
+cgn
+cgk
+acs
+ttk
+tsa
+cm
+rar
+sgt
+wcg
+yyt
+cgw
+tts
+csc
+awa
+csg
+gwt
+rt
+tm
+wr
+wat
+yt
+tak
+tyy
+ak
+gw
+ma
+rc
+sgc
+tw
+tsg
+wg
+wa
+wc
+ya
+yma
+##st
+##sr
+##vt
+ags
+trr
+ccs
+taw
+rra
+rrt
+ayk
+kat
+mr
+ry
+ryt
+yc
+ywc
+ymg
+ykt
+##ks
+cry
+tym
+ayw
+aym
+ygk
+cyy
+rts
+tmm
+wrg
+wgr
+am
+ad
+gy
+kg
+kya
+ksr
+mt
+mc
+mty
+mma
+mar
+rs
+ra
+ss
+sg
+scg
+syg
+tb
+tvt
+tks
+vtg
+wkg
+wmc
+wst
+yy
+yr
+ytr
+ywa
+##wk
+##ms
+grr
+ctb
+asr
+gsy
+sra
+src
+ygm
+cyw
+cwm
+cwr
+cmw
+cmr
+tmy
+ytw
+ytm
+aky
+gww
+mam
+rcw
+twm
+wak
+wcr
+yay
+mrr
+adc

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: VirBert
+emoji: 👀
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 3.23.0
+app_file: app.py
+pinned: false
+duplicated_from: rajaatif786/VirBert
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

Toxonomy/modules/__pycache__/classifier.cpython-39.pyc ADDED Viewed

Binary file (8.96 kB). View file

Toxonomy/modules/__pycache__/confusionmatrix.cpython-39.pyc ADDED Viewed

Binary file (1.41 kB). View file

Toxonomy/modules/__pycache__/preprocessor.cpython-39.pyc ADDED Viewed

Binary file (1.14 kB). View file

Toxonomy/modules/classifier.py ADDED Viewed

	@@ -0,0 +1,526 @@

+# Create the BertClassfier class
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import AdamW, get_linear_schedule_with_warmup
+device = 0
+import random
+import time
+import torch.nn as nn
+# Specify loss function
+loss_fn = nn.CrossEntropyLoss()
+class PretrainedBert(nn.Module):
+    """Bert Model for Classification Tasks.
+    """
+    def __init__(self, freeze_bert=False):
+        """
+        @param    bert: a BertModel object
+        @param    classifier: a torch.nn.Module classifier
+        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
+        """
+        super(PretrainedBert, self).__init__()
+        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
+        D_in, H, D_out = 768, 50, 14
+        # Instantiate BERT model
+        from transformers import BertConfig
+        config = BertConfig(
+          # we align this to the tokenizer vocab_size
+        max_position_embeddings=5000,
+        hidden_size=768,
+        num_attention_heads=2,
+        num_hidden_layers=2,
+        type_vocab_size=1
+)
+        from transformers import BertForMaskedLM
+        self.bert =BertModel(config)
+        # Instantiate an one-layer feed-forward classifier
+        self.classifier = nn.Sequential(
+            nn.Linear(D_in, H),
+            nn.ReLU(),
+            #nn.Dropout(0.5),
+            nn.Linear(H, D_out)
+        )
+        # Freeze the BERT model
+        if freeze_bert:
+            for param in self.bert.parameters():
+                param.requires_grad = False
+    def forward(self, input_ids, attention_mask):
+        """
+        Feed input to BERT and the classifier to compute logits.
+        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
+                      max_length)
+        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
+                      information with shape (batch_size, max_length)
+        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
+                      num_labels)
+        """
+        # Feed input to BERT
+        outputs = self.bert(input_ids=input_ids,
+                            attention_mask=attention_mask)
+        # Extract the last hidden state of the token `[CLS]` for classification task
+        last_hidden_state_cls = outputs[0][:, 0, :]
+        # Feed input to classifier to compute logits
+        logits = self.classifier(last_hidden_state_cls)
+        return logits
+from transformers import AdamW, get_linear_schedule_with_warmup
+#device='cuda'
+def valid_evaluate(model, val_dataloader):
+    """After the completion of each training epoch, measure the model's performance
+    on our validation set.
+    """
+    # Put the model into the evaluation mode. The dropout layers are disabled during
+    # the test time.
+    model.eval()
+    # Tracking variables
+    val_accuracy = []
+    val_loss = []
+    # For each batch in our validation set...
+    for batch in val_dataloader:
+        # Load batch to GPU
+        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
+        # Compute logits
+        with torch.no_grad():
+            logits = model(b_input_ids, b_attn_mask)
+        # Compute loss
+        loss = loss_fn(logits, b_labels)
+        val_loss.append(loss.item())
+        # Get the predictions
+        preds = torch.argmax(logits, dim=1).flatten()
+        # Calculate the accuracy rate
+        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
+        val_accuracy.append(accuracy)
+    # Compute the average accuracy and loss over the validation set.
+    val_loss = np.mean(val_loss)
+    val_accuracy = np.mean(val_accuracy)
+    return val_loss, val_accuracy
+import torch
+import torch.nn as nn
+from transformers import BertModel
+# Create the BertClassfier class
+class FinetunningBert(nn.Module):
+    """Bert Model for Classification Tasks.
+    """
+    def __init__(self,virus_dir, freeze_bert=False):
+        """
+        @param    bert: a BertModel object
+        @param    classifier: a torch.nn.Module classifier
+        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
+        """
+        super(FinetunningBert, self).__init__()
+        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
+        D_in, H, D_out = 768, 50, 2
+        # Instantiate BERT model
+        from transformers import BertConfig
+        from transformers import BertForMaskedLM
+        bert_classifier = PretrainedBert(freeze_bert=False)
+        bert_classifier.load_state_dict(torch.load(virus_dir+'/virBERT.pt'))
+        self.bert =bert_classifier.bert.to(device)
+        # Instantiate an one-layer feed-forward classifier
+        self.classifier = nn.Sequential(
+            nn.Linear(D_in, H),
+            nn.ReLU(),
+            #nn.Dropout(0.5),
+            nn.Linear(H, D_out)
+        )
+        # Freeze the BERT model
+        if freeze_bert:
+            for param in self.bert.parameters():
+                param.requires_grad = False
+    def forward(self, input_ids, attention_mask):
+        """
+        Feed input to BERT and the classifier to compute logits.
+        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
+                      max_length)
+        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
+                      information with shape (batch_size, max_length)
+        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
+                      num_labels)
+        """
+        # Feed input to BERT
+        outputs = self.bert(input_ids=input_ids,
+                            attention_mask=attention_mask)
+        # Extract the last hidden state of the token `[CLS]` for classification task
+        last_hidden_state_cls = outputs[0][:, 0, :]
+        # Feed input to classifier to compute logits
+        logits = self.classifier(last_hidden_state_cls)
+        return logits
+from transformers import AdamW, get_linear_schedule_with_warmup
+#device='cuda'
+def initialize_finetunningBert(train_dataloader,virus_dir,epochs=4):
+    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
+    """
+    # Instantiate Bert Classifier
+    bert_classifier = FinetunningBert(virus_dir,freeze_bert=False)
+    # Tell PyTorch to run the model on GPU
+    bert_classifier.to(device)
+    # Create the optimizer
+    optimizer = AdamW(bert_classifier.parameters(),
+                      lr=5e-5,    # Default learning rate
+                      eps=1e-8    # Default epsilon value
+                      )
+    # Total number of training steps
+    total_steps = len(train_dataloader) * epochs
+    # Set up the learning rate scheduler
+    scheduler = get_linear_schedule_with_warmup(optimizer,
+                                                num_warmup_steps=0, # Default value
+                                                num_training_steps=total_steps)
+    return bert_classifier, optimizer, scheduler
+import random
+import time
+import torch.nn as nn
+# Specify loss function
+loss_fn = nn.CrossEntropyLoss()
+def finetunningBert_training(model, optimizer, scheduler, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
+    """Train the BertClassifier model.
+    """
+    # Start training loop
+    print("Start training...\n")
+    for epoch_i in range(epochs):
+        # =======================================
+        #               Training
+        # =======================================
+        # Print the header of the result table
+        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
+        print("-"*70)
+        # Measure the elapsed time of each epoch
+        t0_epoch, t0_batch = time.time(), time.time()
+        # Reset tracking variables at the beginning of each epoch
+        total_loss, batch_loss, batch_counts = 0, 0, 0
+        # Put the model into the training mode
+        model.train()
+        # For each batch of training data...
+        for step, batch in enumerate(train_dataloader):
+            batch_counts +=1
+            # Load batch to GPU
+            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
+            # Zero out any previously calculated gradients
+            model.zero_grad()
+            # Perform a forward pass. This will return logits.
+            logits = model(b_input_ids, b_attn_mask)
+            # Compute loss and accumulate the loss values
+            loss = loss_fn(logits, b_labels)
+            batch_loss += loss.item()
+            total_loss += loss.item()
+            # Perform a backward pass to calculate gradients
+            loss.backward()
+            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            # Update parameters and the learning rate
+            optimizer.step()
+            scheduler.step()
+            # Print the loss values and time elapsed for every 20 batches
+            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
+                # Calculate time elapsed for 20 batches
+                time_elapsed = time.time() - t0_batch
+                # Print training results
+                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
+                # Reset batch tracking variables
+                batch_loss, batch_counts = 0, 0
+                t0_batch = time.time()
+        # Calculate the average loss over the entire training data
+        avg_train_loss = total_loss / len(train_dataloader)
+        torch.save(model.state_dict(), '{}model.pt'.format("VirDNA"))
+        print("-"*70)
+        # =======================================
+        #               Evaluation
+        # =======================================
+        if evaluation == True:
+            # After the completion of each training epoch, measure the model's performance
+            # on our validation set.
+            val_loss, val_accuracy = valid_evaluate(model, val_dataloader)
+            # Print performance over the entire training data
+            time_elapsed = time.time() - t0_epoch
+            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
+            print("-"*70)
+        print("\n")
+    print("Training complete!")
+def bertPredictions(torch,model, val_dataloader):
+    """After the completion of each training epoch, measure the model's performance
+    on our validation set.
+    """
+    # Put the model into the evaluation mode. The dropout layers are disabled during
+    # the test time.
+    model.eval()
+    device = 0
+    print("working3")
+    # Tracking variables
+    val_accuracy = []
+    val_loss = []
+    pred=[]
+    actual=[]
+    # For each batch in our validation set...
+    for batch in val_dataloader:
+        device = 0
+        # Load batch to GPU
+        b_input_ids, b_attn_mask, b_labels = tuple(t for t in batch)
+        # Compute logits
+        with torch.no_grad():
+            logits = model(b_input_ids, b_attn_mask)
+        # Compute loss
+        #loss = loss_fn(logits, b_labels)
+        #val_loss.append(loss.item())
+        # Get the predictions
+        preds = torch.argmax(logits, dim=1).flatten()
+        # Calculate the accuracy rate
+        #accuracy = (preds == b_labels).cpu().numpy().mean() * 100
+        #val_accuracy.append(accuracy)
+        pred.append(preds.cpu())
+        #actual.append(b_labels.cpu())
+    # Compute the average accuracy and loss over the validation set.
+    #val_loss = np.mean(val_loss)
+    #val_accuracy = np.mean(val_accuracy)
+    return pred
+import torch
+import torch.nn as nn
+from transformers import BertModel
+# Create the BertClassfier class
+class ScratchBert(nn.Module):
+    """Bert Model for Classification Tasks.
+    """
+    def __init__(self, freeze_bert=False):
+        """
+        @param    bert: a BertModel object
+        @param    classifier: a torch.nn.Module classifier
+        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
+        """
+        super(ScratchBert, self).__init__()
+        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
+        D_in, H, D_out = 768, 50, 2
+        # Instantiate BERT model
+        from transformers import BertConfig
+        config = BertConfig(
+          # we align this to the tokenizer vocab_size
+        max_position_embeddings=5000,
+        hidden_size=768,
+        num_attention_heads=2,
+        num_hidden_layers=2,
+        type_vocab_size=1
+)
+        from transformers import BertForMaskedLM
+        self.bert =BertModel(config)
+        # Instantiate an one-layer feed-forward classifier
+        self.classifier = nn.Sequential(
+            nn.Linear(D_in, H),
+            nn.ReLU(),
+            #nn.Dropout(0.5),
+            nn.Linear(H, D_out)
+        )
+        # Freeze the BERT model
+        if freeze_bert:
+            for param in self.bert.parameters():
+                param.requires_grad = False
+    def forward(self, input_ids, attention_mask):
+        """
+        Feed input to BERT and the classifier to compute logits.
+        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
+                      max_length)
+        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
+                      information with shape (batch_size, max_length)
+        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
+                      num_labels)
+        """
+        # Feed input to BERT
+        outputs = self.bert(input_ids=input_ids,
+                            attention_mask=attention_mask)
+        # Extract the last hidden state of the token `[CLS]` for classification task
+        last_hidden_state_cls = outputs[0][:, 0, :]
+        # Feed input to classifier to compute logits
+        logits = self.classifier(last_hidden_state_cls)
+        return logits
+from transformers import AdamW, get_linear_schedule_with_warmup
+#device='cuda'
+def initialize_model(train_dataloader,epochs=4):
+    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
+    """
+    # Instantiate Bert Classifier
+    bert_classifier = ScratchBert(freeze_bert=False)
+    # Tell PyTorch to run the model on GPU
+    bert_classifier.to(device)
+    # Create the optimizer
+    optimizer = AdamW(bert_classifier.parameters(),
+                      lr=5e-5,    # Default learning rate
+                      eps=1e-8    # Default epsilon value
+                      )
+    # Total number of training steps
+    total_steps = len(train_dataloader) * epochs
+    # Set up the learning rate scheduler
+    scheduler = get_linear_schedule_with_warmup(optimizer,
+                                                num_warmup_steps=0, # Default value
+                                                num_training_steps=total_steps)
+    return bert_classifier, optimizer, scheduler
+import random
+import time
+import torch.nn as nn
+# Specify loss function
+loss_fn = nn.CrossEntropyLoss()
+def train(model,optimizer, scheduler, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
+    """Train the BertClassifier model.
+    """
+    # Start training loop
+    print("Start training...\n")
+    for epoch_i in range(epochs):
+        # =======================================
+        #               Training
+        # =======================================
+        # Print the header of the result table
+        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
+        print("-"*70)
+        # Measure the elapsed time of each epoch
+        t0_epoch, t0_batch = time.time(), time.time()
+        # Reset tracking variables at the beginning of each epoch
+        total_loss, batch_loss, batch_counts = 0, 0, 0
+        # Put the model into the training mode
+        model.train()
+        # For each batch of training data...
+        for step, batch in enumerate(train_dataloader):
+            batch_counts +=1
+            # Load batch to GPU
+            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
+            # Zero out any previously calculated gradients
+            model.zero_grad()
+            # Perform a forward pass. This will return logits.
+            logits = model(b_input_ids, b_attn_mask)
+            # Compute loss and accumulate the loss values
+            loss = loss_fn(logits, b_labels)
+            batch_loss += loss.item()
+            total_loss += loss.item()
+            # Perform a backward pass to calculate gradients
+            loss.backward()
+            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            # Update parameters and the learning rate
+            optimizer.step()
+            scheduler.step()
+            # Print the loss values and time elapsed for every 20 batches
+            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
+                # Calculate time elapsed for 20 batches
+                time_elapsed = time.time() - t0_batch
+                # Print training results
+                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
+                # Reset batch tracking variables
+                batch_loss, batch_counts = 0, 0
+                t0_batch = time.time()
+        # Calculate the average loss over the entire training data
+        avg_train_loss = total_loss / len(train_dataloader)
+        torch.save(model.state_dict(), '{}model.pt'.format("VirDNA"))
+        print("-"*70)
+        # =======================================
+        #               Evaluation
+        # =======================================
+        if evaluation == True:
+            # After the completion of each training epoch, measure the model's performance
+            # on our validation set.
+            val_loss, val_accuracy = valid_evaluate(model, val_dataloader)
+            # Print performance over the entire training data
+            time_elapsed = time.time() - t0_epoch
+            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
+            print("-"*70)
+        print("\n")
+    print("Training complete!")

Toxonomy/modules/confusionmatrix.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import matplotlib.pyplot as plt
+import numpy as np
+def plot_confusion_matrix(cm, classes,
+                          normalize=False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Greens):
+    """
+    This function prints and plots the confusion matrix.
+    Normalization can be applied by setting `normalize=True`.
+    """
+    import itertools
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+    print(cm)
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, rotation=45)
+    plt.yticks(tick_marks, classes)
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], fmt),
+                 horizontalalignment="center",
+                 color="white" if cm[i, j] > thresh else "black")
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+    plt.tight_layout()

Toxonomy/modules/preprocessor.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from transformers import BertTokenizer
+import torch
+# Load the BERT tokenizer
+tokenizer = BertTokenizer.from_pretrained('.'+'/BERT/main/berttok')
+# Create a function to tokenize a set of texts
+def preprocessing_for_bert(data):
+    """Perform required preprocessing steps for pretrained BERT.
+    @param    data (np.array): Array of texts to be processed.
+    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
+    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
+                  tokens should be attended to by the model.
+    """
+    # Create empty lists to store outputs
+    input_ids = []
+    attention_masks = []
+    #MAX_LEN=100
+    # For every sentence...
+    for sent in data:
+        # `encode_plus` will:
+        #    (1) Tokenize the sentence
+        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
+        #    (3) Truncate/Pad sentence to max length
+        #    (4) Map tokens to their IDs
+        #    (5) Create attention mask
+        #    (6) Return a dictionary of outputs
+        encoded_sent = tokenizer.encode_plus(
+            text=sent,  # Preprocess sentence
+            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
+            max_length=5000,                  # Max length to truncate/pad
+            pad_to_max_length=True,         # Pad sentence to max length
+            #return_tensors='pt',           # Return PyTorch tensor
+            return_attention_mask=True      # Return attention mask
+            )
+        # Add the outputs to the lists
+        input_ids.append(encoded_sent.get('input_ids'))
+        attention_masks.append(encoded_sent.get('attention_mask'))
+    # Convert lists to tensors
+    input_ids = torch.tensor(input_ids)
+    attention_masks = torch.tensor(attention_masks)
+    return input_ids, attention_masks

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import gradio as gr
+# from transformers import pipeline
+# pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")
+# def predict(text):
+#   return pipe(text)[0]["translation_text"]
+# iface = gr.Interface(
+#   fn=predict,
+#   inputs='text',
+#   outputs='text',
+#   examples=[["Hello! My name is Omar"]]
+# )
+# iface.launch()
+from Toxonomy.modules.confusionmatrix import plot_confusion_matrix
+import glob
+import pandas as pd
+from Toxonomy.modules.preprocessor import preprocessing_for_bert
+print("hello")
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+import torch
+import numpy as np
+from sklearn.model_selection import train_test_split
+import torch
+import torch.nn as nn
+from transformers import BertModel
+from Toxonomy.modules.classifier import PretrainedBert,FinetunningBert,initialize_finetunningBert,finetunningBert_training,bertPredictions
+from transformers import AdamW, get_linear_schedule_with_warmup
+device = 0
+import random
+import time
+import torch.nn as nn
+print("completed")
+def Kmers_funct(seq, size):
+    return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]
+def kmers_sentences(mySeq):
+  #Kmers_funct(mySeq, size=7)
+  words = Kmers_funct(mySeq, size=3)
+  joined_sentence = ' '.join(words)
+  return joined_sentence
+import re
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+import torch
+import numpy as np
+def predict(text):
+    device = 0
+    print(text)
+    temp_df=pd.DataFrame([text]).astype('str')
+    temp_df.columns=['seq']
+    mask = temp_df['seq'].str.len() <= 7000
+    temp_df = temp_df.loc[mask]
+    temp_df['Processed']=temp_df['seq'].apply(kmers_sentences) #.reset_index()
+    test_inputs, test_masks = preprocessing_for_bert(temp_df['Processed'])
+    test_data = TensorDataset(test_inputs, test_masks, test_masks)
+    test_sampler = RandomSampler(test_data)
+    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)
+    bert_classifier = PretrainedBert(freeze_bert=False)
+    bert_classifier.load_state_dict(torch.load("./virBERT.pt",map_location=torch.device('cpu')))
+    print("location")
+    print(next(bert_classifier.parameters()).is_cuda)
+    #bert_classifier.to(device)
+    pred=bertPredictions(torch,bert_classifier,test_dataloader)
+    return str(pred)
+iface = gr.Interface(
+  fn=predict,
+  inputs='text',
+  outputs='text'
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers
+tensorflow
+sentencepiece
+tokenizers
+torch
+scikit-learn

virBERT.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91ba86b1a74084de062f7b9d70af958d8c781e2958b582cc96574177cd9b3b68
+size 168411425