tadsatlawa-na commited on
Commit
bccbe1d
1 Parent(s): ea72f6e

Upload nanoBERTExample.ipynb

Browse files
Files changed (1) hide show
  1. nanoBERTExample.ipynb +156 -0
nanoBERTExample.ipynb ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "source": [
22
+ "# nanoBERT Example\n",
23
+ "\n",
24
+ "Here we present nanoBERT, a nanobody-specific transformer. Its primary application is positing infilling, predicting what amino acids could be available at a given position according to the nanobody-specific distribution. "
25
+ ],
26
+ "metadata": {
27
+ "id": "JU2dnhr24egK"
28
+ }
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 9,
33
+ "metadata": {
34
+ "colab": {
35
+ "base_uri": "https://localhost:8080/"
36
+ },
37
+ "id": "gxL4QKeNqYXI",
38
+ "outputId": "256d9b91-ed93-462a-8d6f-8c257b973f91"
39
+ },
40
+ "outputs": [
41
+ {
42
+ "output_type": "stream",
43
+ "name": "stdout",
44
+ "text": [
45
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.34.1)\n",
46
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.4)\n",
47
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.17.3)\n",
48
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n",
49
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n",
50
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
51
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n",
52
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
53
+ "Requirement already satisfied: tokenizers<0.15,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n",
54
+ "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.0)\n",
55
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n",
56
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n",
57
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n",
58
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.0)\n",
59
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
60
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
61
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n"
62
+ ]
63
+ }
64
+ ],
65
+ "source": [
66
+ "# Install stadard library\n",
67
+ "! pip install --upgrade transformers"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "source": [
73
+ "from transformers import pipeline, RobertaTokenizer, AutoModel"
74
+ ],
75
+ "metadata": {
76
+ "id": "vG5ndbr_rYjL"
77
+ },
78
+ "execution_count": 10,
79
+ "outputs": []
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "source": [
84
+ "# Initialise the tokenizer\n",
85
+ "tokenizer = RobertaTokenizer.from_pretrained(\"NaturalAntibody/nanoBERT\", return_tensors=\"pt\")"
86
+ ],
87
+ "metadata": {
88
+ "id": "1GNqH8HlrzmF"
89
+ },
90
+ "execution_count": 11,
91
+ "outputs": []
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "source": [
96
+ "# Initialise model\n",
97
+ "unmasker = pipeline('fill-mask', model=\"tadsatlawa/nanoBERT\", tokenizer=tokenizer, top_k=20 )"
98
+ ],
99
+ "metadata": {
100
+ "id": "3CYcwIOU3xCY"
101
+ },
102
+ "execution_count": 12,
103
+ "outputs": []
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "source": [
108
+ "# Predict the residue probability at one or more masked positions\n",
109
+ "# mark position to predict with '<mask>'\n",
110
+ "seq = \"QLVSGPEVKKPGASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYC<mask>ATNWGSYFEHWGQGTLVTVSS\"\n",
111
+ "\n",
112
+ "residueProbability = unmasker(seq)\n",
113
+ "\n",
114
+ "# Print residue probabilities\n",
115
+ "for scores in residueProbability:\n",
116
+ " print(f\"Amino Acid : {scores['token_str']}, probability = {scores['score']}\")"
117
+ ],
118
+ "metadata": {
119
+ "colab": {
120
+ "base_uri": "https://localhost:8080/"
121
+ },
122
+ "id": "6rtUxgbYsygY",
123
+ "outputId": "da127f6a-e076-44ba-fce8-ff68c06cf354"
124
+ },
125
+ "execution_count": 13,
126
+ "outputs": [
127
+ {
128
+ "output_type": "stream",
129
+ "name": "stdout",
130
+ "text": [
131
+ "Amino Acid : S, probability = 0.4827525019645691\n",
132
+ "Amino Acid : A, probability = 0.22524100542068481\n",
133
+ "Amino Acid : N, probability = 0.09490441530942917\n",
134
+ "Amino Acid : Y, probability = 0.07571367919445038\n",
135
+ "Amino Acid : K, probability = 0.04161035269498825\n",
136
+ "Amino Acid : T, probability = 0.027568845078349113\n",
137
+ "Amino Acid : H, probability = 0.009884347207844257\n",
138
+ "Amino Acid : C, probability = 0.008951968513429165\n",
139
+ "Amino Acid : V, probability = 0.007528781425207853\n",
140
+ "Amino Acid : R, probability = 0.006156255956739187\n",
141
+ "Amino Acid : G, probability = 0.005135924089699984\n",
142
+ "Amino Acid : I, probability = 0.004699127282947302\n",
143
+ "Amino Acid : W, probability = 0.0030531329102814198\n",
144
+ "Amino Acid : M, probability = 0.0022762243170291185\n",
145
+ "Amino Acid : F, probability = 0.001321254065260291\n",
146
+ "Amino Acid : E, probability = 0.0009838981786742806\n",
147
+ "Amino Acid : L, probability = 0.0006674979231320322\n",
148
+ "Amino Acid : D, probability = 0.000666878477204591\n",
149
+ "Amino Acid : Q, probability = 0.0005539602716453373\n",
150
+ "Amino Acid : P, probability = 0.00032376404851675034\n"
151
+ ]
152
+ }
153
+ ]
154
+ }
155
+ ]
156
+ }