Spaces:
Runtime error
Runtime error
File size: 3,682 Bytes
5139c5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']\n",
"- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([4, 768])\n"
]
}
],
"source": [
"from multilingual_clip import pt_multilingual_clip\n",
"import transformers\n",
"\n",
"texts = [\n",
" 'Three blind horses listening to Mozart.',\n",
" 'Älgen är skogens konung!',\n",
" 'Wie leben Eisbären in der Antarktis?',\n",
" 'Вы знали, что все белые медведи левши?'\n",
"]\n",
"model_name = 'M-CLIP/XLM-Roberta-Large-Vit-L-14'\n",
"\n",
"# Load Model & Tokenizer\n",
"model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)\n",
"tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)\n",
"\n",
"embeddings = model.forward(texts, tokenizer)\n",
"print(embeddings.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"texts = [\n",
" 'Aku sayang kamu',\n",
" 'Aku benci kamu',\n",
"]\n",
"embeddings = model.forward(texts, tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"embeddings_1, embeddings_2 = embeddings\n",
"embeddings_1 = embeddings_1.cpu().detach().numpy()\n",
"embeddings_2 = embeddings_2.cpu().detach().numpy()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from numpy.linalg import norm"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.967305\n"
]
}
],
"source": [
"cosine = np.dot(embeddings_1,embeddings_2)/(norm(embeddings_1)*norm(embeddings_2))\n",
"print(cosine)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|