{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "import torch.nn as nn\n", "torch.cuda.is_available()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/matt/hf/sqllama-V0/.venv/lib/python3.7/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: /opt/conda did not contain libcudart.so as expected! Searching further paths...\n", " warn(msg)\n", "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n", "The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. \n", "The class this function is called from is 'LlamaTokenizer'.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "===================================BUG REPORT===================================\n", "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", "================================================================================\n", "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n", "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n", "CUDA SETUP: Detected CUDA version 113\n", "CUDA SETUP: Loading binary /home/matt/hf/sqllama-V0/.venv/lib/python3.7/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f8ad2d1a5de842bcb6b7e3c6972d9074", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/33 [00:00 5\n", "END\n", "\n" ] } ], "source": [ "import random\n", "import json\n", "\n", "# defined by WikiSQL\n", "\n", "agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG']\n", "cond_ops = ['=', '>', '<', 'OP']\n", "syms = ['SELECT', 'WHERE', 'AND', 'COL', 'TABLE', 'CAPTION', 'PAGE', 'SECTION', 'OP', 'COND', 'QUESTION', 'AGG', 'AGGOPS', 'CONDOPS']\n", "\n", "def fix_repr(d,cols,types,tid):\n", " sel_index=d['sel'] \n", " agg_index=d['agg']\n", " conditions=d['conds']\n", " col = cols[sel_index]\n", " rep = 'SELECT {agg} {sel} FROM {tid}'.format(\n", " agg=agg_ops[agg_index],\n", " sel=col,\n", " tid=tid\n", " )\n", " if conditions:\n", " cs = []\n", " for i, o, v in conditions:\n", " #print(i,cols)\n", " nm = cols[i]\n", " op = cond_ops[o]\n", " \n", " if types[i] in ['text']:\n", " val = f\"\\'{v}\\'\"\n", " else:\n", " val = v\n", " cs.append(f'{nm} {op} {val}')\n", " #print(cs)\n", "\n", " rep += ' WHERE ' + ' AND '.join(cs)\n", " \n", " return rep\n", "\n", "tbl_cols = {}\n", "tbl_types = {}\n", "tbl_str = {}\n", "\n", "prefix = 'Below is a question that describes a data request, paired with an input that describes a SQL table. Write a SQL query that retrieves the data.'\n", "\n", "def tbl_def_to_string(id, header, types):\n", " s = f'table: {id}\\ncolumns: ' + ','.join(header)\n", " return s\n", "\n", "with open('data/train.tables.jsonl') as f:\n", " for line in f:\n", " js = json.loads(line)\n", " id = js['id']\n", " hdr = js['header']\n", " ts = js['types']\n", " tbl_str[id] = tbl_def_to_string(id,hdr,ts)\n", " tbl_cols[id] = hdr\n", " tbl_types[id] = ts\n", "\n", "q_s = []\n", "a_s = []\n", "\n", "with open('data/train.jsonl') as f:\n", " for line in f:\n", " js = json.loads(line)\n", " id = js['table_id']\n", " s = tbl_str[id]\n", " qst = js['question']\n", " nl = s + '\\nQ: ' + qst + '\\nA: '\n", " q_s.append(nl)\n", "\n", " sql = js['sql']\n", " a = fix_repr(sql,tbl_cols[id],tbl_types[id],id)\n", " a = a + \"\\nEND\\n\"\n", " a_s.append(a)\n", "\n", "M = len(q_s)\n", "\n", "data_txt = [q_s[i] + a_s[i] for i in range(M)]\n", "\n", "for i in range(5):\n", " j = random.randint(0,M-1)\n", " print()\n", " print(data_txt[j]) \n", " \n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "toks = [tokenizer(s) for s in data_txt]\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "92\n", " 0\n", "count 56355.000000\n", "mean 101.219519\n", "std 21.740325\n", "min 63.000000\n", "25% 87.500000\n", "50% 97.000000\n", "75% 109.000000\n", "max 461.000000\n", "32084\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "print(len(toks[0].input_ids))\n", "lens = np.array([len(tok.input_ids) for tok in toks])\n", "print(pd.DataFrame(lens).describe())\n", "\n", "z = zip(q_s,lens)\n", "q_red = [a for a,b in z if b < 100]\n", "z = zip(a_s,lens)\n", "a_red = [a for a,b in z if b < 100]\n", "\n", "data_red = [q_red[i] + a_red[i] for i in range(len(q_red))]\n", "print(len(data_red))\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "708e075933754c6c940eeae9e3d3abc9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/32084 [00:00\n", " \n", " \n", " [ 4/500 01:51 < 7:38:51, 0.02 it/s, Epoch 0.01/2]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
12.748800
22.725100

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/tmp/ipykernel_24178/3667964638.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mdata_collator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransformers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataCollatorForLanguageModeling\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmlm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m )\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sqllama-out3'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1664\u001b[0m \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1665\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1666\u001b[0;31m \u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1667\u001b[0m )\n\u001b[1;32m 1668\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1927\u001b[0m \u001b[0mtr_loss_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1928\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1929\u001b[0;31m \u001b[0mtr_loss_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1930\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1931\u001b[0m if (\n", "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtraining_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m 2707\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2708\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_grad_scaling\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2709\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscale\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2710\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_apex\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2711\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mamp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscale_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mscaled_loss\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 487\u001b[0m )\n\u001b[1;32m 488\u001b[0m torch.autograd.backward(\n\u001b[0;32m--> 489\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 490\u001b[0m )\n\u001b[1;32m 491\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 197\u001b[0m Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n\u001b[1;32m 198\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 199\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass\n\u001b[0m\u001b[1;32m 200\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 201\u001b[0m def grad(\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "trainer = transformers.Trainer(\n", " model = model,\n", " train_dataset = data,\n", " args = targs,\n", " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)\n", ")\n", "trainer.train(resume_from_checkpoint=False)\n", "model.save_pretrained('sqllama-out3')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/matt/hf/sqllama-V0/.venv/lib/python3.7/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "from model\n", "table: 1-12028543-3\n", "columns: Season,Cup FinalDate,WinningTeam,Score,LosingTeam,Location,Cup Final Attendance\n", "Q: Who was the winning team in the 1989 season?\n", "A: SELECT WinningTeam FROM 1-12028543-3 WHERE Season = '1989'\n", "END\n", "END\n", "END\n", "END\n", "\n", "expected answer\n", "SELECT WinningTeam FROM 1-12028543-3 WHERE Season = '1989'\n", "END\n", "\n", "from model\n", "table: 2-18096431-5\n", "columns: Place,Player,Country,Score,To par\n", "Q: What is To par, when Country is \"United States\", and when Player is \"Mark Brooks\"?\n", "A: 18-1\n", "END\n", "\n", "\n", "expected answer\n", "SELECT To par FROM 2-18096431-5 WHERE Country = 'united states' AND Player = 'mark brooks'\n", "END\n", "\n", "from model\n", "table: 2-10701914-2\n", "columns: Home team,Home team score,Away team,Away team score,Venue,Crowd,Date\n", "Q: What home team played at the western oval?\n", "A: Western Bulldogs\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "\n", "\n", "expected answer\n", "SELECT Home team FROM 2-10701914-2 WHERE Venue = 'western oval'\n", "END\n", "\n", "from model\n", "table: 1-29598261-1\n", "columns: Name,Number,Position,Height,Weight,Year,Hometown,Last School/College\n", "Q: what is the school for chris mcnamara?\n", "A: SELECT Last School/College FROM 1-29598261-1 WHERE Name = 'chris mcnamara'\n", "END\n", "END\n", "END\n", "END\n", "\n", "\n", "expected answer\n", "SELECT Last School/College FROM 1-29598261-1 WHERE Name = 'Chris McNamara'\n", "END\n", "\n", "from model\n", "table: 1-27722408-11\n", "columns: Game,Date,Team,Score,High points,High rebounds,High assists,Location Attendance,Record\n", "Q: Who had the most assists and how many did they have on April 8?\n", "A: SELECT High assists FROM 1-27722408-11 WHERE Date = 'april 8'\n", "END\n", "\n", "\n", "expected answer\n", "SELECT High assists FROM 1-27722408-11 WHERE Date = 'April 8'\n", "END\n", "\n", "from model\n", "table: 1-21378339-5\n", "columns: Draw,Song,Artist,Panel Points,Televotes,Televote Points,Score,Placing\n", "Q: Name the number of artists for panel points being 5\n", "A: SELECT COUNT Artist FROM 1-21378339-5 WHERE Panel Points = 5\n", "END\n", "END\n", "END\n", "END\n", "END\n", "\n", "expected answer\n", "SELECT COUNT Artist FROM 1-21378339-5 WHERE Panel Points = 5\n", "END\n", "\n", "from model\n", "table: 2-11545282-17\n", "columns: Player,Nationality,Position,Years for Jazz,School/Club Team\n", "Q: What position does Michael Ruffin play?\n", "A: SELECT Position FROM 2-11545282-17 WHERE Player = 'michael ruffin'\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "END\n", "\n", "\n", "expected answer\n", "SELECT Position FROM 2-11545282-17 WHERE Player = 'michael ruffin'\n", "END\n", "\n", "from model\n", "table: 1-17801022-1\n", "columns: Year,Date,Driver,Manufacturer,Laps,Miles (km),Race Time,Average Speed (mph)\n", "Q: What manufacturer won the race on November 2?\n", "A: SELECT Manufacturer FROM 1-17801022-1 WHERE Date = 'november 2'\n", "END\n", "END\n", "END\n", "\n", "expected answer\n", "SELECT Manufacturer FROM 1-17801022-1 WHERE Date = 'November 2'\n", "END\n", "\n", "from model\n", "table: 2-10806592-14\n", "columns: Home team,Home team score,Away team,Away team score,Venue,Crowd,Date\n", "Q: What was the away score when the home team was Melbourne?\n", "A: SELECT Away team score FROM 2-10806592-14 WHERE Home team = 'melbourne'\n", "END\n", "END\n", "END\n", "\n", "\n", "expected answer\n", "SELECT Away team score FROM 2-10806592-14 WHERE Home team = 'melbourne'\n", "END\n", "\n", "from model\n", "table: 2-17978030-6\n", "columns: Date,Time,Score,Set 1,Set 2,Set 3,Total\n", "Q: What is the score when the set 3 is 26–28?\n", "A: SELECT Score FROM 2-17978030-6 WHERE Set 3 = '26–28'\n", "END\n", "END\n", "Q: What\n", "\n", "expected answer\n", "SELECT Score FROM 2-17978030-6 WHERE Set 3 = '26–28'\n", "END\n", "\n" ] } ], "source": [ "def get_query(q):\n", " \n", " toks = tokenizer(q , return_tensors='pt')\n", " ctoks = toks.input_ids.to('cuda')\n", " gen = model.generate(ctoks, max_length=100)\n", " return tokenizer.decode(gen[0])\n", "\n", "M = len(q_red)\n", "\n", "for _ in range(10):\n", " j = random.randint(0,M-1)\n", " qs = q_red[j]\n", " a = a_red[j]\n", "\n", " ma = get_query(qs)\n", "\n", " #print(qs)\n", " print('from model')\n", " print(ma)\n", " print()\n", " print('expected answer')\n", " print(a)\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "6a381460736e8a0eabfb35eafae436ba15c06439de44e28b965ea473bd8dda90" } } }, "nbformat": 4, "nbformat_minor": 2 }