{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
"\n",
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
"\n",
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
"\n",
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
"\n",
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import re\n",
"\n",
"from dotenv import load_dotenv\n",
"from img2table.document import Image\n",
"from img2table.ocr import DocTR\n",
"from itertools import product\n",
"\n",
"load_dotenv()\n",
"pd.set_option('expand_frame_repr', False)\n",
"ocr = DocTR()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"image = Image('../NutriGenMe-Testing/monogenic-1.png')"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" None | \n",
" None | \n",
" Monogenic Diabetes or | \n",
" Associated With Common | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" Gene Name | \n",
" Major Function | \n",
" Syndromes | \n",
" T1D and/or T2D | \n",
" Refs. | \n",
"
\n",
" \n",
" 2 | \n",
" KCNJ11 | \n",
" Encodes pore-forming inwardly-rectifying | \n",
" PNDM (most common cause) | \n",
" E23K | \n",
" 42-46 | \n",
"
\n",
" \n",
" 3 | \n",
" None | \n",
" potassium channel subunits (Kir6.2) | \n",
" and TNDM, CHI, MODY | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 4 | \n",
" ABCC8 | \n",
" Encodes regulatory SUR1 subunits | \n",
" PNDM and TNDM, CHI, MODY | \n",
" A1369S, 1273AGA, R1420H | \n",
" 46,47,52 | \n",
"
\n",
" \n",
" 5 | \n",
" GCK | \n",
" A key glucose-phosphoryating enzyme; | \n",
" GCK-MODY (MODY2), PNDM, | \n",
" rs1799884 (G/A), rs4607517 (A/G), | \n",
" 75,78,79 | \n",
"
\n",
" \n",
" 6 | \n",
" None | \n",
" a glucose sensor | \n",
" CHI | \n",
" 3'UTR SNP, chr7:44184184-G/A | \n",
" None | \n",
"
\n",
" \n",
" 7 | \n",
" SLC2A2 | \n",
" Encodes GLUT2, a high-capacity facilitative | \n",
" FBS | \n",
" SNPS rs5393 (AA) and rs5394 | \n",
" 93-100 | \n",
"
\n",
" \n",
" 8 | \n",
" None | \n",
" glucose transporter | \n",
" None | \n",
" (CC) in the promoter region | \n",
" None | \n",
"
\n",
" \n",
" 9 | \n",
" None | \n",
" None | \n",
" None | \n",
" and SNPS rs5400 (T1101) and | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"0 None None Monogenic Diabetes or Associated With Common None\n",
"1 Gene Name Major Function Syndromes T1D and/or T2D Refs.\n",
"2 KCNJ11 Encodes pore-forming inwardly-rectifying PNDM (most common cause) E23K 42-46\n",
"3 None potassium channel subunits (Kir6.2) and TNDM, CHI, MODY None None\n",
"4 ABCC8 Encodes regulatory SUR1 subunits PNDM and TNDM, CHI, MODY A1369S, 1273AGA, R1420H 46,47,52\n",
"5 GCK A key glucose-phosphoryating enzyme; GCK-MODY (MODY2), PNDM, rs1799884 (G/A), rs4607517 (A/G), 75,78,79\n",
"6 None a glucose sensor CHI 3'UTR SNP, chr7:44184184-G/A None\n",
"7 SLC2A2 Encodes GLUT2, a high-capacity facilitative FBS SNPS rs5393 (AA) and rs5394 93-100\n",
"8 None glucose transporter None (CC) in the promoter region None\n",
"9 None None None and SNPS rs5400 (T1101) and None"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extracted_tables = image.extract_tables(ocr=ocr, implicit_rows=True, borderless_tables=True, min_confidence=0)\n",
"\n",
"df = []\n",
"print(len(extracted_tables))\n",
"if len(extracted_tables) > 0:\n",
" df = extracted_tables[0].df\n",
" for et in extracted_tables[1:]:\n",
" df = pd.concat([df, et.df]).reset_index(drop=True)\n",
"\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0 1 2 3 4\n",
"0 Monogenic Diabetes or Associated With Common \n",
"1 Gene Name Major Function Syndromes T1D and/or T2D Refs.\n",
"2 KCNJ11 Encodes pore-forming inwardly-rectifying potas... PNDM (most common cause) and TNDM, CHI, MODY E23K 42-46\n",
"3 ABCC8 Encodes regulatory SUR1 subunits PNDM and TNDM, CHI, MODY A1369S, 1273AGA, R1420H 46,47,52\n",
"4 GCK A key glucose-phosphoryating enzyme; a glucose... GCK-MODY (MODY2), PNDM, CHI rs1799884 (G/A), rs4607517 (A/G), 3'UTR SNP, c... 75,78,79\n",
"5 SLC2A2 Encodes GLUT2, a high-capacity facilitative gl... FBS SNPS rs5393 (AA) and rs5394 (CC) in the promot... 93-100\n",
"6 HNF1A/TCF1 TF; regulator of pancreatic B-cell differentia... HNF1A-MODY (MODY3), most common cause of MODY,... G319S, C.1522G>A (p.E508K) 114, 118, 119\n",
"7 HNF4A Key TF for early fetal development HNF4A MODY (MODY1), CHI SNPS rs2144908, rs3818247 and rs884614, rs4810... 121-124, 274\n",
"8 HNF1B/TCF2 TF; required for the generation of pancreatic ... RCAD syndrome, or MODY5; TNDM and PNDM (rare) SNP rs757210 A, TS4430796 A, and TS7501939 C 141, 144\n",
"9 PDX1 TF; required for pancreas development, B-cell ... PNDM, MODY4 C18R, Q59L, D76N, R197H, G212R, P239Q, InsCCG2... 163-165, 167\n",
"10 PAX4 Islet TF that functions mainly as a transcript... MODY9 R121W, R133W, R37W, rs10229583 G 180, 181, 187\n",
"11 NEUROD1/BETA2 TF; required for the development of the endocr... MODY6 and PNDM R111L and 206 + C; A45T variant at rs1801262 (... 204-208\n",
"12 WFS1 A transmembrane protein; a negative regulator ... WFS1, sometimes referred to as DIDMOAD R456 and H611, SNPS at rs10010131, rs6446482; ... 223-225\n",
"13 PPARG TF; master regulator of adipogenesis, energy b... Monogenic diabetes Monogenic Diabetes Genes ... Pro12Ala variant (rs1801282), SNP at rs4684847... 240-243, 250\n",
"14 INS Predominant glucose-lowering hormone PNDM (2nd most common cause), TNDM, MODY10 Class I alleles of INS VNTR associated with T1... 273, 274, 276-281\n",
"15 GLIS3 TF; regulator of islet development, insulin ge... Neonatal diabetes syndrome associated with con... rs7020673 G associated with T1D; rs7034200 A a... 78, 214, 289, 291, 292, 295-308\n"
]
}
],
"source": [
"lst = []\n",
"now = []\n",
"for i in df.index:\n",
" if not df.loc[i].isna().any():\n",
" if len(now) > 0:\n",
" lst.append(now)\n",
" now = []\n",
" now.append(i)\n",
"lst.append(now)\n",
"\n",
"df.loc[0] = df.loc[0].fillna('')\n",
"dfc = pd.DataFrame(columns=df.columns)\n",
"for l in lst:\n",
" rows = df.loc[l[0]]\n",
" for idx in l[1:]:\n",
" rows = rows + ' ' + df.loc[idx].fillna('')\n",
" rows = rows.str.strip()\n",
" dfc.loc[len(dfc)] = rows\n",
"\n",
"print(dfc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Simple Filtering"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"118 [('rs5393', 'GLUT2'), ('rs5404', 'SNPS'), ('rs757210', 'HNF1B'), ('rs884614', 'SNPS'), ('rs2144908', 'MODY'), ('rs2144908', 'CHI'), ('rs4684847', 'T1D'), ('rs1884613', 'MODY'), ('rs1884613', 'CHI'), ('rs5393', 'SNPS'), ('rs734312', 'SNPS'), ('rs5394', 'GLUT2'), ('rs757210', 'TS4430796'), ('rs7041847', 'T1D'), ('rs6446482', 'SNPS'), ('rs7020673', 'GLIS3'), ('rs4684847', 'TZDS'), ('rs757210', 'PNDM'), ('rs5400', 'GLUT2'), ('rs7020673', 'T2D'), ('rs3818247', 'HNF4A'), ('rs4810424', 'MODY'), ('rs4810424', 'CHI'), ('rs10229583', 'R133W'), ('rs1801262', 'R111L'), ('rs1801262', 'BETA2'), ('rs10010131', 'SNPS'), ('rs10229583', 'MODY9'), ('rs5400', 'SNPS'), ('rs1801282', 'T1D'), ('rs2144908', 'HNF4A'), ('rs5393', 'FBS'), ('rs757210', 'RCAD'), ('rs10229583', 'R121W'), ('rs1801262', 'INS'), ('rs10010131', 'R456'), ('rs4684847', 'SNP'), ('rs7034200', 'T2D'), ('rs5404', 'T1101'), ('rs4607517', 'MODY'), ('rs1799884', 'MODY'), ('rs1799884', 'CHI'), ('rs4607517', 'PNDM'), ('rs6446482', 'WFS1'), ('rs1799884', 'PNDM'), ('rs5404', 'SLC2A2'), ('rs1801282', 'TZDS'), ('rs5404', 'T198T'), ('rs884614', 'MODY1'), ('rs734312', 'DIDMOAD'), ('rs5394', 'FBS'), ('rs4810424', 'HNF4A'), ('rs7020673', 'T1D'), ('rs757210', 'TCF2'), ('rs5393', 'T1101'), ('rs6446482', 'DIDMOAD'), ('rs1801262', 'A45T'), ('rs5394', 'SNPS'), ('rs5393', 'SLC2A2'), ('rs884614', 'CHI'), ('rs884614', 'MODY'), ('rs5393', 'T198T'), ('rs5400', 'FBS'), ('rs3818247', 'SNPS'), ('rs757210', 'SNP'), ('rs10229583', 'R37W'), ('rs10229583', 'PAX4'), ('rs4684847', 'T2D'), ('rs1801282', 'SNP'), ('rs7034200', 'GLIS3'), ('rs1884613', 'HNF4A'), ('rs4607517', 'GCK'), ('rs757210', 'TS7501939'), ('rs1799884', 'GCK'), ('rs10010131', 'DIDMOAD'), ('rs734312', 'WFS1'), ('rs2144908', 'SNPS'), ('rs5394', 'T198T'), ('rs4684847', 'PPARG'), ('rs734312', 'H611'), ('rs1801262', 'MODY6'), ('rs4607517', 'CHI'), ('rs7041847', 'T2D'), ('rs5404', 'GLUT2'), ('rs5400', 'T1101'), ('rs4607517', 'UTR'), ('rs1799884', 'UTR'), ('rs5400', 'SLC2A2'), ('rs6446482', 'H611'), ('rs5400', 'T198T'), ('rs1799884', 'SNP'), ('rs884614', 'HNF4A'), ('rs4810424', 'SNPS'), ('rs10010131', 'WFS1'), ('rs1801282', 'T2D'), ('rs10010131', 'H611'), ('rs1801262', 'PNDM'), ('rs4607517', 'SNP'), ('rs5394', 'T1101'), ('rs757210', 'TNDM'), ('rs4810424', 'MODY1'), ('rs1801282', 'PPARG'), ('rs7034200', 'T1D'), ('rs7041847', 'GLIS3'), ('rs4607517', 'MODY2'), ('rs5394', 'SLC2A2'), ('rs3818247', 'MODY1'), ('rs1799884', 'MODY2'), ('rs1884613', 'SNPS'), ('rs757210', 'MODY5'), ('rs734312', 'R456'), ('rs3818247', 'MODY'), ('rs3818247', 'CHI'), ('rs6446482', 'R456'), ('rs5404', 'FBS'), ('rs1801262', 'NEUROD1'), ('rs2144908', 'MODY1'), ('rs1884613', 'MODY1')]\n"
]
}
],
"source": [
"def filter(row):\n",
" concat = ' '.join(list(row))\n",
" snp = re.findall('rs\\d+', concat)\n",
" gene = re.findall('[A-Z][A-Z0-9]{2,}', concat)\n",
"\n",
" return snp, gene\n",
"\n",
"pairs = []\n",
"for i in dfc.index:\n",
" snp_gene = filter(dfc.loc[i])\n",
" pairs.extend(list(product(*snp_gene)))\n",
"\n",
"pairs = list(set(pairs))\n",
"print(len(pairs), pairs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Df to JSON to LLM"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"0\": \"\",\n",
" \"1\": \"\",\n",
" \"2\": \"Monogenic Diabetes or\",\n",
" \"3\": \"Associated With Common\",\n",
" \"4\": \"\"\n",
" },\n",
" {\n",
" \"0\": \"Gene Name\",\n",
" \"1\": \"Major Function\",\n",
" \"2\": \"Syndromes\",\n",
" \"3\": \"T1D and/or T2D\",\n",
" \"4\": \"Refs.\"\n",
" },\n",
" {\n",
" \"0\": \"KCNJ11\",\n",
" \"1\": \"Encodes pore-forming inwardly-rectifying potassium channel subunits (Kir6.2)\",\n",
" \"2\": \"PNDM (most common cause) and TNDM, CHI, MODY\",\n",
" \"3\": \"E23K\",\n",
" \"4\": \"42-46\"\n",
" },\n",
" {\n",
" \"0\": \"ABCC8\",\n",
" \"1\": \"Encodes regulatory SUR1 subunits\",\n",
" \"2\": \"PNDM and TNDM, CHI, MODY\",\n",
" \"3\": \"A1369S, 1273AGA, R1420H\",\n",
" \"4\": \"46,47,52\"\n",
" },\n",
" {\n",
" \"0\": \"GCK\",\n",
" \"1\": \"A key glucose-phosphoryating enzyme; a glucose sensor\",\n",
" \"2\": \"GCK-MODY (MODY2), PNDM, CHI\",\n",
" \"3\": \"rs1799884 (G/A), rs4607517 (A/G), 3'UTR SNP, chr7:44184184-G/A\",\n",
" \"4\": \"75,78,79\"\n",
" },\n",
" {\n",
" \"0\": \"SLC2A2\",\n",
" \"1\": \"Encodes GLUT2, a high-capacity facilitative glucose transporter\",\n",
" \"2\": \"FBS\",\n",
" \"3\": \"SNPS rs5393 (AA) and rs5394 (CC) in the promoter region and SNPS rs5400 (T1101) and rs5404 (T198T)\",\n",
" \"4\": \"93-100\"\n",
" },\n",
" {\n",
" \"0\": \"HNF1A/TCF1\",\n",
" \"1\": \"TF; regulator of pancreatic B-cell differentiation\",\n",
" \"2\": \"HNF1A-MODY (MODY3), most common cause of MODY, CHI\",\n",
" \"3\": \"G319S, C.1522G>A (p.E508K)\",\n",
" \"4\": \"114, 118, 119\"\n",
" },\n",
" {\n",
" \"0\": \"HNF4A\",\n",
" \"1\": \"Key TF for early fetal development\",\n",
" \"2\": \"HNF4A MODY (MODY1), CHI\",\n",
" \"3\": \"SNPS rs2144908, rs3818247 and rs884614, rs4810424, rs1884613\",\n",
" \"4\": \"121-124, 274\"\n",
" },\n",
" {\n",
" \"0\": \"HNF1B/TCF2\",\n",
" \"1\": \"TF; required for the generation of pancreatic and endocrine progenitors\",\n",
" \"2\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\",\n",
" \"3\": \"SNP rs757210 A, TS4430796 A, and TS7501939 C\",\n",
" \"4\": \"141, 144\"\n",
" },\n",
" {\n",
" \"0\": \"PDX1\",\n",
" \"1\": \"TF; required for pancreas development, B-cell differentiation and the maintenance of mature B-cell function\",\n",
" \"2\": \"PNDM, MODY4\",\n",
" \"3\": \"C18R, Q59L, D76N, R197H, G212R, P239Q, InsCCG243, p.Gly218Alafs*12\",\n",
" \"4\": \"163-165, 167\"\n",
" },\n",
" {\n",
" \"0\": \"PAX4\",\n",
" \"1\": \"Islet TF that functions mainly as a transcription repressor\",\n",
" \"2\": \"MODY9\",\n",
" \"3\": \"R121W, R133W, R37W, rs10229583 G\",\n",
" \"4\": \"180, 181, 187\"\n",
" },\n",
" {\n",
" \"0\": \"NEUROD1/BETA2\",\n",
" \"1\": \"TF; required for the development of the endocrine pancreas; transactivates the INS gene\",\n",
" \"2\": \"MODY6 and PNDM\",\n",
" \"3\": \"R111L and 206 + C; A45T variant at rs1801262 (inconsistent)\",\n",
" \"4\": \"204-208\"\n",
" },\n",
" {\n",
" \"0\": \"WFS1\",\n",
" \"1\": \"A transmembrane protein; a negative regulator of ER stress\",\n",
" \"2\": \"WFS1, sometimes referred to as DIDMOAD\",\n",
" \"3\": \"R456 and H611, SNPS at rs10010131, rs6446482; variants rs10010131 G, 1801213 G, and rs734312 A\",\n",
" \"4\": \"223-225\"\n",
" },\n",
" {\n",
" \"0\": \"PPARG\",\n",
" \"1\": \"TF; master regulator of adipogenesis, energy balance, lipid biosynthesis, and insulin sensitivity; cellular target of TZDS Monogenic Diabetes Genes Associated With Both Common T1D and T2D\",\n",
" \"2\": \"Monogenic diabetes Monogenic Diabetes Genes Associated With Both Common T1D and T2D\",\n",
" \"3\": \"Pro12Ala variant (rs1801282), SNP at rs4684847 Monogenic Diabetes Genes Associated With Both Common T1D and T2D\",\n",
" \"4\": \"240-243, 250\"\n",
" },\n",
" {\n",
" \"0\": \"INS\",\n",
" \"1\": \"Predominant glucose-lowering hormone\",\n",
" \"2\": \"PNDM (2nd most common cause), TNDM, MODY10\",\n",
" \"3\": \"Class I alleles of INS VNTR associated with T1D; Class IIl alleles of INS VNTR inconsistently associated with T2D\",\n",
" \"4\": \"273, 274, 276-281\"\n",
" },\n",
" {\n",
" \"0\": \"GLIS3\",\n",
" \"1\": \"TF; regulator of islet development, insulin gene transcription, and obesity-induced compensatory B-cell proliferation\",\n",
" \"2\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\",\n",
" \"3\": \"rs7020673 G associated with T1D; rs7034200 A and rs7041847 A associated with T2D\",\n",
" \"4\": \"78, 214, 289, 291, 292, 295-308\"\n",
" }\n",
"]\n"
]
}
],
"source": [
"import json\n",
"\n",
"js = dfc.to_json(orient='records')\n",
"\n",
"df_str = json.dumps(json.loads(js), indent=2)\n",
"print(df_str)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"\n",
"llm = ChatOpenAI(model=\"gpt-3.5-turbo\")"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"Genes\": \"KCNJ11\",\n",
" \"SNPs\": [\"E23K\"],\n",
" \"Diseases\": \"PNDM (most common cause) and TNDM, CHI, MODY\"\n",
" },\n",
" {\n",
" \"Genes\": \"ABCC8\",\n",
" \"SNPs\": [\"A1369S\", \"1273AGA\", \"R1420H\"],\n",
" \"Diseases\": \"PNDM and TNDM, CHI, MODY\"\n",
" },\n",
" {\n",
" \"Genes\": \"GCK\",\n",
" \"SNPs\": [\"rs1799884 (G/A)\", \"rs4607517 (A/G)\", \"3'UTR SNP\", \"chr7:44184184-G/A\"],\n",
" \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC2A2\",\n",
" \"SNPs\": [\"rs5393 (AA)\", \"rs5394 (CC)\", \"rs5400 (T1101)\", \"rs5404 (T198T)\"],\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF1A/TCF1\",\n",
" \"SNPs\": [\"G319S\", \"C.1522G>A (p.E508K)\"],\n",
" \"Diseases\": \"HNF1A-MODY (MODY3), most common cause of MODY, CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF4A\",\n",
" \"SNPs\": [\"rs2144908\", \"rs3818247\", \"rs884614\", \"rs4810424\", \"rs1884613\"],\n",
" \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF1B/TCF2\",\n",
" \"SNPs\": [\"rs757210 A\", \"TS4430796 A\", \"TS7501939 C\"],\n",
" \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
" },\n",
" {\n",
" \"Genes\": \"PDX1\",\n",
" \"SNPs\": [\"C18R\", \"Q59L\", \"D76N\", \"R197H\", \"G212R\", \"P239Q\", \"InsCCG243\", \"p.Gly218Alafs*12\"],\n",
" \"Diseases\": \"PNDM, MODY4\"\n",
" },\n",
" {\n",
" \"Genes\": \"PAX4\",\n",
" \"SNPs\": [\"R121W\", \"R133W\", \"R37W\", \"rs10229583 G\"],\n",
" \"Diseases\": \"MODY9\"\n",
" },\n",
" {\n",
" \"Genes\": \"NEUROD1/BETA2\",\n",
" \"SNPs\": [\"R111L\", \"206 + C\", \"A45T variant at rs1801262 (inconsistent)\"],\n",
" \"Diseases\": \"MODY6 and PNDM\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": [\"R456\", \"H611\", \"rs10010131\", \"rs6446482\", \"rs10010131 G\", \"1801213 G\", \"rs734312 A\"],\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"PPARG\",\n",
" \"SNPs\": [\"Pro12Ala variant (rs1801282)\", \"SNP at rs4684847\"],\n",
" \"Diseases\": \"Monogenic diabetes Monogenic Diabetes Genes Associated With Both Common T1D and T2D\"\n",
" },\n",
" {\n",
" \"Genes\": \"INS\",\n",
" \"SNPs\": [\"Class I alleles of INS VNTR associated with T1D\", \"Class IIl alleles of INS VNTR inconsistently associated with T2D\"],\n",
" \"Diseases\": \"PNDM (2nd most common cause), TNDM, MODY10\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLIS3\",\n",
" \"SNPs\": [\"rs7020673 G\", \"rs7034200 A\", \"rs7041847 A\"],\n",
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
" }\n",
"]\n"
]
}
],
"source": [
"prompt = \"\"\"\n",
"# CONTEXT #\n",
"In my capacity as a genomics specialist, I have table data obtained from a published research paper in the field of genomics. The data is provided in a list of JSONs format, with each JSON object representing a single row in a tabular structure. The first JSON element in the list represents the header row of the table, containing the names of each column.\n",
"This is the data:\n",
"{}\n",
"\n",
"# OBJECTIVE #\n",
"Given the provided table data, the following tasks need to be completed:\n",
"\n",
"1. Identify all unique gene names present within the table. Each row can contains more than one gene name.\n",
"2. If present, extract any entries starting with \"rs\" (presumably representing Single Nucleotide Polymorphisms or rsIDs) that correspond to the same row as their associated gene names. Each gene name can correspond with more than one SNPs.\n",
"3. If available, extract any disease information associated with both the gene name and its corresponding SNP/rsID.\n",
"\n",
"It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.\n",
"If an SNPs or Diseases is absent from the table, leave the corresponding field blank with an empty string ('').\n",
"\n",
"# RESPPOSE #\n",
"The output should be a string containing list of JSON objects, each representing an entry with the following structure:\n",
"[\n",
" {{\n",
" \"Genes\": \"A\",\n",
" \"SNPs\": [\"rs123\", \"rs456\"],\n",
" \"Diseases\": \"A, B, C\"\n",
" }}\n",
"]\n",
"\"\"\"\n",
"\n",
"result = llm.invoke(prompt.format(df_str))\n",
"print(result.content)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'Genes': 'KCNJ11',\n",
" 'SNPs': ['E23K'],\n",
" 'Diseases': 'PNDM (most common cause) and TNDM, CHI, MODY'},\n",
" {'Genes': 'ABCC8',\n",
" 'SNPs': ['A1369S', '1273AGA', 'R1420H'],\n",
" 'Diseases': 'PNDM and TNDM, CHI, MODY'},\n",
" {'Genes': 'GCK',\n",
" 'SNPs': ['rs1799884 (G/A)',\n",
" 'rs4607517 (A/G)',\n",
" \"3'UTR SNP\",\n",
" 'chr7:44184184-G/A'],\n",
" 'Diseases': 'GCK-MODY (MODY2), PNDM, CHI'},\n",
" {'Genes': 'SLC2A2',\n",
" 'SNPs': ['rs5393 (AA)', 'rs5394 (CC)', 'rs5400 (T1101)', 'rs5404 (T198T)'],\n",
" 'Diseases': 'FBS'},\n",
" {'Genes': 'HNF1A/TCF1',\n",
" 'SNPs': ['G319S', 'C.1522G>A (p.E508K)'],\n",
" 'Diseases': 'HNF1A-MODY (MODY3), most common cause of MODY, CHI'},\n",
" {'Genes': 'HNF4A',\n",
" 'SNPs': ['rs2144908', 'rs3818247', 'rs884614', 'rs4810424', 'rs1884613'],\n",
" 'Diseases': 'HNF4A MODY (MODY1), CHI'},\n",
" {'Genes': 'HNF1B/TCF2',\n",
" 'SNPs': ['rs757210 A', 'TS4430796 A', 'TS7501939 C'],\n",
" 'Diseases': 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)'},\n",
" {'Genes': 'PDX1',\n",
" 'SNPs': ['C18R',\n",
" 'Q59L',\n",
" 'D76N',\n",
" 'R197H',\n",
" 'G212R',\n",
" 'P239Q',\n",
" 'InsCCG243',\n",
" 'p.Gly218Alafs*12'],\n",
" 'Diseases': 'PNDM, MODY4'},\n",
" {'Genes': 'PAX4',\n",
" 'SNPs': ['R121W', 'R133W', 'R37W', 'rs10229583 G'],\n",
" 'Diseases': 'MODY9'},\n",
" {'Genes': 'NEUROD1/BETA2',\n",
" 'SNPs': ['R111L', '206 + C', 'A45T variant at rs1801262 (inconsistent)'],\n",
" 'Diseases': 'MODY6 and PNDM'},\n",
" {'Genes': 'WFS1',\n",
" 'SNPs': ['R456',\n",
" 'H611',\n",
" 'rs10010131',\n",
" 'rs6446482',\n",
" 'rs10010131 G',\n",
" '1801213 G',\n",
" 'rs734312 A'],\n",
" 'Diseases': 'WFS1, sometimes referred to as DIDMOAD'},\n",
" {'Genes': 'PPARG',\n",
" 'SNPs': ['Pro12Ala variant (rs1801282)', 'SNP at rs4684847'],\n",
" 'Diseases': 'Monogenic diabetes Monogenic Diabetes Genes Associated With Both Common T1D and T2D'},\n",
" {'Genes': 'INS',\n",
" 'SNPs': ['Class I alleles of INS VNTR associated with T1D',\n",
" 'Class IIl alleles of INS VNTR inconsistently associated with T2D'],\n",
" 'Diseases': 'PNDM (2nd most common cause), TNDM, MODY10'},\n",
" {'Genes': 'GLIS3',\n",
" 'SNPs': ['rs7020673 G', 'rs7034200 A', 'rs7041847 A'],\n",
" 'Diseases': 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys'}]"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lst_result = eval(result.content)\n",
"lst_result"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['PNDM (most common cause) and TNDM, CHI, MODY',\n",
" 'PNDM and TNDM, CHI, MODY',\n",
" 'PNDM and TNDM, CHI, MODY',\n",
" 'PNDM and TNDM, CHI, MODY',\n",
" 'GCK-MODY (MODY2), PNDM, CHI',\n",
" 'GCK-MODY (MODY2), PNDM, CHI',\n",
" 'GCK-MODY (MODY2), PNDM, CHI',\n",
" 'GCK-MODY (MODY2), PNDM, CHI',\n",
" 'FBS',\n",
" 'FBS',\n",
" 'FBS',\n",
" 'FBS',\n",
" 'HNF1A-MODY (MODY3), most common cause of MODY, CHI',\n",
" 'HNF1A-MODY (MODY3), most common cause of MODY, CHI',\n",
" 'HNF4A MODY (MODY1), CHI',\n",
" 'HNF4A MODY (MODY1), CHI',\n",
" 'HNF4A MODY (MODY1), CHI',\n",
" 'HNF4A MODY (MODY1), CHI',\n",
" 'HNF4A MODY (MODY1), CHI',\n",
" 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)',\n",
" 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)',\n",
" 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)',\n",
" 'PNDM, MODY4',\n",
" 'PNDM, MODY4',\n",
" 'PNDM, MODY4',\n",
" 'PNDM, MODY4',\n",
" 'PNDM, MODY4',\n",
" 'PNDM, MODY4',\n",
" 'PNDM, MODY4',\n",
" 'PNDM, MODY4',\n",
" 'MODY9',\n",
" 'MODY9',\n",
" 'MODY9',\n",
" 'MODY9',\n",
" 'MODY6 and PNDM',\n",
" 'MODY6 and PNDM',\n",
" 'MODY6 and PNDM',\n",
" 'WFS1, sometimes referred to as DIDMOAD',\n",
" 'WFS1, sometimes referred to as DIDMOAD',\n",
" 'WFS1, sometimes referred to as DIDMOAD',\n",
" 'WFS1, sometimes referred to as DIDMOAD',\n",
" 'WFS1, sometimes referred to as DIDMOAD',\n",
" 'WFS1, sometimes referred to as DIDMOAD',\n",
" 'WFS1, sometimes referred to as DIDMOAD',\n",
" 'Monogenic diabetes Monogenic Diabetes Genes Associated With Both Common T1D and T2D',\n",
" 'Monogenic diabetes Monogenic Diabetes Genes Associated With Both Common T1D and T2D',\n",
" 'PNDM (2nd most common cause), TNDM, MODY10',\n",
" 'PNDM (2nd most common cause), TNDM, MODY10',\n",
" 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys',\n",
" 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys',\n",
" 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys']"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_gene = []\n",
"res_snp = []\n",
"res_disease = []\n",
"\n",
"for res in lst_result:\n",
" gene = res['Genes']\n",
" snps = res['SNPs']\n",
" disease = res['Diseases']\n",
"\n",
" for snp in snps:\n",
" res_gene.append(gene)\n",
" res_snp.append(snp)\n",
" res_disease.append(disease)\n",
"\n",
"res_disease\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"st = '```json\\n[\\n {\\n \"Genes\": \"BCLIIA\",\\n \"SNPs\": [\"rs243021\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"ZBED3\",\\n \"SNPs\": [\"rs4457053\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"KLF14\",\\n \"SNPs\": [\"rs972283\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"TP53INP1\",\\n \"SNPs\": [\"rs896854\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"CHCHD9\",\\n \"SNPs\": [\"rs13292136\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"KCNQI\",\\n \"SNPs\": [\"rs231362\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"CENTD2\",\\n \"SNPs\": [\"rs1552224\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"HMGA2\",\\n \"SNPs\": [\"rs15313432\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"HNFIA\",\\n \"SNPs\": [\"rs7957197\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"ZFAND6\",\\n \"SNPs\": [\"rsl1634397\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"PRCI\",\\n \"SNPs\": [\"rs8042680\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"DUSP9\",\\n \"SNPs\": [\"rs5945326\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"IRSI\",\\n \"SNPs\": [\"rs7578326\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"MTNRIB\",\\n \"SNPs\": [\"rs1387153\"],\\n \"Diseases\": \"\"\\n }\\n]\\n```'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'[\\n {\\n \"Genes\": \"BCLIIA\",\\n \"SNPs\": [\"rs243021\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"ZBED3\",\\n \"SNPs\": [\"rs4457053\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"KLF14\",\\n \"SNPs\": [\"rs972283\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"TP53INP1\",\\n \"SNPs\": [\"rs896854\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"CHCHD9\",\\n \"SNPs\": [\"rs13292136\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"KCNQI\",\\n \"SNPs\": [\"rs231362\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"CENTD2\",\\n \"SNPs\": [\"rs1552224\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"HMGA2\",\\n \"SNPs\": [\"rs15313432\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"HNFIA\",\\n \"SNPs\": [\"rs7957197\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"ZFAND6\",\\n \"SNPs\": [\"rsl1634397\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"PRCI\",\\n \"SNPs\": [\"rs8042680\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"DUSP9\",\\n \"SNPs\": [\"rs5945326\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"IRSI\",\\n \"SNPs\": [\"rs7578326\"],\\n \"Diseases\": \"\"\\n },\\n {\\n \"Genes\": \"MTNRIB\",\\n \"SNPs\": [\"rs1387153\"],\\n \"Diseases\": \"\"\\n }\\n'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"st[st.find('['):st.rfind(']')+1]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 1-s2.0-S0002916523016155-main.pdf\n",
"3 1329.pdf\n",
"4 41467_2020_Article_15421.pdf\n",
"4 berndt2013.pdf\n",
"5 BMD.pdf\n",
"3 clock and eat timing.pdf\n",
"2 COMT breast cancer metaanalysis chinese.pdf\n",
"1 COSTAR CHATGPTPrompt _ Towards Data Science.pdf\n",
"3 dubois2010.pdf\n",
"3 EMMM-8-688.pdf\n",
"6 EMS120610.pdf\n",
"6 file.pdf\n",
"3 journal.pbio.3001547.pdf\n",
"3 lipid.pdf\n",
"7 monogenic diabetes.pdf\n",
"3 nihms-1651539.pdf\n",
"5 nihms-1792335.pdf\n",
"6 nihms-668049.pdf\n",
"4 nihms364577.pdf\n",
"4 nihms510594.pdf\n",
"4 pgen.1009952.pdf\n",
"3 PIIS0091674919313661.pdf\n",
"3 s12881-019-0830-y.pdf\n",
"4 s41576-021-00414-z (1).pdf\n",
"3 s41588-018-0047-6.pdf\n",
"8 s41588-022-01024-z (1).pdf\n",
"4 stroke genetic AHA.pdf\n",
"5 surendran2016.pdf\n",
"3 teslovich2010.pdf\n",
"1 ukmss-34421-testing.pdf\n",
"3 ukmss-34421.pdf\n",
"3 wightman2021.pdf\n"
]
}
],
"source": [
"from langchain_community.document_loaders.pdf import PyPDFLoader\n",
"from langchain_core.documents.base import Document\n",
"from langchain_text_splitters import TokenTextSplitter\n",
"import os\n",
"\n",
"for file in os.listdir('../NutriGenMe-Testing/'):\n",
"\n",
" if file[-4:] != '.pdf':\n",
" continue\n",
" loader = PyPDFLoader(f\"../NutriGenMe-Testing/{file}\")\n",
" pages = loader.load()\n",
"\n",
" docs = [Document('\\n'.join([page.page_content for page in pages]))]\n",
" docs[0].metadata = {'source': pages[0].metadata['source']}\n",
"\n",
" text_splitter = TokenTextSplitter.from_tiktoken_encoder(\n",
" chunk_size=8000, chunk_overlap=0\n",
" )\n",
" chunks = text_splitter.split_documents(docs)\n",
" print(len(chunks), file)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Genes | \n",
" SNPs | \n",
" Diseases | \n",
" Title | \n",
" Authors | \n",
" Publisher Name | \n",
" Publication Year | \n",
" Population | \n",
" Sample Size | \n",
" Study Methodology | \n",
" Study Level | \n",
" Conclusion | \n",
"
\n",
" \n",
" \n",
" \n",
" 36 | \n",
" 36 | \n",
" PAX4 | \n",
" S1369A | \n",
" hyperinsulinemic hypoglycemia | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 37 | \n",
" 37 | \n",
" NEUROD1 | \n",
" E23K | \n",
" Wolfram syndrome | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 38 | \n",
" 38 | \n",
" WFS1 | \n",
" Pro12Ala | \n",
" NaN | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 39 | \n",
" 39 | \n",
" KIR6.2 | \n",
" NaN | \n",
" NaN | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 40 | \n",
" 40 | \n",
" GLUT2 | \n",
" NaN | \n",
" NaN | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Genes SNPs Diseases \\\n",
"36 36 PAX4 S1369A hyperinsulinemic hypoglycemia \n",
"37 37 NEUROD1 E23K Wolfram syndrome \n",
"38 38 WFS1 Pro12Ala NaN \n",
"39 39 KIR6.2 NaN NaN \n",
"40 40 GLUT2 NaN NaN \n",
"\n",
" Title \\\n",
"36 Monogenic Diabetes: What It Teaches Us on the ... \n",
"37 Monogenic Diabetes: What It Teaches Us on the ... \n",
"38 Monogenic Diabetes: What It Teaches Us on the ... \n",
"39 Monogenic Diabetes: What It Teaches Us on the ... \n",
"40 Monogenic Diabetes: What It Teaches Us on the ... \n",
"\n",
" Authors Publisher Name Publication Year \\\n",
"36 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"37 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"38 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"39 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"40 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"\n",
" Population Sample Size \\\n",
"36 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"37 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"38 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"39 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"40 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"\n",
" Study Methodology Study Level \\\n",
"36 whole-exome sequencing, case-control and famil... Not Specified \n",
"37 whole-exome sequencing, case-control and famil... Not Specified \n",
"38 whole-exome sequencing, case-control and famil... Not Specified \n",
"39 whole-exome sequencing, case-control and famil... Not Specified \n",
"40 whole-exome sequencing, case-control and famil... Not Specified \n",
"\n",
" Conclusion \n",
"36 The study delves into the genetic intricacies ... \n",
"37 The study delves into the genetic intricacies ... \n",
"38 The study delves into the genetic intricacies ... \n",
"39 The study delves into the genetic intricacies ... \n",
"40 The study delves into the genetic intricacies ... "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_excel('monogenic diabetes_16000.xlsx', sheet_name='Original')\n",
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Genes | \n",
" SNPs | \n",
" Diseases | \n",
" Title | \n",
" Authors | \n",
" Publisher Name | \n",
" Publication Year | \n",
" Population | \n",
" Sample Size | \n",
" Study Methodology | \n",
" Study Level | \n",
" Conclusion | \n",
"
\n",
" \n",
" \n",
" \n",
" 33 | \n",
" 33 | \n",
" HNF4A | \n",
" rs10229583 | \n",
" neonatal diabetes mellitus | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 34 | \n",
" 34 | \n",
" HNF1B | \n",
" rs6467136 | \n",
" maturity-onset diabetes of the young | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 35 | \n",
" 35 | \n",
" PDX1 | \n",
" rs1801262 | \n",
" permanent neonatal diabetes | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 36 | \n",
" 39 | \n",
" KIR6.2 | \n",
" | \n",
" | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
" 37 | \n",
" 40 | \n",
" GLUT2 | \n",
" | \n",
" | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK | \n",
" Not Specified | \n",
" whole-exome sequencing, case-control and famil... | \n",
" Not Specified | \n",
" The study delves into the genetic intricacies ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Genes SNPs Diseases \\\n",
"33 33 HNF4A rs10229583 neonatal diabetes mellitus \n",
"34 34 HNF1B rs6467136 maturity-onset diabetes of the young \n",
"35 35 PDX1 rs1801262 permanent neonatal diabetes \n",
"36 39 KIR6.2 \n",
"37 40 GLUT2 \n",
"\n",
" Title \\\n",
"33 Monogenic Diabetes: What It Teaches Us on the ... \n",
"34 Monogenic Diabetes: What It Teaches Us on the ... \n",
"35 Monogenic Diabetes: What It Teaches Us on the ... \n",
"36 Monogenic Diabetes: What It Teaches Us on the ... \n",
"37 Monogenic Diabetes: What It Teaches Us on the ... \n",
"\n",
" Authors Publisher Name Publication Year \\\n",
"33 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"34 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"35 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"36 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"37 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"\n",
" Population Sample Size \\\n",
"33 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"34 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"35 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"36 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"37 Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK Not Specified \n",
"\n",
" Study Methodology Study Level \\\n",
"33 whole-exome sequencing, case-control and famil... Not Specified \n",
"34 whole-exome sequencing, case-control and famil... Not Specified \n",
"35 whole-exome sequencing, case-control and famil... Not Specified \n",
"36 whole-exome sequencing, case-control and famil... Not Specified \n",
"37 whole-exome sequencing, case-control and famil... Not Specified \n",
"\n",
" Conclusion \n",
"33 The study delves into the genetic intricacies ... \n",
"34 The study delves into the genetic intricacies ... \n",
"35 The study delves into the genetic intricacies ... \n",
"36 The study delves into the genetic intricacies ... \n",
"37 The study delves into the genetic intricacies ... "
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"df = df.fillna('')\n",
"for i in df.index:\n",
" snp = df.loc[i, 'SNPs'].lower()\n",
" flag = True\n",
" # print(snp)\n",
" if not re.fullmatch('rs(\\d)+|', snp):\n",
" if not re.fullmatch('s(\\d)+', snp):\n",
" if not re.fullmatch('(\\d)+', snp):\n",
" flag = False\n",
" else:\n",
" snp = 'rs' + snp\n",
" else:\n",
" snp = 'r' + snp\n",
" \n",
" if not flag:\n",
" df = df.drop(i)\n",
" else:\n",
" df.loc[i, 'SNPs'] = snp\n",
"\n",
"df = df.reset_index(drop=True)\n",
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['135457029', '1029', '1432', '3630', '155030', '1576', '5071', '6934', '3039', '5599', '2308', '3643', '5111', '3172', '6927', '116519', '15376', '3767', '18609', '2645']"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from Bio import Entrez, Seq\n",
"\n",
"Entrez.email = \"fadliaulawia@gmail.com\"\n",
"\n",
"handle = Entrez.esearch(db=\"gene\", term='GCK [All Fields]')\n",
"record = Entrez.read(handle)\n",
"record['IdList']"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"YES\n",
"1\n",
"GCK rs4607517 NO\n",
"2\n",
"YES\n",
"3\n",
"YES\n",
"4\n",
"YES\n",
"5\n",
"YES\n",
"6\n",
"YES\n",
"7\n",
"YES\n",
"8\n",
"HNF4A rs884614 NO\n",
"9\n",
"HNF4A rs4810424 NO\n",
"10\n",
"HNF4A rs1884613 NO\n",
"11\n",
"HNF1B/TCF2 rs757210 NO\n",
"12\n",
"HNF1B/TCF2 rs4430796 NO\n",
"13\n",
"HNF1B/TCF2 rs7501939 NO\n",
"14\n",
"NEUROD1IBETA2 rs1801262 NO\n",
"15\n",
"YES\n",
"16\n",
"YES\n",
"17\n",
"YES\n",
"18\n",
"YES\n",
"19\n",
"YES\n",
"20\n",
"YES\n",
"21\n",
"YES\n",
"22\n",
"YES\n",
"23\n",
"YES\n",
"24\n",
"YES\n",
"25\n",
"KCNJ11 rs2650000 NO\n",
"26\n",
"PPARG rs2144908 NO\n",
"27\n",
"INS rs3818247 NO\n",
"28\n",
"GLIS3 rs884614 NO\n",
"29\n",
"ABCC8 rs4810424 NO\n",
"30\n",
"GCK rs1884613 NO\n",
"31\n",
"SLC2A2 rs757210 NO\n",
"32\n",
"HNF1A rs4430796 NO\n",
"33\n",
"HNF4A rs10229583 NO\n",
"34\n",
"HNF1B rs6467136 NO\n",
"35\n",
"PDX1 rs1801262 NO\n",
"36\n",
"KIR6.2 NO\n",
"37\n",
"GLUT2 NO\n"
]
}
],
"source": [
"import requests\n",
"\n",
"data = {}\n",
"\n",
"for i in df.index:\n",
" gene = df.loc[i, 'Genes']\n",
" snp = df.loc[i, 'SNPs']\n",
"\n",
" print(i)\n",
"\n",
" if len(data.get(gene, '')) == 0:\n",
" url = f'https://www.ncbi.nlm.nih.gov/research/litvar2-api/variant/search/gene/{gene}'\n",
" res = requests.get(url).content\n",
" data[gene] = res\n",
" \n",
" val = data[gene]\n",
" if len(val) != 0:\n",
" if val.decode().find(f\"'{snp}'\") != -1:\n",
" print('YES')\n",
" continue\n",
" \n",
" print(gene, snp, \"NO\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompt = \"\"\"\n",
"# CONTEXT #\n",
"In my capacity as a genomics specialist, I have table data obtained from a published research paper in the field of genomics. The data is provided in a list of JSONs format, with each JSON object representing a single row in a tabular structure.\n",
"This is the data:\n",
"\n",
"[\n",
" {{\n",
" \"Genes\": \"A\",\n",
" \"SNPs\": \"rs123\",\n",
" \"Diseases\": \"A disease\"\n",
" }}\n",
"]\n",
"\n",
"# OBJECTIVE #\n",
"Given the provided table data, the following tasks need to be completed:\n",
"\n",
"1. Identify all unique gene names present within the table. Each row can contains more than one gene name.\n",
"2. If present, extract any entries starting with \"rs\" (presumably representing Single Nucleotide Polymorphisms or rsIDs) that correspond to the same row as their associated gene names. Each gene name can correspond with more than one SNPs.\n",
"3. If available, extract any disease information associated with both the gene name and its corresponding SNP/rsID.\n",
"\n",
"It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.\n",
"If an SNPs or Diseases is absent from the table, leave the corresponding field blank with an empty string ('').\n",
"\n",
"# RESPONSE #\n",
"The output should only be a string containing list of JSON objects, each representing an entry with the following structure:\n",
"[\n",
" {{\n",
" \"Genes\": \"A\",\n",
" \"SNPs\": [\"rs123\", \"rs456\"],\n",
" \"Diseases\": \"A disease\"\n",
" }}\n",
"]\n",
"\n",
"If there is no specific extracted entities provided from the table, just leave the response with an empty lists ([]).\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"\n",
"llm = ChatOpenAI(temperature=0, model_name=\"gpt-4-0125-preview\")\n",
"\n",
"result = llm.invoke(\"DO something\").content\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Genes | \n",
" SNPs | \n",
" Diseases | \n",
" Title | \n",
" Authors | \n",
" Publisher Name | \n",
" Publication Year | \n",
" Population | \n",
" Sample Size | \n",
" Study Methodology | \n",
" Study Level | \n",
" Conclusion | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" GCK | \n",
" rs1799884 | \n",
" GCK-MODY (MODY2), PNDM, CHI | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" American Indian population, Canadian Oji-Cree ... | \n",
" Not Specified | \n",
" Candidate gene and genome-wide association stu... | \n",
" Not Specified | \n",
" The study conducted by Yisheng Yang and Lawren... | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" GCK | \n",
" s4607517 | \n",
" GCK-MODY (MODY2), PNDM, CHI | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" American Indian population, Canadian Oji-Cree ... | \n",
" Not Specified | \n",
" Candidate gene and genome-wide association stu... | \n",
" Not Specified | \n",
" The study conducted by Yisheng Yang and Lawren... | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" SLC2A2 | \n",
" rs5393 | \n",
" FBS | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" American Indian population, Canadian Oji-Cree ... | \n",
" Not Specified | \n",
" Candidate gene and genome-wide association stu... | \n",
" Not Specified | \n",
" The study conducted by Yisheng Yang and Lawren... | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" SLC2A2 | \n",
" rs5394 | \n",
" FBS | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" American Indian population, Canadian Oji-Cree ... | \n",
" Not Specified | \n",
" Candidate gene and genome-wide association stu... | \n",
" Not Specified | \n",
" The study conducted by Yisheng Yang and Lawren... | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" SLC2A2 | \n",
" rs5400 | \n",
" FBS | \n",
" Monogenic Diabetes: What It Teaches Us on the ... | \n",
" Yisheng Yang and Lawrence Chan | \n",
" Endocrine Reviews | \n",
" 2016 | \n",
" American Indian population, Canadian Oji-Cree ... | \n",
" Not Specified | \n",
" Candidate gene and genome-wide association stu... | \n",
" Not Specified | \n",
" The study conducted by Yisheng Yang and Lawren... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Genes SNPs Diseases \\\n",
"0 0 GCK rs1799884 GCK-MODY (MODY2), PNDM, CHI \n",
"1 1 GCK s4607517 GCK-MODY (MODY2), PNDM, CHI \n",
"2 2 SLC2A2 rs5393 FBS \n",
"3 3 SLC2A2 rs5394 FBS \n",
"4 4 SLC2A2 rs5400 FBS \n",
"\n",
" Title \\\n",
"0 Monogenic Diabetes: What It Teaches Us on the ... \n",
"1 Monogenic Diabetes: What It Teaches Us on the ... \n",
"2 Monogenic Diabetes: What It Teaches Us on the ... \n",
"3 Monogenic Diabetes: What It Teaches Us on the ... \n",
"4 Monogenic Diabetes: What It Teaches Us on the ... \n",
"\n",
" Authors Publisher Name Publication Year \\\n",
"0 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"1 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"2 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"3 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"4 Yisheng Yang and Lawrence Chan Endocrine Reviews 2016 \n",
"\n",
" Population Sample Size \\\n",
"0 American Indian population, Canadian Oji-Cree ... Not Specified \n",
"1 American Indian population, Canadian Oji-Cree ... Not Specified \n",
"2 American Indian population, Canadian Oji-Cree ... Not Specified \n",
"3 American Indian population, Canadian Oji-Cree ... Not Specified \n",
"4 American Indian population, Canadian Oji-Cree ... Not Specified \n",
"\n",
" Study Methodology Study Level \\\n",
"0 Candidate gene and genome-wide association stu... Not Specified \n",
"1 Candidate gene and genome-wide association stu... Not Specified \n",
"2 Candidate gene and genome-wide association stu... Not Specified \n",
"3 Candidate gene and genome-wide association stu... Not Specified \n",
"4 Candidate gene and genome-wide association stu... Not Specified \n",
"\n",
" Conclusion \n",
"0 The study conducted by Yisheng Yang and Lawren... \n",
"1 The study conducted by Yisheng Yang and Lawren... \n",
"2 The study conducted by Yisheng Yang and Lawren... \n",
"3 The study conducted by Yisheng Yang and Lawren... \n",
"4 The study conducted by Yisheng Yang and Lawren... "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_excel(\"result/monogenic diabetes_8000.xlsx\", sheet_name='Original')\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"Genes\": \"GCK\",\n",
" \"SNPs\": \"rs1799884\",\n",
" \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"GCK\",\n",
" \"SNPs\": \"s4607517\",\n",
" \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC2A2\",\n",
" \"SNPs\": \"rs5393\",\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC2A2\",\n",
" \"SNPs\": \"rs5394\",\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC2A2\",\n",
" \"SNPs\": \"rs5400\",\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC2A2\",\n",
" \"SNPs\": \"rs5404\",\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF4A\",\n",
" \"SNPs\": \"rs2144908\",\n",
" \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF4A\",\n",
" \"SNPs\": \"S3818247\",\n",
" \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF4A\",\n",
" \"SNPs\": \"rs884614\",\n",
" \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF4A\",\n",
" \"SNPs\": \"rs4810424\",\n",
" \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF4A\",\n",
" \"SNPs\": \"s1884613\",\n",
" \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF1B/TCF2\",\n",
" \"SNPs\": \"s757210\",\n",
" \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF1B/TCF2\",\n",
" \"SNPs\": \"S4430796\",\n",
" \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF1B/TCF2\",\n",
" \"SNPs\": \"rs7501939\",\n",
" \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
" },\n",
" {\n",
" \"Genes\": \"NEUROD1IBETA2\",\n",
" \"SNPs\": \"rs1801262\",\n",
" \"Diseases\": \"MODY6 and PNDM\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": \"rs10010131\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": \"rs6446482\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": \"s10010131\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": \"1801213\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": \"rs734312\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"PPARG\",\n",
" \"SNPs\": \"rs1801282\",\n",
" \"Diseases\": \"Monogenic diabetes Monogenic Diabetes Genes Associated With Both common T1D and T2D\"\n",
" },\n",
" {\n",
" \"Genes\": \"PPARG\",\n",
" \"SNPs\": \"rs4684847\",\n",
" \"Diseases\": \"Monogenic diabetes Monogenic Diabetes Genes Associated With Both common T1D and T2D\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLIS3\",\n",
" \"SNPs\": \"rs7020673\",\n",
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLIS3\",\n",
" \"SNPs\": \"s7034200\",\n",
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLIS3\",\n",
" \"SNPs\": \"s7041847\",\n",
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
" },\n",
" {\n",
" \"Genes\": \"KCNJ11\",\n",
" \"SNPs\": \"E23K\",\n",
" \"Diseases\": \"Type 2 Diabetes\"\n",
" },\n",
" {\n",
" \"Genes\": \"PPARG\",\n",
" \"SNPs\": \"S1369A\",\n",
" \"Diseases\": \"Type 1 Diabetes\"\n",
" },\n",
" {\n",
" \"Genes\": \"INS\",\n",
" \"SNPs\": \"rs1799884\",\n",
" \"Diseases\": \"Neonatal Diabetes Mellitus\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLIS3\",\n",
" \"SNPs\": \"rs5400\",\n",
" \"Diseases\": \"Maturity-Onset Diabetes of the Young (MODY)\"\n",
" },\n",
" {\n",
" \"Genes\": \"ABCC8\",\n",
" \"SNPs\": \"rs2650000\",\n",
" \"Diseases\": \"Wolfram syndrome 1\"\n",
" },\n",
" {\n",
" \"Genes\": \"GCK\",\n",
" \"SNPs\": \"rs2144908\",\n",
" \"Diseases\": \"Fanconi-Bickel syndrome\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC2A2\",\n",
" \"SNPs\": \"rs3818247\",\n",
" \"Diseases\": \"young-onset diabetes\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF1A\",\n",
" \"SNPs\": \"rs884614\",\n",
" \"Diseases\": \"prostate cancer\"\n",
" },\n",
" {\n",
" \"Genes\": \"HNF1B\",\n",
" \"SNPs\": \"rs1884613\",\n",
" \"Diseases\": \"PNDM\"\n",
" },\n",
" {\n",
" \"Genes\": \"PDX1\",\n",
" \"SNPs\": \"rs757210\",\n",
" \"Diseases\": \"KPD\"\n",
" },\n",
" {\n",
" \"Genes\": \"PAX4\",\n",
" \"SNPs\": \"rs4430796\",\n",
" \"Diseases\": \"TNDM\"\n",
" },\n",
" {\n",
" \"Genes\": \"NEUROD1\",\n",
" \"SNPs\": \"rs10229583\",\n",
" \"Diseases\": \"type 1b diabetes\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": \"rs6467136\",\n",
" \"Diseases\": \"\"\n",
" },\n",
" {\n",
" \"Genes\": \"MAFA\",\n",
" \"SNPs\": \"rs1801262\",\n",
" \"Diseases\": \"\"\n",
" },\n",
" {\n",
" \"Genes\": \"Ccnd2\",\n",
" \"SNPs\": \"rs10010131\",\n",
" \"Diseases\": \"\"\n",
" },\n",
" {\n",
" \"Genes\": \"NGN3\",\n",
" \"SNPs\": \"rs6446482\",\n",
" \"Diseases\": \"\"\n",
" },\n",
" {\n",
" \"Genes\": \"FOXA2\",\n",
" \"SNPs\": \"rs1801282\",\n",
" \"Diseases\": \"\"\n",
" },\n",
" {\n",
" \"Genes\": \"TCF2\",\n",
" \"SNPs\": \"rs780094\",\n",
" \"Diseases\": \"\"\n",
" }\n",
"]\n"
]
}
],
"source": [
"import json\n",
"df.fillna('', inplace=True)\n",
"json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')\n",
"str_json_table = json.dumps(json.loads(json_table), indent=2)\n",
"print(str_json_table)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
"\n",
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
"\n"
]
}
],
"source": [
"from process import validate\n",
"\n",
"\n",
"df = validate(df)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GCK rs1799884 Match\n",
"GCK rs4607517 Match\n",
"SLC2A2 rs5393 Not Available\n",
"SLC2A2 rs5394 Not Available\n",
"SLC2A2 rs5400 Match\n",
"SLC2A2 rs5404 Not Available\n",
"HNF4A rs2144908 Not Available\n",
"HNF4A rs3818247 Match\n",
"HNF4A rs884614 Not Available\n",
"HNF4A rs4810424 Not Available\n",
"HNF4A rs1884613 Not Available\n",
"HNF1B rs757210 Match\n",
"HNF1B rs4430796 Match\n",
"HNF1B rs7501939 Match\n",
"NEUROD1IBETA2 rs1801262 Not Available\n",
"WFS1 rs10010131 Match\n",
"WFS1 rs6446482 Not Available\n",
"WFS1 rs10010131 Match\n",
"WFS1 rs1801213 Not Available\n",
"WFS1 rs734312 Match\n",
"PPARG rs1801282 Match\n",
"PPARG rs4684847 Match\n",
"GLIS3 rs7020673 Match\n",
"GLIS3 rs7034200 Match\n",
"GLIS3 rs7041847 Match\n",
"INS rs1799884 Not Match\n",
"GLIS3 rs5400 Not Match\n",
"ABCC8 rs2650000 Not Match\n",
"GCK rs2144908 Not Available\n",
"SLC2A2 rs3818247 Not Match\n",
"HNF1A rs884614 Not Available\n",
"HNF1B rs1884613 Not Available\n",
"PDX1 rs757210 Not Match\n",
"PAX4 rs4430796 Not Match\n",
"NEUROD1 rs10229583 Not Match\n",
"WFS1 rs6467136 Not Match\n",
"MAFA rs1801262 Not Available\n",
"CCND2 rs10010131 Not Match\n",
"NGN3 rs6446482 Not Available\n",
"FOXA2 rs1801282 Not Match\n",
"TCF2 rs780094 Not Match\n",
"TCF2 rs757210 Not Match\n",
"TCF2 rs4430796 Not Match\n",
"TCF2 rs7501939 Not Match\n"
]
}
],
"source": [
"import requests\n",
"\n",
"for i in df.index:\n",
" snp = df.loc[i, 'SNPs']\n",
" gene = df.loc[i, 'Genes']\n",
"\n",
" res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
" try:\n",
" res = res.json()\n",
" except:\n",
" print('{:<10} {:<10} Not Available'.format(gene, snp))\n",
" continue\n",
" lst = []\n",
" for r in res['genomicContexts']:\n",
" if r['gene']['geneName'] == gene:\n",
" print('{:<10} {:<10} Match'.format(gene, snp))\n",
" break\n",
" else:\n",
" print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GCK rs1799884 Match\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"GCK rs4607517 Not Match\n",
"SLC2A2 rs5393 Match\n",
"SLC2A2 rs5394 Match\n",
"SLC2A2 rs5400 Match\n",
"SLC2A2 rs5404 Match\n",
"HNF4A rs2144908 Match\n",
"HNF4A rs3818247 Match\n",
"HNF4A rs884614 Not Match\n",
"HNF4A rs4810424 Not Match\n",
"HNF4A rs1884613 Not Match\n",
"HNF1B rs757210 Match\n",
"HNF1B rs4430796 Match\n",
"HNF1B rs7501939 Match\n",
"NEUROD1IBETA2 rs1801262 Not Match\n",
"WFS1 rs10010131 Match\n",
"WFS1 rs6446482 Match\n",
"WFS1 rs10010131 Match\n",
"WFS1 rs1801213 Match\n",
"WFS1 rs734312 Match\n",
"PPARG rs1801282 Match\n",
"PPARG rs4684847 Match\n",
"GLIS3 rs7020673 Match\n",
"GLIS3 rs7034200 Match\n",
"GLIS3 rs7041847 Match\n",
"INS rs1799884 Not Match\n",
"GLIS3 rs5400 Not Match\n",
"ABCC8 rs2650000 Not Match\n",
"GCK rs2144908 Not Match\n",
"SLC2A2 rs3818247 Not Match\n",
"HNF1A rs884614 Not Match\n",
"HNF1B rs1884613 Not Match\n",
"PDX1 rs757210 Not Match\n",
"PAX4 rs4430796 Not Match\n",
"NEUROD1 rs10229583 Not Match\n",
"WFS1 rs6467136 Not Match\n",
"MAFA rs1801262 Not Match\n",
"CCND2 rs10010131 Not Match\n",
"NGN3 rs6446482 Not Match\n",
"FOXA2 rs1801282 Not Match\n",
"TCF2 rs780094 Not Match\n",
"TCF2 rs757210 Not Match\n",
"TCF2 rs4430796 Not Match\n",
"TCF2 rs7501939 Not Match\n"
]
}
],
"source": [
"import requests\n",
"\n",
"for i in df.index:\n",
" snp = df.loc[i, 'SNPs']\n",
" gene = df.loc[i, 'Genes']\n",
"\n",
" res = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]\n",
" if 'error' in res:\n",
" print('{:<10} {:<10} Not Available'.format(gene, snp))\n",
" continue\n",
" lst = []\n",
" for r in res['genes']:\n",
" if r['name'] == gene:\n",
" print('{:<10} {:<10} Match'.format(gene, snp))\n",
" break\n",
" else:\n",
" print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_excel('../result/monogenic diabetes_8000.xlsx', sheet_name=\"Result\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['STARDIO', 'STARDI0', 'STARD1O', 'STARD10']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mistakes = {'I': '1', 'O': '0'}\n",
"\n",
"def permutate(word):\n",
"\n",
" if len(word) == 0:\n",
" return ['']\n",
"\n",
" change = []\n",
" res = permutate(word[1:])\n",
"\n",
" if word[0] in mistakes:\n",
" change = [mistakes[word[0]] + r for r in res]\n",
"\n",
" return [word[0] + r for r in res] + change\n",
"\n",
"permutate('STARDIO')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GCK rs1799884 Match\n",
"GCK rs4607517 Match\n",
"SLC2A2 rs5393 Match\n",
"SLC2A2 rs5394 Match\n",
"SLC2A2 rs5400 Match\n",
"SLC2A2 rs5404 Match\n",
"HNF4A rs2144908 Match\n",
"HNF4A rs3818247 Match\n",
"HNF4A rs884614 Not Match\n",
"HNF4A rs4810424 Not Match\n",
"HNF4A rs1884613 Not Match\n",
"HNF1B rs757210 Match\n",
"TCF2 rs757210 Not Match\n",
"HNF1B rs4430796 Match\n",
"TCF2 rs4430796 Not Match\n",
"HNF1B rs7501939 Match\n",
"TCF2 rs7501939 Not Match\n",
"PAX4 rs10229583 Match\n",
"NEUROD1 rs1801262 Match\n",
"BETA2 rs1801262 Not Match\n",
"WFS1 rs10010131 Match\n",
"WFS1 rs6446482 Match\n",
"WFS1 rs734312 Match\n",
"PPARG rs1801282 Match\n",
"PPARG rs4684847 Match\n",
"GLIS3 rs7020673 Match\n",
"GLIS3 rs7034200 Match\n",
"GLIS3 rs7041847 Match\n",
"HNF1A rs1801262 Not Match\n",
"INS rs1801282 Not Match\n",
"PPARG rs780094 Not Match\n"
]
}
],
"source": [
"import requests\n",
"\n",
"dbsnp = {}\n",
"\n",
"for i in df.index:\n",
" snp = df.loc[i, 'SNPs']\n",
" gene = df.loc[i, 'Genes']\n",
"\n",
" if snp not in dbsnp:\n",
" res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
"\n",
" try:\n",
" res = res.json()\n",
" dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]\n",
" except:\n",
" dbsnp[snp] = []\n",
"\n",
" res = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]\n",
" if 'error' not in res:\n",
" dbsnp[snp].extend([r['name'] for r in res['genes']])\n",
"\n",
" dbsnp[snp] = list(set(dbsnp[snp]))\n",
"\n",
" if gene in dbsnp[snp]:\n",
" print('{:<10} {:<10} Match'.format(gene, snp))\n",
" else:\n",
" for other in permutate(gene):\n",
" if other in dbsnp[snp]:\n",
" print('{:<10} {:<10} Match (corrected)'.format(other, snp))\n",
" break\n",
" else:\n",
" print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"12\n",
"13\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
"19\n",
"20\n",
"21\n",
"22\n",
"23\n",
"24\n",
"25\n",
"26\n",
"27\n",
"28\n",
"29\n",
"30\n",
"31\n",
"32\n",
"33\n",
"34\n",
"35\n",
"36\n",
"37\n",
"38\n",
"39\n",
"40\n",
"41\n",
"42\n",
"43\n",
"44\n",
"45\n",
"46\n",
"47\n",
"48\n",
"49\n",
"50\n",
"51\n",
"52\n",
"53\n",
"54\n",
"55\n",
"56\n",
"57\n",
"58\n",
"59\n",
"60\n",
"61\n",
"62\n",
"63\n",
"64\n",
"65\n",
"66\n",
"67\n",
"68\n",
"69\n",
"70\n",
"71\n",
"72\n",
"73\n",
"74\n",
"75\n",
"76\n",
"77\n",
"78\n",
"79\n",
"80\n",
"81\n",
"82\n",
"83\n",
"84\n",
"85\n",
"86\n",
"87\n",
"88\n",
"89\n",
"90\n",
"91\n",
"92\n",
"93\n",
"94\n",
"95\n",
"96\n",
"97\n",
"98\n",
"99\n"
]
}
],
"source": [
"import requests\n",
"import time\n",
"\n",
"snp = 'rs972283'\n",
"for i in range(100):\n",
" print(i)\n",
" while True:\n",
" try:\n",
" res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
" break\n",
" except Exception as e:\n",
" print('sleep')\n",
" time.sleep(1)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Here's the list of JSON objects with corrected gene names, SNPs, and diseases based on the given context:\n",
"\n",
"[\n",
" {\n",
" \"Genes\": \"GCK\",\n",
" \"SNPs\": \"rs1799884\",\n",
" \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC24A2\",\n",
" \"SNPs\": \"rs5393\",\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"NEUROD1, INS\",\n",
" \"SNPs\": \"rs1801262\",\n",
" \"Diseases\": \"MODY6 and PNDM\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFS1\",\n",
" \"SNPs\": \"rs6446482\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLIS3\",\n",
" \"SNPs\": \"rs7020673\",\n",
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
" },\n",
" {\n",
" \"Genes\": \"FTO\",\n",
" \"SNPs\": \"rs9937290\",\n",
" \"Diseases\": \"Obesity\"\n",
" }\n",
"]\n",
"\n",
"Changes made:\n",
"1. Corrected \"SLC242\" to \"SLC24A2\"\n",
"2. Separated \"NEUROD1IBETA2\" into \"NEUROD1, INS\"\n",
"3. Corrected \"GLI53\" to \"GLIS3\"\n",
"4. Corrected \"FT0\" to \"FTO\"\n"
]
}
],
"source": [
"from langchain_openai import ChatOpenAI\n",
"import os\n",
"\n",
"llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
"\n",
"prompt = \"\"\"\n",
"# CONTEXT #\n",
"In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
"The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
"\n",
"This is the data:\n",
"[\n",
" {\n",
" \"Genes\": \"GCK\",\n",
" \"SNPs\": \"rs1799884\",\n",
" \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC242\",\n",
" \"SNPs\": \"rs5393\",\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"NEUROD1IBETA2\",\n",
" \"SNPs\": \"rs1801262\",\n",
" \"Diseases\": \"MODY6 and PNDM\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFSI\",\n",
" \"SNPs\": \"rs6446482\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLI53\",\n",
" \"SNPs\": \"rs7020673\",\n",
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
" },\n",
" {\n",
" \"Genes\": \"FT0\",\n",
" \"SNPs\": \"rs9937290\",\n",
" \"Diseases\": \"Obesity\"\n",
" },\n",
"]\n",
"\n",
"# OBJECTIVE #\n",
"Given the provided table data, the following tasks need to be completed:\n",
"\n",
"1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
" - Combined Names: Two gene names erroneously merged into one. Separate these using \"and\": \"A and B\".\n",
" - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
"2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
"3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
"\n",
"# RESPONSE #\n",
"The output must be only a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
"[\n",
" {{\n",
" \"Genes\": \"A\",\n",
" \"SNPs\": \"rs123\",\n",
" \"Diseases\": \"A disease\"\n",
" }}\n",
"]\n",
"\"\"\"\n",
"\n",
"result = llm.invoke(model='mixtral-8x7b-instruct', input=prompt)\n",
"print(result.content)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"To accomplish this task, we'll need a reference dataset of correct gene names, SNPs, and diseases. Let's assume we have a dictionary `gene_ref` that maps gene names to their corresponding SNPs and diseases.\n",
"\n",
"Here's a Python script that should accomplish the tasks:\n",
"```python\n",
"import json\n",
"\n",
"# Reference dataset (example)\n",
"gene_ref = {\n",
" \"GCK\": {\"SNPs\": [\"rs1799884\"], \"Diseases\": [\"GCK-MODY (MODY2)\", \"PNDM\", \"CHI\"]},\n",
" \"SLC2A2\": {\"SNPs\": [\"rs5393\"], \"Diseases\": [\"FBS\"]},\n",
" \"NEUROD1\": {\"SNPs\": [\"rs1801262\"], \"Diseases\": [\"MODY6\", \"PNDM\"]},\n",
" \"WFS1\": {\"SNPs\": [\"rs6446482\"], \"Diseases\": [\"WFS1\", \"DIDMOAD\"]},\n",
" \"GLIS3\": {\"SNPs\": [\"rs7020673\"], \"Diseases\": [\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"]},\n",
" \"FTO\": {\"SNPs\": [\"rs9937290\"], \"Diseases\": [\"Obesity\"]},\n",
" # Add more gene references as needed\n",
"}\n",
"\n",
"def correct_gene_name(gene_name):\n",
" # Check for combined names\n",
" for ref_gene in gene_ref:\n",
" if ref_gene in gene_name:\n",
" return [ref_gene]\n",
" # Check for OCR errors\n",
" for ref_gene in gene_ref:\n",
" if len(set(gene_name) & set(ref_gene)) > len(ref_gene) / 2:\n",
" return [ref_gene]\n",
" return []\n",
"\n",
"def validate_data(data):\n",
" validated_data = []\n",
" for row in data:\n",
" gene_name = row[\"Genes\"]\n",
" corrected_genes = correct_gene_name(gene_name)\n",
" if not corrected_genes:\n",
" continue # Remove row if gene name is invalid\n",
" for corrected_gene in corrected_genes:\n",
" new_row = row.copy()\n",
" new_row[\"Genes\"] = corrected_gene\n",
" # Check and correct SNP\n",
" if row[\"SNPs\"]:\n",
" if row[\"SNPs\"] not in gene_ref[corrected_gene][\"SNPs\"]:\n",
" new_row[\"SNPs\"] = gene_ref[corrected_gene][\"SNPs\"][0]\n",
" # Check and correct diseases\n",
" if row[\"Diseases\"]:\n",
" diseases = [disease.strip() for disease in row[\"Diseases\"].split(\",\")]\n",
" if not all(disease in gene_ref[corrected_gene][\"Diseases\"] for disease in diseases):\n",
" new_row[\"Diseases\"] = \", \".join(gene_ref[corrected_gene][\"Diseases\"])\n",
" validated_data.append(new_row)\n",
" return json.dumps(validated_data)\n",
"\n",
"data = [\n",
" {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
" {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
" {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
" {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
" {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
" {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"},\n",
"]\n",
"\n",
"print(validate_data(data))\n",
"```\n",
"This script will output a string containing a list of JSON objects with corrected gene names, SNPs, and diseases.\n",
"\n",
"Note that this implementation assumes a simple reference dataset and may not cover all possible OCR errors or combined gene names. You may need to expand the `gene_ref` dictionary and the `correct_gene_name` function to handle more complex cases.\n"
]
}
],
"source": [
"from langchain_openai import ChatOpenAI\n",
"import os\n",
"\n",
"llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
"\n",
"prompt = \"\"\"\n",
"In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
"The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
"\n",
"This is the data:\n",
"[\n",
" {\n",
" \"Genes\": \"GCK\",\n",
" \"SNPs\": \"rs1799884\",\n",
" \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
" },\n",
" {\n",
" \"Genes\": \"SLC242\",\n",
" \"SNPs\": \"rs5393\",\n",
" \"Diseases\": \"FBS\"\n",
" },\n",
" {\n",
" \"Genes\": \"NEUROD1IBETA2\",\n",
" \"SNPs\": \"rs1801262\",\n",
" \"Diseases\": \"MODY6 and PNDM\"\n",
" },\n",
" {\n",
" \"Genes\": \"WFSI\",\n",
" \"SNPs\": \"rs6446482\",\n",
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
" },\n",
" {\n",
" \"Genes\": \"GLI53\",\n",
" \"SNPs\": \"rs7020673\",\n",
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
" },\n",
" {\n",
" \"Genes\": \"FT0\",\n",
" \"SNPs\": \"rs9937290\",\n",
" \"Diseases\": \"Obesity\"\n",
" },\n",
"]\n",
"\n",
"Given the provided table data, the following tasks need to be completed:\n",
"\n",
"1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
" - Combined Names: Two gene names erroneously merged into one. Duplicate this data row so each gene name has its own data.\n",
" - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
"2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
"3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
"\n",
"The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
"[\n",
" {{\n",
" \"Genes\": \"A\",\n",
" \"SNPs\": \"rs123\",\n",
" \"Diseases\": \"A disease\"\n",
" }}\n",
"]\n",
"\"\"\"\n",
"\n",
"result = llm.invoke(model='llama-3-70b-instruct', input=prompt)\n",
"print(result.content)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import nest_asyncio\n",
"\n",
"nest_asyncio.apply()\n",
"\n",
"from llama_parse import LlamaParse"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Started parsing the file under job_id cb5d7891-1366-47b7-98e2-d6cfbd5d3b87\n",
".."
]
}
],
"source": [
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"parser = LlamaParse(\n",
" # api_key=os.environ['LLAMA_'], # can also be set in your env as LLAMA_CLOUD_API_KEY\n",
" result_type=\"markdown\", # \"markdown\" and \"text\" are available\n",
" num_workers=4, # if multiple files passed, split in `num_workers` API calls\n",
" verbose=True,\n",
" language=\"en\", # Optionally you can define a language, default=en\n",
")\n",
"\n",
"# sync\n",
"objs = parser.get_json_result(\"papers/ukmss-34421.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Table 2 Voight et al. Page 22',\n",
" 'Expression QTL results for T2D-associated variants in blood and adipose tissue',\n",
" '',\n",
" '',\n",
" '',\n",
" 'e',\n",
" 'SNP with strongest correlation with trait',\n",
" '',\n",
" '',\n",
" '',\n",
" 'SNPChr.PositionNearbyRiskGene (transcript)TissueP valueP value',\n",
" 'cd2fg',\n",
" 'Effect (s.e.m.)P SNP (r)P',\n",
" 'B36 (bp) Europe PMC Funders Author Manuscriptsadj adj',\n",
" 'ab',\n",
" 'geneallele',\n",
" '',\n",
" '',\n",
" '',\n",
" 'Novel loci reported in this study',\n",
" 'rs4457053576,460,705ZBED3GPDE8B (NM_003719)Adipose0.302 (0.070)−50.80rs6864250−17−13',\n",
" '2.8 × 103.1 × 105.8 × 10',\n",
" '(0.18)',\n",
" 'ZBED3 (NM_032367)Adipose0.429 (0.068)−90.011rs4704389−16−9',\n",
" '1.0 × 103.9 × 106.0 × 10',\n",
" '(0.20)',\n",
" '−11−12',\n",
" 'rs9722837130,117,394KLF14GKLF14 (NM_138693)Adipose−0.387 (0.058)0.058rs7381340.0014',\n",
" '8.1 × 102.2 × 10',\n",
" '(0.30)',\n",
" '',\n",
" '',\n",
" '',\n",
" '−5−7',\n",
" 'rs896854896,029,687TP53INP1TCCNE2 (NM_057749)Blood−0.225 (0.053)0.78rs47353390.0051',\n",
" '3.8 × 105.8 × 10',\n",
" '(0.61)',\n",
" '',\n",
" '',\n",
" '',\n",
" '−7−24−19',\n",
" 'rs15522241172,110,746CENTD2ASTARD10 (NM_006645)Blood0.337 (0.066)0.026rs519790',\n",
" '8.6 × 102.7 × 101.6 × 10',\n",
" '(0.04)',\n",
" 'rs795719712119,945,069HNF1ATACADS (NM_000017)Adipose−0.248 (0.067)−40.29rs9204−53−50',\n",
" '3.7 × 101.3 × 105.9 × 10',\n",
" '(0.02)',\n",
" 'PSMD9 (NM_002813)Blood0.240 (0.065)−40.0088rs3741593−8−6',\n",
" '3.9 × 108.3 × 101.7 × 10',\n",
" '(0.00)',\n",
" '−6−7',\n",
" 'OASL (NM_003733)Adipose0.318 (0.068)0.13rs22598830.0018',\n",
" '6.4 × 101.1 × 10',\n",
" '(0.19)',\n",
" '',\n",
" '',\n",
" '',\n",
" '−6−22−16',\n",
" 'OASL (NM_003733)Blood0.319 (0.064)0.37rs4556628',\n",
" '1.3 × 104.4 × 101.4 × 10',\n",
" '(0.21)',\n",
" '',\n",
" '',\n",
" '',\n",
" '−4−39−35',\n",
" 'COQ5 (NM_032314)Blood0.248 (0.065)0.92rs10774561',\n",
" '2.1 × 108.7 × 104.9 × 10',\n",
" '(0.02)',\n",
" 'UNC119B (NM_032661)Blood−0.254 (0.064)−40.048rs11065202−12−9',\n",
" '1.4 × 107.8 × 102.3 × 10',\n",
" '(0.09)',\n",
" 'CAMKK2 (NM_172215)Adipose−0.497 (0.068)−120.18rs11065504−117−98',\n",
" '1.2 × 102.7 × 103.8 × 10',\n",
" '(0.08)',\n",
" '−8−105−94',\n",
" 'CAMKK2 (NM_172215)Blood−0.360 (0.063)0.68rs11065504',\n",
" '3.4 × 107.0 × 105.7 × 10',\n",
" '(0.08)',\n",
" '',\n",
" '',\n",
" '',\n",
" '−6−6−17−17',\n",
" 'P2RX4 (NM_175568)Blood0.312 (0.065)rs25644',\n",
" '3.4 × 102.0 × 103.4 × 101.9 × 10',\n",
" '(0.03)',\n",
" '−10−21−12Europe PMC Funders Author Manuscripts',\n",
" 'rs80426801589,322,341PRC1AVPS33B (NM_018668)Blood−0.371 (0.057)0.50rs12595616',\n",
" '2.9 × 102.3 × 104.5 × 10',\n",
" '(0.57)',\n",
" 'Previously reported loci',\n",
" '',\n",
" '',\n",
" '',\n",
" '−5−5',\n",
" 'rs75783262226,728,897IRS1AIRS1 (Contig50189_RC)Adipose−0.251 (0.059)0.89rs29436530.69',\n",
" '3.7 × 103.4 × 10',\n",
" '(0.93)',\n",
" '',\n",
" '',\n",
" '',\n",
" '−8−10',\n",
" 'IRS1 (NM_005544)Adipose−0.331 (0.059)0.58rs21760400.0042',\n",
" '5.7 × 107.8 × 10',\n",
" '(0.74)',\n",
" 'rs13081389312,264,800PPARGAIQSEC1 (NM_014869)Adipose−0.630 (0.131)−6−4rs9211−96−94',\n",
" '2.9 × 101.4 × 101.1 × 107.4 × 10',\n",
" '(0.01)',\n",
" 'rs6795735364,680,405ADAMTS9CBC040632 (AK022320)Adipose−0.229 (0.056)−50.28rs4521216−13−10',\n",
" '7.6 × 103.0 × 108.7 × 10',\n",
" '(0.02)',\n",
" '',\n",
" '',\n",
" '',\n",
" ' Nat Genet. Author manuscript; available in PMC 2011 April 21.']"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"page = 22\n",
"text_parts = objs[0]['pages'][page - 1]['text'].split('\\n')\n",
"text_parts"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Gene Names and Diseases': [{'Gene Name': 'ZBED3',\n",
" 'SNP': 'rs6864250',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'ZBED3',\n",
" 'SNP': 'rs4704389',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'KLF14',\n",
" 'SNP': 'rs972283',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'TP53INP1',\n",
" 'SNP': 'rs896854',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'CENTD2',\n",
" 'SNP': 'rs1552224',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'HNF1A',\n",
" 'SNP': 'rs7957197',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'PSMD9',\n",
" 'SNP': 'rs3741593',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'OASL',\n",
" 'SNP': 'rs2259883',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'OASL',\n",
" 'SNP': 'rs4556628',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'COQ5',\n",
" 'SNP': 'rs10774561',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'UNC119B',\n",
" 'SNP': 'rs11065202',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'CAMKK2',\n",
" 'SNP': 'rs11065504',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'P2RX4',\n",
" 'SNP': 'rs25644',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'PRC1',\n",
" 'SNP': 'rs8042680',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'IRS1',\n",
" 'SNP': 'rs7578326',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'IRS1',\n",
" 'SNP': 'rs2943653',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'IRS1',\n",
" 'SNP': 'rs2176040',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'PPARG',\n",
" 'SNP': 'rs13081389',\n",
" 'Diseases': 'T2D-associated variants'},\n",
" {'Gene Name': 'ADAMTS9',\n",
" 'SNP': 'rs6795735',\n",
" 'Diseases': 'T2D-associated variants'}]}"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from openai import OpenAI\n",
"\n",
"client = OpenAI()\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-4-0125-preview\",\n",
" response_format={\"type\": \"json_object\"},\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant designed to output JSON.\"},\n",
" {\"role\": \"user\", \"content\": f\"Given a text like this: {text_parts}, automatically extract, return multiple Gene Names, potential diseases and their corresponding SNPs in the format like this: {{\\\"Gene Name\\\": \\\"FTO\\\", \\\"SNP\\\": \\\"rs9939609\\\", \\\"Diseases\\\": \\\"Obesity\\\"}}, from table format at text (this is just an example, don't return this)\"}\n",
" ]\n",
")\n",
"res = response.choices[0].message.content\n",
"eval(res)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
"\n",
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
"\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"\n",
"import os\n",
"import torch\n",
"from pdf2image import convert_from_path\n",
"from table_detector import detection_transform, device, model, ocr, outputs_to_objects\n",
"import io\n",
"from img2table.document import Image"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1-s2.0-S0002916523016155-main.pdf\n",
"4\n",
"1329.pdf\n",
"8\n",
"41467_2020_Article_15421.pdf\n",
"11\n",
"berndt2013.pdf\n",
"14\n",
"BMD.pdf\n",
"17\n",
"clock and eat timing.pdf\n",
"23\n",
"COMT breast cancer metaanalysis chinese.pdf\n",
"26\n",
"dubois2010.pdf\n",
"30\n",
"EMMM-8-688.pdf\n",
"40\n",
"EMS120610.pdf\n",
"45\n",
"file.pdf\n",
"51\n",
"journal.pbio.3001547.pdf\n",
"54\n",
"lipid.pdf\n",
"60\n",
"monogenic diabetes.pdf\n",
"62\n",
"nihms-1651539.pdf\n",
"62\n",
"nihms-1792335.pdf\n",
"73\n",
"nihms-668049.pdf\n",
"87\n",
"nihms364577.pdf\n",
"90\n",
"nihms510594.pdf\n",
"110\n",
"pgen.1009952.pdf\n",
"116\n",
"PIIS0091674919313661.pdf\n",
"121\n",
"s12881-019-0830-y.pdf\n",
"128\n",
"s41576-021-00414-z (1).pdf\n",
"132\n",
"s41588-018-0047-6.pdf\n",
"137\n",
"s41588-022-01024-z (1).pdf\n",
"150\n",
"stroke genetic AHA.pdf\n",
"154\n",
"surendran2016.pdf\n",
"158\n",
"teslovich2010.pdf\n",
"161\n",
"testing\n",
"ukmss-34421.pdf\n",
"167\n",
"wightman2021.pdf\n",
"173\n"
]
}
],
"source": [
"tables = []\n",
"\n",
"for path in os.listdir('papers/'):\n",
" print(path)\n",
"\n",
" if path[-3:] != 'pdf':\n",
" continue\n",
"\n",
" images = convert_from_path('papers/' + path)\n",
"\n",
" # Loop pages\n",
" for image in images:\n",
"\n",
" pixel_values = detection_transform(image).unsqueeze(0).to(device)\n",
" with torch.no_grad():\n",
" outputs = model(pixel_values)\n",
"\n",
" id2label = model.config.id2label\n",
" id2label[len(model.config.id2label)] = \"no object\"\n",
" detected_tables = outputs_to_objects(outputs, image.size, id2label)\n",
"\n",
" # Loop table in page (if any)\n",
" for idx in range(len(detected_tables)):\n",
" cropped_table = image.crop(detected_tables[idx][\"bbox\"])\n",
" if detected_tables[idx][\"label\"] == 'table rotated':\n",
" cropped_table = cropped_table.rotate(270, expand=True)\n",
"\n",
" # TODO: what is the perfect threshold?\n",
" if detected_tables[idx]['score'] > 0.9:\n",
" # print(detected_tables[idx])\n",
" tables.append(cropped_table)\n",
" \n",
" print(len(tables))\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"tables[0].save(\n",
" 'table.pdf', \"PDF\", resolution=100.0, save_all=True, append_images=tables[1:]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import fitz\n",
"\n",
"# Open the PDF file\n",
"doc=fitz.open(\"table.pdf\")\n",
" \n",
"# Say, you like to save the first 6 pages, first page is 0\n",
"pages = [2,3,4,7,8,10,12,13,16,17,28,29,33,34,35,46,47,48,49,56,57,59,60,62,76,77,78,79,80,81,82,84,85,86,87,88,89,90,105,106,107,108,109,110,112,113,118,119,120,123,124,125,130,138,139,154,155,156,159,160,164,166,167,168]\n",
"pages = [(x - 1) for x in pages]\n",
"doc.select(pages)\n",
" \n",
"# Save the selected pages to a new PDF\n",
"doc.save(\"out_file_name.pdf\")\n",
"doc.close()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}