fadliaulawi commited on
Commit
3f96f05
1 Parent(s): 2ec4f41

Explore Gemini model

Browse files
Files changed (4) hide show
  1. app.py +15 -6
  2. process.py +7 -4
  3. requirements.txt +22 -21
  4. resources/experiment.ipynb +125 -98
app.py CHANGED
@@ -31,11 +31,21 @@ st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its
31
 
32
  uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
33
 
34
- chunk_option = st.selectbox(
35
- 'Tokens amounts per process :',
36
- (32000, 16000, 8000), key='table_hv'
37
- )
38
- chunk_overlap = 0
 
 
 
 
 
 
 
 
 
 
39
 
40
  if uploaded_files:
41
  journals = []
@@ -45,7 +55,6 @@ if uploaded_files:
45
  with st.status("Extraction in progress ...", expanded=True) as status:
46
  start_time = datetime.now()
47
 
48
- csv = pd.DataFrame()
49
  for uploaded_file in stqdm(uploaded_files):
50
  with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
51
  pdf.write(uploaded_file.getbuffer())
 
31
 
32
  uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
33
 
34
+ col1, col2 = st.columns(2)
35
+
36
+ with col1:
37
+ chunk_option = st.selectbox(
38
+ 'Token amounts per process:',
39
+ (24000, 16000, 8000), key='token'
40
+ )
41
+ chunk_overlap = 0
42
+
43
+ with col2:
44
+ model = st.selectbox(
45
+ 'Model selection: (UNDER DEVELOPED)',
46
+ # 128000, 32768, 1048576
47
+ ('gpt-4-turbo', 'llama-3-sonar-large-32k-chat', 'gemini-1.5-pro-latest'), key='model'
48
+ )
49
 
50
  if uploaded_files:
51
  journals = []
 
55
  with st.status("Extraction in progress ...", expanded=True) as status:
56
  start_time = datetime.now()
57
 
 
58
  for uploaded_file in stqdm(uploaded_files):
59
  with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
60
  pdf.write(uploaded_file.getbuffer())
process.py CHANGED
@@ -11,6 +11,7 @@ from pdf2image import convert_from_path
11
  from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
12
  from table_detector import detection_transform, device, model, ocr, outputs_to_objects
13
 
 
14
  import io
15
  import json
16
  import os
@@ -19,9 +20,11 @@ import re
19
  import torch
20
 
21
  load_dotenv()
 
22
 
23
- llm = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")
24
- llm_p = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
 
25
 
26
  prompts = {
27
  'gsd': [prompt_entity_gsd_chunk, prompt_entity_gsd_combine],
@@ -221,7 +224,7 @@ def validate(df):
221
  json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
222
  str_json_table = json.dumps(json.loads(json_table), indent=2)
223
 
224
- result = llm_p.invoke(model='mistral-7b-instruct', input=prompt_validation.format(str_json_table)).content
225
  print('val')
226
  print(result)
227
 
@@ -234,6 +237,6 @@ def validate(df):
234
  df_val = pd.DataFrame(result)
235
  df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
236
 
237
- # TODO: How to validate genes and SNPs?
238
 
239
  return df, df_val
 
11
  from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
12
  from table_detector import detection_transform, device, model, ocr, outputs_to_objects
13
 
14
+ import google.generativeai as genai
15
  import io
16
  import json
17
  import os
 
20
  import torch
21
 
22
  load_dotenv()
23
+ genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
24
 
25
+ llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo")
26
+ llm_p = ChatOpenAI(temperature=0, model_name="llama-3-sonar-large-32k-chat", api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
27
+ llm_g = genai.GenerativeModel(model_name='gemini-1.5-pro-latest')
28
 
29
  prompts = {
30
  'gsd': [prompt_entity_gsd_chunk, prompt_entity_gsd_combine],
 
224
  json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
225
  str_json_table = json.dumps(json.loads(json_table), indent=2)
226
 
227
+ result = llm_p.invoke(input=prompt_validation.format(str_json_table)).content
228
  print('val')
229
  print(result)
230
 
 
237
  df_val = pd.DataFrame(result)
238
  df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
239
 
240
+ # TODO: How to validate genes and SNPs with ground truth?
241
 
242
  return df, df_val
requirements.txt CHANGED
@@ -1,21 +1,22 @@
1
- pikepdf
2
- stqdm
3
- pdf2image
4
- nltk
5
- pandas
6
- streamlit
7
- xlsxwriter
8
- openai
9
- biopython
10
- langchain
11
- pypdf
12
- tiktoken
13
- pillow-heif
14
- torchvision
15
- transformers
16
- python-dotenv
17
- rapidocr-onnxruntime
18
- langchain-openai
19
- img2table
20
- timm
21
- python-doctr
 
 
1
+ pikepdf==8.13.0
2
+ stqdm==0.0.5
3
+ pdf2image==1.17.0
4
+ nltk==3.8.1
5
+ pandas==2.2.2
6
+ streamlit==1.33.0
7
+ xlsxwriter==3.2.0
8
+ openai==1.26.0
9
+ biopython==1.83
10
+ langchain==0.1.13
11
+ pypdf==4.1.0
12
+ tiktoken==0.5.2
13
+ pillow-heif==0.15.0
14
+ torchvision==0.15.2
15
+ transformers==4.38.2
16
+ python-dotenv==1.0.1
17
+ rapidocr-onnxruntime==1.3.15
18
+ langchain-openai==0.1.6
19
+ img2table==1.2.11
20
+ timm==0.9.16
21
+ python-doctr==0.8.1
22
+ google-generativeai==0.5.2
resources/experiment.ipynb CHANGED
@@ -2058,69 +2058,59 @@
2058
  },
2059
  {
2060
  "cell_type": "code",
2061
- "execution_count": 19,
2062
  "metadata": {},
2063
  "outputs": [
2064
  {
2065
  "name": "stdout",
2066
  "output_type": "stream",
2067
  "text": [
2068
- "To complete the given tasks, I will first provide a list of the correct gene names and their corresponding diseases. Then, I will validate the provided data and correct any discrepancies in the gene names and diseases.\n",
2069
- "\n",
2070
- "Correct gene names and their corresponding diseases:\n",
2071
- "\n",
2072
- "1. GCK: GCK-MODY (MODY2), PNDM, CHI\n",
2073
- "2. SLC17A4 (formerly SLC242): FBS\n",
2074
- "3. NEUROD1: MODY6 and PNDM\n",
2075
- "4. WFS1: WFS1, sometimes referred to as DIDMOAD\n",
2076
- "5. GLI3: Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\n",
2077
- "\n",
2078
- "Validated JSON objects:\n",
2079
  "\n",
2080
- "```json\n",
2081
  "[\n",
2082
- " {\n",
2083
- " \"Genes\": \"GCK\",\n",
2084
- " \"SNPs\": \"rs1799884\",\n",
2085
- " \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
2086
- " },\n",
2087
- " {\n",
2088
- " \"Genes\": \"SLC17A4\",\n",
2089
- " \"SNPs\": \"rs5393\",\n",
2090
- " \"Diseases\": \"FBS\"\n",
2091
- " },\n",
2092
- " {\n",
2093
- " \"Genes\": \"NEUROD1\",\n",
2094
- " \"SNPs\": \"rs1801262\",\n",
2095
- " \"Diseases\": \"MODY6 and PNDM\"\n",
2096
- " },\n",
2097
- " {\n",
2098
- " \"Genes\": \"WFS1\",\n",
2099
- " \"SNPs\": \"rs6446482\",\n",
2100
- " \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
2101
- " },\n",
2102
- " {\n",
2103
- " \"Genes\": \"GLI3\",\n",
2104
- " \"SNPs\": \"rs7020673\",\n",
2105
- " \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
2106
- " }\n",
 
 
 
 
 
2107
  "]\n",
2108
- "```\n",
2109
  "\n",
2110
- "Explanation:\n",
2111
- "\n",
2112
- "1. Gene GCK is correct.\n",
2113
- "2. Gene SLC242 is corrected to SLC17A4.\n",
2114
- "3. Gene NEUROD1IBETA2 is corrected to NEUROD1.\n",
2115
- "4. Gene WFSI is correct.\n",
2116
- "5. Gene GLI53 is corrected to GLI3.\n",
2117
- "\n",
2118
- "The SNPs and diseases are not corrected since they are not suspected of having typos.\n"
2119
  ]
2120
  }
2121
  ],
2122
  "source": [
2123
  "from langchain_openai import ChatOpenAI\n",
 
2124
  "\n",
2125
  "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
2126
  "\n",
@@ -2156,16 +2146,24 @@
2156
  " \"SNPs\": \"rs7020673\",\n",
2157
  " \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
2158
  " },\n",
 
 
 
 
 
2159
  "]\n",
2160
  "\n",
2161
  "# OBJECTIVE #\n",
2162
  "Given the provided table data, the following tasks need to be completed:\n",
2163
  "\n",
2164
- "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If not, eliminate this row data because the gene name is invalid. \n",
2165
- "2. If diseases are not empty, check whether the gene name corresponds with the gene names. Fix it with the correct diseases if the original disease is wrong.\n",
 
 
 
2166
  "\n",
2167
  "# RESPONSE #\n",
2168
- "The output should only be a string containing a list of JSON objects, each representing a validated entry with the following structure:\n",
2169
  "[\n",
2170
  " {{\n",
2171
  " \"Genes\": \"A\",\n",
@@ -2181,79 +2179,100 @@
2181
  },
2182
  {
2183
  "cell_type": "code",
2184
- "execution_count": 20,
2185
  "metadata": {},
2186
  "outputs": [
2187
- {
2188
- "name": "stderr",
2189
- "output_type": "stream",
2190
- "text": [
2191
- "Failed to batch ingest runs: TypeError('sequence item 0: expected str instance, ReadTimeoutError found')\n"
2192
- ]
2193
- },
2194
  {
2195
  "name": "stdout",
2196
  "output_type": "stream",
2197
  "text": [
2198
- "To accomplish this task, we'll need a reference list of correct gene names and their corresponding diseases. Let's assume we have a dictionary `gene_reference` that maps correct gene names to their corresponding diseases.\n",
2199
- "\n",
2200
- "Here's a Python solution using the `json` module:\n",
2201
  "```python\n",
2202
  "import json\n",
2203
  "\n",
2204
- "# Reference list of correct gene names and their corresponding diseases\n",
2205
- "gene_reference = {\n",
2206
- " \"GCK\": \"GCK-MODY (MODY2), PNDM, CHI\",\n",
2207
- " \"SLC2A2\": \"FBS\",\n",
2208
- " \"NEUROD1\": \"MODY6 and PNDM\",\n",
2209
- " \"WFS1\": \"WFS1, sometimes referred to as DIDMOAD\",\n",
2210
- " \"GLIS3\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
2211
  "}\n",
2212
  "\n",
2213
- "def validate_gene_name(gene_name):\n",
2214
- " # Simple typo correction using Levenshtein distance (you can use a more advanced method if needed)\n",
2215
- " min_distance = float('inf')\n",
2216
- " closest_gene = None\n",
2217
- " for ref_gene in gene_reference:\n",
2218
- " distance = sum(el1 != el2 for el1, el2 in zip(gene_name, ref_gene))\n",
2219
- " if distance < min_distance:\n",
2220
- " min_distance = distance\n",
2221
- " closest_gene = ref_gene\n",
2222
- " return closest_gene if min_distance <= 2 else None # adjust the threshold as needed\n",
2223
- "\n",
2224
- "def validate_data(data):\n",
2225
- " validated_data = []\n",
2226
  " for entry in data:\n",
2227
- " gene_name = entry[\"Genes\"]\n",
2228
- " corrected_gene_name = validate_gene_name(gene_name)\n",
2229
- " if corrected_gene_name:\n",
2230
- " entry[\"Genes\"] = corrected_gene_name\n",
2231
- " if entry[\"Diseases\"]:\n",
2232
- " entry[\"Diseases\"] = gene_reference[corrected_gene_name]\n",
2233
- " validated_data.append(entry)\n",
2234
- " return json.dumps(validated_data)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2235
  "\n",
 
2236
  "data = [\n",
2237
  " {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2238
  " {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
2239
  " {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
2240
  " {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2241
- " {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"}\n",
 
2242
  "]\n",
2243
  "\n",
2244
- "print(validate_data(data))\n",
 
2245
  "```\n",
2246
- "This script will output:\n",
2247
  "```\n",
2248
  "[\n",
2249
  " {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2250
  " {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
2251
  " {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
2252
  " {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2253
- " {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"}\n",
 
2254
  "]\n",
2255
  "```\n",
2256
- "Note that this implementation uses a simple Levenshtein distance-based approach for typo correction, which may not be sufficient for all cases. You may want to consider using more advanced methods, such as fuzzy matching or machine learning-based approaches, depending on the complexity of your data.\n"
2257
  ]
2258
  }
2259
  ],
@@ -2294,16 +2313,24 @@
2294
  " \"SNPs\": \"rs7020673\",\n",
2295
  " \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
2296
  " },\n",
 
 
 
 
 
2297
  "]\n",
2298
  "\n",
2299
  "# OBJECTIVE #\n",
2300
  "Given the provided table data, the following tasks need to be completed:\n",
2301
  "\n",
2302
- "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If not, eliminate this row data because the gene name is invalid. \n",
2303
- "2. If diseases are not empty, check whether the gene name corresponds with the gene names. Fix it with the correct diseases if the original disease is wrong.\n",
 
 
 
2304
  "\n",
2305
  "# RESPONSE #\n",
2306
- "The output should only be a string containing a list of JSON objects, each representing a validated entry with the following structure:\n",
2307
  "[\n",
2308
  " {{\n",
2309
  " \"Genes\": \"A\",\n",
 
2058
  },
2059
  {
2060
  "cell_type": "code",
2061
+ "execution_count": 2,
2062
  "metadata": {},
2063
  "outputs": [
2064
  {
2065
  "name": "stdout",
2066
  "output_type": "stream",
2067
  "text": [
2068
+ "Here's the list of JSON objects with corrected gene names, SNPs, and diseases based on the given context:\n",
 
 
 
 
 
 
 
 
 
 
2069
  "\n",
 
2070
  "[\n",
2071
+ " {\n",
2072
+ " \"Genes\": \"GCK\",\n",
2073
+ " \"SNPs\": \"rs1799884\",\n",
2074
+ " \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
2075
+ " },\n",
2076
+ " {\n",
2077
+ " \"Genes\": \"SLC24A2\",\n",
2078
+ " \"SNPs\": \"rs5393\",\n",
2079
+ " \"Diseases\": \"FBS\"\n",
2080
+ " },\n",
2081
+ " {\n",
2082
+ " \"Genes\": \"NEUROD1, INS\",\n",
2083
+ " \"SNPs\": \"rs1801262\",\n",
2084
+ " \"Diseases\": \"MODY6 and PNDM\"\n",
2085
+ " },\n",
2086
+ " {\n",
2087
+ " \"Genes\": \"WFS1\",\n",
2088
+ " \"SNPs\": \"rs6446482\",\n",
2089
+ " \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
2090
+ " },\n",
2091
+ " {\n",
2092
+ " \"Genes\": \"GLIS3\",\n",
2093
+ " \"SNPs\": \"rs7020673\",\n",
2094
+ " \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
2095
+ " },\n",
2096
+ " {\n",
2097
+ " \"Genes\": \"FTO\",\n",
2098
+ " \"SNPs\": \"rs9937290\",\n",
2099
+ " \"Diseases\": \"Obesity\"\n",
2100
+ " }\n",
2101
  "]\n",
 
2102
  "\n",
2103
+ "Changes made:\n",
2104
+ "1. Corrected \"SLC242\" to \"SLC24A2\"\n",
2105
+ "2. Separated \"NEUROD1IBETA2\" into \"NEUROD1, INS\"\n",
2106
+ "3. Corrected \"GLI53\" to \"GLIS3\"\n",
2107
+ "4. Corrected \"FT0\" to \"FTO\"\n"
 
 
 
 
2108
  ]
2109
  }
2110
  ],
2111
  "source": [
2112
  "from langchain_openai import ChatOpenAI\n",
2113
+ "import os\n",
2114
  "\n",
2115
  "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
2116
  "\n",
 
2146
  " \"SNPs\": \"rs7020673\",\n",
2147
  " \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
2148
  " },\n",
2149
+ " {\n",
2150
+ " \"Genes\": \"FT0\",\n",
2151
+ " \"SNPs\": \"rs9937290\",\n",
2152
+ " \"Diseases\": \"Obesity\"\n",
2153
+ " },\n",
2154
  "]\n",
2155
  "\n",
2156
  "# OBJECTIVE #\n",
2157
  "Given the provided table data, the following tasks need to be completed:\n",
2158
  "\n",
2159
+ "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
2160
+ " - Combined Names: Two gene names erroneously merged into one. Separate these using \"and\": \"A and B\".\n",
2161
+ " - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
2162
+ "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
2163
+ "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
2164
  "\n",
2165
  "# RESPONSE #\n",
2166
+ "The output must be only a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
2167
  "[\n",
2168
  " {{\n",
2169
  " \"Genes\": \"A\",\n",
 
2179
  },
2180
  {
2181
  "cell_type": "code",
2182
+ "execution_count": 4,
2183
  "metadata": {},
2184
  "outputs": [
 
 
 
 
 
 
 
2185
  {
2186
  "name": "stdout",
2187
  "output_type": "stream",
2188
  "text": [
2189
+ "Here is the Python solution using the `json` module and a dictionary to map known gene names to their correct forms:\n",
 
 
2190
  "```python\n",
2191
  "import json\n",
2192
  "\n",
2193
+ "# Known gene names and their corrections\n",
2194
+ "gene_corrections = {\n",
2195
+ " \"SLC242\": \"SLC2A2\",\n",
2196
+ " \"NEUROD1IBETA2\": \"NEUROD1\",\n",
2197
+ " \"WFSI\": \"WFS1\",\n",
2198
+ " \"GLI53\": \"GLIS3\",\n",
2199
+ " \"FT0\": \"FTO\"\n",
2200
  "}\n",
2201
  "\n",
2202
+ "# Function to correct gene names and SNPs\n",
2203
+ "def correct_gene_data(data):\n",
2204
+ " corrected_data = []\n",
 
 
 
 
 
 
 
 
 
 
2205
  " for entry in data:\n",
2206
+ " genes = entry[\"Genes\"]\n",
2207
+ " snps = entry[\"SNPs\"]\n",
2208
+ " diseases = entry[\"Diseases\"]\n",
2209
+ " \n",
2210
+ " # Correct gene names\n",
2211
+ " if genes in gene_corrections:\n",
2212
+ " genes = gene_corrections[genes]\n",
2213
+ " elif \" and \" not in genes:\n",
2214
+ " # Check for combined names\n",
2215
+ " parts = genes.split()\n",
2216
+ " if len(parts) > 1:\n",
2217
+ " genes = \" and \".join(parts)\n",
2218
+ " \n",
2219
+ " # Correct SNPs (assuming a dictionary of known SNPs for each gene)\n",
2220
+ " snp_corrections = {\n",
2221
+ " \"GCK\": {\"rs1799884\": \"rs1799884\"},\n",
2222
+ " \"SLC2A2\": {\"rs5393\": \"rs5393\"},\n",
2223
+ " \"NEUROD1\": {\"rs1801262\": \"rs1801262\"},\n",
2224
+ " \"WFS1\": {\"rs6446482\": \"rs6446482\"},\n",
2225
+ " \"GLIS3\": {\"rs7020673\": \"rs7020673\"},\n",
2226
+ " \"FTO\": {\"rs9937290\": \"rs9937290\"}\n",
2227
+ " }\n",
2228
+ " if snps and genes in snp_corrections:\n",
2229
+ " if snps not in snp_corrections[genes]:\n",
2230
+ " snps = \"\"\n",
2231
+ " \n",
2232
+ " # Correct diseases (assuming a dictionary of known diseases for each gene)\n",
2233
+ " disease_corrections = {\n",
2234
+ " \"GCK\": {\"GCK-MODY (MODY2), PNDM, CHI\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2235
+ " \"SLC2A2\": {\"FBS\": \"FBS\"},\n",
2236
+ " \"NEUROD1\": {\"MODY6 and PNDM\": \"MODY6 and PNDM\"},\n",
2237
+ " \"WFS1\": {\"WFS1, sometimes referred to as DIDMOAD\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2238
+ " \"GLIS3\": {\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
2239
+ " \"FTO\": {\"Obesity\": \"Obesity\"}\n",
2240
+ " }\n",
2241
+ " if diseases and genes in disease_corrections:\n",
2242
+ " if diseases not in disease_corrections[genes]:\n",
2243
+ " diseases = \"\"\n",
2244
+ " \n",
2245
+ " # Add corrected entry to the list\n",
2246
+ " if genes and snps and diseases:\n",
2247
+ " corrected_data.append({\"Genes\": genes, \"SNPs\": snps, \"Diseases\": diseases})\n",
2248
+ " \n",
2249
+ " return json.dumps(corrected_data)\n",
2250
  "\n",
2251
+ "# Input data\n",
2252
  "data = [\n",
2253
  " {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2254
  " {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
2255
  " {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
2256
  " {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2257
+ " {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
2258
+ " {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
2259
  "]\n",
2260
  "\n",
2261
+ "# Correct and output the data\n",
2262
+ "print(correct_gene_data(data))\n",
2263
  "```\n",
2264
+ "This will output the corrected data in the same format as the input:\n",
2265
  "```\n",
2266
  "[\n",
2267
  " {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2268
  " {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
2269
  " {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
2270
  " {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2271
+ " {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
2272
+ " {\"Genes\": \"FTO\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
2273
  "]\n",
2274
  "```\n",
2275
+ "Note that this implementation assumes a dictionary of known gene names, SNPs, and diseases for correction. You may need to expand or modify these dictionaries based on your specific use case.\n"
2276
  ]
2277
  }
2278
  ],
 
2313
  " \"SNPs\": \"rs7020673\",\n",
2314
  " \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
2315
  " },\n",
2316
+ " {\n",
2317
+ " \"Genes\": \"FT0\",\n",
2318
+ " \"SNPs\": \"rs9937290\",\n",
2319
+ " \"Diseases\": \"Obesity\"\n",
2320
+ " },\n",
2321
  "]\n",
2322
  "\n",
2323
  "# OBJECTIVE #\n",
2324
  "Given the provided table data, the following tasks need to be completed:\n",
2325
  "\n",
2326
+ "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
2327
+ " - Combined Names: Two gene names erroneously merged into one. Duplicate this data row so each gene name has its own data.\n",
2328
+ " - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
2329
+ "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
2330
+ "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
2331
  "\n",
2332
  "# RESPONSE #\n",
2333
+ "The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
2334
  "[\n",
2335
  " {{\n",
2336
  " \"Genes\": \"A\",\n",