vbzvibin
/

TAPAS_Text2SQL

Model card Files Files and versions Community

vbzvibin commited on Jun 7, 2023

Commit

b072237

•

1 Parent(s): b8d0ad5

Upload 4 files

Browse files

Files changed (4) hide show

data.csv +11 -0
pasta.py +178 -0
qnacsv.csv +0 -0
requirements.txt +13 -0

data.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Patient_Name,Country,Disease,CUI,Snomed,Oxygen_Rate,Med_Type,Admission_Date
+Rahul,India,Diabetes,CUI00234,SNO34672,90,Medicare,23-09-2022
+Kumar ,Sri Lanka,Severe Fever,CUI00235,SNO34673,91,Medicaid,04-03-2022
+Ricky ,Australia,Edema,CUI00236,SNO34674,92,Commercial,02-09-2022
+Jayasuriya,Sri Lanka,Cardiac Arrest,CUI00237,SNO34675,93,Medicare,13-01-2022
+Mahela,Sri Lanka,Alzheimer,CUI00238,SNO34676,94,Medicaid,07-03-2022
+Kohli,India,Cancer,CUI00239,SNO34677,95,Commercial,05-05-2022
+Inzamam,Pakistan,Pneumonia,CUI00240,SNO34678,96,Medicare,04-03-2022
+Jacques,South Africa,Severe Fever,CUI00241,SNO34679,97,Medicaid,02-09-2022
+Saurav,India,Edema,CUI00242,SNO34680,98,Medicare,13-01-2022
+David,India,Cardiac Arrest,CUI00243,SNO34681,99,Medicaid,07-03-2022

pasta.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Fri May 26 14:07:22 2023
+@author: vibin
+"""
+import streamlit as st
+from pandasql import sqldf
+import pandas as pd
+import re
+from typing import List
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import re
+@st.cache_resource()
+def tapas_model():
+    return(pipeline(task="table-question-answering", model="google/tapas-base-finetuned-wtq"))
+@st.cache_resource()
+def prepare_input(question: str, table: List[str]):
+    table_prefix = "table:"
+    question_prefix = "question:"
+    join_table = ",".join(table)
+    inputs = f"{question_prefix} {question} {table_prefix} {join_table}"
+    input_ids = tokenizer(inputs, max_length=512, return_tensors="pt").input_ids
+    return input_ids
+@st.cache_resource()
+def inference(question: str, table: List[str]) -> str:
+    input_data = prepare_input(question=question, table=table)
+    input_data = input_data.to(model.device)
+    outputs = model.generate(inputs=input_data, num_beams=10, top_k=10, max_length=700)
+    result = tokenizer.decode(token_ids=outputs[0], skip_special_tokens=True)
+    return result
+@st.cache_resource()
+def tokmod(tok_md):
+    tkn = AutoTokenizer.from_pretrained(tok_md)
+    mdl = AutoModelForSeq2SeqLM.from_pretrained(tok_md)
+    return(tkn,mdl)
+### Main
+nav = st.sidebar.radio("Navigation",["TAPAS","Text2SQL"])
+if nav == "TAPAS":
+    col1 , col2, col3 = st.columns(3)
+    col2.title("TAPAS")
+    col3 , col4 = st.columns([3,12])
+    col4.text("Tabular Data Text Extraction using text")
+    table = pd.read_csv("data.csv")
+    table = table.astype(str)
+    st.text("DataSet - ")
+    st.dataframe(table,width=3000,height= 400)
+    st.title("")
+    lst_q = ["Which country has low medicare","Who are the patients from india","Who are the patients from india","Patients who have Edema","CUI code for diabetes patients","Patients having oxygen less than 94 but 91"]
+    v2 = st.selectbox("Choose your text",lst_q,index = 0)
+    st.title("")
+    sql_txt = st.text_area("TAPAS Input",v2)
+    if st.button("Predict"):
+        tqa = tapas_model()
+        txt_sql = tqa(table=table, query=sql_txt)["answer"]
+        st.text("Output - ")
+        st.success(f"{txt_sql}")
+        # st.write(all_students)
+elif nav == "Text2SQL":
+    ### Function
+    col1 , col2, col3 = st.columns(3)
+    col2.title("Text2SQL")
+    col3 , col4 = st.columns([1,20])
+    col4.text("Text will be converted to SQL Query and can extract the data from DataSet")
+    # Import Data
+    df_qna = pd.read_csv("qnacsv.csv", encoding= 'unicode_escape')
+    st.title("")
+    st.text("DataSet - ")
+    st.dataframe(df_qna,width=3000,height= 500)
+    st.title("")
+    lst_q = ["what interface is measure indicator code = 72_HR_ABX and version is 1 and source is TD", "get class code with measure = 72_HR_ABX", "get sum of version for Class_Code is Antibiotic Stewardship", "what interface is measure indicator code = 72_HR_ABX"]
+    v2 = st.selectbox("Choose your text",lst_q,index = 0)
+    st.title("")
+    sql_txt = st.text_area("Text for SQL Conversion",v2)
+    if st.button("Predict"):
+        tok_model = "juierror/flan-t5-text2sql-with-schema"
+        tokenizer,model = tokmod(tok_model)
+        # text = "what interface is measure indicator code = 72_HR_ABX and version is 1 and source is TD"
+        table_name = "df_qna"
+        table_col = ["Type","Class_Code", "Version","Measure_Indicator_Code","Measure_Indicator_Name","Description_Definition", "Source", "Interfaces"]
+        txt_sql = inference(question=sql_txt, table=table_col)
+        ### SQL Modification
+        sql_avg = ["AVG","COUNT","DISTINCT","MAX","MIN","SUM"]
+        txt_sql = txt_sql.replace("table",table_name)
+        sql_quotes = []
+        for match in re.finditer("=",txt_sql):
+            new_txt = txt_sql[match.span()[1]+1:]
+            try:
+                match2 = re.search("AND",new_txt)
+                sql_quotes.append((new_txt[:match2.span()[0]]).strip())
+            except:
+                sql_quotes.append(new_txt.strip())
+        for i in sql_quotes:
+            qts = "'" + i + "'"
+            txt_sql = txt_sql.replace(i, qts)
+        for r in sql_avg:
+            if r in txt_sql:
+                rr = re.search(rf"{r} (\w+)", txt_sql)
+                init = " " + rr[1]
+                qts = "(" + rr[1] + ")"
+                txt_sql = txt_sql.replace(init,qts)
+            else:
+                pass
+        st.success(f"{txt_sql}")
+        all_students = sqldf(txt_sql)
+        st.text("Output - ")
+        st.write(all_students)

qnacsv.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+pip
+Cmake
+wheel
+pandas
+jinja2==3.1.2
+pandasql
+Cython
+datasets
+huggingface-hub
+tapas
+torch
+transformers
+streamlit