File size: 3,720 Bytes
e57fe3b
 
 
 
5a03966
e57fe3b
 
 
 
e820636
e57fe3b
 
 
 
 
 
 
e3701be
e57fe3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227b705
 
e57fe3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6752b05
e57fe3b
 
 
 
 
 
 
2f4dacb
 
e57fe3b
d3d6d52
e57fe3b
d3d6d52
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import tensorflow.compat.v1 as tf
import os 
import shutil
import csv
import sys
import pandas as pd
import numpy as np
import IPython
import streamlit as st
#import subprocess
from itertools import islice
import random
#from transformers import pipeline
from transformers import TapasTokenizer, TapasForQuestionAnswering

tf.get_logger().setLevel('ERROR')

model_name = 'google/tapas-base-finetuned-wtq'
#model_name =  "table-question-answering"
#model = pipeline(model_name)

model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
tokenizer = TapasTokenizer.from_pretrained(model_name)

st.set_option('deprecation.showfileUploaderEncoding', False)

st.title('Query your Table')
st.header('Upload CSV file')

uploaded_file = st.file_uploader("Choose your CSV file",type = 'csv')
placeholder = st.empty()

if uploaded_file is not None:
    data = pd.read_csv(uploaded_file)
    data.replace(',','', regex=True, inplace=True)
    if st.checkbox('Want to see the data?'):
        placeholder.dataframe(data)

st.header('Enter your queries')
input_queries = st.text_input('Type your queries separated by comma(,)',value='')
input_queries = input_queries.split(',')

colors1 = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(len(input_queries))]
colors2 = ['background-color:'+str(color)+'; color: black' for color in colors1]

def styling_specific_cell(x,tags,colors):
    df_styler = pd.DataFrame('', index=x.index, columns=x.columns)
    for idx,tag in enumerate(tags):
        for r,c in tag:
            df_styler.iloc[r, c] = colors[idx]
    return df_styler
    
if st.button('Predict Answers'):
    with st.spinner('It will take approx a minute'):
        table = data.astype(str)
        inputs = tokenizer(table=table , queries=input_queries, padding='max_length',truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        #outputs = model(table = data, query = queries)
        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( inputs, outputs.logits.detach(), outputs.logits_aggregation.detach())
        
        id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
        aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
    
        answers = []
        
        for coordinates in predicted_answer_coordinates:
           if len(coordinates) == 1:
             # only a single cell:
             answers.append(table.iat[coordinates[0]])
           else:
             # multiple cells
             cell_values = []
             for coordinate in coordinates:
                cell_values.append(table.iat[coordinate])
             answers.append(", ".join(cell_values))
             
    st.success('Done! Please check below the answers and its cells highlighted in table above')
    
    placeholder.dataframe(data.style.apply(styling_specific_cell,tags=predicted_answer_coordinates,colors=colors2,axis=None))
      
    for query, answer, predicted_agg, c in zip(input_queries, answers, aggregation_predictions_string, colors1):
        st.write('\n')
        st.markdown('<font color={} size=4>**{}**</font>'.format(c,query), unsafe_allow_html=True)
        st.write('\n')
        
        if predicted_agg == "NONE" or predicted_agg == 'COUNT':
            st.markdown('**>** '+str(answer))
        else:
            #st.write(predicted_agg)
            #st.write(answer)
            if predicted_agg == 'SUM':
                st.markdown('**>** '+str(sum(list(map(float,answer.split(','))))))
            else:
                st.markdown('**>** '+str(np.round(np.mean(list(map(float,answer.split(',')))),2)))