bglearning commited on
Commit
eb4710d
0 Parent(s):

First version

Browse files
Files changed (3) hide show
  1. run_tapas_viz.py +23 -0
  2. tapas-styles.css +38 -0
  3. tapas_visualizer.py +139 -0
run_tapas_viz.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from transformers import TapasTokenizer
4
+
5
+ from tapas_visualizer import TapasVisualizer
6
+
7
+
8
+ def main():
9
+ tapas_tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
10
+ viz = TapasVisualizer(tapas_tokenizer)
11
+
12
+ data = {
13
+ "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
14
+ "Age": ["56", "45", "59"],
15
+ "Number of movies": ["87", "53", "69"],
16
+ }
17
+
18
+ table = pd.DataFrame.from_dict(data)
19
+ print(viz(table))
20
+
21
+
22
+ if __name__ == '__main__':
23
+ main()
tapas-styles.css ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .tokenized-text {
2
+ width:100%;
3
+ padding:2rem;
4
+ max-height: 400px;
5
+ overflow-y: auto;
6
+ box-sizing:border-box;
7
+ line-height:4rem; /* Lots of space between lines */
8
+ font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
9
+ box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
10
+ background-color: rgba(0,0,0,0.01);
11
+ letter-spacing:2px; /* Give some extra separation between chars */
12
+ }
13
+
14
+ .non-token{
15
+ /* White space and other things the tokenizer ignores*/
16
+ white-space: pre;
17
+ letter-spacing:4px;
18
+ border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
19
+ border-bottom:1px solid #A0A0A0;
20
+ line-height: 1rem;
21
+ height: calc(100% - 2px);
22
+ }
23
+
24
+ .token {
25
+ white-space: pre;
26
+ position:relative;
27
+ color:black;
28
+ letter-spacing:2px;
29
+ }
30
+
31
+ .even-token{
32
+ background:#DCDCDC ;
33
+ border: 1px solid #DCDCDC;
34
+ }
35
+ .odd-token{
36
+ background:#A0A0A0;
37
+ border: 1px solid #A0A0A0;
38
+ }
tapas_visualizer.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, List
3
+
4
+ from collections import defaultdict
5
+
6
+ import pandas as pd
7
+
8
+ dirname = os.path.dirname(__file__)
9
+ css_filename = os.path.join(dirname, "tapas-styles.css")
10
+ with open(css_filename) as f:
11
+ css = f.read()
12
+
13
+
14
+ def HTMLBody(table_html: str, css_styles=css) -> str:
15
+ """
16
+ Generates the full html with css from a list of html spans
17
+
18
+ Args:
19
+ children (:obj:`List[str]`):
20
+ A list of strings, assumed to be html elements
21
+
22
+ css_styles (:obj:`str`, `optional`):
23
+ Optional alternative implementation of the css
24
+
25
+ Returns:
26
+ :obj:`str`: An HTML string with style markup
27
+ """
28
+ return f"""
29
+ <html>
30
+ <head>
31
+ <style>
32
+ {css_styles}
33
+ </style>
34
+ </head>
35
+ <body>
36
+ <div class="tokenized-text" dir=auto>
37
+ {table_html}
38
+ </div>
39
+ </body>
40
+ </html>
41
+ """
42
+
43
+
44
+ class TapasVisualizer:
45
+ def __init__(self, tokenizer) -> None:
46
+ self.tokenizer = tokenizer
47
+
48
+ def normalize_token_str(self, token_str: str) -> str:
49
+ return token_str.replace("##", "")
50
+
51
+ def style_span(self, span_text: str, css_classes: List[str]) -> str:
52
+ css = f'''class="{' '.join(css_classes)}"'''
53
+ return f"<span {css} >{span_text}</span>"
54
+
55
+ def text_to_html(self, org_text: str, tokens: List[str]) -> str:
56
+ """Create html based on the original text and its tokens.
57
+
58
+ Note: The tokens need to be in same order as in the original text
59
+
60
+ Args:
61
+ org_text (str): Original string before tokenization
62
+ tokens (List[str]): The tokens of org_text
63
+
64
+ Returns:
65
+ str: html with styling for the tokens
66
+ """
67
+ if len(tokens) == 0:
68
+ print(f'Empty tokens for: {org_text}')
69
+ return ''
70
+
71
+ cur_token_id = 0
72
+ cur_token = self.normalize_token_str(tokens[cur_token_id])
73
+
74
+ # Loop through each character
75
+ next_start = 0
76
+ last_end = 0
77
+ spans = []
78
+
79
+ while next_start < len(org_text):
80
+ candidate = org_text[next_start: next_start + len(cur_token)]
81
+
82
+ # The tokenizer performs lowercasing; so check against lowercase
83
+ if candidate.lower() == cur_token:
84
+ if last_end != next_start:
85
+ # There was token-less text (probably whitespace)
86
+ # in the middle
87
+ spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
88
+
89
+ odd_or_even = 'even-token' if cur_token_id % 2 == 0 else 'odd-token'
90
+ spans.append(self.style_span(candidate, ['token', odd_or_even]))
91
+ next_start += len(cur_token)
92
+ last_end = next_start
93
+ cur_token_id += 1
94
+ if cur_token_id >= len(tokens):
95
+ break
96
+ cur_token = self.normalize_token_str(tokens[cur_token_id])
97
+ else:
98
+ next_start += 1
99
+
100
+ if last_end != len(org_text):
101
+ spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
102
+
103
+ return spans
104
+
105
+
106
+ def __call__(self, table: pd.DataFrame) -> Any:
107
+ tokenized = self.tokenizer(table)
108
+
109
+ cell_tokens = defaultdict(list)
110
+
111
+ for id_ind, input_id in enumerate(tokenized['input_ids']):
112
+ input_id = int(input_id)
113
+ # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
114
+ segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
115
+ token_text = self.tokenizer._convert_id_to_token(input_id)
116
+ cell_tokens[(row_id, col_id)].append(token_text)
117
+
118
+ # token_df = pd.DataFrame(token_data, columns=['id', 'token', 'segment_id', 'column_id', 'row_id'])
119
+ header_row_html = ""
120
+ for col_id, col in enumerate(table.columns, start=1):
121
+ span_htmls = self.text_to_html(col, cell_tokens[0, col_id])
122
+ cell_html = "".join(span_htmls)
123
+ header_row_html += f"<th>{cell_html}</th>"
124
+ header_row_html = f'<tr>{header_row_html}</tr>'
125
+
126
+ table_vals = table.values
127
+
128
+ table_html = header_row_html
129
+
130
+ for row_id, row in enumerate(table_vals, start=1):
131
+ row_html = ""
132
+ for col_id, cell in enumerate(row, start=1):
133
+ span_htmls = self.text_to_html(cell, cell_tokens[row_id, col_id])
134
+ cell_html = "".join(span_htmls)
135
+ row_html += f"<td>{cell_html}</td>"
136
+ table_html += f'<tr>{row_html}</tr>'
137
+
138
+ table_html = f'<table>{table_html}</table>'
139
+ return HTMLBody(table_html)