Spaces:
Runtime error
Runtime error
bglearning
commited on
Commit
•
eb4710d
0
Parent(s):
First version
Browse files- run_tapas_viz.py +23 -0
- tapas-styles.css +38 -0
- tapas_visualizer.py +139 -0
run_tapas_viz.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
from transformers import TapasTokenizer
|
4 |
+
|
5 |
+
from tapas_visualizer import TapasVisualizer
|
6 |
+
|
7 |
+
|
8 |
+
def main():
|
9 |
+
tapas_tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
|
10 |
+
viz = TapasVisualizer(tapas_tokenizer)
|
11 |
+
|
12 |
+
data = {
|
13 |
+
"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
|
14 |
+
"Age": ["56", "45", "59"],
|
15 |
+
"Number of movies": ["87", "53", "69"],
|
16 |
+
}
|
17 |
+
|
18 |
+
table = pd.DataFrame.from_dict(data)
|
19 |
+
print(viz(table))
|
20 |
+
|
21 |
+
|
22 |
+
if __name__ == '__main__':
|
23 |
+
main()
|
tapas-styles.css
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.tokenized-text {
|
2 |
+
width:100%;
|
3 |
+
padding:2rem;
|
4 |
+
max-height: 400px;
|
5 |
+
overflow-y: auto;
|
6 |
+
box-sizing:border-box;
|
7 |
+
line-height:4rem; /* Lots of space between lines */
|
8 |
+
font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
|
9 |
+
box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
|
10 |
+
background-color: rgba(0,0,0,0.01);
|
11 |
+
letter-spacing:2px; /* Give some extra separation between chars */
|
12 |
+
}
|
13 |
+
|
14 |
+
.non-token{
|
15 |
+
/* White space and other things the tokenizer ignores*/
|
16 |
+
white-space: pre;
|
17 |
+
letter-spacing:4px;
|
18 |
+
border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
|
19 |
+
border-bottom:1px solid #A0A0A0;
|
20 |
+
line-height: 1rem;
|
21 |
+
height: calc(100% - 2px);
|
22 |
+
}
|
23 |
+
|
24 |
+
.token {
|
25 |
+
white-space: pre;
|
26 |
+
position:relative;
|
27 |
+
color:black;
|
28 |
+
letter-spacing:2px;
|
29 |
+
}
|
30 |
+
|
31 |
+
.even-token{
|
32 |
+
background:#DCDCDC ;
|
33 |
+
border: 1px solid #DCDCDC;
|
34 |
+
}
|
35 |
+
.odd-token{
|
36 |
+
background:#A0A0A0;
|
37 |
+
border: 1px solid #A0A0A0;
|
38 |
+
}
|
tapas_visualizer.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Any, List
|
3 |
+
|
4 |
+
from collections import defaultdict
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
dirname = os.path.dirname(__file__)
|
9 |
+
css_filename = os.path.join(dirname, "tapas-styles.css")
|
10 |
+
with open(css_filename) as f:
|
11 |
+
css = f.read()
|
12 |
+
|
13 |
+
|
14 |
+
def HTMLBody(table_html: str, css_styles=css) -> str:
|
15 |
+
"""
|
16 |
+
Generates the full html with css from a list of html spans
|
17 |
+
|
18 |
+
Args:
|
19 |
+
children (:obj:`List[str]`):
|
20 |
+
A list of strings, assumed to be html elements
|
21 |
+
|
22 |
+
css_styles (:obj:`str`, `optional`):
|
23 |
+
Optional alternative implementation of the css
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
:obj:`str`: An HTML string with style markup
|
27 |
+
"""
|
28 |
+
return f"""
|
29 |
+
<html>
|
30 |
+
<head>
|
31 |
+
<style>
|
32 |
+
{css_styles}
|
33 |
+
</style>
|
34 |
+
</head>
|
35 |
+
<body>
|
36 |
+
<div class="tokenized-text" dir=auto>
|
37 |
+
{table_html}
|
38 |
+
</div>
|
39 |
+
</body>
|
40 |
+
</html>
|
41 |
+
"""
|
42 |
+
|
43 |
+
|
44 |
+
class TapasVisualizer:
|
45 |
+
def __init__(self, tokenizer) -> None:
|
46 |
+
self.tokenizer = tokenizer
|
47 |
+
|
48 |
+
def normalize_token_str(self, token_str: str) -> str:
|
49 |
+
return token_str.replace("##", "")
|
50 |
+
|
51 |
+
def style_span(self, span_text: str, css_classes: List[str]) -> str:
|
52 |
+
css = f'''class="{' '.join(css_classes)}"'''
|
53 |
+
return f"<span {css} >{span_text}</span>"
|
54 |
+
|
55 |
+
def text_to_html(self, org_text: str, tokens: List[str]) -> str:
|
56 |
+
"""Create html based on the original text and its tokens.
|
57 |
+
|
58 |
+
Note: The tokens need to be in same order as in the original text
|
59 |
+
|
60 |
+
Args:
|
61 |
+
org_text (str): Original string before tokenization
|
62 |
+
tokens (List[str]): The tokens of org_text
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
str: html with styling for the tokens
|
66 |
+
"""
|
67 |
+
if len(tokens) == 0:
|
68 |
+
print(f'Empty tokens for: {org_text}')
|
69 |
+
return ''
|
70 |
+
|
71 |
+
cur_token_id = 0
|
72 |
+
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
73 |
+
|
74 |
+
# Loop through each character
|
75 |
+
next_start = 0
|
76 |
+
last_end = 0
|
77 |
+
spans = []
|
78 |
+
|
79 |
+
while next_start < len(org_text):
|
80 |
+
candidate = org_text[next_start: next_start + len(cur_token)]
|
81 |
+
|
82 |
+
# The tokenizer performs lowercasing; so check against lowercase
|
83 |
+
if candidate.lower() == cur_token:
|
84 |
+
if last_end != next_start:
|
85 |
+
# There was token-less text (probably whitespace)
|
86 |
+
# in the middle
|
87 |
+
spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
|
88 |
+
|
89 |
+
odd_or_even = 'even-token' if cur_token_id % 2 == 0 else 'odd-token'
|
90 |
+
spans.append(self.style_span(candidate, ['token', odd_or_even]))
|
91 |
+
next_start += len(cur_token)
|
92 |
+
last_end = next_start
|
93 |
+
cur_token_id += 1
|
94 |
+
if cur_token_id >= len(tokens):
|
95 |
+
break
|
96 |
+
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
97 |
+
else:
|
98 |
+
next_start += 1
|
99 |
+
|
100 |
+
if last_end != len(org_text):
|
101 |
+
spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
|
102 |
+
|
103 |
+
return spans
|
104 |
+
|
105 |
+
|
106 |
+
def __call__(self, table: pd.DataFrame) -> Any:
|
107 |
+
tokenized = self.tokenizer(table)
|
108 |
+
|
109 |
+
cell_tokens = defaultdict(list)
|
110 |
+
|
111 |
+
for id_ind, input_id in enumerate(tokenized['input_ids']):
|
112 |
+
input_id = int(input_id)
|
113 |
+
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
|
114 |
+
segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
|
115 |
+
token_text = self.tokenizer._convert_id_to_token(input_id)
|
116 |
+
cell_tokens[(row_id, col_id)].append(token_text)
|
117 |
+
|
118 |
+
# token_df = pd.DataFrame(token_data, columns=['id', 'token', 'segment_id', 'column_id', 'row_id'])
|
119 |
+
header_row_html = ""
|
120 |
+
for col_id, col in enumerate(table.columns, start=1):
|
121 |
+
span_htmls = self.text_to_html(col, cell_tokens[0, col_id])
|
122 |
+
cell_html = "".join(span_htmls)
|
123 |
+
header_row_html += f"<th>{cell_html}</th>"
|
124 |
+
header_row_html = f'<tr>{header_row_html}</tr>'
|
125 |
+
|
126 |
+
table_vals = table.values
|
127 |
+
|
128 |
+
table_html = header_row_html
|
129 |
+
|
130 |
+
for row_id, row in enumerate(table_vals, start=1):
|
131 |
+
row_html = ""
|
132 |
+
for col_id, cell in enumerate(row, start=1):
|
133 |
+
span_htmls = self.text_to_html(cell, cell_tokens[row_id, col_id])
|
134 |
+
cell_html = "".join(span_htmls)
|
135 |
+
row_html += f"<td>{cell_html}</td>"
|
136 |
+
table_html += f'<tr>{row_html}</tr>'
|
137 |
+
|
138 |
+
table_html = f'<table>{table_html}</table>'
|
139 |
+
return HTMLBody(table_html)
|