Spaces:

ml6team
/

toxic-comment-detection-german

Sleeping

Konstantin commited on Mar 8, 2022

Commit

c80f1e3

•

1 Parent(s): 89b186a

Remove padding in words in the token attribution

Files changed (1) hide show

app.py CHANGED Viewed

@@ -64,16 +64,35 @@ def format_explainer_html(html_string):
     """Extract tokens with attribution-based background color."""
     inside_token_prefix = '##'
     soup = BeautifulSoup(html_string, 'html.parser')
-    p = soup.new_tag('p')
-    p.append(soup.new_tag('font', attrs={'color': 'black'}))
     # Select token elements and remove model specific tokens
     for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
         text = token.font.text.strip()
         if text.startswith(inside_token_prefix):
             text = text[len(inside_token_prefix):]
         token.string = text
-        p.font.append(token)
-    return p.prettify()
 def classify_comment(comment):

     """Extract tokens with attribution-based background color."""
     inside_token_prefix = '##'
     soup = BeautifulSoup(html_string, 'html.parser')
+    p = soup.new_tag('p',
+        attrs={'style': 'color: black; background-color: white;'})
     # Select token elements and remove model specific tokens
+    current_word = None
     for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
         text = token.font.text.strip()
         if text.startswith(inside_token_prefix):
             text = text[len(inside_token_prefix):]
+        else:
+            # Create a new span for each word (sequence of sub-tokens)
+            if current_word is not None:
+                p.append(current_word)
+                p.append(' ')
+            current_word = soup.new_tag('span')
+        token.attrs['style'] = f"{token.attrs['style']}; padding: 0.2em 0em;"
         token.string = text
+        current_word.append(token)
+    # Add last word
+    p.append(current_word)
+    # Add left and right-padding to each word
+    for span in p.find_all('span'):
+        span.find_all('mark')[0].attrs['style'] = (
+            f"{span.find_all('mark')[0].attrs['style']} padding-left: 0.2em;")
+        span.find_all('mark')[-1].attrs['style'] = (
+            f"{span.find_all('mark')[-1].attrs['style']} padding-right: 0.2em;")
+    return p
 def classify_comment(comment):