Konstantin commited on
Commit
89b186a
1 Parent(s): bd8327d

Remove subtoken indicators ('##') in token attribution

Browse files
Files changed (1) hide show
  1. app.py +7 -1
app.py CHANGED
@@ -62,11 +62,17 @@ toxicity_pipeline, cls_explainer = load_pipeline()
62
  # Auxiliary functions
63
  def format_explainer_html(html_string):
64
  """Extract tokens with attribution-based background color."""
 
65
  soup = BeautifulSoup(html_string, 'html.parser')
66
  p = soup.new_tag('p')
 
67
  # Select token elements and remove model specific tokens
68
  for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
69
- p.append(token)
 
 
 
 
70
  return p.prettify()
71
 
72
 
 
62
  # Auxiliary functions
63
  def format_explainer_html(html_string):
64
  """Extract tokens with attribution-based background color."""
65
+ inside_token_prefix = '##'
66
  soup = BeautifulSoup(html_string, 'html.parser')
67
  p = soup.new_tag('p')
68
+ p.append(soup.new_tag('font', attrs={'color': 'black'}))
69
  # Select token elements and remove model specific tokens
70
  for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
71
+ text = token.font.text.strip()
72
+ if text.startswith(inside_token_prefix):
73
+ text = text[len(inside_token_prefix):]
74
+ token.string = text
75
+ p.font.append(token)
76
  return p.prettify()
77
 
78