Spaces:

ml6team
/

toxic-comment-detection-dutch

Running

App Files Files Community

toxic-comment-detection-dutch / app.py

Konstantin

Add spaces application

8d4c2d8 over 2 years ago

raw history blame

No virus

7.62 kB

	import random

	import streamlit as st
	from bs4 import BeautifulSoup

	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from transformers import pipeline
	from transformers_interpret import SequenceClassificationExplainer

	# Map model names to URLs
	model_names_to_URLs = {
	'ml6team/distilbert-base-dutch-cased-toxic-comments':
	'https://huggingface.co/ml6team/distilbert-base-dutch-cased-toxic-comments',
	'ml6team/robbert-dutch-base-toxic-comments':
	'https://huggingface.co/ml6team/robbert-dutch-base-toxic-comments',
	}

	about_page_markdown = f"""# 🤬 Dutch Toxic Comment Detection Space

	Made by [ML6](https://ml6.eu/).

	Token attribution is performed using [transformers-interpret](https://github.com/cdpierse/transformers-interpret).
	"""

	regular_emojis = [
	'😐', '🙂', '👶', '😇',
	]
	undecided_emojis = [
	'🤨', '🧐', '🥸', '🥴', '🤷',
	]
	potty_mouth_emojis = [
	'🤐', '👿', '😡', '🤬', '☠️', '☣️', '☢️',
	]

	# Page setup
	st.set_page_config(
	page_title="Toxic Comment Detection Space",
	page_icon="🤬",
	layout="centered",
	initial_sidebar_state="auto",
	menu_items={
	'Get help': None,
	'Report a bug': None,
	'About': about_page_markdown,
	}
	)

	# Model setup
	@st.cache(allow_output_mutation=True,
	suppress_st_warning=True,
	show_spinner=False)
	def load_pipeline(model_name):
	with st.spinner('Loading model (this might take a while)...'):
	toxicity_pipeline = pipeline(
	'text-classification',
	model=model_name,
	tokenizer=model_name)
	cls_explainer = SequenceClassificationExplainer(
	toxicity_pipeline.model,
	toxicity_pipeline.tokenizer)
	return toxicity_pipeline, cls_explainer


	# Auxiliary functions
	def format_explainer_html(html_string):
	"""Extract tokens with attribution-based background color."""
	inside_token_prefix = '##'
	soup = BeautifulSoup(html_string, 'html.parser')
	p = soup.new_tag('p',
	attrs={'style': 'color: black; background-color: white;'})
	# Select token elements and remove model specific tokens
	current_word = None
	for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
	text = token.font.text.strip()
	if text.startswith(inside_token_prefix):
	text = text[len(inside_token_prefix):]
	else:
	# Create a new span for each word (sequence of sub-tokens)
	if current_word is not None:
	p.append(current_word)
	p.append(' ')
	current_word = soup.new_tag('span')
	token.string = text
	token.attrs['style'] = f"{token.attrs['style']}; padding: 0.2em 0em;"
	current_word.append(token)

	# Add last word
	p.append(current_word)

	# Add left and right-padding to each word
	for span in p.find_all('span'):
	span.find_all('mark')[0].attrs['style'] = (
	f"{span.find_all('mark')[0].attrs['style']}; padding-left: 0.2em;")
	span.find_all('mark')[-1].attrs['style'] = (
	f"{span.find_all('mark')[-1].attrs['style']}; padding-right: 0.2em;")

	return p


	def classify_comment(comment, selected_model):
	"""Classify the given comment and augment with additional information."""
	toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
	result = toxicity_pipeline(comment)[0]
	result['model_name'] = selected_model

	# Add explanation
	result['word_attribution'] = cls_explainer(comment, class_name="non-toxic")
	result['visualitsation_html'] = cls_explainer.visualize()._repr_html_()
	result['tokens_with_background'] = format_explainer_html(
	result['visualitsation_html'])

	# Choose emoji reaction
	label, score = result['label'], result['score']
	if label == 'toxic' and score > 0.1:
	emoji = random.choice(potty_mouth_emojis)
	elif label in ['non_toxic', 'non-toxic'] and score > 0.1:
	emoji = random.choice(regular_emojis)
	else:
	emoji = random.choice(undecided_emojis)
	result.update({'text': comment, 'emoji': emoji})

	# Add result to session
	st.session_state.results.append(result)


	# Start session
	if 'results' not in st.session_state:
	st.session_state.results = []

	# Page
	st.title('🤬 Dutch Toxic Comment Detection')
	st.markdown("""This demo showcases two Dutch toxic comment detection models.""")

	# Introduction
	st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
	The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
	st.markdown(f"""For a more comprehensive overview of the models check out their model card on 🤗 Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
	""")
	st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
	<font color="black">
	<span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
	</font>
	tokens indicate toxicity whereas
	<font color="black">
	<span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
	</font> tokens indicate the opposite.

	Try it yourself! 👇""",
	unsafe_allow_html=True)


	# Demo
	with st.form("dutch-toxic-comment-detection-input", clear_on_submit=False):
	selected_model = st.selectbox('Select a model:', model_names_to_URLs.keys(),
	)#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
	text = st.text_area(
	label='Enter the comment you want to classify below (in Dutch):')
	_, rightmost_col = st.columns([6,1])
	submitted = rightmost_col.form_submit_button("Classify",
	help="Classify comment")

	# Listener
	if submitted:
	if text:
	with st.spinner('Analysing comment...'):
	classify_comment(text, selected_model)
	else:
	st.error('Error: No comment to classify. Please provide a comment.')

	# Results
	if 'results' in st.session_state and st.session_state.results:
	first = True
	for result in st.session_state.results[::-1]:
	if not first:
	st.markdown("---")
	st.markdown(f"Text:\n> {result['text']}")
	col_1, col_2, col_3 = st.columns([1,2,2])
	col_1.metric(label='', value=f"{result['emoji']}")
	col_2.metric(label='Label', value=f"{result['label']}")
	col_3.metric(label='Score', value=f"{result['score']:.3f}")
	st.markdown(f"Token Attribution:\n{result['tokens_with_background']}",
	unsafe_allow_html=True)
	st.caption(f"Model: {result['model_name']}")
	first = False