Spaces:

lukestanley
/

ChillTranslator

Sleeping

App Files Files Community

ChillTranslator / learn.py

lukestanley

remove unused helpers

c4d55d5 4 months ago

raw history blame contribute delete

No virus

5.47 kB

	# %%
	import asyncio
	import json
	import time
	import os
	import hashlib
	from functools import wraps

	import pandas as pd
	from datasets import load_dataset
	from detoxify import Detoxify

	# TODO: Compare OpenAI's moderation API to Detoxify


	predict_model = Detoxify('original-small')
	dataset = load_dataset("tasksource/jigsaw")

	train_data = dataset['train']
	print('length',len(train_data)) # length 159571
	print(train_data[0]) # {'id': '0000997932d777bf', 'comment_text': "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", 'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0}

	small_subset = train_data[:2000]

	predict_model.predict("You suck, that is not Markdown!") # Also accepts an array of strings, returning an single dict of arrays of predictions.
	# Returns:
	{'toxicity': 0.98870254,
	'severe_toxicity': 0.087154716,
	'obscene': 0.93440753,
	'threat': 0.0032278204,
	'insult': 0.7787105,
	'identity_attack': 0.007936229}



	_in_memory_cache = {}

	def handle_cache(prefix, func, args, _result=None, *kwargs):
	# Generate a key based on function name and arguments
	key = f"{func.__name__}_{args}_{kwargs}"
	hashed_key = hashlib.sha1(key.encode()).hexdigest()
	cache_filename = f"{prefix}_{hashed_key}.json"

	# Check the in-memory cache first
	if key in _in_memory_cache:
	return _in_memory_cache[key]

	# Check if cache file exists and read data
	if os.path.exists(cache_filename):
	with open(cache_filename, 'r') as file:
	#print("Reading from cache file with prefix", prefix)
	_in_memory_cache[key] = json.load(file)
	return _in_memory_cache[key]

	# If result is not provided (for sync functions), compute it
	if _result is None:
	_result = func(args, *kwargs)

	# Update the in-memory cache and write it to the file
	_in_memory_cache[key] = _result
	with open(cache_filename, 'w') as file:
	json.dump(_result, file)

	return _result




	def cache(prefix):
	def decorator(func):
	@wraps(func)
	def wrapper(args, *kwargs):
	# Direct call to the shared cache handling function
	return handle_cache(prefix, func, args, *kwargs)
	return wrapper
	return decorator





	@cache("toxicity")
	def cached_toxicity_prediction(comments):
	data = predict_model.predict(comments)
	return data

	def predict_toxicity(comments, batch_size=4):
	"""
	Predicts toxicity scores for a list of comments.

	Args:
	- comments: List of comment texts.
	- batch_size: Size of batches for prediction to manage memory usage.

	Returns:
	A DataFrame with the original comments and their predicted toxicity scores.
	"""
	results = {'comment_text': [], 'toxicity': [], 'severe_toxicity': [], 'obscene': [], 'threat': [], 'insult': [], 'identity_attack': []}
	for i in range(0, len(comments), batch_size):
	batch_comments = comments[i:i+batch_size]
	predictions = cached_toxicity_prediction(batch_comments)
	# We convert the JSON serializable data back to a DataFrame:
	results['comment_text'].extend(batch_comments)
	for key in predictions.keys():
	results[key].extend(predictions[key])
	return pd.DataFrame(results)

	# Predict toxicity scores for the small subset of comments:
	#small_subset_predictions = predict_toxicity(small_subset['comment_text'][4])
	# Let's just try out 4 comments with cached_toxicity_prediction:
	small_subset['comment_text'][0:1]

	# %%
	small_subset_predictions=predict_toxicity(small_subset['comment_text'][0:200])

	# %%
	small_subset_predictions

	# %%
	def filter_comments(dataframe, toxicity_threshold=0.2, severe_toxicity_threshold=0.4):
	"""
	Filters comments based on specified thresholds for toxicity, severe toxicity.

	Args:
	- dataframe: DataFrame containing comments and their toxicity scores.
	- toxicity_threshold: Toxicity score threshold.
	- severe_toxicity_threshold: Severe toxicity score threshold.
	- identity_attack_threshold: Identity attack score threshold.

	Returns:
	DataFrame filtered based on the specified thresholds.
	"""
	identity_attack_threshold = 0.5
	insult_threshold = 0.3
	obscene_threshold = 0.6
	threat_threshold = 0.3
	filtered_df = dataframe[
	(dataframe['toxicity'] >= toxicity_threshold) &
	#(dataframe['toxicity'] < 1.0) & # Ensure comments are spicy but not 100% toxic
	(dataframe['severe_toxicity'] < severe_toxicity_threshold) &
	(dataframe['identity_attack'] < identity_attack_threshold) &
	(dataframe['insult'] < insult_threshold) &
	(dataframe['obscene'] < obscene_threshold) &
	(dataframe['threat'] < threat_threshold)

	]
	return filtered_df

	spicy_comments = filter_comments(small_subset_predictions)


	# Lets sort spicy comments by combined toxicity score:
	spicy_comments.sort_values(by=['toxicity', 'severe_toxicity'], ascending=True, inplace=True)

	# Print the spicy comments comment_text and their toxicity scores as a formatted string:
	for index, row in spicy_comments.iterrows():
	print(f"Comment: `{row['comment_text']}` \n Toxiciy: {(row['toxicity'] + row['severe_toxicity']) / 2 * 100:.0f}% \n")