open-gpt-3.5-detector / convert_into_distilbert_dataset.py

Upload 2 files

0db33af verified 5 months ago

3.78 kB

	# The purpose of this file is to take given texts
	# Put AI ones into negative and human ones into positive
	# While making sure to split all the texts into word by word
	# To ensure searching before the text has finished streaming

	# Example this: "The dog walked over the pavement." will be turned into:
	# The
	# The dog
	# The dog walked
	# The dog walked over
	# The dog walked over the
	# The dog walked over the pavement
	# The dog walked over the pavement.

	# Example data row:
	# {"query": "Write a story about dogs", "pos": ["lorem ipsum..."], "neg": ["lorem ipsum..."]}

	import re
	import ujson as json
	import random
	from tqdm import tqdm

	def split_string(text):
	"""Split a given text by spaces and punctuation"""
	# Split the text by spaces
	words = text.split()

	# For now we disabled further splitting because of issues
	# # Further split each word by punctuation using regex
	# split_words = []
	# for word in words:
	# # Find all substrings that match the pattern: either a word or a punctuation mark
	# split_words.extend(re.findall(r'\w+\|[^\w\s]', word))

	return words

	reddit_vs_synth_writing_prompts = []
	with open("writing_prompts/reddit_vs_synth_writing_prompts.jsonl", "r") as f:
	temp = f.read()
	for line in temp.splitlines():
	loaded_object = json.loads(line)
	if not "story_human" in loaded_object: # Remove ones where we don't have human data
	continue

	reddit_vs_synth_writing_prompts.append(loaded_object)

	dataset_entries = []

	SAVE_FILE_NAME = "bert_reddit_vs_synth_writing_prompts.jsonl"

	def add_streamed_data(data):
	entries = []
	data_parts = split_string(data)

	for i in range(len(data_parts)):
	streamed_so_far = " ".join(data_parts[:i + 1]) # Since python slicing is exclusive toward the end
	entries.append({"text": streamed_so_far, "label": HUMAN_LABEL})

	return entries

	with open(SAVE_FILE_NAME, "w") as f:
	f.write("")

	NUM_OF_TURNS_TO_DUMP = 200
	i = 0
	for data in tqdm(reddit_vs_synth_writing_prompts):
	# {"text": "AI-generated text example 1", "label": 1},
	# Assuming 1 means AI generated, 0 means human
	HUMAN_LABEL = 0
	AI_LABEL = 1
	i += 1

	# Below is to enable writing dataset part by part
	if i == NUM_OF_TURNS_TO_DUMP:
	i = 0
	dumped_string = ""
	dumped_entries = []
	for entry in dataset_entries:
	dumped_entries.append(json.dumps(entry))

	dumped_string = "\n".join(dumped_entries) + "\n"

	with open(SAVE_FILE_NAME, "a") as f:
	f.write(dumped_string)

	dataset_entries = []

	if False: # Disable Streaming
	# Add streamed data
	human_entries = add_streamed_data(data["story_human"])
	dataset_entries.extend(human_entries)

	ai_data = []
	if data.get("story_opus"):
	ai_data.extend(add_streamed_data(data["story_opus"]))
	if data.get("story_gpt_3_5"):
	ai_data.extend(add_streamed_data(data["story_gpt_3_5"]))

	dataset_entries.extend(ai_data)

	else:
	# Add without streaming
	dataset_entries.append({"text": data["story_human"], "label": HUMAN_LABEL})

	ai_data = []
	if data.get("story_opus"):
	dataset_entries.append({"text": data["story_opus"], "label": AI_LABEL})
	if data.get("story_gpt_3_5"):
	dataset_entries.append({"text": data["story_gpt_3_5"], "label": AI_LABEL})

	# Dump as JSONL
	dumped_string = ""
	dumped_entries = []
	for entry in dataset_entries:
	dumped_entries.append(json.dumps(entry))

	dumped_string = "\n".join(dumped_entries) + "\n"

	with open(SAVE_FILE_NAME, "a") as f:
	f.write(dumped_string)