# The purpose of this file is to take given texts # Put AI ones into negative and human ones into positive # While making sure to split all the texts into word by word # To ensure searching before the text has finished streaming # Example this: "The dog walked over the pavement." will be turned into: # The # The dog # The dog walked # The dog walked over # The dog walked over the # The dog walked over the pavement # The dog walked over the pavement. # Example data row: # {"query": "Write a story about dogs", "pos": ["lorem ipsum..."], "neg": ["lorem ipsum..."]} import re import ujson as json import random from tqdm import tqdm def split_string(text): """Split a given text by spaces and punctuation""" # Split the text by spaces words = text.split() # For now we disabled further splitting because of issues # # Further split each word by punctuation using regex # split_words = [] # for word in words: # # Find all substrings that match the pattern: either a word or a punctuation mark # split_words.extend(re.findall(r'\w+|[^\w\s]', word)) return words reddit_vs_synth_writing_prompts = [] with open("writing_prompts/reddit_vs_synth_writing_prompts.jsonl", "r") as f: temp = f.read() for line in temp.splitlines(): loaded_object = json.loads(line) if not "story_human" in loaded_object: # Remove ones where we don't have human data continue reddit_vs_synth_writing_prompts.append(loaded_object) dataset_entries = [] SAVE_FILE_NAME = "bert_reddit_vs_synth_writing_prompts.jsonl" def add_streamed_data(data): entries = [] data_parts = split_string(data) for i in range(len(data_parts)): streamed_so_far = " ".join(data_parts[:i + 1]) # Since python slicing is exclusive toward the end entries.append({"text": streamed_so_far, "label": HUMAN_LABEL}) return entries with open(SAVE_FILE_NAME, "w") as f: f.write("") NUM_OF_TURNS_TO_DUMP = 200 i = 0 for data in tqdm(reddit_vs_synth_writing_prompts): #  {"text": "AI-generated text example 1", "label": 1}, # Assuming 1 means AI generated, 0 means human HUMAN_LABEL = 0 AI_LABEL = 1 i += 1 # Below is to enable writing dataset part by part if i == NUM_OF_TURNS_TO_DUMP: i = 0 dumped_string = "" dumped_entries = [] for entry in dataset_entries: dumped_entries.append(json.dumps(entry)) dumped_string = "\n".join(dumped_entries) + "\n" with open(SAVE_FILE_NAME, "a") as f: f.write(dumped_string) dataset_entries = [] if False: # Disable Streaming # Add streamed data human_entries = add_streamed_data(data["story_human"]) dataset_entries.extend(human_entries) ai_data = [] if data.get("story_opus"): ai_data.extend(add_streamed_data(data["story_opus"])) if data.get("story_gpt_3_5"): ai_data.extend(add_streamed_data(data["story_gpt_3_5"])) dataset_entries.extend(ai_data) else: # Add without streaming dataset_entries.append({"text": data["story_human"], "label": HUMAN_LABEL}) ai_data = [] if data.get("story_opus"): dataset_entries.append({"text": data["story_opus"], "label": AI_LABEL}) if data.get("story_gpt_3_5"): dataset_entries.append({"text": data["story_gpt_3_5"], "label": AI_LABEL}) # Dump as JSONL dumped_string = "" dumped_entries = [] for entry in dataset_entries: dumped_entries.append(json.dumps(entry)) dumped_string = "\n".join(dumped_entries) + "\n" with open(SAVE_FILE_NAME, "a") as f: f.write(dumped_string)