File size: 3,779 Bytes

0db33af

# The purpose of this file is to take given texts
# Put AI ones into negative and human ones into positive
# While making sure to split all the texts into word by word
# To ensure searching before the text has finished streaming

# Example this: "The dog walked over the pavement." will be turned into:
# The
# The dog
# The dog walked
# The dog walked over
# The dog walked over the
# The dog walked over the pavement
# The dog walked over the pavement.

# Example data row:
# {"query": "Write a story about dogs", "pos": ["lorem ipsum..."], "neg": ["lorem ipsum..."]}

import re
import ujson as json
import random
from tqdm import tqdm

def split_string(text):
    """Split a given text by spaces and punctuation"""
    # Split the text by spaces
    words = text.split()
    
    # For now we disabled further splitting because of issues
    # # Further split each word by punctuation using regex
    # split_words = []
    # for word in words:
    #     # Find all substrings that match the pattern: either a word or a punctuation mark
    #     split_words.extend(re.findall(r'\w+|[^\w\s]', word))
    
    return words

reddit_vs_synth_writing_prompts = []
with open("writing_prompts/reddit_vs_synth_writing_prompts.jsonl", "r") as f:
    temp = f.read()
for line in temp.splitlines():
    loaded_object = json.loads(line)
    if not "story_human" in loaded_object: # Remove ones where we don't have human data
        continue
    
    reddit_vs_synth_writing_prompts.append(loaded_object)

dataset_entries = []

SAVE_FILE_NAME = "bert_reddit_vs_synth_writing_prompts.jsonl"

def add_streamed_data(data):
    entries = []
    data_parts = split_string(data)
    
    for i in range(len(data_parts)):
        streamed_so_far = " ".join(data_parts[:i + 1]) # Since python slicing is exclusive toward the end
        entries.append({"text": streamed_so_far, "label": HUMAN_LABEL})
    
    return entries

with open(SAVE_FILE_NAME, "w") as f:
    f.write("")

NUM_OF_TURNS_TO_DUMP = 200
i = 0
for data in tqdm(reddit_vs_synth_writing_prompts):
    #     {"text": "AI-generated text example 1", "label": 1},
    # Assuming 1 means AI generated, 0 means human
    HUMAN_LABEL = 0
    AI_LABEL = 1
    i += 1
    
    # Below is to enable writing dataset part by part
    if i == NUM_OF_TURNS_TO_DUMP:
        i = 0
        dumped_string = ""
        dumped_entries = []
        for entry in dataset_entries:
            dumped_entries.append(json.dumps(entry))
        
        dumped_string = "\n".join(dumped_entries) + "\n"

        with open(SAVE_FILE_NAME, "a") as f:
            f.write(dumped_string)
        
        dataset_entries = []

    if False: # Disable Streaming
        # Add streamed data
        human_entries = add_streamed_data(data["story_human"])
        dataset_entries.extend(human_entries)
        
        ai_data = []
        if data.get("story_opus"):
            ai_data.extend(add_streamed_data(data["story_opus"]))
        if data.get("story_gpt_3_5"):
            ai_data.extend(add_streamed_data(data["story_gpt_3_5"]))
        
        dataset_entries.extend(ai_data)
    
    else:
        # Add without streaming
        dataset_entries.append({"text": data["story_human"], "label": HUMAN_LABEL})
        
        ai_data = []
        if data.get("story_opus"):
            dataset_entries.append({"text": data["story_opus"], "label": AI_LABEL})
        if data.get("story_gpt_3_5"):
            dataset_entries.append({"text": data["story_gpt_3_5"], "label": AI_LABEL})

# Dump as JSONL
dumped_string = ""
dumped_entries = []
for entry in dataset_entries:
    dumped_entries.append(json.dumps(entry))

dumped_string = "\n".join(dumped_entries) + "\n"

with open(SAVE_FILE_NAME, "a") as f:
    f.write(dumped_string)