|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
import ujson as json |
|
import random |
|
from tqdm import tqdm |
|
|
|
def split_string(text): |
|
"""Split a given text by spaces and punctuation""" |
|
|
|
words = text.split() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return words |
|
|
|
reddit_vs_synth_writing_prompts = [] |
|
with open("writing_prompts/reddit_vs_synth_writing_prompts.jsonl", "r") as f: |
|
temp = f.read() |
|
for line in temp.splitlines(): |
|
loaded_object = json.loads(line) |
|
if not "story_human" in loaded_object: |
|
continue |
|
|
|
reddit_vs_synth_writing_prompts.append(loaded_object) |
|
|
|
dataset_entries = [] |
|
|
|
SAVE_FILE_NAME = "bert_reddit_vs_synth_writing_prompts.jsonl" |
|
|
|
def add_streamed_data(data): |
|
entries = [] |
|
data_parts = split_string(data) |
|
|
|
for i in range(len(data_parts)): |
|
streamed_so_far = " ".join(data_parts[:i + 1]) |
|
entries.append({"text": streamed_so_far, "label": HUMAN_LABEL}) |
|
|
|
return entries |
|
|
|
with open(SAVE_FILE_NAME, "w") as f: |
|
f.write("") |
|
|
|
NUM_OF_TURNS_TO_DUMP = 200 |
|
i = 0 |
|
for data in tqdm(reddit_vs_synth_writing_prompts): |
|
|
|
|
|
HUMAN_LABEL = 0 |
|
AI_LABEL = 1 |
|
i += 1 |
|
|
|
|
|
if i == NUM_OF_TURNS_TO_DUMP: |
|
i = 0 |
|
dumped_string = "" |
|
dumped_entries = [] |
|
for entry in dataset_entries: |
|
dumped_entries.append(json.dumps(entry)) |
|
|
|
dumped_string = "\n".join(dumped_entries) + "\n" |
|
|
|
with open(SAVE_FILE_NAME, "a") as f: |
|
f.write(dumped_string) |
|
|
|
dataset_entries = [] |
|
|
|
if False: |
|
|
|
human_entries = add_streamed_data(data["story_human"]) |
|
dataset_entries.extend(human_entries) |
|
|
|
ai_data = [] |
|
if data.get("story_opus"): |
|
ai_data.extend(add_streamed_data(data["story_opus"])) |
|
if data.get("story_gpt_3_5"): |
|
ai_data.extend(add_streamed_data(data["story_gpt_3_5"])) |
|
|
|
dataset_entries.extend(ai_data) |
|
|
|
else: |
|
|
|
dataset_entries.append({"text": data["story_human"], "label": HUMAN_LABEL}) |
|
|
|
ai_data = [] |
|
if data.get("story_opus"): |
|
dataset_entries.append({"text": data["story_opus"], "label": AI_LABEL}) |
|
if data.get("story_gpt_3_5"): |
|
dataset_entries.append({"text": data["story_gpt_3_5"], "label": AI_LABEL}) |
|
|
|
|
|
dumped_string = "" |
|
dumped_entries = [] |
|
for entry in dataset_entries: |
|
dumped_entries.append(json.dumps(entry)) |
|
|
|
dumped_string = "\n".join(dumped_entries) + "\n" |
|
|
|
with open(SAVE_FILE_NAME, "a") as f: |
|
f.write(dumped_string) |
|
|