{ "cells": [ { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This is the name_results: .\n", "And this is the dialogue: None\n", "[('???', '“Hey, what are you mumbling about?”'), ('???', '“Okarin? Earth to Okarin!”'), ('???', '“You talking to someone?”'), ('Rintaro', '“...No, I was just talking to someone. Everything’s fine. I’m about to infiltrate the assembly hall.”'), ('Rintaro', '“Yeah, Doctor Nakabachi got the jump on us, but I’ll make sure he tells us everything.”'), ('Rintaro', '“What!? The Organization is already on the move!?”'), ('Rintaro', '“I see... so that’s the choice of Steins Gate. El Psy Kongroo.”'), ('Mayuri', '“Who was that on the phone?”'), ('Rintaro', '“If I told you, I’d have to kill you.”'), ('Mayuri', '“Oh, wow. Thanks, Okarin!”')]\n", "This is the name_results: .\n", "And this is the dialogue: None\n" ] } ], "source": [ "# usage: sp.encode_as_ids(\"This is a test\")\n", "\n", "# We're using sentencepiece because this is llama2\n", "\n", "\n", "import re\n", "\n", "\n", "\n", "\n", "# Step 1: Remove Unwanted Strings\n", "# Regex to match unwanted patterns enclosed in []\n", "unwanted_pattern = re.compile(r\"\\[color index=\\\".*?\\\"\\]|\\[(?!name|line|%p).*?\\]\")\n", "def remove_unwanted_strings(text):\n", " return unwanted_pattern.sub('', text)\n", "\n", "# Step 2: Parsing the text\n", "# I'll update the regular expressions to exclude the delimiters.\n", "name_regex = re.compile(r\"\\[name\\](.*?)\\[line\\]\")\n", "dialogue_regex = re.compile(r\"\\[line\\](.*?)\\[%p\\]\")\n", "\n", "def makecols(str):\n", " name_results = name_regex.search(str)\n", " dialogue_results = dialogue_regex.search(str)\n", " if name_results is None:\n", " return ('MONOLOGUE', dialogue_results.group(1) if dialogue_results else \"\")\n", " try: \n", " return (name_results.group(1).strip(), dialogue_results.group(1).strip())\n", " except:\n", " print(f\"This is the name_results: {name_results}.\\nAnd this is the dialogue: {dialogue_results}\")\n", " return ('ERROR!', '')\n", "\n", "def not_empty_monologue(tup):\n", " if (tup[0] == 'MONOLOGUE') and (tup[1] == ''):\n", " return False\n", " return True\n", "\n", "# Step 3: Final Processing\n", "def process_script(filename):\n", " with open(filename, 'r') as f:\n", " raw_script = f.read()\n", "\n", " # Remove unwanted strings\n", " cleaned_script = remove_unwanted_strings(raw_script)\n", "\n", " # Split the cleaned_script into lines and filter out empty lines\n", " lines = [line.strip() for line in cleaned_script.split('\\n') if line.strip()]\n", "\n", " # Process each line to make a tuple of (speaker, dialogue)\n", " script_tuples = list(map(makecols, lines))\n", " script_tuples = list(filter(not_empty_monologue, script_tuples))\n", "\n", " \n", " return script_tuples\n", "\n", "script_tuples = process_script('combined_script.txt')\n", "print(script_tuples[:10]) # Just printing the first 10 for visualization\n", "\n", "\n", "script = process_script('combined_script.txt')\n", "# script = list(filter(not_monologue,script)) \n", "\n", "# tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "14353\n" ] }, { "data": { "text/plain": [ "[('Rintaro', '“Farewell! Muhahaha!”'),\n", " ('Rintaro',\n", " '“Damn the Organization! They must be serious if they’re sending in agents like her!”'),\n", " ('Rintaro', '“But I can’t let them capture me yet.”'),\n", " ('Rintaro', '“...Damn. I left Mayuri behind.”'),\n", " ('Rintaro', '“Hm? An email?”'),\n", " ('Rintaro', '“...Hm?”'),\n", " ('Rintaro', '“Dammit, Mayuri. Why won’t you pick up?”'),\n", " ('Rintaro', '“Wait, don’t tell me! Did that femme fatale kidnap Mayuri!?”'),\n", " ('Rintaro', '“Damn you! Is that how the Organization operates!?”'),\n", " ('Rintaro', '“I have to go back for her.”'),\n", " ('Rintaro', '“Heh, looks like I scared her off.”'),\n", " ('Rintaro', '“So be it. I’ll let her go this time.”'),\n", " ('Rintaro', '“Mayuri! Why didn’t you pick up? We’re leaving.”'),\n", " ('Mayuri', '“Okarin! My Metal Upa ran away.”'),\n", " ('Rintaro', '“Ran away? What, it’s alive? That’s a little hard to believe.”'),\n", " ('Mayuri', '“I think I dropped it...”'),\n", " ('Rintaro', '“Forget about it. You can always get another one.”'),\n", " ('Mayuri',\n", " '“No way. Metal Upas sell upwards of 10,000 yen online, you know?”'),\n", " ('Rintaro', '“Wait... what?”'),\n", " ('Rintaro', '“Think, Mayuri! Where did you drop it!?”'),\n", " ('Mayuri',\n", " '“I don’t know! That’s why I’m looking. And even if we find it, you can’t sell it, okay?”'),\n", " ('Rintaro', '“Muhahaha! That 10,000 yen will fund my research!”'),\n", " ('Mayuri', '“I said you can’t sell it! It even has Mayushii’s name on it.”'),\n", " ('Mayuri', '“Tutturu♪ Upa, Upa, come out, come out, wherever you are!”'),\n", " ('Rintaro',\n", " '“What kind of man steals a helpless girl’s toy? Is there nothing in his heart but the lust for money!?”'),\n", " ('Mayuri', '“Sounds like you, Okarin!”'),\n", " ('???', '“AHHHHHHHHHHH!”'),\n", " ('Rintaro', '“!?”'),\n", " ('Mayuri', '“Was that a scream?”'),\n", " ('Rintaro', '“Stay here, Mayuri!”')]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(script))\n", "script[100:130]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "from tqdm import tqdm\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"Gryphe/MythoMax-L2-13b\")\n", "\n", "def generate_examples(script, tokenizer, kurisu_count_min=1, window_length=10, rintaro_count_min=1, max_lines_without_kurisu=6):\n", " MAX_TOKENS = 800 # Change this value if you want to use a different token limit\n", "\n", " examples = []\n", " sliding_window = []\n", " example = []\n", " kurisu_counter = 0\n", " rintaro_counter = 0\n", " lines_without_kurisu = 0\n", " making_conversation = False\n", "\n", " for dialogue in tqdm(script):\n", " speaker, line = dialogue\n", "\n", " if len(sliding_window) == window_length:\n", " sliding_window.pop(0) # Remove first element\n", "\n", " sliding_window.append(dialogue)\n", "\n", " # Check if there are more than kurisu_count_min spoken lines from kurisu across sliding_window\n", " kurisu_counter = sum(1 for d in sliding_window if d[0] == 'Kurisu')\n", " rintaro_counter = sum(1 for d in sliding_window if d[0] == 'Rintaro')\n", "\n", " if speaker == 'Kurisu':\n", " lines_without_kurisu = 0 # Reset count\n", " else:\n", " lines_without_kurisu += 1 # Increment count\n", " \n", " can_start_conversation = kurisu_counter >= kurisu_count_min and rintaro_counter >= rintaro_count_min\n", " should_stop_conversation = making_conversation and (len(tokenizer.encode(' '.join([d[1] for d in example]))) > MAX_TOKENS or lines_without_kurisu > max_lines_without_kurisu)\n", " \n", " if making_conversation:\n", " if should_stop_conversation: # making conversation and should stop\n", " examples.append(example)\n", " example = []\n", " sliding_window = []\n", " kurisu_counter = 0\n", " rintaro_counter = 0\n", " lines_without_kurisu = 0\n", " making_conversation = False\n", " else: # making conversation and should not stop\n", " example.append(dialogue)\n", " elif can_start_conversation: # not making conversation and should start, by appending an example to conversation\n", " example.append(dialogue)\n", " making_conversation = True\n", "\n", " if example: # Add last example if it's non-empty\n", " examples.append(example)\n", "\n", " return examples\n", "\n", "\n", "# additional step: remove all non-kurisu examples at the end of each example. They're literally pointless and will not be used in training data anyway.\n", "\n", "# def format_conversations(script, speaker1='kurisu', speaker2='rintaro', speaker1_count=3, speaker2_count=2, window_size=6, max_tokens=600):\n", "# conversations = [] # to hold the conversations\n", "# current_conversation = [] # to hold the current conversation\n", "# current_window = [] # to hold the current window of lines\n", " \n", "# for line in script:\n", "# speaker, dialogue = line\n", "# current_window.append(line)\n", "\n", "# # If window is larger than window_size, remove the oldest line\n", "# if len(current_window) > window_size:\n", "# current_window.pop(0)\n", "\n", "# # Count the dialogues of speaker1 and speaker2 in the current window\n", "# speaker1_dialogues = sum([1 for line in current_window if line[0] == speaker1])\n", "# speaker2_dialogues = sum([1 for line in current_window if line[0] == speaker2])\n", "\n", "# # If conditions are met, add dialogues to the current conversation\n", "# if speaker1_dialogues >= speaker1_count and speaker2_dialogues >= speaker2_count:\n", "# current_conversation.append(line)\n", "\n", "# # If the current conversation reaches the max_tokens limit, add it to the conversations and reset current_conversation\n", "# if tokenizer.encode(' '.join([dialogue for _, dialogue in current_conversation]), return_tensors='pt').shape[1] > max_tokens:\n", "# conversations.append(current_conversation)\n", "# current_conversation = []\n", "# current_window = []\n", " \n", "# # Add the last conversation if it was not added before\n", "# if current_conversation and tokenizer.encode(' '.join([dialogue for _, dialogue in current_conversation]), return_tensors='pt').shape[1] <= max_tokens:\n", "# conversations.append(current_conversation)\n", "\n", "# return conversations" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('???', '“Hey, what are you mumbling about?”')\n" ] } ], "source": [ "print(script[0])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 14353/14353 [00:02<00:00, 6517.22it/s]\n" ] } ], "source": [ "created_examples_script = generate_examples(script, tokenizer,)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Kurisu', '“Could you come with me for a moment?”'),\n", " ('Rintaro', '“Y-you’re with the Organization!?”'),\n", " ('Kurisu', '“Huh?”'),\n", " ('Rintaro',\n", " '“If their tendrils have gotten this far, then I’ve made a grave mistake.”'),\n", " ('Kurisu', '“Stop fooling around and come with me.”'),\n", " ('Rintaro', '“...”'),\n", " ('Rintaro',\n", " '“Try anything and people are sure to notice. What will your superiors say then?”'),\n", " ('Kurisu', '“What are you talking about?”'),\n", " ('Kurisu', '“I just need to ask you something.”'),\n", " ('Rintaro',\n", " '“What makes you think I’ll answer? I know how the Organization operates.”'),\n", " ('Kurisu', '“What’s with this ’Organization’ stuff?”'),\n", " ('Rintaro',\n", " '“It’s me. I’ve been caught by an Organization agent. ...Yes, it’s Makise Kurisu. She’s a dangerous one. ...No, it’s fine. I’ll find a way to--”'),\n", " ('Kurisu', '“...”'),\n", " ('Rintaro', '“What are you doing!?”'),\n", " ('Kurisu', '“Huh? Your phone’s off.”'),\n", " ('Rintaro', '“...”'),\n", " ('Kurisu', '“...Who were you talking to?”'),\n", " ('Rintaro',\n", " '“Y-your techniques don’t work on me, but I’ll tell you anyway. That’s no ordinary phone. It’s designed to deactivate the moment it leaves my hand. Muhahaha!”'),\n", " ('Kurisu', '“...So you talk to yourself.”'),\n", " ('Rintaro', '“Guh!”'),\n", " ('Kurisu', '“What were you trying to tell me earlier?”'),\n", " ('Rintaro', '“What are you talking about?”'),\n", " ('Kurisu', '“About fifteen minutes ago. Before the conference started.”'),\n", " ('Kurisu',\n", " '“You were trying to tell me something, right? You looked really upset.”'),\n", " ('Kurisu', '“You looked like you were going to start crying any second.”'),\n", " ('Kurisu', '“Why? Have we met before?”'),\n", " ('Kurisu', '“And how do you know my name?”'),\n", " ('Rintaro', '“My knowledge has no limits.”'),\n", " ('Rintaro', '“Genius girl, our next meeting shall be as enemies!”'),\n", " ('Kurisu', '“Huh?”'),\n", " ('Rintaro', '“Farewell! Muhahaha!”'),\n", " ('Rintaro',\n", " '“Damn the Organization! They must be serious if they’re sending in agents like her!”'),\n", " ('Rintaro', '“But I can’t let them capture me yet.”'),\n", " ('Rintaro', '“...Damn. I left Mayuri behind.”'),\n", " ('Rintaro', '“Hm? An email?”'),\n", " ('Rintaro', '“...Hm?”')]" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "created_examples_script[0]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "224\n", "[('Kurisu', '“Oh?”'), ('Kurisu', '“That looks like an interesting experiment.”'), ('Rintaro', '“Who’s there!?”'), ('Rintaro', '“Impossible! What are you doing here!?”'), ('Rintaro', '“The 18-year-old genius girl! A sadist who humiliates men in public! Also known as The Zombie!”'), ('Rintaro', '“Makise... Kurisu!”'), ('Itaru', '”Nice exposition, bro.”'), ('Kurisu', '“Who are you calling a zombie?”'), ('Rintaro', '“What is the meaning of this!? What is your purpose here?”'), ('Kurisu', '“I’m here to see you, Okabe Rintaro-san. Or is it Hououin Kyouma-san?”'), ('Rintaro', '“I was right! You’re one of the Organization’s top agents, an esper with superhuman powers!”'), ('Rintaro', '“No wonder you rose from the dead!”'), ('Kurisu', '“I’m not dead, alright? Please stop killing me off.”'), ('Kurisu', '“Hashida-san, can you do something about this guy?”'), ('Itaru', '“You came at a bad time, Makise-shi, with Okarin freaking out like this.”'), ('Rintaro', '“Have you betrayed me, Daru!?”'), ('Itaru', '“Calm down, man.”'), ('Rintaro', '“Are you being blackmailed? Or did she seduce you?”'), ('Rintaro', '“YOU’VE CROSSED THE LINE, BIIITTCH!”'), ('Kurisu', '“Get ahold of yourself.”'), ('Kurisu', '“Hashida-san gave me the address after yesterday’s lecture. He also told me your name.”'), ('Rintaro', '“So you’re here to see me, is that it?”'), ('Kurisu', '“Yes. You claimed to have seen me die. I came to see if that was the truth, or just a pathetic excuse to grope me. I came for the answer.”'), ('Kurisu', '“But your current behavior is all the answer I need. It was all an act to grope me. My initial hypothesis was correct.”'), ('Rintaro', '“Not so fast. There’s more to this than you know.”'), ('Kurisu', '“Anyway, let’s put that aside for now.”'), ('Rintaro', '“R-really?”'), ('Kurisu', '“I haven’t properly introduced myself yet, have I? I’m Makise Kurisu. Pleased to meet you.”'), ('Kurisu', '“You can’t even shake hands? Are all Japanese men this difficult?”'), ('Rintaro', '“You’re not Japanese!?”'), ('Kurisu', '“I’ve lived in America for seven years. What about it?”'), ('Rintaro', '“America...”'), ('Kurisu', '“What’s your problem?”'), ('Rintaro', '“I can feel your aura of malice. You must be a powerful kung-fu master.”'), ('Kurisu', '“Don’t be ridiculous.”'), ('Rintaro', '“Then you’re a NINJA--”'), ('Kurisu', '“Give it a rest.”'), ('Rintaro', '“If you grew up in America, shouldn’t you say ’HAHAHA! NICE TO MEET YOU!’ with a smile across your whole face when asking for a handshake?”'), ('Rintaro', '“No, wait, you should be asking for a hug, right?”'), ('Kurisu', '“What kind of stereotype is that?”'), ('Kurisu', '“...Fascinating.”'), ('Kurisu', '“Have any forceps?”'), ('Rintaro', '“No!”'), ('Kurisu', '“Oh.”'), ('Rintaro', '“What are you doing!? That’s precious data!”'), ('Kurisu', '“It’s squishy.”'), ('Kurisu', '“...No taste. Gross.”'), ('Rintaro', '“You have quite the appetite, I see. A side-effect of the resurrection, perhaps? If you’re that hungry, I guess I could give you a banana or two!”'), ('Kurisu', '“No thanks.”'), ('Itaru', '“Either way, those bananas are Mayushii’s.”'), ('Rintaro', '“Come, don’t be shy. This is an offering. Take it!”'), ('Kurisu', '“As if. Who would eat some perv’s banana?”'), ('Itaru', '“A perv’s banana...!”'), ('Itaru', '“Eat... a perv’s banana... squishy... finger in mouth... ’Gross!’ with a sour expression...”')]\n" ] } ], "source": [ "# NOTE: you'll need to do some optimization here, depending on what the script's like. Some scripts have one character monologue for a very, very, long time. For those, shorter windows are needed, to get more \"chatty\" excerpts.\n", "print(len(created_examples_script))\n", "print(created_examples_script[5])" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "def remove_only_ellipsis_lines(conversation):\n", " \"\"\"Remove lines that only contain ellipsis.\"\"\"\n", " return [(speaker, line) for speaker, line in conversation if line.replace('“','').replace('”','') != \"...\"]" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "conversations_de_ellipsised = list(map(remove_only_ellipsis_lines, created_examples_script))" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Kurisu', '“Could you come with me for a moment?”'),\n", " ('Rintaro', '“Y-you’re with the Organization!?”'),\n", " ('Kurisu', '“Huh?”'),\n", " ('Rintaro',\n", " '“If their tendrils have gotten this far, then I’ve made a grave mistake.”'),\n", " ('Kurisu', '“Stop fooling around and come with me.”'),\n", " ('Rintaro',\n", " '“Try anything and people are sure to notice. What will your superiors say then?”'),\n", " ('Kurisu', '“What are you talking about?”'),\n", " ('Kurisu', '“I just need to ask you something.”'),\n", " ('Rintaro',\n", " '“What makes you think I’ll answer? I know how the Organization operates.”'),\n", " ('Kurisu', '“What’s with this ’Organization’ stuff?”'),\n", " ('Rintaro',\n", " '“It’s me. I’ve been caught by an Organization agent. ...Yes, it’s Makise Kurisu. She’s a dangerous one. ...No, it’s fine. I’ll find a way to--”'),\n", " ('Rintaro', '“What are you doing!?”'),\n", " ('Kurisu', '“Huh? Your phone’s off.”'),\n", " ('Kurisu', '“...Who were you talking to?”'),\n", " ('Rintaro',\n", " '“Y-your techniques don’t work on me, but I’ll tell you anyway. That’s no ordinary phone. It’s designed to deactivate the moment it leaves my hand. Muhahaha!”'),\n", " ('Kurisu', '“...So you talk to yourself.”'),\n", " ('Rintaro', '“Guh!”'),\n", " ('Kurisu', '“What were you trying to tell me earlier?”'),\n", " ('Rintaro', '“What are you talking about?”'),\n", " ('Kurisu', '“About fifteen minutes ago. Before the conference started.”'),\n", " ('Kurisu',\n", " '“You were trying to tell me something, right? You looked really upset.”'),\n", " ('Kurisu', '“You looked like you were going to start crying any second.”'),\n", " ('Kurisu', '“Why? Have we met before?”'),\n", " ('Kurisu', '“And how do you know my name?”'),\n", " ('Rintaro', '“My knowledge has no limits.”'),\n", " ('Rintaro', '“Genius girl, our next meeting shall be as enemies!”'),\n", " ('Kurisu', '“Huh?”'),\n", " ('Rintaro', '“Farewell! Muhahaha!”'),\n", " ('Rintaro',\n", " '“Damn the Organization! They must be serious if they’re sending in agents like her!”'),\n", " ('Rintaro', '“But I can’t let them capture me yet.”'),\n", " ('Rintaro', '“...Damn. I left Mayuri behind.”'),\n", " ('Rintaro', '“Hm? An email?”'),\n", " ('Rintaro', '“...Hm?”')]" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conversations_de_ellipsised[0]" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "def merge_consecutive_lines(conversation):\n", " merged_conversation = []\n", " last_speaker = None\n", " for speaker, line in conversation:\n", " line_filtered = line.replace(\"“\",'').replace(\"”\",'')\n", " if not merged_conversation or speaker != last_speaker:\n", " # New speaker or first dialogue, just add it to the list\n", " merged_conversation.append((speaker, line.replace(\"“\",'').replace(\"”\",'')))\n", " else:\n", " # Same speaker as before, concatenate the lines\n", " prev_speaker, prev_line = merged_conversation.pop()\n", " merged_conversation.append((prev_speaker, (prev_line + \" \" + line).replace(\"“\",'').replace(\"”\",'')))\n", " last_speaker = speaker\n", " return merged_conversation # why do this step here? Because I don't want to iterate over the dataset twice, and monologues should count when examples are being generated with the sliding window, so I can't remove them in the usual spot.\n" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "conversations_processed = list(map(merge_consecutive_lines, conversations_de_ellipsised))" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Kurisu', 'Could you come with me for a moment?'),\n", " ('Rintaro', 'Y-you’re with the Organization!?'),\n", " ('Kurisu', 'Huh?'),\n", " ('Rintaro',\n", " 'If their tendrils have gotten this far, then I’ve made a grave mistake.'),\n", " ('Kurisu', 'Stop fooling around and come with me.'),\n", " ('Rintaro',\n", " 'Try anything and people are sure to notice. What will your superiors say then?'),\n", " ('Kurisu', 'What are you talking about? I just need to ask you something.'),\n", " ('Rintaro',\n", " 'What makes you think I’ll answer? I know how the Organization operates.'),\n", " ('Kurisu', 'What’s with this ’Organization’ stuff?'),\n", " ('Rintaro',\n", " 'It’s me. I’ve been caught by an Organization agent. ...Yes, it’s Makise Kurisu. She’s a dangerous one. ...No, it’s fine. I’ll find a way to-- What are you doing!?'),\n", " ('Kurisu', 'Huh? Your phone’s off. ...Who were you talking to?'),\n", " ('Rintaro',\n", " 'Y-your techniques don’t work on me, but I’ll tell you anyway. That’s no ordinary phone. It’s designed to deactivate the moment it leaves my hand. Muhahaha!'),\n", " ('Kurisu', '...So you talk to yourself.'),\n", " ('Rintaro', 'Guh!'),\n", " ('Kurisu', 'What were you trying to tell me earlier?'),\n", " ('Rintaro', 'What are you talking about?'),\n", " ('Kurisu',\n", " 'About fifteen minutes ago. Before the conference started. You were trying to tell me something, right? You looked really upset. You looked like you were going to start crying any second. Why? Have we met before? And how do you know my name?'),\n", " ('Rintaro',\n", " 'My knowledge has no limits. Genius girl, our next meeting shall be as enemies!'),\n", " ('Kurisu', 'Huh?'),\n", " ('Rintaro',\n", " 'Farewell! Muhahaha! Damn the Organization! They must be serious if they’re sending in agents like her! But I can’t let them capture me yet. ...Damn. I left Mayuri behind. Hm? An email? ...Hm?')]" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conversations_processed[0]" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('Kurisu', 'However, all of these models are purely theoretical. Some of them even contradict each other.'), ('Rintaro', 'Well, what if someone comes up with a 12th model?'), ('Kurisu', 'Hm? Ahh, uhh, right, well... It could be contradicted by the 13th model, now couldn’t it? By the way, time travel to the future is available to us right now, according to Einstein’s special theory of relativity. For example, let’s say someone were to go to Haneda Airport and board a plane headed to Okinawa. Upon arrival, that person would be about one hundred millionth of a second farther into the future than I am. According to the special theory of relativity, time moves slower for objects as they approach the speed of light. For example, if you could run at near the speed of light, you could reach a point where time only moves half as fast for you. If you were to keep running at that speed for 24 hours, 48 hours would elapse in the rest of the world, meaning you would ’jump’ one day into the future. Understand, Hououin Kyouma?'), ('Rintaro', 'Gah!'), ('Professor', 'But that’s not really time travel, is it?'), ('Kurisu', 'Yes, you’re right.'), ('Professor', 'Then what about going to the past?'), ('Kurisu', 'Going to the past is possible right now. Take a look at the sky at night. You can see light from tens of thousands of years ago, can’t you?'), ('Student', 'That’s not time travel either!'), ('Kurisu', 'Well, I was just getting started. Let’s say we wanted to make a machine that could physically transport people through time. What would we need? The best candidates for this are cosmic strings and wormholes. A cosmic string is a string-shaped ’crevice’ with extreme mass. The crevice is about as wide as an elementary particle, and at least as long as the diameter of a galaxy. It has immense mass, so it distorts space-time. If you were to travel through that distortion, you could make a full circle around the string in less than 360 degrees. In short, you can do something resembling a warp. This is called a space-time angular deficit. When you pass through an area of angular deficit, transit time becomes zero. Now we apply this to a cosmic string moving at near-light speed. According to the special theory of relativity, time will flow slower for the cosmic string in relation to its surroundings. Therefore, passing through the area of angular deficit would cause the transit time to become negative instead of zero. In other words, you will arrive in the past after transit. If you use two cosmic strings, you can do a space deficit jump. If you loop back to your original location, you can return to the same time you started revolving. And that, roughly speaking, is time travel by means of cosmic strings. By the way, just so nobody misunderstands, cosmic strings are not the same as superstrings. Now then, you need three things in order to travel to the past with cosmic strings. First. The cosmic strings themselves. Two strings, to be exact. By the way, they are hypothesized to exist only where the universe was first formed, so they might be a little hard to find. Second. You would need the energy required to make them move them at near-light speed. How much energy do you think you’d need to accelerate something as long as the Milky Way to near the speed of light? I’m pretty sure it’s a little more than 1.21 jigowatts.')]\n" ] } ], "source": [ "print(conversations_processed[2])" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "def add_space_after_punctuation(conversations):\n", " corrected_conversations = []\n", " for conversation in conversations:\n", " corrected_conversation = []\n", " for speaker, line in conversation:\n", " # Add a space wherever there is a punctuation mark followed by a letter, excluding ellipsis\n", " corrected_line = re.sub(r'([.,!?])(?= 1, training_data_conversations))\n", "# len(processed_conversations)\n", "print(training_data_conversations[99][-1])\n", "print(len(training_data_conversations))" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "# A FUNCTION THAT LETS YOU CALL OPENAI ON ALL THE EXAMPLES\n", "import openai\n", "import os\n", "\n", "def write_context_to_file(training_data_example, destination_directory, example_index): # for easier inspection\n", " full_conversation = training_data_example[-1]\n", " context = '\\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])\n", " \n", " filename = os.path.join(destination_directory, f'{example_index:03d}_conversation.txt') # I'm paying for the tokens, I damn well want to see them\n", "\n", " # Write the scenario to the file\n", " with open(filename, 'w') as f_1:\n", " f_1.write(context)\n", "\n", "for idx, content in enumerate(training_data_conversations):\n", " write_context_to_file(content, 'conversations', idx)\n", " \n", "\n", "def create_scenario(training_data_example, destination_directory, example_index):\n", " full_conversation = training_data_example[-1]\n", " context = '\\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])\n", "\n", " if not os.path.exists(os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt')):\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-4\",\n", " temperature=0.7,\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"\"\"You are an expert scenario-writing and prompt-engineering AI. Your task is to, given a series of statements made by characters from the visual novel Steins;Gate, determine which part of the story the scene is taking place in, and write a 3-sentence summary about what's happened until the point the conversation STARTS at (writing under the assumption that the reader knows who Kurisu is, and what some of her general traits are). \n", "\n", "Remember to keep the scenario at most three sentences long. Your goal is to describe the conversation's SETTING, at the START of the conversation (as well as what Kurisu is feeling and trying to do) instead of being to summarize it. This context should make sense if the AI to be trained on this data only had access to the first thing said by Kurisu.\n", "\n", "Think step-by-step, and explain your plan to write a good scenario for the provided context, before you actually write the scenario.\n", "\n", "Here are two roleplay prompt engineering things you should incorporate into your scenario:\n", "1. Your first sentence should explain the context of the scene: where it takes place, what exactly that place is (in general terms) and what each of the characters are doing there. Focus on Rintaro and Kurisu when it comes to motivations.\n", "2. End with a statement that describes where the scene is going, specifically, what Kurisu is trying to do, in the future tense. So if it's the scene where Kurisu first arrives in the lab, you might end your scenario with \"[short context behind the scene and what Kurisu is feeling]. Kurisu will interrogate Rintaro in her usual sarcastic, blunt manner, first about his earlier interaction with her, and then about his experiments.\" Note the mention of what kind of interaction this will be, as well as what Kurisu is actively trying to do in the conversation.\n", "\n", "Note that we're using the naming conventions of the visual novel here, so it's 'PhoneWave (name subject to change)', in case that ever comes up.\n", "\n", "----\n", "\n", "To help orient you as you determine which part of the plot a conversation is taking place in, here is the full plot summary of Steins;Gate, in point form.\n", "\n", "Rintaro, a self-proclaimed \"mad scientist,\" attends a lecture on time travel where he encounters Kurisu Makise, a genius neuroscientist/physicist. They have a brief argument about the feasibility of time travel.\n", "\n", "Shortly after, Rintaro discovers Kurisu lying in a pool of blood and sends a text to his friend, Daru. This inadvertently activates the \"Phone Microwave,\" a prototype time machine, sending the text to the past.\n", "\n", "Rintaro later finds out that Kurisu is alive and well. She confronts him about their earlier meeting, but he has no memory of it.\n", "\n", "Rintaro and his friends (including Mayuri, his childhood friend) discover that the Phone Microwave can send text messages to the past. They name these texts \"D-mails\" and experiment with them extensively, using them to grant the deepest-held wishes of three of their friends (Kiryu Moeka, Faris, Urashibara Luka, and Amane Suzuha, in that order)\n", "\n", "Kurisu joins Rintaro's lab and collaborates with him on perfecting the Phone Microwave. Their interactions start off tense but they slowly begin to trust and rely on each other.\n", "\n", "Darus, Kurisu, and Rintaro send various D-mails, leading to alterations in the timeline. Each change results in Rintaro being the only one who remembers the original timeline due to his \"Reading Steiner\" ability.\n", "\n", "Together, they develop a method to send memories to the past, effectively allowing the consciousness of the sender to time travel. They call this process \"Time Leap.\"\n", "\n", "Eventually the time travel experiments draw the attention of SERN, and the organization raids the lab and kills Mayuri. Mayuri begins to die in multiple timelines, prompting Rintaro to time leap repeatedly to save her, but he fails each time.\n", "\n", "Rintaro, with the help of Kurisu, undoes all of the D-mails he had sent earlier, causing himself great pain as he has to undo the very-personal wishes of Suzuha, Luka, Faris, and Moeka, in that order.\n", " \n", "Kurisu and Rintaro grow closer with each time leap, as Kurisu resolutely and coolheadedly supports Rintaro no matter the circumstances and no matter the timeline. Finally, Rintaro realizes that the original D-mail he sent about Kurisu's \"death\" is the cause of the altered timelines. Undoing this D-Mail means killing Kurisu. He swears to save both Kurisu and Mayuri, and tries hundreds of times, but fails every single time to save Mayuri.\n", "\n", "In one timeline, Rintaro confesses his feelings to Kurisu and they share a passionate kiss. However, he realizes that to save Mayuri, he must revert the timeline to one where Kurisu dies.\n", "\n", "Rintaro struggles with the decision but eventually, with Kurisu's encouragement, sends the D-mail to revert events to the original timeline.\n", " \n", "Shortly after reverting to the original timeline, Rintaro is contacted by a time-travelling Amane Suzuha, who implores him to help her stop World War 3 by saving Kurisu.\n", " \n", "Rintaro fails and ends up killing Kurisu with his own two hands this time, while trying to save her from her murderous father.\n", "\n", "Rintaro tries to move on but receives a message from his future self, providing a way to reach \"Steins Gate,\" a timeline where both Mayuri and Kurisu are saved.\n", "\n", "With help from his friends and guided by the message, Rintaro stages Kurisu's death without her actually dying, thus deceiving his past self and reaching the Steins Gate timeline.\n", "\n", "In the Steins Gate timeline, Rintaro and Kurisu meet by chance in the streets of Akihabara. They both feel a sense of déjà vu, hinting at their shared memories from other timelines.\n", "\n", "----\n", "\n", "One last pointer: keep the language simple. Which characters are where, under what circumstances, and what Kurisu is feeling and will do. The scene itself will do most of the talking. Keep the scenario 3 sentences long at most. Instead of mentioning events in the far future, you will concentrate on the event at hand and the things that led up to it.\n", " \"\"\"},\n", " {\n", " \"role\": \"user\",\n", " \"content\": \"\"\"Context:\n", "\n", "Kurisu: What pervy act are you thinking about this time? Can’t be too careful around the perv duo.\n", "\n", "Itaru: We’re not a perv duo. We are a pervy gentlemen duo!\n", "\n", "Kurisu: Whatever. Seriously, whatever.\n", "\"\"\"\n", " },\n", " {\n", " \"role\" : \"assistant\",\n", " \"content\": \"\"\"Plan: This conversation seems to be a casual one, in a non-crisis moment. Given the characters involved (Kurisu and Itaru), this is likely taking place in the Future Gadget Laboratory. The playful banter suggests a relatively peaceful time when they are working on the Phone Microwave, somewhere in the middle of the series after Kurisu has joined Rintaro's lab. \n", "\n", "Scenario: Inside the Future Gadget Laboratory, Kurisu, Rintaro, and Itaru are engaged in another day of research and experimentation on the PhoneWave. Having grown closer to the group, Kurisu is now familiar with their antics, enjoys their company somewhat, and is comfortable enough to tease them. Kurisu, feeling witty, will fire off sarcastic remarks towards Itaru and Rintaro, in her own unique way of bonding with the team.\"\"\"\n", " },\n", " {\n", " \"role\" : \"user\",\n", " \"content\" : context\n", " }\n", " ]\n", " )\n", " \n", " scenario = response['choices'][0]['message']['content']\n", " \n", " filename_cot_debug = os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt') # I'm paying for the tokens, I damn well want to see them\n", "\n", " # Write the scenario to the file\n", " with open(filename_cot_debug, 'w') as f_1:\n", " f_1.write(scenario)\n", "\n", " # # Assume \"Scenario:\\n\" is followed by the actual scenario\n", " scenario = re.search('Scenario:(.*)', scenario, re.DOTALL).group(1)\n", "\n", " # Create a filename based on the example index\n", " filename = os.path.join(destination_directory, f'{example_index:03d}.txt')\n", "\n", " # Write the scenario to the file\n", " with open(filename, 'w') as f_2:\n", " f_2.write(scenario)\n", " else:\n", " print(f\"Skipping {example_index:03d} because it already exists.\")" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "\n", "openai.api_key = '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'\n", "# create_scenario(training_data_conversations[2], 'scenarios', 2)\n", "# print(training_data_conversations[70][-1])\n", "# create_scenario(training_data_conversations[70], 'scenarios', 70)" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/186 [00:00