{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "nAJ2Ubu1-MUb", "outputId": "d4b88502-60dc-49cf-efb3-4151a994e79c", "colab": { "base_uri": "https://localhost:8080/" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2023-11-05 02:16:18-- https://github.com/LC1332/Needy-Haruhi/raw/main/data/Jines.csv\n", "Resolving github.com (github.com)... 140.82.112.4\n", "Connecting to github.com (github.com)|140.82.112.4|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://raw.githubusercontent.com/LC1332/Needy-Haruhi/main/data/Jines.csv [following]\n", "--2023-11-05 02:16:18-- https://raw.githubusercontent.com/LC1332/Needy-Haruhi/main/data/Jines.csv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 249489 (244K) [text/plain]\n", "Saving to: ‘Jines.csv’\n", "\n", "Jines.csv 100%[===================>] 243.64K --.-KB/s in 0.03s \n", "\n", "2023-11-05 02:16:19 (9.50 MB/s) - ‘Jines.csv’ saved [249489/249489]\n", "\n" ] } ], "source": [ "# 下载文件。\n", "!wget https://github.com/LC1332/Needy-Haruhi/raw/main/data/Jines.csv" ] }, { "cell_type": "code", "source": [ "import os\n", "import re\n", "import shutil\n", "\n", "import pandas as pd\n", "\n", "Jines_file = r\"/content/Jines.csv\" # 更改文件路径和扩展名\n", "\n", "# # 读取CSV文件\n", "Jines = pd.read_csv(Jines_file, lineterminator='\\n')" ], "metadata": { "id": "1-kV6t_ARAGN" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import re\n", "import shutil\n", "\n", "import pandas as pd\n", "\n", "Jines_file = r\"/content/Jines.csv\" # Change the file path and extension as needed\n", "\n", "# Read the CSV file\n", "Jines = pd.read_csv(Jines_file, lineterminator='\\n')\n", "\n", "\n" ], "metadata": { "id": "SRQcHQgHRTDz" }, "execution_count": 4, "outputs": [] }, { "cell_type": "markdown", "source": [ "实现一个python函数,输入是字符串,输出也是一个字符串,找到第一个\"(\",取括号之前的字符串并strip后输出\n", "\n", "例子输入\n", "Day0_JINE (First Part)\n", "例子输出\n", "Day0_JINE" ], "metadata": { "id": "j0UF0Oh7R9ou" } }, { "cell_type": "code", "source": [ "def extract_string_before_parentheses(input_value):\n", " # Check if the input is not a string\n", " if pd.isnull(input_value):\n", " return \"\"\n", " elif not isinstance(input_value, str):\n", " print(\"Warning: Input is not a string. Converting to string.\")\n", " # Convert to string if possible\n", " input_value = str(input_value)\n", " print(input_value)\n", "\n", " # Split the string at the first occurrence of '('\n", " parts = input_value.split('(', 1)\n", " # Take the first part and strip it of whitespace\n", " return parts[0].strip()\n", "\n", "# Example usage with a string:\n", "input_example_str = \"Day0_JINE (First Part)\"\n", "print(extract_string_before_parentheses(input_example_str)) # Expected output: Day0_JINE\n", "\n", "# Example usage with a non-string:\n", "input_example_non_str = 12345\n", "print(extract_string_before_parentheses(input_example_non_str)) # Expected output: Warning + \"12345\"\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mBWfbACoSPLr", "outputId": "6d9d7e52-a87e-47d2-c959-ca4fd722d5c4" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Day0_JINE\n", "Warning: Input is not a string. Converting to string.\n", "12345\n", "12345\n" ] } ] }, { "cell_type": "code", "source": [ "count = 0\n", "\n", "lines = []\n", "\n", "last_parent = \"\"\n", "last_category = \"\"\n", "\n", "all_events = []\n", "\n", "# Loop through each row and print the 'Category' and 'Parent'\n", "for index, row in Jines.iterrows():\n", "\n", " if pd.isnull(row['ParentId (more info)']):\n", " continue\n", "\n", " parent = extract_string_before_parentheses(row['ParentId (more info)'])\n", " category = row['Category']\n", "\n", " if parent.startswith(\"Ending\"):\n", " break\n", "\n", "\n", " # print(f\"Category: {category}, Parent: {parent}\")\n", "\n", " if category == last_category and last_parent == parent:\n", " lines.append(row)\n", " else:\n", " data = {\n", " \"parent\": last_parent,\n", " \"category\": last_category,\n", " \"lines\": lines\n", " }\n", " all_events.append(data)\n", "\n", " last_parent = parent\n", " last_category = category\n", " lines = [row]\n", "\n", "if len(lines) > 0:\n", " data = {\n", " \"parent\": last_parent,\n", " \"category\": last_category,\n", " \"lines\": lines\n", " }\n", " all_events.append(data)\n", "\n", "all_events = all_events[1:]" ], "metadata": { "id": "MNZSH1qRRcdF" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "print(len(all_events))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vCcRJXR2TiE7", "outputId": "82be5beb-2a83-4cff-c78a-66b4f3bfb6c0" }, "execution_count": 29, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "196\n" ] } ] }, { "cell_type": "code", "source": [ "for i, event in enumerate(all_events):\n", " if i % 3 == 0:\n", " print(i, event['parent'])\n", " else:\n", " print(i, event['parent'], end = ' ')" ], "metadata": { "id": "Qe3Cb7kVTlqd" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for i, event in enumerate(all_events):\n", " only_ame_flag = True\n", " for line in event['lines']:\n", " speaker = line[\"Speaker/Action (in blue)\"]\n", " if speaker != \"ame\":\n", " only_ame_flag = False\n", " break\n", " if only_ame_flag:\n", " print(event['parent'])" ], "metadata": { "id": "kpHE_EI_YeD5" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def check_only_ame(event):\n", " for line in event['lines']:\n", " speaker = line[\"Speaker/Action (in blue)\"]\n", " if speaker == \"pi\":\n", " return False\n", " return True\n", "\n", "def check_2nd(event):\n", " for line in event['lines']:\n", " parent = line['ParentId (more info)']\n", " if \"2ndOption\" in parent:\n", " return True\n", " return False" ], "metadata": { "id": "Ab_yJKGcZyy6" }, "execution_count": 79, "outputs": [] }, { "cell_type": "code", "source": [ "def check_open(event):\n", " for line in event['lines']:\n", " speaker = line[\"Speaker/Action (in blue)\"]\n", " if speaker == \"Any Open-Text Answer\":\n", " return True\n", " return False" ], "metadata": { "id": "tbnALRMTxW13" }, "execution_count": 80, "outputs": [] }, { "cell_type": "code", "source": [ "def transfer_dialogue(event):\n", " dialogues = []\n", "\n", " last_speaker = \"\"\n", " last_text = \"\"\n", " last_title = \"\"\n", " for i, line in enumerate(event['lines']):\n", " speaker = line[\"Speaker/Action (in blue)\"]\n", " title = line[\"Id\"]\n", " text = line[\"BodyCn\"]\n", " if speaker != last_speaker or i + 1 == len(event['lines']):\n", " if last_speaker != \"\":\n", " dialogues.append({\n", " \"speaker\": last_speaker,\n", " \"text\": last_text,\n", " \"title\": last_title\n", " })\n", " last_speaker = speaker\n", " last_text = text\n", " last_title = title\n", " else:\n", " last_text += text + \" \"\n", "\n", " return dialogues" ], "metadata": { "id": "KeCarNmmfur9" }, "execution_count": 81, "outputs": [] }, { "cell_type": "markdown", "source": [ "如果是一开始的ame的部分,直接作为prefix\n", "\n", "然后pi的时候要看相同的parent\n", "\n", "然后后面如果有不一样的就作为post\n" ], "metadata": { "id": "e0nGAhy4pw-j" } }, { "cell_type": "code", "source": [ "count = 0\n", "for i, event in enumerate(all_events):\n", " if check_only_ame(event):\n", " continue\n", "\n", " if check_2nd(event):\n", " continue\n", "\n", " if check_open(event):\n", " continue\n", "\n", " parent = event['parent']\n", " # print(parent, ' ' , event['category'],' ', len(event['lines']))\n", "\n", " # dealing with one event\n", " state = \"count_prefix\"\n", "\n", " prefix = \"\"\n", " options = []\n", "\n", " # if parent in [\"Day0_JINE\"]:\n", " # verbose = True\n", " # else:\n", " # verbose = False\n", "\n", " verbose = False\n", "\n", " record_flag = True\n", "\n", " for i, line in enumerate(event['lines']):\n", "\n", " speaker = line[\"Speaker/Action (in blue)\"]\n", " line_parent = line['ParentId (more info)']\n", " content = line[\"BodyCn\"]\n", "\n", " if verbose:\n", " print( speaker ,\" \", line_parent, \" \", content, \" \", len(options))\n", "\n", "\n", " if i != len(event['lines']) - 1:\n", " next_parent = event['lines'][i + 1]['ParentId (more info)']\n", " next_parent = next_parent.replace(\";end\", \"\")\n", " next_speaker = event['lines'][i + 1]['Speaker/Action (in blue)']\n", " else:\n", " next_parent = \"\"\n", " next_speaker = \"\"\n", "\n", " if state == \"count_prefix\":\n", " if verbose:\n", " print(\"state = \", state )\n", "\n", " if speaker == \"ame\":\n", " prefix += line[\"BodyCn\"] + \"\\n\"\n", " continue\n", " elif speaker == \"pi\":\n", " state = \"count_option_start\"\n", "\n", " if state == \"count_option_start\":\n", " if verbose:\n", " print(\"prefix = \")\n", " print(prefix)\n", " print(\"state = \", state )\n", "\n", " if speaker == \"pi\":\n", " option_text = line[\"BodyCn\"]\n", " state = \"collect_reply\"\n", " option_parent = line_parent\n", " # print(\"opt_parent=\", option_parent)\n", " # print(\"next_parent=\", next_parent)\n", " reply = \"\"\n", "\n", " if next_speaker == \"pi\":\n", " # 说明下一个也是选项,要直接终结掉这个选项\n", " state = \"count_post\"\n", " else:\n", " continue\n", " else:\n", " print(\"warning! not pi's reply in count_option_start, Event = \", parent)\n", "\n", " if state == \"collect_reply\":\n", " if verbose:\n", " print(\"state = \", state )\n", "\n", " # if speaker != \"ame\" and speaker != \"pi\":\n", " # # print(\"skip speaker \", speaker)\n", " # continue\n", "\n", " if speaker == \"ame\":\n", " option_text += line[\"BodyCn\"]\n", " elif speaker == \"pi\":\n", " # a new reply\n", " print(\"warning! not ame's reply in collect_reply, Event = \", parent)\n", "\n", " if i == len(event['lines']) - 1:\n", " state = \"count_post\"\n", " else:\n", " if next_parent != option_parent:\n", " state = \"count_post\"\n", "\n", " if state == \"count_post\":\n", " if verbose:\n", " print(\"state = \", state )\n", " option_data = {\n", " \"text\": option_text,\n", " \"reply\": reply\n", " }\n", "\n", " options.append(option_data)\n", "\n", " if i == len(event['lines']) - 1:\n", " break\n", "\n", " next_speaker = event['lines'][i + 1]['Speaker/Action (in blue)']\n", "\n", " if \"Option\" in next_parent and next_speaker == \"pi\":\n", " state = \"count_option_start\"\n", " elif \"Option\" not in next_parent and next_speaker == \"ame\":\n", " state = \"collect_post\"\n", " else:\n", " # print(\"strange\", next_parent, ' ', next_speaker, ' ', content)\n", " record_flag = False\n", "\n", "\n", " continue\n", "\n", " if record_flag:\n", " event_name = extract_string_before_parentheses(parent)\n", " print(event_name, \" len_prefix = \" , len(prefix), \" #opt = \", len(options))\n", " print(prefix)\n", "\n", " count += 1\n", "\n", "\n", " # break\n", "\n", " # if state == \"count_option_end\":\n", " # if speaker == \"ame\":\n", " # option_text += line[\"BodyCn\"]\n", " # else:\n", " # print(\"warning! not ame's reply in count_option_end\")\n", "\n", " # print(\"prefix:\\n\",prefix)\n", " # print(\"first option:\\n\",option_text)\n", " # print()\n", " # count += 1\n", " # if count > 5:\n", " # break\n", "\n", "\n", "print(count)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t5pn9UfSZOWo", "outputId": "c7d257f9-0fa3-4800-aef3-642566d92139" }, "execution_count": 111, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Event_UberEats len_prefix = 25 #opt = 3\n", "我们点外卖吧我一步也不想动了可是又超想吃饭!!!\n", "\n", "Event_Sea len_prefix = 27 #opt = 2\n", "我已经彻底疲倦了\n", "不如我们结束这一切 现在就去海边吧\n", "\n", "Event_Pudding len_prefix = 26 #opt = 3\n", "我没打招呼就把冰箱里的布丁吃了 会被判死刑吗???\n", "\n", "Event_Hairstyle len_prefix = 20 #opt = 4\n", "想换个发型了,阿P喜欢什么样子的糖糖?\n", "\n", "Event_Money len_prefix = 15 #opt = 3\n", "我要出去玩!给我零花钱!!!\n", "\n", "Event_Seikei len_prefix = 18 #opt = 3\n", "如果我要整容,你觉得整哪里比较好?\n", "\n", "Event_AmePiercerd len_prefix = 70 #opt = 2\n", "嗳,你来帮我打耳洞嘛 让喜欢的人给自己打耳洞很棒不是吗 有一种被支配着的感觉 鸡皮疙瘩都要起来了\n", "我好怕我好怕我好怕\n", "我好怕!\n", "但是来吧!\n", "\n", "Event_Charahen len_prefix = 14 #opt = 3\n", "哎,你喜欢什么样的糖糖啊?\n", "\n", "Event_AmeFuture len_prefix = 18 #opt = 3\n", "哎,你会希望看到糖糖将来的样子吗?\n", "\n", "Event_Sumabura len_prefix = 41 #opt = 2\n", "我也想被做进那个大乱斗游戏……\n", "哎,如果那个游戏里面有超天酱的话,阿P会用我吗?\n", "\n", "Event_Negativ len_prefix = 37 #opt = 3\n", "光是活着就好累啊……\n", "现在无论是谁对我说什么,我肯定都会往负面方向去理解\n", "\n", "Event_DrugHolic len_prefix = 46 #opt = 2\n", "啊~不行了 不行不行不行不行\n", "无论思考什么,满脑袋都只有一个“死”字\n", "阿P,我该怎么办啊?\n", "\n", "Event_Jisatumisui len_prefix = 30 #opt = 3\n", "不行了 我现在就想立刻马上消失\n", "阿P 我们一起去买炭吧……\n", "\n", "Event_Flower len_prefix = 12 #opt = 3\n", "阿P,看!我买了小发发\n", "\n", "Event_Advice len_prefix = 24 #opt = 3\n", "我正在想下次搞什么企划呢~阿P帮帮我 出出主意\n", "\n", "Event_Cheerup len_prefix = 32 #opt = 2\n", "我今后也会努力加油的,你要支持我哦 还有阿P你自己也要加油哦!\n", "\n", "Event_LoveJINE len_prefix = 10 #opt = 3\n", "阿P 我最喜欢你了\n", "\n", "Event_Manicure len_prefix = 19 #opt = 4\n", "哎,你觉得我下次美甲做什么颜色好呢?\n", "\n", "Event_Okusan len_prefix = 54 #opt = 3\n", "说到笨蛋情侣,就不得不提那个段子了\n", "“欢迎回家,你要先吃饭?”“还是先洗澡?”“还是……先,吃,我,呢?”\n", "\n", "Event_Copyceleb len_prefix = 33 #opt = 3\n", "机会这么难得,要不整点富婆快乐活吧\n", "说不定还能用作下次的企划哦!\n", "\n", "Event_Menherafriend len_prefix = 133 #opt = 3\n", "有个女孩发私信找我谈人生,我该怎么办呐\n", "「超天酱你好,我是一名高中生。之前因为精神疾病而住院了一段时间,现在跟不上学习进度,班上还没决定好志愿的人也只剩我一个了。平时看着同学们为了各自的前程努力奋斗的样子,心里总是非常地焦虑。请你告诉我,我到底应该怎么办才好呢?」\n", "\n", "Event_Okiru_Afternoon len_prefix = 40 #opt = 2\n", "醒过来一看太阳都下山了 笑死\n", "睡太久了浑身无力~~……我可以就酱紫睡一辈子吗?\n", "\n", "Event_Okiru_Night len_prefix = 45 #opt = 1\n", "要命 一个回笼觉睡到了这个点\n", "浪费一整天啥都没干的罪恶感好难顶啊!\n", "你为什么不叫醒我啦!\n", "\n", "Event_Newthings len_prefix = 15 #opt = 3\n", "今天有点想试试平时不会做的事\n", "\n", "Event_Watchword len_prefix = 29 #opt = 3\n", "小天使请安!这个开场白也说厌了啊~\n", "帮我想个别的开场白!\n", "\n", "Day0_JINE len_prefix = 180 #opt = 2\n", "啊~紧张死了……\n", "我们两个一起想出来的“超天酱”\n", "终于,降临在这个世界上了\n", "粉丝……涨了一千啊\n", "这样都得不到什么被捧的感觉\n", "毕竟现在才刚开始呢\n", "想满足我黑洞似的认可欲求\n", "最少也得有一百万个宅宅围着我转呀\n", "大概一个月的时间,胜负就能见分晓吧\n", "因为凭我的干劲也只能坚持那么久……\n", "所以接下来的这一个月,咱们要努力奋斗咯!!\n", "我和你的话,一定能够打造厉害的主播吧?\n", "\n", "Day1_JINE len_prefix = 208 #opt = 1\n", "早啊!\n", "这是我们当上主播后的第一个早晨呢\n", "……然而时间已经到中午了\n", "早起无能~~~\n", "算了,就这样吧!\n", "距离百万粉丝的目标只剩区区999000人了\n", "现在先朝着一万粉进发吧!\n", "就让我们潇洒登顶吧♪\n", "你也知道,仅凭我一个人是什么都做不到的\n", "阿P你每天都要给我下很多很多的指示呀\n", "我相信你哦\n", "只要是你说的\n", "我什么都会听\n", "我相信阿P\n", "我一定乖乖听话\n", "所以你一定要把我打造成最棒的主播呀……\n", "不然的话\n", "不然的话,我可是会坏掉的\n", "\n", "Event_Wishlist len_prefix = 30 #opt = 4\n", "我要搞自己的心愿单了\n", "然后本糖允许阿P来想要往里加什么东西\n", "\n", "Event_Song len_prefix = 53 #opt = 1\n", "你快看私信!\n", "有位作曲家联系我,说要给我写角色歌诶!\n", "哎呀~终于也走到这一步了~\n", "宅宅们的耳朵要怀孕啦~\n", "\n", "Scenario_topstreamer_trakenjoikeike len_prefix = 100 #opt = 1\n", "快看,快看啊阿P!\n", "锵锵~粉丝破百万的纪念金盾哦!\n", "像黄金骑士一样,金光闪闪!\n", "哈,哈,哈!\n", "全世界的阿宅们都彻底被我的颜值俘虏啦\n", "而阿P,你就是这个可爱过头的女孩子最最在乎的人!\n", "你要以此为豪哦!\n", "\n", "30\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "for event in all_events:\n", " flag = False\n", " for line in event['lines']:\n", " parent = line['ParentId (more info)']\n", " if \"2ndOption\" in parent:\n", " flag = True\n", " break\n", "\n", " if flag:\n", " print(event['parent'])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "myJ3x9xGVCir", "outputId": "96594bd6-0178-4048-fbf9-f9fe1e3a394f" }, "execution_count": 33, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Event_NextDate\n", "Event_Yandeiru\n", "Event_Yutabon\n", "Event_MailInterview\n", "Event_Dialog\n" ] } ] }, { "cell_type": "code", "source": [ "count = 0\n", "\n", "for event in all_events:\n", " if len(event['lines']) == 1:\n", " continue\n", "\n", " flag = False\n", " for line in event['lines']:\n", " parent = line['ParentId (more info)']\n", " if \"2ndOption\" in parent:\n", " flag = True\n", " break\n", "\n", " if flag:\n", " continue\n", "\n", " count += 1\n", "\n", "print(count)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6apNMqgXV5aH", "outputId": "8eb21455-999e-46b3-d575-05841afb5adb" }, "execution_count": 35, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "176\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "\n", "\n", "# # 查询有选项的内容\n", "pattern = r'(?<=\\().*?(?=\\))'\n", "\n", "# # 匹配事件\n", "Title = Jines.loc[(Jines['ParentId (more info)'].str.contains(pattern, regex=True, na=False))]\n", "\n", "Attribute_temp = {\"Affection\": 0, \"Stress\": 0, \"Darkness\": 0}\n" ], "metadata": { "id": "vmh37VqKCiw-" }, "execution_count": 61, "outputs": [] }, { "cell_type": "markdown", "source": [ "Jines_file = r\"/content/Jines.csv\" # 更改文件路径和扩展名\n", "\n", "使用utf-8读取这个文件后,为我split成多行" ], "metadata": { "id": "5ZS0MWuxJoIN" } }, { "cell_type": "code", "source": [ "def sanitize_filename(filename):\n", " invalid_chars = '<>:\"/\\\\|?*\\n'\n", " for char in invalid_chars:\n", " filename = filename.replace(char, '_')\n", " return filename" ], "metadata": { "id": "aOBxpvtlK8B5" }, "execution_count": 68, "outputs": [] }, { "cell_type": "code", "source": [ "!rm -rf /content/events\n", "!mkdir events\n" ], "metadata": { "id": "PLoXThkLLC7e" }, "execution_count": 84, "outputs": [] }, { "cell_type": "code", "source": [ "def set_json_value( a, key, value, verbose = True):\n", " if key in a and a[key] != value:\n", " event_name = a[\"Name_while_read_csv\"]\n", " if verbose:\n", " print(f\"Warning! Key {key} already exists in event {event_name}\")\n", " print(f\"try overwrite {a[key]} to {value}\")\n", " else:\n", " a[key] = value\n", " return a" ], "metadata": { "id": "AydtmWYmLm6z" }, "execution_count": 111, "outputs": [] }, { "cell_type": "markdown", "source": [ "请为我实现一个python函数,输入和输出都是字符串\n", "去掉字符串末尾3位可能的数字。" ], "metadata": { "id": "yE5sAn59ODk4" } }, { "cell_type": "code", "source": [ "def remove_trailing_digits(s: str) -> str:\n", " \"\"\"\n", " Remove up to three trailing digits from a string.\n", "\n", " :param s: Input string that may end with up to three digits.\n", " :return: String with the trailing digits removed.\n", " \"\"\"\n", " # 初始化一个变量,用于计数尾部连续数字的数量\n", " trailing_digit_count = 0\n", "\n", " # 从字符串末尾开始,向前检查每个字符\n", " for char in reversed(s[-3:]): # 查看最后三个字符\n", " if char.isdigit(): # 如果字符是数字\n", " trailing_digit_count += 1 # 增加计数\n", " else:\n", " break # 如果遇到非数字字符,跳出循环\n", "\n", " # 如果尾部有数字,去除相应数量的字符\n", " if trailing_digit_count > 0:\n", " return s[:-trailing_digit_count]\n", " else:\n", " return s\n", "\n", "\n", "# 示例用法\n", "input_str = \"example123\"\n", "output_str = remove_trailing_digits(input_str)\n", "print(output_str) # 应该输出 \"example\"\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dAGwJKDoOOam", "outputId": "c217ad74-e14d-4c17-92d5-917c1b5ef1d5" }, "execution_count": 112, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "example\n" ] } ] }, { "cell_type": "code", "source": [ "event_name_to_data = {}\n", "\n", "# 把文件转换为txt\n", "def format_output(row):\n", "\n", " # ID\n", " ParentId = f'{row[\"ParentId (more info)\"]}'\n", " Category_temp = f'{row[\"Category\"]}'\n", " Category = sanitize_filename(Category_temp)\n", " ID = f'{row[\"Id\"]}'\n", "\n", " # 匹配标题\n", " regex1 = r\"\\w+(?= \\()\"\n", " title = re.search(regex1, ParentId)\n", " title_str = title.group()\n", "\n", " current_data = {}\n", "\n", " # 通过title_str 索引出数据\n", " if title_str in event_name_to_data:\n", " current_data = event_name_to_data[title_str]\n", " else:\n", " current_data = {\"Name_while_read_csv\":title_str,\"options\":[] }\n", " event_name_to_data[title_str] = current_data\n", "\n", "\n", " # 事件\n", " event_list = []\n", "\n", " # 匹配提问\n", " match = re.search(r\"\\(First Part\\)\", ParentId)\n", " match2 = re.search(r\"\\(First Part; end\\)\", ParentId)\n", " match3 = re.search(r\"\\(Third Part\\)\", ParentId)\n", " match4 = re.search(r\"\\(Second Part\\)\", ParentId)\n", " match5 = re.search(r\"\\(Fourth Part\\)\", ParentId)\n", "\n", " # 数值\n", " aff = f\"Affection: {row['Affection']}\"\n", " str = f\"Stress: {row['Stress']}\"\n", " dar = f\"Darkness: {row['Darkness']}\"\n", "\n", " # 匹配选项以及回复\n", " choose_time = re.search(r\"\\d+\", ParentId)\n", " reply_ = re.search(r'(\\(.*Option[0-9]+;end\\))', ParentId)\n", " reply_2 = re.search(r'(\\(.*Option[0-9]\\))', ParentId)\n", "\n", " # 处理提问\n", " # if match or match2 or match4 or match5 or match3:\n", " if match or match2 or match3 or match4 or match5:\n", "\n", " Prefix = f'\\n## 对话\\n### Prefix Category_temp:{Category} ID:{ID}'\n", "\n", " current_data = set_json_value(current_data, \"category\", Category,False)\n", "\n", " if ID and len(ID) > 0 and ID != \"nan\":\n", " ID = remove_trailing_digits(ID)\n", " current_data = set_json_value(current_data, \"id\", ID, False)\n", "\n", "\n", "\n", " Ame = f\"糖糖: {row['BodyCn']}\"\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([Prefix, Ame])\n", "\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", "\n", " return \"\\n\".join([Prefix, Ame])\n", "\n", " # 处理选项\n", " elif row['Speaker/Action (in blue)'] == 'pi':\n", " # 跳过数值为空的回复\n", " try:\n", " key = f'\\n### Option-{choose_time.group()}'\n", " user = f\"User: {row['BodyCn']}\"\n", "\n", " if aff == 'Affection: nan':\n", " aff = ''\n", " if str == 'Stress: nan':\n", " str = ''\n", " if dar == 'Darkness: nan':\n", " dar = ''\n", " value = f\"Attribute Change: {aff} {str} {dar}\"\n", "\n", " if value == 'Attribute Change: ':\n", " value = ''\n", "\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([key, user, value])\n", "\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", " return \"\\n\".join([key, user, value])\n", " except:\n", " pass\n", "\n", " # 处理选项回复\n", " elif reply_ or (reply_2 and row['Speaker/Action (in blue)'] == 'ame'):\n", " try:\n", " key = f'\\nReply:\\n糖糖:{row[\"BodyCn\"]}'\n", "\n", " if aff == 'Affection: nan':\n", " aff = ''\n", " if str == 'Stress: nan':\n", " str = ''\n", " if dar == 'Darkness: nan':\n", " dar = ''\n", " value = f\"Attribute Change: {aff} {str} {dar}\"\n", "\n", " if value == 'Attribute Change: ':\n", " value = 'Attribute Change: None'\n", "\n", " if key == '\\nReply:\\n糖糖:nan':\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([value])\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", "\n", " return \"\\n\".join([value])\n", "\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([key, value])\n", "\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", "\n", " return \"\\n\".join([key, value])\n", " except:\n", " pass\n" ], "metadata": { "id": "DgCgTdvC-eEe" }, "execution_count": 116, "outputs": [] }, { "cell_type": "code", "source": [ "# 转换为txt\n", "re_Title = Title.apply(format_output, axis=1)\n", "# output_str = re_Title.str.cat(sep=\"\\n \\n\") # 用空格分隔每个元素" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZHdGWB-qF9v0", "outputId": "80271ca6-995a-416f-a9dc-f5b36e0a2938" }, "execution_count": 115, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 500k+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 500k+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 500k+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event StHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n", "Warning! Key category already exists in event YamiHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 500k+_and Darkness 61+\n", "Warning! Key category already exists in event YamiHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 500k+_and Darkness 61+\n", "Warning! Key category already exists in event YamiHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 1m+_and Darkness 61+\n", "Warning! Key category already exists in event YamiHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 1m+_and Darkness 61+\n", "Warning! Key category already exists in event YamiHi_FollowHi\n", "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 1m+_and Darkness 61+\n", "Warning! Key category already exists in event KenjoHi\n", "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-14\n", "Warning! Key category already exists in event KenjoHi\n", "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-9\n", "Warning! Key category already exists in event KenjoHi\n", "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-9\n", "Warning! Key category already exists in event KenjoHi\n", "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-4\n", "Warning! Key category already exists in event KenjoHi\n", "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-4\n", "Warning! Key category already exists in event YamiHi_SukiHi\n", "try overwrite Random Noon Event_ Affection _and Darkness at 41+ to Random Noon Event_ Affection _and Darkness at 61+\n", "Warning! Key category already exists in event YamiHi_SukiHi\n", "try overwrite Random Noon Event_ Affection _and Darkness at 41+ to Random Noon Event_ Affection _and Darkness at 61+\n", "Warning! Key category already exists in event YamiHi_SukiHi\n", "try overwrite Random Noon Event_ Affection _and Darkness at 41+ to Random Noon Event_ Affection _and Darkness at 81+\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Day0_JINE\n", "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n", "Warning! Key category already exists in event Ending_Normal\n", "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event)\n", "Warning! Key category already exists in event Ending_Normal\n", "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event)\n", "Warning! Key category already exists in event Ending_Normal\n", "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n", "Warning! Key category already exists in event Ending_Normal\n", "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n", "Warning! Key category already exists in event Ending_Normal\n", "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n", "Warning! Key category already exists in event Ending_Normal\n", "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n" ] } ] }, { "cell_type": "code", "source": [ "print(re_Title.head)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VSuD3iv_GGjt", "outputId": "2f9a34cf-6302-4caf-b29b-bbfaeaa5bb09" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Empty DataFrame\n", "Columns: [{\"payload\":{\"allShortcutsEnabled\":false, fileTree:{\"data\":{\"items\":[{\"name\":\"Jines.csv\", path:\"data/Jines.csv\", contentType:\"file\"}, {\"name\":\"Jines.xlsx\", path:\"data/Jines.xlsx\", contentType:\"file\"}.1, {\"name\":\"emoji_story_23.jsonl\", path:\"data/emoji_story_23.jsonl\", contentType:\"file\"}.2, {\"name\":\"original_story_23.jsonl\", path:\"data/original_story_23.jsonl\", contentType:\"file\"}], totalCount:4}, :{\"items\":[{\"name\":\"data\", path:\"data\", contentType:\"directory\"}, {\"name\":\"notebook\", path:\"notebook\", contentType:\"directory\"}.1, {\"name\":\"src\", path:\"src\", contentType:\"directory\"}.2, {\"name\":\".DS_Store\", path:\".DS_Store\", contentType:\"file\"}.3, {\"name\":\".gitignore\", path:\".gitignore\", contentType:\"file\"}.4, {\"name\":\"LICENSE\", path:\"LICENSE\", contentType:\"file\"}.5, {\"name\":\"README.md\", path:\"README.md\", contentType:\"file\"}].1, totalCount:7}}, fileTreeProcessingTime:5.894192, foldersToFetch:[], reducedMotionEnabled:null, repo:{\"id\":713164097, defaultBranch:\"main\", name:\"Needy-Haruhi\", ownerLogin:\"LC1332\", currentUserCanPush:false, isFork:false, isEmpty:false, createdAt:\"2023-11-02T01:04:46.000Z\", ownerAvatar:\"https://avatars.githubusercontent.com/u/5266090?v=4\", public:true, private:false, isOrgOwned:false}, symbolsExpanded:false, treeExpanded:true, refInfo:{\"name\":\"main\", listCacheKey:\"v0:1699146562.0\", canEdit:false, refType:\"branch\", currentOid:\"7804db2cd6540b664df0e89e5d6c2ee0e62248ac\"}, path:\"data/Jines.csv\".1, currentUser:null, blob:{\"rawLines\":null, stylingDirectives:null, csv:[[\"Category\", Id, ParentId (more info), Speaker/Action (in blue), Stress, Affection, Darkness, BodyCn, null, null.1, null.2, null.3, null.4, null.5, null.6, null.7, null.8, null.9, null.10, null.11, null.12, null.13, null.14, null.15, null.16, null.17, null.18, null.19, null.20, null], [\"Random Dusk/Night Texts\", LineWeekDay003, weekday, ame, null.21, null.22, null.23, 早上起不来!好想死~~~, ...]\n", "Index: []\n", "\n", "[0 rows x 46399 columns]\n" ] } ] }, { "cell_type": "code", "source": [ "# txt文件转换为jsonl\n", "def parse_to_jsonl(file_path):\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " lines = iter(f.readlines())\n", " dialogs = []\n", " dialog = {}\n", " option = {}\n", " for line in lines:\n", " line = line.strip()\n", " if line.startswith(\"## 对话\") or line.startswith(\"## 对话组\"):\n", " if dialog and option:\n", " dialog[\"options\"].append(option)\n", " option = {}\n", " if dialog:\n", " dialogs.append(dialog)\n", " dialog = {\"prefix\": \"\", \"options\": []}\n", " elif line.startswith(\"### Prefix\") or line.startswith('**Prefix'):\n", " prefix = next(lines).strip()\n", " ids, categories = search_in_excel(prefix)\n", " # print(ids, categories)\n", " if ids and categories:\n", " dialog[\"id\"] = ids[0]\n", " dialog[\"category\"] = categories[0]\n", " dialog[\"prefix\"] = prefix\n", " elif line.startswith(\"### Option\") or line.startswith('**Option'):\n", " if option:\n", " dialog[\"options\"].append(option)\n", " option = {\"user\": \"\", \"reply\": \"\", \"attribute_change\": \"\"}\n", "\n", " elif line.startswith(\"User\") or line.startswith(\"User:\"):\n", " option[\"user\"] = line[5:].strip()\n", " elif line.startswith(\"Reply\") or line.startswith('**Reply:**'):\n", " option[\"reply\"] = next(lines).strip()\n", " elif line.startswith(\"Attribute Change\") or line.startswith('**Attribute Change:**'):\n", " option[\"attribute_change\"] = line[17:].strip()\n", "\n", " if option:\n", " dialog[\"options\"].append(option)\n", " if dialog:\n", " dialogs.append(dialog)\n", "\n", " with open('emoji_story_23.jsonl', 'a+', encoding=\"utf-8\") as outfile:\n", " for entry in dialogs:\n", " json.dump(entry, outfile, ensure_ascii=False)\n", " outfile.write('\\n')" ], "metadata": { "id": "sghUu4Or-uC4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 转换为jsonl\n", "for filename in os.listdir('events'):\n", " if filename.endswith(\".txt\"):\n", " try:\n", " parse_to_jsonl(f'events/{filename}')\n", " except:\n", " shutil.move(f'move/{filename}', f'error/{filename}')\n", " print(filename)" ], "metadata": { "id": "nBLuoQov_C_5" }, "execution_count": null, "outputs": [] } ] }