{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# Data" ], "metadata": { "id": "fifTbKR0OCp2" } }, { "cell_type": "code", "source": [ "import io\n", "import os\n", "import re\n", "import requests\n", "import string\n", "from tqdm import tqdm\n", "from bs4 import BeautifulSoup\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import numpy as np" ], "metadata": { "id": "XyUdDEkmMvR0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "main_characters = ['CHANDLER', 'JOEY', 'MONICA', 'PHOEBE', 'ROSS', 'RACHEL']\n", "url_base = 'https://www.drodd.com/friends'\n", "SEASON_COUNT = 10\n", "NEWLINE_REPLACEMENT = ' 5923 '" ], "metadata": { "id": "dSD_x10tNGvO" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Parse main url for getting links for all episodes" ], "metadata": { "id": "bLjNXOfHN_tT" } }, { "cell_type": "code", "source": [ "r = requests.get(url_base)\n", "soup = BeautifulSoup(r.content)\n", "links_by_episode = [(i['href'][14:-4], url_base + \"/\" + i['href']) for i in soup.find_all('a') if str(i).find('name') == -1][1:-3]\n", "links = []\n", "for i, j in links_by_episode:\n", " if i == '212_213':\n", " links.append((2, 12, j))\n", " elif i == '615616':\n", " links.append((6, 15, j))\n", " elif i == '723.':\n", " links.append((7, 23, j))\n", " else:\n", " links.append((int(i) // 100, int(i) % 100, j))\n", "\n", "# Links to every episode\n", "links_by_episode = links # link to every episode\n", "\n", "all_scripts = {i + 1: {} for i in range(SEASON_COUNT)}\n", "scripts_with_context = {i + 1: {} for i in range(SEASON_COUNT)}" ], "metadata": { "id": "i9laydWzNPO1" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Transcript links with broken structure" ], "metadata": { "id": "NT4In9ZHPpf6" } }, { "cell_type": "code", "source": [ "exception_episodes = [(1, 5),(2, 18),(6, 11),(7, 11),(7, 23), (9, 1), (9, 2), (9, 3), (9, 4), (9, 8)] + [(8, i) for i in range(7, 24)]" ], "metadata": { "id": "jUsU2vUFNuTA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Iterate through each link by episode\n", "for i in links_by_episode:\n", " # Initialize an empty dictionary for each episode within its respective season\n", " scripts_with_context[i[0]][i[1]] = {}\n", "# Iterate through seasons, excluding the last one\n", "for season in tqdm(range(1, SEASON_COUNT)):\n", " # Iterate through episode numbers within the current season\n", " for num in scripts_with_context[season].keys():\n", " # Skip episodes listed in the exception_episodes list\n", " if (season, num) in exception_episodes:\n", " continue\n", "\n", " # Iterate through each link by episode\n", " for i, j, k in links_by_episode:\n", " # Assign variables for link season, episode, and the link itself\n", " link_season = i\n", " link_episode = j\n", " lnk = k\n", "\n", " # Check if the current link corresponds to the current season and episode\n", " if season == link_season and num == link_episode:\n", " # Initialize an empty string to store the script\n", " current_script_with_context = ''\n", "\n", " # Make a GET request to fetch the script content from the link\n", " r = requests.get(lnk)\n", "\n", " # Parse the HTML content of the page using BeautifulSoup\n", " soup = BeautifulSoup(r.content)\n", "\n", " # Extract all
elements which usually contain the script content\n", " script_raw = soup.find_all('p')\n", "\n", " # Initialize the maximum script length\n", " max_script = script_raw[0]\n", "\n", " # Find the longest script element\n", " for i in script_raw:\n", " if len(i) > len(max_script):\n", " max_script = i\n", "\n", " # Clean and preprocess the script text\n", " # Convert the BeautifulSoup object max_script to a string and replace '
' tags with an empty string\n", " script = str(max_script).replace(\"
\", \"\")\n", " # Replace '
' tags with an empty string in the script string\n", " script = script.replace(\"\", \"\")\n", " # Replace ' elements which usually contain the script content\n",
" script_raw = soup.find_all('p')\n",
"\n",
" # Remove unwanted header and footer lines from the script content\n",
" script_raw = script_raw[3:-12]\n",
"\n",
" # Initialize variables to store the processed script\n",
" script_res = ''\n",
" current_script_with_context = ''\n",
"\n",
" # Iterate through each line of the script content\n",
" for line in script_raw:\n",
" # Skip lines with multiple tags, which usually indicate headers\n",
" if len(line.find_all('strong')) > 1:\n",
" continue\n",
"\n",
" # Convert the line to a string and perform various replacements to clean it\n",
" script = str(line)\n",
" script = script.replace(\" \", \"\").replace(\" \", \"\").replace(\" \", \"\").replace(\"
\", \"\\n\").replace(\"
\", \" \")\n",
" script = script.replace(\"\", \"\").replace(\"\", \"\")\n",
" script = script.replace(\"\", \"\").replace(\"\", \"\").replace(\"{\", \"(\")\n",
" script = re.sub('\\r\\n', '', script)\n",
" script = re.sub(r'\\[(.*?)\\]', '', script)\n",
" script = re.sub(r'\\(.*?\\)', '', script)\n",
"\n",
" # Find the index of ':' in the line to identify the character speaking\n",
" tag = script.find(\":\")\n",
" if tag > -1:\n",
" # Extract the character name and the spoken dialogue\n",
" cur_char = script[:tag].upper()\n",
" script = script[tag+2:]\n",
" script = re.sub(' +', ' ', script)\n",
" script = re.sub('\\n', '', script)\n",
" script = script.strip()\n",
"\n",
" # Check if the character is one of the main_characters\n",
" if script != '':\n",
" if script.find(')') > -1:\n",
" continue\n",
" if cur_char in main_characters:\n",
" # Append the character's Russian name and the dialogue to the current_script_with_context\n",
" current_script_with_context += cur_char + \": \" + script + NEWLINE_REPLACEMENT\n",
" else:\n",
" # Append \"НЕДРУГ\" (meaning \"OTHER\" in Russian) and the dialogue to the current_script_with_context\n",
" current_script_with_context += \"OTHER: \" + script + NEWLINE_REPLACEMENT\n",
"\n",
" # Assign the processed script to the corresponding episode in scripts_with_context\n",
" scripts_with_context[season][episode] = current_script_with_context\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_witCiOXXyi1",
"outputId": "3cea1b8d-40e0-4cf7-a1ff-8224a60348df"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"100%|██████████| 43/43 [00:30<00:00, 1.42it/s]\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# 2-18"
],
"metadata": {
"id": "zhvnaWPIa6q2"
}
},
{
"cell_type": "code",
"source": [
"r = requests.get('https://www.drodd.com/friends/friend-episode218.htm')\n",
"soup = BeautifulSoup(r.content)\n",
"script_raw = soup.find_all('pre')[0]\n",
"script = str(script_raw).replace(\"
\", \"\\n\").replace(\"
\", \" \").replace(\"\", \"\").replace(\"
\", \"\")\n",
"script = script.replace(\"\", \"\").replace(\"\", \"\")\n",
"script = script.replace(\"\", \"\").replace(\"\", \"\").replace(\"{\", \"(\")\n",
"script = re.sub('\\r\\n', '\\n', script)\n",
"script = re.sub('\\n +', '\\n', script)\n",
"script = re.sub(r'\\[(.*?)\\]', '', script)\n",
"script = re.sub(r'\\(.*?\\)', '', script)\n",
"script = re.sub('\\n\\n', '\\n', script)\n",
"script = re.sub('DELIVERY GUY', '\\nDELIVERY GUY', script)\n",
"\n",
"current_script_with_context = ''\n",
"\n",
"for line in script.split('\\n'):\n",
" if line in ('', '\\n') or line.find(']') > -1:\n",
" continue\n",
" if line.replace(\" \", \"\").isupper():\n",
" continue\n",
" tag = line.find('[')\n",
" if tag > - 1:\n",
" current_script_with_context += line[:tag] + '\\n'\n",
" elif not(line[0].isupper() and line[1].isupper()):\n",
" current_script_with_context = current_script_with_context[:-1] + \" \" + line\n",
" else:\n",
" current_script_with_context += line + '\\n'\n",
"current_script_with_context = re.sub('\\n\\n', '\\n', current_script_with_context)\n",
"current_script_with_context = re.sub('\\n \\n', '\\n', current_script_with_context)\n",
"current_script_with_context = current_script_with_context.strip()\n",
"\n",
"\n",
"script = current_script_with_context.split('\\n')\n",
"current_script_with_context = ''\n",
"for line in script:\n",
" tag = line.find(':')\n",
" cur_char = line[:tag]\n",
" if cur_char in main_characters:\n",
" current_script_with_context += cur_char + \": \" + line[tag+2:] + NEWLINE_REPLACEMENT\n",
" else:\n",
" current_script_with_context += \"OTHER: \" + line[tag+2:] + NEWLINE_REPLACEMENT\n",
"scripts_with_context[2][18] = current_script_with_context"
],
"metadata": {
"id": "dH56VvvQavPn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 10-15"
],
"metadata": {
"id": "kFyUI5kMbPaf"
}
},
{
"cell_type": "code",
"source": [
"exception = {'BOTH',\n",
" 'JANICE',\n",
" 'JENNIFER',\n",
" 'LADY',\n",
" 'MR ZELNER',\n",
" 'PHOEBE-ESTELLE',\n",
" 'REALTOR'}\n",
"\n",
"lnk = 'https://www.drodd.com/friends/friend-episode1015.htm'\n",
"r = requests.get(lnk)\n",
"soup = BeautifulSoup(r.content)\n",
"script_raw = soup.find_all('p')\n",
"script_raw = script_raw[12:]\n",
"script = str(script_raw).replace(\"
\", \"\").replace(\"
\", \"\").replace(\"\", \"\").replace(\"\", \"\")\n", "script = script.replace(\"\", \"\").replace(\"\", \"\")\n", "script = script.replace(\"\", \"\").replace(\"\", \"\").replace(\"{\", \"(\")\n", "script = re.sub('\\r\\n', '\\n', script)\n", "script = re.sub('\\n +', '\\n', script)\n", "script = re.sub(r'\\[(.*?)\\]', '', script)\n", "script = re.sub(r'\\(.*?\\)', '', script)\n", "script = re.sub('\\n\\n', '\\n', script)\n", "script = re.sub('DELIVERY GUY', '\\nDELIVERY GUY', script)\n", "\n", "current_script_with_context = ''\n", "\n", "for line in script.split('\\n'):\n", " if line in ('', '\\n') or line.find(']') > -1:\n", " continue\n", " tag = line.find('[')\n", " if len(line) == 1:\n", " print(line)\n", " if tag > - 1:\n", " current_script_with_context += line[:tag] + '\\n'\n", " elif (line[0].isalpha() and line[0].islower()) and (line[1].islower() or line[1].isdigit() or line[1] == ' '):\n", " current_script_with_context = current_script_with_context[:-1] + line\n", " else:\n", " current_script_with_context += line + '\\n'\n", "current_script_with_context = re.sub('\\n\\n', '\\n', current_script_with_context)\n", "current_script_with_context = re.sub('\\n \\n', '\\n', current_script_with_context)\n", "current_script_with_context = current_script_with_context.strip()" ], "metadata": { "id": "nrTlpRJKb7Tq" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Check results" ], "metadata": { "id": "qy3ZyNv5cT4U" } }, { "cell_type": "code", "source": [ "!mkdir \"english\"" ], "metadata": { "id": "lKVQ7DcPiml2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "try:\n", " os.mkdir('english', exist=True)\n", "except:\n", " pass\n", "\n", "for season, script in scripts_with_context.items():\n", " for episode, lines in script.items():\n", " try:\n", " os.mkdir(f'english/{season}')\n", " except:\n", " pass\n", " with open(f'english/{season}/{episode}.txt', \"w\") as text_file:\n", " text_file.write(lines.replace(NEWLINE_REPLACEMENT, '\\n'))" ], "metadata": { "id": "HeRrVqYHfzdg" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!tar -czvf scripts.tar.gz english/" ], "metadata": { "id": "1ml0Q_SqiqVx" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Check the length of the seasons and episodes" ], "metadata": { "id": "NF9fmXwIPi_n" } }, { "cell_type": "code", "source": [ "episode_length = [] # to histplot\n", "season_length = {} # to barplot\n", "chars_sentence = {name : 0 for name in main_characters}\n", "chars = []\n", "texts = []\n", "\n", "for season in scripts_with_context:\n", " season_length[season] = 0\n", " for episode in tqdm(scripts_with_context[season], desc=f\"Season {season}: \"):\n", " e_length = 0\n", " with open(f'english/{season}/{episode}.txt', \"r\") as text_file:\n", " for line in text_file:\n", " char, text = line.split(\":\", 1)\n", " chars.append(char)\n", " texts.append(text.strip())\n", "\n", " # chars_sentence[char] += len(texts.split(\".\"))\n", "\n", " season_length[season] += 1\n", " e_length += 1\n", "\n", " episode_length.append(e_length)\n", "\n", "df = pd.DataFrame({'Characters': chars, 'Texts': texts})\n", "df.to_csv(\"transcription.csv\", index=False)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ILGND9ovPTK2", "outputId": "35715939-a8ac-4599-a6e7-44d0d7693106" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Season 1: 100%|██████████| 24/24 [00:00<00:00, 917.00it/s]\n", "Season 2: 100%|██████████| 23/23 [00:00<00:00, 934.87it/s]\n", "Season 3: 100%|██████████| 25/25 [00:00<00:00, 824.33it/s]\n", "Season 4: 100%|██████████| 23/23 [00:00<00:00, 841.29it/s]\n", "Season 5: 100%|██████████| 23/23 [00:00<00:00, 476.16it/s]\n", "Season 6: 100%|██████████| 23/23 [00:00<00:00, 606.84it/s]\n", "Season 7: 100%|██████████| 23/23 [00:00<00:00, 466.46it/s]\n", "Season 8: 100%|██████████| 23/23 [00:00<00:00, 389.72it/s]\n", "Season 9: 100%|██████████| 23/23 [00:00<00:00, 613.41it/s]\n", "Season 10: 100%|██████████| 17/17 [00:00<00:00, 782.33it/s]\n" ] } ] }, { "cell_type": "markdown", "source": [ "Length of the transcripts during season" ], "metadata": { "id": "ZmUqHuXETWmn" } }, { "cell_type": "code", "source": [ "sns.histplot(episode_length)\n", "plt.xlabel(\"Length of episodes\");\n", "plt.ylabel(\"Frequency\");\n", "plt.title(\"Number of replicas in episode\");\n", "plt.yticks(np.arange(0,60,5));\n", "plt.grid(True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 472 }, "id": "IXnffER4Tll5", "outputId": "ce2d2f8a-8dc2-4bdf-be2b-e5254845dae8" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "