{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "import time\n", "import spacy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize(text):\n", " nlp = spacy.load('en_core_web_sm')\n", " tokens = [token.text for token in nlp(text) if not token.is_punct and not token.is_space]\n", " \n", " return tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def add_target_columns(data, category, target):\n", " # Column to store category of questions\n", " data[\"Category\"] = category\n", " # Target column to store whether human answer or chatgpt answer\n", " data[\"Human vs ChatGPT\"] = target\n", " \n", " return data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def store_excel(data, prev_data = None):\n", " if prev_data:\n", " # Loading old data into dataframe\n", " old_data = pd.read_excel(prev_data)\n", " # Concatenating the two dataframes vertically\n", " complete_data = pd.concat([old_data, data], ignore_index=True)\n", " # Storing the combined data to the excel file\n", " complete_data.to_excel('scraped_data.xlsx', index=False)\n", " else:\n", " data.to_excel(\"scraped_data.xlsx\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def getData(page_urls, driver, min_ans_len = 15, limit = 10, scroll_num = 10):\n", " \n", " # Empty dataframe to store the scraped content\n", " scraped_data = pd.DataFrame()\n", " \n", " # Initializing variable to track the number of data samples collected \n", " len_data = 0\n", " \n", " # Initializing lists to store the scraped content\n", " questions = []\n", " answers = []\n", " \n", " count = 1\n", " count1 = 0\n", " for page_url in page_urls:\n", " print(f\"Page {count} of {len(page_urls)}\")\n", " # Sending a get request to the web page (Navigating to the webpage)\n", " driver.get(page_url)\n", " # Wait\n", " driver.implicitly_wait(10)\n", "\n", " # Initializing variables to iterate through the try except block\n", " max_tries = 10\n", " retry = 0 \n", "\n", " # Initializing variable to check if we've reached the end of the page \n", " old_content = None\n", " new_content = None\n", "\n", " # Scrolling to get enough answers\n", " for i in range(scroll_num):\n", " # Scrolling to access the next page of questions\n", " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n", " # Wait for the content to be loaded \n", " time.sleep(5) \n", " # Checking if page is same before and after scrolling\n", " new_content = driver.page_source\n", " if new_content == old_content:\n", " break\n", " old_content = new_content\n", "\n", " # Used while loop to avoid \"StaleElementReferenceException\" error\n", " while retry