{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.by import By\n",
    "import time\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(text):\n",
    "    nlp = spacy.load('en_core_web_sm')\n",
    "    tokens = [token.text for token in nlp(text) if not token.is_punct and not token.is_space]\n",
    "    \n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_target_columns(data, category, target):\n",
    "    # Column to store category of questions\n",
    "    data[\"Category\"] = category\n",
    "    # Target column to store whether human answer or chatgpt answer\n",
    "    data[\"Human vs ChatGPT\"] = target\n",
    "    \n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def store_excel(data, prev_data = None):\n",
    "    if prev_data:\n",
    "        # Loading old data into dataframe\n",
    "        old_data = pd.read_excel(prev_data)\n",
    "        # Concatenating the two dataframes vertically\n",
    "        complete_data = pd.concat([old_data, data], ignore_index=True)\n",
    "        # Storing the combined data to the excel file\n",
    "        complete_data.to_excel('scraped_data.xlsx', index=False)\n",
    "    else:\n",
    "        data.to_excel(\"scraped_data.xlsx\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getData(page_urls, driver, min_ans_len = 15, limit = 10, scroll_num = 10):\n",
    "    \n",
    "    # Empty dataframe to store the scraped content\n",
    "    scraped_data = pd.DataFrame()\n",
    "    \n",
    "    # Initializing variable to track the number of data samples collected \n",
    "    len_data = 0\n",
    "    \n",
    "    # Initializing lists to store the scraped content\n",
    "    questions = []\n",
    "    answers = []\n",
    "    \n",
    "    count = 1\n",
    "    count1 = 0\n",
    "    for page_url in page_urls:\n",
    "        print(f\"Page {count} of {len(page_urls)}\")\n",
    "        # Sending a get request to the web page (Navigating to the webpage)\n",
    "        driver.get(page_url)\n",
    "        # Wait\n",
    "        driver.implicitly_wait(10)\n",
    "\n",
    "        # Initializing variables to iterate through the try except block\n",
    "        max_tries = 10\n",
    "        retry = 0   \n",
    "\n",
    "        # Initializing variable to check if we've reached the end of the page \n",
    "        old_content = None\n",
    "        new_content = None\n",
    "\n",
    "        # Scrolling to get enough answers\n",
    "        for i in range(scroll_num):\n",
    "            # Scrolling to access the next page of questions\n",
    "            driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
    "            # Wait for the content to be loaded        \n",
    "            time.sleep(5) \n",
    "            # Checking if page is same before and after scrolling\n",
    "            new_content = driver.page_source\n",
    "            if new_content == old_content:\n",
    "                break\n",
    "            old_content = new_content\n",
    "\n",
    "        # Used while loop to avoid \"StaleElementReferenceException\" error\n",
    "        while retry<max_tries:\n",
    "            try:\n",
    "                # Scraping the question answer blocks on Quora\n",
    "                data_elements = driver.find_elements(By.CSS_SELECTOR, \"div.dom_annotate_multifeed_bundle_AnswersBundle\")\n",
    "                retry = 0\n",
    "                break\n",
    "            except:\n",
    "                retry += 1\n",
    "\n",
    "\n",
    "        # Used while loop to avoid \"StaleElementReferenceException\" error\n",
    "        while retry<max_tries:\n",
    "            try:\n",
    "                for block in data_elements:\n",
    "\n",
    "                    ### --- Questions --- ###\n",
    "                    # Scraping question from the webpage \n",
    "                    ques = block.find_element(By.CSS_SELECTOR, \"div.q-text.puppeteer_test_question_title span\")\n",
    "\n",
    "                    ### --- Answers --- ###\n",
    "                    # Checking if \"more\" button is present for an answer\n",
    "                    try: \n",
    "                        # Selecting the \"more\" button\n",
    "                        read_more = block.find_element(By.CSS_SELECTOR, \"div.q-absolute div.qt_read_more\")\n",
    "                        # Checking if the button is clickable\n",
    "                        try:\n",
    "                            # Expanding answer by clicking \"more\" button\n",
    "                            read_more.click()\n",
    "                        except:\n",
    "                            # Discarding data where complete answer cannot be obtained\n",
    "                            continue\n",
    "                    except:\n",
    "                        None\n",
    "                    # Scraping answers from the webpage \n",
    "                    ans = block.find_element(By.CSS_SELECTOR, \"div.q-box.spacing_log_answer_content.puppeteer_test_answer_content span.q-box\")\n",
    "\n",
    "                    if ques.text and ans.text:\n",
    "                        # Skipping questions that are already present\n",
    "                        if ques.text in questions:\n",
    "                            continue\n",
    "                        # Skipping the questions where length of answers are less than a given threshold\n",
    "                        ans_tokens = len(tokenize(ans.text))\n",
    "                        if ans_tokens<min_ans_len:\n",
    "                            continue\n",
    "                        # Appending the scraped question\n",
    "                        questions.append(ques.text)\n",
    "                        # Appending the scraped answer\n",
    "                        answers.append(ans.text)\n",
    "                        count1+=1\n",
    "                        print(f\"{count1} of 250\")\n",
    "                    else:\n",
    "                        continue\n",
    "\n",
    "                    # Updating the number of data samples collected\n",
    "                    len_data = len(questions) \n",
    "                    # Collecting data until limit is reached \n",
    "                    if len_data == limit:\n",
    "                        break\n",
    "                retry = 0\n",
    "                break\n",
    "            except:\n",
    "                retry += 1 \n",
    "        count+=1\n",
    "        if len_data == limit:\n",
    "            break\n",
    "        \n",
    "    # Warning to give more urls if desired amount of data is not scraped\n",
    "    if len_data < limit:\n",
    "        print(\"Warning: Need to provide more webpages to get desired amount of data!\")\n",
    "        \n",
    "    # Storing the scraped information in a dataframe  \n",
    "    scraped_data[\"Questions\"] = questions\n",
    "    scraped_data[\"Answers\"] = answers\n",
    "    \n",
    "    return scraped_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://boards.greenhouse.io/enveritas/jobs/4001717008\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Send a GET request to the URL\n",
    "response = requests.get(url)\n",
    "\n",
    "# Parse the HTML content with BeautifulSoup\n",
    "soup = BeautifulSoup(response.content, \"html.parser\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Job Title: Data Scientist\n",
      "Company: Enveritas\n",
      "Job Location: Global / Remote\n",
      "Job Description: \n",
      "Data Scientist, Engineering & Data Group\n",
      "Do you want to work for a mission-driven non-profit, analyzing data and writing software that will contribute to helping millions of coffee farmers out of poverty? Enveritas is a 501(c)3 non-profit and Y Combinator-backed startup looking to hire a Data Scientist for our Data Team. \n",
      "We are looking for a Data Scientist with extensive professional experience to join our Engineering and Data Group on a remote, full-time basis. This position is open globally, based on locations supported by our EOR partner, Deel. You can learn more about this role at https://www.enveritas.org/jobs/data-scientist.\n",
      "Our Engineering and Data Group is a quirky, talented, and humble group of about twenty with diverse backgrounds ranging from journalism to academia to international industry.\n",
      "About Our Data Team\n",
      "The Data Team's mission is to leverage data analytics to drive Enveritas' efforts in improving the livelihoods of smallholder farmers around the world. We are responsible for ensuring that we collect high quality data, we leverage it to generate actionable insights that support smallholder farmers worldwide, and we make the data and insights accessible to other teams, such as the Operations and Partnerships Team.\n",
      "To improve the quality of our data collection process, we create tools to support our Country Operations teams. These tools enable the detection of outliers and automating quality control processes while collecting our survey data. \n",
      "We are also responsible for the transformation of our raw data into insights. This includes writing the code that scores our standards, and creating models that give new insights about smallholder farming. For example, we estimate the cost farmers face when growing coffee to know if they earn a livable income.\n",
      "Our programming languages of choice are Python and SQL (we use PostgreSQL), although some of our analysis is also done in R. We use git for version control, Github for hosting our repositories, and pytest for automated testing. Our internal BI tool is Looker.\n",
      "What You’ll Be Doing\n",
      "Providing technical oversight and mentorship. You will provide technical oversight to the models the Data Team produces. Additionally, you will offer guidance on and document statistical best practices and methodologies, helping to elevate the overall analytical capabilities within the Data Team and the organization.\n",
      "Build models to create new insights from our data. You will identify opportunities for creating models using the data we collect. Engage with the Operations and Partnerships teams to generate ideas for models that would be beneficial for them, and the coffee and cocoa community. This includes using Python to craft these models, incorporating them into our data pipeline, and effectively synthesizing, visualizing, and conveying the results.\n",
      "Serve as internal statistics consultant. Undertake the role of an internal statistical advisor, providing expertise across the organization for all statistics-related matters. This includes aiding teams in developing innovative sampling methods, calculating appropriate sample sizes, interpreting data analysis results, and ensuring the statistical integrity of projects and data collection. \n",
      "Who You Are\n",
      "Our team is fully distributed, so you should be comfortable with remote work. This role is a full-time individual contributor role. While you can be located anywhere, our core hours are 10am to 2pm Eastern Time Monday through Friday, with team members choosing either an early start or later stop as suits them.\n",
      "Our work is often ambiguous, so you should have a love for environments with uncertainty. You should have a deep empathy for users of our tools and understand the importance of supporting the work of other teams. You should also be willing to engage in a broad range of tasks beyond standard data science functions, and you are not afraid of using a simple model that gets the job done.\n",
      " \n",
      "Qualifications\n",
      "Read this first: Research shows that people of different backgrounds read job postings differently. If you don’t think you meet all of the qualifications but do think you’d be a great match for us, please consider applying and sharing more in your cover letter. We’d love to talk with you to see what skills you can bring to our team. This said, we are most likely to be interested in your candidacy if you can demonstrate the majority of the qualifications listed below:\n",
      "\n",
      "Extensive professional experience as a data scientist, a statistician, or equivalent.\n",
      "Hands-on experience building and deploying statistical models. Strong theoretical knowledge of probability and statistics.\n",
      "Intermediate Python. Experience with good software engineering practices such as reproducible environments, testing, and code quality.\n",
      "A desire to apply your skills at a non-profit working to improve the livelihoods of smallholder farmers.\n",
      "Have experience or expertise in one or more of the following:\n",
      "\n",
      "Building sustainability or agricultural models.\n",
      "Advanced statistical modeling.\n",
      "Survey sampling.\n",
      "Survey methodology.\n",
      "Bayesian statistical modeling.\n",
      "Data Management.\n",
      "Causal inference.\n",
      "Managing teams.\n",
      "\n",
      "\n",
      "\n",
      " \n",
      "About Working With Us & Compensation\n",
      "Enveritas has teams around the world: we are about 90 people spread over almost two dozen countries, and of all backgrounds, faiths, and identities. To learn more about working at Enveritas, see https://www.enveritas.org/jobs/\n",
      "For a US-Based hire, base salary for this position will be between $110,000-$130,000 annually (paid semi-monthly). This is a full-time exempt position. Full benefits include 401k with matching contributions, Medical/Dental/Vision, and Flexible Spending Account (FSA), 4 weeks vacation in addition to 12 standard holidays, and personal/sick time.\n",
      "For a hire outside the US, our offer will be competitive; the specific benefits and compensation details will vary as required to account for your region’s laws and requirements. Salary for this position will be paid in relevant local currency.\n",
      "For all staff, we are able to offer:\n",
      "\n",
      "Annual education budget for conferences, books, and other professional development opportunities.\n",
      "Annual all-company retreat and annual group retreat.\n",
      "Field visits to our Country Ops teams in coffee-growing countries such as Colombia, Costa Rica, Ethiopia, and Indonesia.\n",
      "\n",
      "Interview Process\n",
      "We are committed to fair and equitable hiring. To honor this commitment, we are being transparent about our interview process. We are interested in learning what working with you would be like and believe the below is the fairest method for us to see you at your best — and for you to learn about us! If you feel that a different method would be better for us to learn what working together would be like, please tell us in your application. \n",
      "After your introductory interview, we expect your interview process to take four to six weeks (but will depend on scheduling), and consist of four conversations that total about five hours of time. You should plan to also spend about four hours in total preparing for interviews.  See the hiring page at https://www.enveritas.org/jobs/data-scientist/ for details about each of these interviews.\n",
      "\n",
      "Introductory Interview (30 minutes; Google Meet; audio-only)\n",
      "First Technical Interview (60-90 minutes; Google Meet video)\n",
      "Second Technical Interview (60-90 minutes; Google Meet video)\n",
      "Manager Interview (45 minutes; Google Meet video)\n",
      "\n",
      "How to Apply\n",
      "Please apply here. Feel free to contact us at jobs@enveritas.org should you have any questions about the position, the interview process, or if you require any adjustments to ensure a fair and equitable application process. Questions about this opportunity or process will not reflect negatively on your application.\n",
      "A few notes about our communications: We are not able to reply to messages sent to staff outside of either our application process or our jobs email address, as this is unfair to other candidates. Also, Enveritas has been made aware of fake job postings by individuals pretending to hire persons seeking employment. These individuals are looking to collect personal information about you for fraudulent purposes. All legitimate Enveritas job openings are posted on our Greenhouse board at https://boards.greenhouse.io/enveritas or Lever board at https://jobs.lever.co/Enveritas. All recruiting emails from Enveritas team members will come through Lever or from @enveritas.org. If you have any concerns about employment opportunities or contact from someone supposedly representing Enveritas, please reach out to us at jobs@enveritas.org\n"
     ]
    }
   ],
   "source": [
    "# Extract the job description content\n",
    "job_title = soup.find(\"h1\", {\"class\": \"app-title\"})\n",
    "\n",
    "company_name = soup.find(\"span\",{\"class\": \"company-name\"})\n",
    "\n",
    "job_location = soup.find(\"div\", {\"class\": \"location\"})\n",
    "\n",
    "job_description = soup.find(\"div\", {\"id\": \"content\"})\n",
    "\n",
    "# Print the job description\n",
    "if job_description:\n",
    "    print(\"Job Title: \"+job_title.get_text()+\"\\n\"+\"Company: \"+company_name.get_text().strip().split(\"at \")[1]+\"\\n\"+\"Job Location: \"+job_location.get_text().strip()+\"\\n\"+\"Job Description: \\n\"+job_description.get_text().strip())\n",
    "else:\n",
    "    print(\"Job description not found.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "my_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}