{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "import csv\n", "from getpass import getpass\n", "from time import sleep\n", "from selenium.webdriver.common.keys import Keys\n", "from selenium.common.exceptions import NoSuchElementException\n", "from msedge.selenium_tools import Edge, EdgeOptions " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_tweet_data(card):\n", " \"\"\"Extract data from tweet card\"\"\"\n", " username = card.find_element_by_xpath('.//span').text\n", " try:\n", " handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n", " except NoSuchElementException:\n", " return\n", " \n", " try:\n", " postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n", " except NoSuchElementException:\n", " return\n", " \n", " comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n", " responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n", " text = comment + responding\n", " reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n", " retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n", " like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n", "\n", " \n", " tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)\n", " return tweet " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "search_term = input('search term: ')\n", "\n", "# create instance of web driver\n", "options = EdgeOptions()\n", "options.use_chromium = True\n", "driver = Edge(options=options)\n", "\n", "# navigate to login screen\n", "driver.get('https://twitter.com/search')\n", "driver.maximize_window()\n", "sleep(5)\n", "\n", "# find search input and search for term\n", "search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n", "search_input.send_keys(search_term)\n", "search_input.send_keys(Keys.RETURN)\n", "sleep(1)\n", "\n", "# navigate to historical 'latest' tab\n", "driver.find_element_by_link_text('Latest').click()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# get all tweets on the page\n", "data = []\n", "tweet_ids = set()\n", "last_position = driver.execute_script(\"return window.pageYOffset;\")\n", "scrolling = True\n", "\n", "while scrolling:\n", " page_cards = driver.find_elements_by_xpath('//article[@data-testid=\"tweet\"]')\n", " for card in page_cards[-15:]:\n", " tweet = get_tweet_data(card)\n", " if tweet:\n", " tweet_id = ''.join(tweet)\n", " if tweet_id not in tweet_ids:\n", " tweet_ids.add(tweet_id)\n", " data.append(tweet)\n", " \n", " scroll_attempt = 0\n", " while True:\n", " # check scroll position\n", " driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n", " sleep(2)\n", " curr_position = driver.execute_script(\"return window.pageYOffset;\")\n", " if last_position == curr_position:\n", " scroll_attempt += 1\n", " \n", " # end of scroll region\n", " if scroll_attempt >= 3:\n", " scrolling = False\n", " break\n", " else:\n", " sleep(2) # attempt another scroll\n", " else:\n", " last_position = curr_position\n", " break\n", "\n", "# close the web driver\n", "driver.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Saving the tweet data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n", " header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Comments', 'Likes', 'Retweets']\n", " writer = csv.writer(f)\n", " writer.writerow(header)\n", " writer.writerows(data)" ] } ], "metadata": { "interpreter": { "hash": "306b4709344c791e982a258cf5494139869959872aa39c2c4102a54cca0d2138" }, "kernelspec": { "display_name": "Python 3.7.0 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }