{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "730ba509", "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"" ] }, { "cell_type": "code", "execution_count": null, "id": "d9acd4b6", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import sys\n", "proj_dir = Path.cwd().parent\n", "\n", "sys.path.append(str(proj_dir))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "62452860", "metadata": {}, "outputs": [], "source": [ "from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe, get_post_count_for_day" ] }, { "cell_type": "code", "execution_count": 4, "id": "a956a623", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "17df3f2812084d3591e914ffcfd948b0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "2023-04-12 16:23:59,392 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 20:00:00\n", "2023-04-12 16:24:03,524 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 14:37:16\n", "2023-04-12 16:24:08,443 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 05:02:52\n", "2023-04-12 16:24:13,409 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 00:43:35\n", "2023-04-12 16:24:17,548 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:28:35\n", "2023-04-12 16:24:21,490 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:00:48\n", "2023-04-12 16:24:23,658 - INFO - Finished scraping 4106 submissions in 28.86 seconds\n" ] } ], "source": [ "subreddit_to_scrape = \"askreddit\"\n", "day_to_scrape = \"2013-03-01\"\n", "submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)" ] }, { "cell_type": "code", "execution_count": 5, "id": "b1cc845b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
permalinkselftexturlcreated_utcauthornum_commentsscoretitleiddownsups
0/r/AskReddit/comments/19hbm0/in_the_way_that_p...Basically, do other parts of the world have th...http://www.reddit.com/r/AskReddit/comments/19h...2013-03-01 19:58:55sjr6311In the way that popular English and American m...19hbm001
1/r/AskReddit/comments/19hblp/could_i_buy_an_an...http://www.reddit.com/r/AskReddit/comments/19h...2013-03-01 19:58:50WeirdPlane131Could I buy an Android phone without a plan an...19hblp01
2/r/AskReddit/comments/19hblj/how_do_i_reddit/Yeah.\n", "\n", "How do I reddit? I don't use or read re...http://www.reddit.com/r/AskReddit/comments/19h...2013-03-01 19:58:47xxnovaroxgg140How do I reddit19hblj00
3/r/AskReddit/comments/19hbjx/xpost_rsurvival_h...My brothers, dad and I have always been huge L...http://www.reddit.com/r/AskReddit/comments/19h...2013-03-01 19:58:07tuffstough01(x-post r/survival) Have any redditors seen Le...19hbjx01
4/r/AskReddit/comments/19hbjk/female_redditors_...I'm curious, guys tend to get asked the usual ...http://www.reddit.com/r/AskReddit/comments/19h...2013-03-01 19:57:58redditredditx3132Female Redditors, which part of the male physi...19hbjk02
\n", "
" ], "text/plain": [ " permalink \\\n", "0 /r/AskReddit/comments/19hbm0/in_the_way_that_p... \n", "1 /r/AskReddit/comments/19hblp/could_i_buy_an_an... \n", "2 /r/AskReddit/comments/19hblj/how_do_i_reddit/ \n", "3 /r/AskReddit/comments/19hbjx/xpost_rsurvival_h... \n", "4 /r/AskReddit/comments/19hbjk/female_redditors_... \n", "\n", " selftext \\\n", "0 Basically, do other parts of the world have th... \n", "1 \n", "2 Yeah.\n", "\n", "How do I reddit? I don't use or read re... \n", "3 My brothers, dad and I have always been huge L... \n", "4 I'm curious, guys tend to get asked the usual ... \n", "\n", " url created_utc \\\n", "0 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:55 \n", "1 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:50 \n", "2 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:47 \n", "3 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:07 \n", "4 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:57:58 \n", "\n", " author num_comments score \\\n", "0 sjr63 1 1 \n", "1 WeirdPlane 13 1 \n", "2 xxnovaroxgg 14 0 \n", "3 tuffstough 0 1 \n", "4 redditredditx3 13 2 \n", "\n", " title id downs ups \n", "0 In the way that popular English and American m... 19hbm0 0 1 \n", "1 Could I buy an Android phone without a plan an... 19hblp 0 1 \n", "2 How do I reddit 19hblj 0 0 \n", "3 (x-post r/survival) Have any redditors seen Le... 19hbjx 0 1 \n", "4 Female Redditors, which part of the male physi... 19hbjk 0 2 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = submissions_to_dataframe(submissions)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "518addff", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6e5490dc", "metadata": {}, "outputs": [], "source": [ "start_date = datetime.strptime(\"2013-01-01\", \"%Y-%m-%d\")\n", "start_date" ] }, { "cell_type": "code", "execution_count": null, "id": "bf13555a", "metadata": {}, "outputs": [], "source": [ "df[\"created_utc\"] = pd.to_datetime(df[\"created_utc\"], unit=\"s\").dt.tz_localize(\"UTC\").dt.strftime('%Y-%m-%d %H:%M:%S')" ] }, { "cell_type": "code", "execution_count": null, "id": "48e413f3", "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "9e83befa", "metadata": {}, "outputs": [], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": null, "id": "ba84be68", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 5 }