Mosa commited on
Commit
a2b888f
1 Parent(s): 2981ede

here is my changes

Browse files
twitter-scraper/twint-master/twitter_scraper.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": 25,
14
  "id": "c9021300",
15
  "metadata": {
16
  "scrolled": true
@@ -18,9 +18,7 @@
18
  "outputs": [],
19
  "source": [
20
  "%%capture \n",
21
- "!pip3 install Twint \n",
22
- "#!pip install asyncio\n",
23
- "\n"
24
  ]
25
  },
26
  {
@@ -33,123 +31,209 @@
33
  },
34
  {
35
  "cell_type": "code",
36
- "execution_count": 26,
37
  "id": "1413ab2b",
38
  "metadata": {},
39
  "outputs": [],
40
  "source": [
41
- "import asyncio\n",
42
- "import os\n",
43
- "loop = asyncio.get_event_loop()\n",
44
- "loop.is_running()\n",
45
- "import twint\n",
46
- "import nest_asyncio\n",
47
- "nest_asyncio.apply()"
48
  ]
49
  },
50
  {
51
  "cell_type": "code",
52
- "execution_count": 27,
53
- "id": "193ee41e",
54
  "metadata": {},
55
- "outputs": [
56
- {
57
- "ename": "NameError",
58
- "evalue": "name 'get_tweets' is not defined",
59
- "output_type": "error",
60
- "traceback": [
61
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
62
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
63
- "\u001b[0;32m/tmp/ipykernel_17223/2414687227.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mto_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-6-30\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m20\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0m_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_date\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
64
- "\u001b[0;31mNameError\u001b[0m: name 'get_tweets' is not defined"
65
- ]
66
- }
67
- ],
68
  "source": [
69
  "from_date=\"2022-6-10 10:30:22\"\n",
70
  "to_date= \"2022-6-30\"\n",
71
  "num_tweets = 20\n",
72
- "_data=get_tweets(\"jimmieakesson\",from_date, to_date,num_tweets, u_or_s=\"u\")\n",
73
- "\n"
74
  ]
75
  },
76
  {
77
  "cell_type": "code",
78
  "execution_count": null,
79
- "id": "1276f8a4",
80
  "metadata": {},
81
- "outputs": [
82
- {
83
- "data": {
84
- "text/plain": [
85
- "9"
86
- ]
87
- },
88
- "execution_count": 17,
89
- "metadata": {},
90
- "output_type": "execute_result"
91
- }
92
- ],
93
  "source": [
94
- "tweets= list(_data.keys())\n",
95
- "len(list(_data.keys()))"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  ]
97
  },
98
  {
99
  "cell_type": "code",
100
  "execution_count": null,
101
- "id": "c68d6f75",
 
 
 
 
 
 
 
 
 
 
 
 
102
  "metadata": {},
103
  "outputs": [
104
  {
105
- "data": {
106
- "text/plain": [
107
- "'/home/oxygen/politweet/twitter-scraper/twint-master'"
108
- ]
109
- },
110
- "execution_count": 18,
111
- "metadata": {},
112
- "output_type": "execute_result"
 
 
 
113
  }
114
  ],
115
  "source": [
116
- "pwd"
 
 
117
  ]
118
  },
119
  {
120
  "cell_type": "code",
121
- "execution_count": null,
122
- "id": "d38514f3",
123
  "metadata": {},
124
- "outputs": [],
 
 
 
 
 
 
 
 
125
  "source": [
126
- "import scrape\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ]
128
  },
129
  {
130
  "cell_type": "code",
131
- "execution_count": 29,
132
- "id": "a7912a91",
133
  "metadata": {},
134
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  "source": [
136
- "from_date=\"2022-6-10 10:30:22\"\n",
137
- "to_date= \"2022-6-30\"\n",
138
- "num_tweets = 20\n",
139
- "_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\")\n"
140
  ]
141
  },
142
  {
143
  "cell_type": "code",
144
  "execution_count": null,
145
- "id": "e3fe4402",
146
  "metadata": {},
147
  "outputs": [],
148
- "source": [
149
- "tweets= df[\"tweet\"]\n",
150
- "for i in tweets:\n",
151
- " print(i, \"\\n\", \"__________________________________________________________\")"
152
- ]
153
  }
154
  ],
155
  "metadata": {
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": null,
14
  "id": "c9021300",
15
  "metadata": {
16
  "scrolled": true
 
18
  "outputs": [],
19
  "source": [
20
  "%%capture \n",
21
+ "!pip3 install Twint \n"
 
 
22
  ]
23
  },
24
  {
 
31
  },
32
  {
33
  "cell_type": "code",
34
+ "execution_count": null,
35
  "id": "1413ab2b",
36
  "metadata": {},
37
  "outputs": [],
38
  "source": [
39
+ "# import asyncio\n",
40
+ "# import os\n",
41
+ "# loop = asyncio.get_event_loop()\n",
42
+ "# loop.is_running()\n",
43
+ "# import twint\n",
44
+ "# import nest_asyncio\n",
45
+ "# nest_asyncio.apply()"
46
  ]
47
  },
48
  {
49
  "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "d38514f3",
52
  "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "import scrape\n"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "id": "a7912a91",
62
+ "metadata": {},
63
+ "outputs": [],
 
 
64
  "source": [
65
  "from_date=\"2022-6-10 10:30:22\"\n",
66
  "to_date= \"2022-6-30\"\n",
67
  "num_tweets = 20\n",
68
+ "_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\",from_date=221232,to_date=2313)\n"
 
69
  ]
70
  },
71
  {
72
  "cell_type": "code",
73
  "execution_count": null,
74
+ "id": "48d50b46",
75
  "metadata": {},
76
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
77
  "source": [
78
+ "tweets= _data.keys()\n",
79
+ "for i in tweets:\n",
80
+ " _data[i][\"tweet\"]\n",
81
+ " print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": null,
87
+ "id": "72cabcb5",
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "from_date=\"2022-6-10 10:30:22\"\n",
92
+ "to_date= \"2022-6-30\"\n",
93
+ "num_tweets = 20\n",
94
+ "_data=scrape.scraper.string_search_user_tweets(\"jimmieakesson\",\"invandring\")\n"
95
  ]
96
  },
97
  {
98
  "cell_type": "code",
99
  "execution_count": null,
100
+ "id": "549e4fb3",
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "tweets= _data[\"tweet\"]\n",
105
+ "for i in tweets:\n",
106
+ " print(i, \"\\n\", \"__________________________________________________________\")"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 3,
112
+ "id": "733dd44a",
113
  "metadata": {},
114
  "outputs": [
115
  {
116
+ "name": "stdout",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "Defaulting to user installation because normal site-packages is not writeable\n",
120
+ "Requirement already satisfied: snscrape in /home/oxygen/.local/lib/python3.10/site-packages (0.3.4)\n",
121
+ "Requirement already satisfied: beautifulsoup4 in /home/oxygen/.local/lib/python3.10/site-packages (from snscrape) (4.11.1)\n",
122
+ "Requirement already satisfied: requests[socks] in /usr/lib/python3/dist-packages (from snscrape) (2.25.1)\n",
123
+ "Requirement already satisfied: lxml in /usr/lib/python3/dist-packages (from snscrape) (4.8.0)\n",
124
+ "Requirement already satisfied: soupsieve>1.2 in /home/oxygen/.local/lib/python3.10/site-packages (from beautifulsoup4->snscrape) (2.3.2.post1)\n",
125
+ "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/oxygen/.local/lib/python3.10/site-packages (from requests[socks]->snscrape) (1.7.1)\n"
126
+ ]
127
  }
128
  ],
129
  "source": [
130
+ "#%pip install -q snscrape==0.3.4\n",
131
+ "!pip3 install snscrape\n",
132
+ "#!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git"
133
  ]
134
  },
135
  {
136
  "cell_type": "code",
137
+ "execution_count": 14,
138
+ "id": "0d16422c",
139
  "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "name": "stdout",
143
+ "output_type": "stream",
144
+ "text": [
145
+ "Note: you may need to restart the kernel to use updated packages.\n"
146
+ ]
147
+ }
148
+ ],
149
  "source": [
150
+ "%pip install -q snscrape==0.3.4\n",
151
+ "from datetime import date\n",
152
+ "import os\n",
153
+ "import pandas as pd\n",
154
+ "\n",
155
+ "\n",
156
+ "def get_tweets(search_term, from_date, to_date=date.today(), num_tweets=100,u_or_s='s'):\n",
157
+ " if u_or_s.lower() =='u':\n",
158
+ " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-user '{search_term} until:{to_date}' > extracted-tweets.txt\" \n",
159
+ " else:\n",
160
+ " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-search '{search_term} until:{to_date}' > extracted-tweets.txt\"\n",
161
+ " \n",
162
+ " os.system(extracted_tweets)\n",
163
+ " if os.stat(\"extracted-tweets.txt\").st_size == 0:\n",
164
+ " print('No Tweets found')\n",
165
+ " else:\n",
166
+ " df = pd.read_csv('extracted-tweets.txt', names=['content'])\n",
167
+ " data_list=[]\n",
168
+ " for row in df['content'].iteritems():\n",
169
+ " temp= str(row[0])+str(row[1])\n",
170
+ " temp= temp.replace(\"\\'\",\"\")\n",
171
+ " data_list.append(temp)\n",
172
+ " return data_list\n",
173
+ "\n"
174
  ]
175
  },
176
  {
177
  "cell_type": "code",
178
+ "execution_count": 12,
179
+ "id": "8e2adb35",
180
  "metadata": {},
181
+ "outputs": [
182
+ {
183
+ "name": "stdout",
184
+ "output_type": "stream",
185
+ "text": [
186
+ "No Tweets found\n"
187
+ ]
188
+ },
189
+ {
190
+ "name": "stderr",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "Traceback (most recent call last):\n",
194
+ " File \"/home/oxygen/.local/bin/snscrape\", line 8, in <module>\n",
195
+ " sys.exit(main())\n",
196
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 224, in main\n",
197
+ " args = parse_args()\n",
198
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 159, in parse_args\n",
199
+ " import snscrape.modules\n",
200
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 15, in <module>\n",
201
+ " _import_modules()\n",
202
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 12, in _import_modules\n",
203
+ " module = importlib.import_module(moduleName)\n",
204
+ " File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n",
205
+ " return _bootstrap._gcd_import(name[level:], package, level)\n",
206
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/instagram.py\", line 12, in <module>\n",
207
+ " class InstagramPost(typing.NamedTuple, snscrape.base.Item):\n",
208
+ " File \"/usr/lib/python3.10/typing.py\", line 2329, in _namedtuple_mro_entries\n",
209
+ " raise TypeError(\"Multiple inheritance with NamedTuple is not supported\")\n",
210
+ "TypeError: Multiple inheritance with NamedTuple is not supported\n"
211
+ ]
212
+ },
213
+ {
214
+ "ename": "UnboundLocalError",
215
+ "evalue": "local variable 'df' referenced before assignment",
216
+ "output_type": "error",
217
+ "traceback": [
218
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
219
+ "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
220
+ "\u001b[0;32m/tmp/ipykernel_26511/1892081786.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-06-01\"\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
221
+ "\u001b[0;32m/tmp/ipykernel_26511/275462205.py\u001b[0m in \u001b[0;36mget_tweets\u001b[0;34m(search_term, from_date, to_date, num_tweets, u_or_s)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'extracted-tweets.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
222
+ "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'df' referenced before assignment"
223
+ ]
224
+ }
225
+ ],
226
  "source": [
227
+ "d= get_tweets(\"jimmieakesson\",from_date= \"2022-06-01\" ,num_tweets =5, u_or_s=\"u\")"
 
 
 
228
  ]
229
  },
230
  {
231
  "cell_type": "code",
232
  "execution_count": null,
233
+ "id": "a2c837f4",
234
  "metadata": {},
235
  "outputs": [],
236
+ "source": []
 
 
 
 
237
  }
238
  ],
239
  "metadata": {