kimic commited on
Commit
bfe6b6c
·
1 Parent(s): 6275168

Initial commit for GPT

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
analysis.ipynb ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 58,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "df_input = pd.read_csv('sampled_data.csv')\n",
11
+ "df_inferenced = pd.read_csv('inference_output.csv')"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 59,
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "1000\n",
24
+ "1000\n"
25
+ ]
26
+ }
27
+ ],
28
+ "source": [
29
+ "print(len(df_input))\n",
30
+ "print(len(df_inferenced))"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 60,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "df_combined = pd.concat([df_input, df_inferenced], axis=1)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 61,
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "data": {
49
+ "text/html": [
50
+ "<div>\n",
51
+ "<style scoped>\n",
52
+ " .dataframe tbody tr th:only-of-type {\n",
53
+ " vertical-align: middle;\n",
54
+ " }\n",
55
+ "\n",
56
+ " .dataframe tbody tr th {\n",
57
+ " vertical-align: top;\n",
58
+ " }\n",
59
+ "\n",
60
+ " .dataframe thead th {\n",
61
+ " text-align: right;\n",
62
+ " }\n",
63
+ "</style>\n",
64
+ "<table border=\"1\" class=\"dataframe\">\n",
65
+ " <thead>\n",
66
+ " <tr style=\"text-align: right;\">\n",
67
+ " <th></th>\n",
68
+ " <th>title</th>\n",
69
+ " <th>text</th>\n",
70
+ " <th>label</th>\n",
71
+ " <th>Output</th>\n",
72
+ " <th>Tokens Used</th>\n",
73
+ " <th>Finish Reason</th>\n",
74
+ " </tr>\n",
75
+ " </thead>\n",
76
+ " <tbody>\n",
77
+ " <tr>\n",
78
+ " <th>0</th>\n",
79
+ " <td>Live at Truthdig: Robert Scheer and Thomas Fra...</td>\n",
80
+ " <td>Live at Truthdig: Robert Scheer and Thomas Fra...</td>\n",
81
+ " <td>0</td>\n",
82
+ " <td>Real</td>\n",
83
+ " <td>265</td>\n",
84
+ " <td>stop</td>\n",
85
+ " </tr>\n",
86
+ " <tr>\n",
87
+ " <th>1</th>\n",
88
+ " <td>The Mirage of a Return to Manufacturing Greatn...</td>\n",
89
+ " <td>Half a century ago, harvesting California’s 2....</td>\n",
90
+ " <td>1</td>\n",
91
+ " <td>Real</td>\n",
92
+ " <td>1627</td>\n",
93
+ " <td>stop</td>\n",
94
+ " </tr>\n",
95
+ " <tr>\n",
96
+ " <th>2</th>\n",
97
+ " <td>British PM expected to offer to fill post-Brex...</td>\n",
98
+ " <td>(Reuters) - The British government has told Ge...</td>\n",
99
+ " <td>1</td>\n",
100
+ " <td>fake</td>\n",
101
+ " <td>200</td>\n",
102
+ " <td>stop</td>\n",
103
+ " </tr>\n",
104
+ " <tr>\n",
105
+ " <th>3</th>\n",
106
+ " <td>Checkmating Obama</td>\n",
107
+ " <td>Originally published by the Jerusalem Post . \\...</td>\n",
108
+ " <td>0</td>\n",
109
+ " <td>fake</td>\n",
110
+ " <td>2166</td>\n",
111
+ " <td>stop</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>4</th>\n",
115
+ " <td>Thirty-eight injured in police charges in Cata...</td>\n",
116
+ " <td>MADRID (Reuters) - Emergency services have att...</td>\n",
117
+ " <td>1</td>\n",
118
+ " <td>Real</td>\n",
119
+ " <td>176</td>\n",
120
+ " <td>stop</td>\n",
121
+ " </tr>\n",
122
+ " </tbody>\n",
123
+ "</table>\n",
124
+ "</div>"
125
+ ],
126
+ "text/plain": [
127
+ " title \\\n",
128
+ "0 Live at Truthdig: Robert Scheer and Thomas Fra... \n",
129
+ "1 The Mirage of a Return to Manufacturing Greatn... \n",
130
+ "2 British PM expected to offer to fill post-Brex... \n",
131
+ "3 Checkmating Obama \n",
132
+ "4 Thirty-eight injured in police charges in Cata... \n",
133
+ "\n",
134
+ " text label Output \\\n",
135
+ "0 Live at Truthdig: Robert Scheer and Thomas Fra... 0 Real \n",
136
+ "1 Half a century ago, harvesting California’s 2.... 1 Real \n",
137
+ "2 (Reuters) - The British government has told Ge... 1 fake \n",
138
+ "3 Originally published by the Jerusalem Post . \\... 0 fake \n",
139
+ "4 MADRID (Reuters) - Emergency services have att... 1 Real \n",
140
+ "\n",
141
+ " Tokens Used Finish Reason \n",
142
+ "0 265 stop \n",
143
+ "1 1627 stop \n",
144
+ "2 200 stop \n",
145
+ "3 2166 stop \n",
146
+ "4 176 stop "
147
+ ]
148
+ },
149
+ "execution_count": 61,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "df_combined.head()"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 62,
161
+ "metadata": {},
162
+ "outputs": [
163
+ {
164
+ "data": {
165
+ "text/plain": [
166
+ "array(['stop', 'length'], dtype=object)"
167
+ ]
168
+ },
169
+ "execution_count": 62,
170
+ "metadata": {},
171
+ "output_type": "execute_result"
172
+ }
173
+ ],
174
+ "source": [
175
+ "df_combined[\"Finish Reason\"].unique()"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 63,
181
+ "metadata": {},
182
+ "outputs": [
183
+ {
184
+ "data": {
185
+ "text/plain": [
186
+ "994"
187
+ ]
188
+ },
189
+ "execution_count": 63,
190
+ "metadata": {},
191
+ "output_type": "execute_result"
192
+ }
193
+ ],
194
+ "source": [
195
+ "df_combined = df_combined[df_combined[\"Finish Reason\"] != \"length\"]\n",
196
+ "len(df_combined)"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 64,
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "df_combined.drop(columns=[\"title\", \"text\", \"Tokens Used\", \"Finish Reason\"], inplace=True)"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 66,
211
+ "metadata": {},
212
+ "outputs": [
213
+ {
214
+ "name": "stderr",
215
+ "output_type": "stream",
216
+ "text": [
217
+ "C:\\Users\\kimi\\AppData\\Local\\Temp\\ipykernel_31372\\3169472720.py:2: DeprecationWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`\n",
218
+ " df_combined.loc[:, \"Output\"] = df_combined[\"Output\"].str.strip().str.lower().map({\"real\": 1, \"fake\": 0})\n"
219
+ ]
220
+ }
221
+ ],
222
+ "source": [
223
+ "df_combined = df_combined.copy()\n",
224
+ "df_combined.loc[:, \"Output\"] = df_combined[\"Output\"].str.strip().str.lower().map({\"real\": 1, \"fake\": 0})"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 68,
230
+ "metadata": {},
231
+ "outputs": [
232
+ {
233
+ "data": {
234
+ "text/plain": [
235
+ "994"
236
+ ]
237
+ },
238
+ "execution_count": 68,
239
+ "metadata": {},
240
+ "output_type": "execute_result"
241
+ }
242
+ ],
243
+ "source": [
244
+ "len(df_combined)"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 70,
250
+ "metadata": {},
251
+ "outputs": [
252
+ {
253
+ "name": "stdout",
254
+ "output_type": "stream",
255
+ "text": [
256
+ "Accuracy: 0.7323943661971831\n",
257
+ "F1 Score: 0.5969696969696969\n"
258
+ ]
259
+ },
260
+ {
261
+ "name": "stderr",
262
+ "output_type": "stream",
263
+ "text": [
264
+ "C:\\Users\\kimi\\AppData\\Local\\Temp\\ipykernel_31372\\2541391757.py:14: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.\n",
265
+ " plt.style.use(\"seaborn-whitegrid\")\n"
266
+ ]
267
+ },
268
+ {
269
+ "data": {
270
+ "image/png": "",
271
+ "text/plain": [
272
+ "<Figure size 800x600 with 2 Axes>"
273
+ ]
274
+ },
275
+ "metadata": {},
276
+ "output_type": "display_data"
277
+ }
278
+ ],
279
+ "source": [
280
+ "import pandas as pd\n",
281
+ "from sklearn.metrics import accuracy_score, f1_score, confusion_matrix\n",
282
+ "import matplotlib.pyplot as plt\n",
283
+ "import seaborn as sns\n",
284
+ "\n",
285
+ "accuracy = accuracy_score(df_combined[\"label\"], df_combined[\"Output\"])\n",
286
+ "f1 = f1_score(df_combined[\"label\"], df_combined[\"Output\"])\n",
287
+ "\n",
288
+ "print(f\"Accuracy: {accuracy}\")\n",
289
+ "print(f\"F1 Score: {f1}\")\n",
290
+ "\n",
291
+ "conf_matrix = confusion_matrix(df_combined[\"label\"], df_combined[\"Output\"])\n",
292
+ "\n",
293
+ "plt.style.use(\"seaborn-whitegrid\")\n",
294
+ "plt.figure(figsize=(8, 6))\n",
295
+ "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
296
+ "plt.title(\"Confusion Matrix (GPT-4 Turbo)\")\n",
297
+ "plt.ylabel(\"True Label\")\n",
298
+ "plt.xlabel(\"Predicted Label\")\n",
299
+ "plt.show()"
300
+ ]
301
+ }
302
+ ],
303
+ "metadata": {
304
+ "kernelspec": {
305
+ "display_name": "torch",
306
+ "language": "python",
307
+ "name": "python3"
308
+ },
309
+ "language_info": {
310
+ "codemirror_mode": {
311
+ "name": "ipython",
312
+ "version": 3
313
+ },
314
+ "file_extension": ".py",
315
+ "mimetype": "text/x-python",
316
+ "name": "python",
317
+ "nbconvert_exporter": "python",
318
+ "pygments_lexer": "ipython3",
319
+ "version": "3.10.11"
320
+ }
321
+ },
322
+ "nbformat": 4,
323
+ "nbformat_minor": 2
324
+ }
data_2/WELFake_Dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:665331424230fc452e9482c3547a6a199a2c29745ade8d236950d1d105223773
3
+ size 245086152
data_3/news_articles.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53855240e9036a7d6c204e72bd0fa9d37a10f8e1bd2b2fdf34b962569ef271c6
3
+ size 10969548
inference.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ import csv
4
+ import pandas as pd
5
+
6
+ df = pd.read_csv("sampled_data.csv")
7
+
8
+ df["text"] = df["text"].str.replace(
9
+ r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True
10
+ )
11
+
12
+ df["text"] = df["text"].str.replace(r"Featured image via .+?\.($|\s)", "", regex=True)
13
+
14
+ df["text"] = df["title"] + " " + df["text"]
15
+
16
+ df = df[["text", "label"]]
17
+
18
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
19
+
20
+ system_prompt = """You are an expert in identifying fake news and disinformation. Please identify whether the piece of news is real or fake.
21
+ Please think step-by-step as you answer the question. However, please only respond with 'real' if the news is real or 'fake' if the news is fake.
22
+ Do not respond with any other words or phrases.
23
+ If you are unsure if the news is real or fake, please still make an educational guess."""
24
+
25
+ for i in range(961, len(df)):
26
+ response = client.chat.completions.create(
27
+ model="gpt-4-1106-preview",
28
+ max_tokens=10,
29
+ messages=[
30
+ {
31
+ "role": "system",
32
+ "content": system_prompt,
33
+ },
34
+ {
35
+ "role": "user",
36
+ "content": str(df.iloc[i]["text"]),
37
+ },
38
+ ],
39
+ )
40
+ # Extract the response message
41
+ output = response.choices[0].message.content
42
+ tokens_used = response.usage.total_tokens
43
+ finish_reason = response.choices[0].finish_reason
44
+
45
+ with open("inference_output.csv", "a", newline="", encoding="utf-8") as file:
46
+ writer = csv.writer(file)
47
+
48
+ # If the file is empty, write a header
49
+ if file.tell() == 0:
50
+ writer.writerow(["Output", "Tokens Used", "Finish Reason"])
51
+
52
+ # Write the data
53
+ writer.writerow([output, tokens_used, finish_reason])
54
+
55
+ if i % 50 == 0:
56
+ print(f"Batch: {i} / {len(df)}")
inference_output.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b523a4c2768bd211660ff526f8eccffefe0fa685985cee3a3ebc4ef0d833fe
3
+ size 15566
preprocessing.ipynb ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 6,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "# Load the datasets\n",
20
+ "df_1 = pd.read_csv(\"data_2/WELFake_Dataset.csv\")\n",
21
+ "df_2 = pd.read_csv(\"data_3/news_articles.csv\")\n",
22
+ "\n",
23
+ "# Drop index\n",
24
+ "df_1.drop(df_1.columns[0], axis=1, inplace=True)\n",
25
+ "df_1.dropna(inplace=True)\n",
26
+ "\n",
27
+ "# Swapping labels around since it originally is the opposite\n",
28
+ "df_1[\"label\"] = df_1[\"label\"].map({0: 1, 1: 0})\n",
29
+ "\n",
30
+ "# Add labels\n",
31
+ "df_2.drop(\n",
32
+ " columns=[\n",
33
+ " \"author\",\n",
34
+ " \"published\",\n",
35
+ " \"site_url\",\n",
36
+ " \"main_img_url\",\n",
37
+ " \"type\",\n",
38
+ " \"text_without_stopwords\",\n",
39
+ " \"title_without_stopwords\",\n",
40
+ " \"hasImage\",\n",
41
+ " ],\n",
42
+ " inplace=True,\n",
43
+ ")\n",
44
+ "# Map Real to 1 and Fake to 0\n",
45
+ "df_2[\"label\"] = df_2[\"label\"].map({\"Real\": 1, \"Fake\": 0})\n",
46
+ "df_2 = df_2[df_2[\"label\"].isin([1, 0])]\n",
47
+ "\n",
48
+ "# Drop rows where the language is not 'english'\n",
49
+ "df_2 = df_2[df_2[\"language\"] == \"english\"]\n",
50
+ "df_2.drop(columns=[\"language\"], inplace=True)\n",
51
+ "\n",
52
+ "# Convert \"no title\" to empty string\n",
53
+ "df_2[\"title\"] = df_2[\"title\"].apply(lambda x: \"\" if x == \"no title\" else x)\n",
54
+ "\n",
55
+ "df_2.dropna(inplace=True)\n",
56
+ "\n",
57
+ "random_1 = df_1.sample(n=500, random_state=42)\n",
58
+ "random_2 = df_2.sample(n=500, random_state=42)\n",
59
+ "\n",
60
+ "# Combine the datasets\n",
61
+ "df = pd.concat([random_1, random_2], ignore_index=True)\n",
62
+ "\n",
63
+ "df[\"label\"] = df[\"label\"].astype(int)\n",
64
+ "\n",
65
+ "df.to_csv(\"sampled_data.csv\", index=False)"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 7,
71
+ "metadata": {},
72
+ "outputs": [
73
+ {
74
+ "data": {
75
+ "text/html": [
76
+ "<div>\n",
77
+ "<style scoped>\n",
78
+ " .dataframe tbody tr th:only-of-type {\n",
79
+ " vertical-align: middle;\n",
80
+ " }\n",
81
+ "\n",
82
+ " .dataframe tbody tr th {\n",
83
+ " vertical-align: top;\n",
84
+ " }\n",
85
+ "\n",
86
+ " .dataframe thead th {\n",
87
+ " text-align: right;\n",
88
+ " }\n",
89
+ "</style>\n",
90
+ "<table border=\"1\" class=\"dataframe\">\n",
91
+ " <thead>\n",
92
+ " <tr style=\"text-align: right;\">\n",
93
+ " <th></th>\n",
94
+ " <th>title</th>\n",
95
+ " <th>text</th>\n",
96
+ " <th>label</th>\n",
97
+ " </tr>\n",
98
+ " </thead>\n",
99
+ " <tbody>\n",
100
+ " <tr>\n",
101
+ " <th>0</th>\n",
102
+ " <td>Live at Truthdig: Robert Scheer and Thomas Fra...</td>\n",
103
+ " <td>Live at Truthdig: Robert Scheer and Thomas Fra...</td>\n",
104
+ " <td>0</td>\n",
105
+ " </tr>\n",
106
+ " <tr>\n",
107
+ " <th>1</th>\n",
108
+ " <td>The Mirage of a Return to Manufacturing Greatn...</td>\n",
109
+ " <td>Half a century ago, harvesting California’s 2....</td>\n",
110
+ " <td>1</td>\n",
111
+ " </tr>\n",
112
+ " <tr>\n",
113
+ " <th>2</th>\n",
114
+ " <td>British PM expected to offer to fill post-Brex...</td>\n",
115
+ " <td>(Reuters) - The British government has told Ge...</td>\n",
116
+ " <td>1</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>3</th>\n",
120
+ " <td>Checkmating Obama</td>\n",
121
+ " <td>Originally published by the Jerusalem Post . \\...</td>\n",
122
+ " <td>0</td>\n",
123
+ " </tr>\n",
124
+ " <tr>\n",
125
+ " <th>4</th>\n",
126
+ " <td>Thirty-eight injured in police charges in Cata...</td>\n",
127
+ " <td>MADRID (Reuters) - Emergency services have att...</td>\n",
128
+ " <td>1</td>\n",
129
+ " </tr>\n",
130
+ " </tbody>\n",
131
+ "</table>\n",
132
+ "</div>"
133
+ ],
134
+ "text/plain": [
135
+ " title \\\n",
136
+ "0 Live at Truthdig: Robert Scheer and Thomas Fra... \n",
137
+ "1 The Mirage of a Return to Manufacturing Greatn... \n",
138
+ "2 British PM expected to offer to fill post-Brex... \n",
139
+ "3 Checkmating Obama \n",
140
+ "4 Thirty-eight injured in police charges in Cata... \n",
141
+ "\n",
142
+ " text label \n",
143
+ "0 Live at Truthdig: Robert Scheer and Thomas Fra... 0 \n",
144
+ "1 Half a century ago, harvesting California’s 2.... 1 \n",
145
+ "2 (Reuters) - The British government has told Ge... 1 \n",
146
+ "3 Originally published by the Jerusalem Post . \\... 0 \n",
147
+ "4 MADRID (Reuters) - Emergency services have att... 1 "
148
+ ]
149
+ },
150
+ "execution_count": 7,
151
+ "metadata": {},
152
+ "output_type": "execute_result"
153
+ }
154
+ ],
155
+ "source": [
156
+ "df.head()"
157
+ ]
158
+ }
159
+ ],
160
+ "metadata": {
161
+ "kernelspec": {
162
+ "display_name": "torch",
163
+ "language": "python",
164
+ "name": "python3"
165
+ },
166
+ "language_info": {
167
+ "codemirror_mode": {
168
+ "name": "ipython",
169
+ "version": 3
170
+ },
171
+ "file_extension": ".py",
172
+ "mimetype": "text/x-python",
173
+ "name": "python",
174
+ "nbconvert_exporter": "python",
175
+ "pygments_lexer": "ipython3",
176
+ "version": "3.10.11"
177
+ }
178
+ },
179
+ "nbformat": 4,
180
+ "nbformat_minor": 2
181
+ }
sampled_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:406b12a0d8e60d0c558d12a131f2013319b9eb910af92590a007511fb8904017
3
+ size 3510245