EddyGiusepe commited on
Commit
c9db2c1
1 Parent(s): c0fffe0
Files changed (2) hide show
  1. .gitignore +4 -0
  2. Disaster_GPT4-turbo.ipynb +349 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # EddyGiusepe
2
+ venv_GPT4_Disaster/
3
+ .env
4
+
Disaster_GPT4-turbo.ipynb ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "<h1 align=\"center\"><font color=\"yellow\">Researching a Humanitarian Disaster Situation Report Chatbot — Using GPT-4-Turbo and full-context prompting</font></h1>"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "<font color=\"yellow\">Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro</font>"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {},
20
+ "source": [
21
+ "Link de estudo:\n",
22
+ "\n",
23
+ "* [Tutorial Disaster Situation](https://towardsdatascience.com/researching-a-humanitarian-disaster-situation-report-chatbot-using-gpt-4-turbo-and-full-context-f742203d495a)"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "Aqui vamos analisar (seguindo o tutorial, acima) [Relatórios de Situações de Desastres Humanitários na incrível plataforma ReliefWeb](https://reliefweb.int/disasters). \n",
31
+ "\n",
32
+ "Estes relatórios (conhecidos como “Sitreps”) são vitais para monitorizar e reagir a catástrofes humanitárias em todo o mundo.\n",
33
+ "\n",
34
+ "\n",
35
+ "Usaremos a [ReliefWeb API](https://reliefweb.int/help/api)."
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 1,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "import openai\n",
45
+ "\n",
46
+ "# Substitua sua chave de API OpenAI:\n",
47
+ "import openai\n",
48
+ "import os\n",
49
+ "from dotenv import load_dotenv, find_dotenv\n",
50
+ "_ = load_dotenv(find_dotenv()) # read local .env file\n",
51
+ "openai.api_key = os.environ['OPENAI_API_KEY']\n",
52
+ "\n",
53
+ "model = \"gpt-3.5-turbo-1106\"\n",
54
+ "\n",
55
+ "def run_llm(query, system_prompt, reference_content):\n",
56
+ "\n",
57
+ " llm_query = {\n",
58
+ " \"temperature\": 0.0,\n",
59
+ " \"max_tokens\": 1000,\n",
60
+ " \"top_p\": 0.95,\n",
61
+ " \"frequency_penalty\": 0,\n",
62
+ " \"presence_penalty\": 0,\n",
63
+ " }\n",
64
+ "\n",
65
+ " response = openai.ChatCompletion.create(\n",
66
+ " model=model,\n",
67
+ " messages=[ {\n",
68
+ " \"role\":\"system\",\n",
69
+ " \"content\": system_prompt\n",
70
+ " },\n",
71
+ " {\n",
72
+ " \"role\":\"user\",\n",
73
+ " \"content\": query\n",
74
+ " }\n",
75
+ " ],\n",
76
+ " temperature=llm_query['temperature'],\n",
77
+ " max_tokens=llm_query['max_tokens'],\n",
78
+ " top_p=llm_query['top_p'],\n",
79
+ " frequency_penalty=llm_query['frequency_penalty'],\n",
80
+ " presence_penalty=llm_query['presence_penalty'],\n",
81
+ " stop=None\n",
82
+ " ) \n",
83
+ "\n",
84
+ " answer = response['choices'][0]['message']['content']\n",
85
+ " return answer\n"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "import requests \n",
95
+ "import os \n",
96
+ "from bs4 import BeautifulSoup \n",
97
+ "import re\n",
98
+ "import pandas as pd\n",
99
+ "import PyPDF2 \n",
100
+ "import traceback\n",
101
+ "import json\n",
102
+ "import ast\n",
103
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
104
+ "import tiktoken\n",
105
+ "\n",
106
+ "from googletrans import Translator\n",
107
+ "translator = Translator()\n",
108
+ "\n",
109
+ "\n",
110
+ "def auto_translate(text):\n",
111
+ " \"\"\"\n",
112
+ " This function automatically detects language and translates to english \n",
113
+ "\n",
114
+ " Parameters:\n",
115
+ " text(str): The text to be translated\n",
116
+ "\n",
117
+ " Returns:\n",
118
+ " text (str): Translated text if in another language, otherwise \n",
119
+ " input text\n",
120
+ " \"\"\"\n",
121
+ " try:\n",
122
+ " lang = translator.detect(text)\n",
123
+ " lang = lang.lang\n",
124
+ " print(f\"Linguagem detectado: {lang}\")\n",
125
+ " q = translator.translate(text, dest='pt')\n",
126
+ " text = q.text\n",
127
+ " except Exception as e:\n",
128
+ " print(\"An exception occurred trying to translate\")\n",
129
+ " return text\n",
130
+ "\n",
131
+ "def get_safe_name(name):\n",
132
+ " \"\"\"\n",
133
+ " This function takes a string and returns a version of it that is \n",
134
+ " safe to use as a filename.\n",
135
+ "\n",
136
+ " Parameters:\n",
137
+ " name (str): The string to be converted to a safe filename.\n",
138
+ "\n",
139
+ " Returns:\n",
140
+ " name (str): The safe filename.\n",
141
+ " \"\"\"\n",
142
+ " name = str(name)\n",
143
+ " name = re.sub(\"[^0-9a-zA-Z]+\", \"_\", name)\n",
144
+ " name = re.sub(r\"_$\",\"\", name)\n",
145
+ " if len(name) == 0:\n",
146
+ " name = 'Unknown' \n",
147
+ " return name\n",
148
+ "\n",
149
+ "def download_pdf(url, download_path): \n",
150
+ " \"\"\"\n",
151
+ " Function to download a PDF from a URL and save locally\n",
152
+ "\n",
153
+ " Parameters:\n",
154
+ " url (str): Location of online PDF file\n",
155
+ " download_path (str): Folder where to save PDF\n",
156
+ "\n",
157
+ " \"\"\"\n",
158
+ " response = requests.get(url) \n",
159
+ " with open(download_path, 'wb') as f: \n",
160
+ " f.write(response.content) \n",
161
+ " \n",
162
+ "def save_text(content, file_path): \n",
163
+ " \"\"\"\n",
164
+ " Function to save text to local file\n",
165
+ "\n",
166
+ " Parameters:\n",
167
+ " content (str): Text to save\n",
168
+ " file_path (str): Folder where to save \n",
169
+ " \"\"\"\n",
170
+ " with open(file_path, 'w') as file: \n",
171
+ " print(f'Saving {file_path}')\n",
172
+ " file.write(content) \n",
173
+ " \n",
174
+ "def extract_text_from_pdf(pdf_path): \n",
175
+ " \"\"\"\n",
176
+ " Function to extract text from PDF file\n",
177
+ "\n",
178
+ " Parameters:\n",
179
+ " pdf_path (str): Path to PDF file\n",
180
+ "\n",
181
+ " Returns:\n",
182
+ " text (str): Text extracted from PDF file\n",
183
+ " \"\"\"\n",
184
+ " print(pdf_path)\n",
185
+ " pdf_reader = PyPDF2.PdfReader(pdf_path) \n",
186
+ " text = '' \n",
187
+ " for page_num in range(len(pdf_reader.pages)): \n",
188
+ " page_obj = pdf_reader.pages[page_num]\n",
189
+ " text += page_obj.extract_text() \n",
190
+ " return text \n",
191
+ "\n",
192
+ "def get_rw_data(keyword, filter, sort, fields, endpoint, limit=10, \\\n",
193
+ " save_body_to_text=False): \n",
194
+ " \"\"\"\n",
195
+ " Function to extract data from ReliefWeb API. For API details see:\n",
196
+ "\n",
197
+ " https://apidoc.rwlabs.org/?utm_medium=blog&utm_source=reliefweb+website&utm_campaign=api+doc+launching+2016_06\n",
198
+ "\n",
199
+ " Parameters:\n",
200
+ " keyword (str): Search string\n",
201
+ " filter (dict): ReliefWeb filter json\n",
202
+ " sort (dict): ReliefWeb sort json\n",
203
+ " fields (list): List of fields to return\n",
204
+ " endpoint (str): API Endpoint, eg reports, disasters\n",
205
+ " limit (int): Maximum records to return\n",
206
+ " save_body_to_text (bool) : Flag to save body to text file, including any PDFs on page\n",
207
+ "\n",
208
+ " Returns:\n",
209
+ " all_data (pandas dataframe): Dataframe of data from API\n",
210
+ " \"\"\"\n",
211
+ " query = { \n",
212
+ " \"appname\": \"myapp\", \n",
213
+ " \"query\": { \n",
214
+ " \"value\": keyword\n",
215
+ " }, \n",
216
+ " \"filter\":filter,\n",
217
+ " \"sort\": sort,\n",
218
+ " \"limit\": limit, \n",
219
+ " \"fields\": fields\n",
220
+ " } \n",
221
+ "\n",
222
+ " reliefweb_api_url =\"https://api.reliefweb.int/v1/reports\" # Eddy Adicionou\n",
223
+ "\n",
224
+ " endpoint = f\"{reliefweb_api_url}/{endpoint}?appname=apidoc&query[value]=\"\n",
225
+ " print(f\"Getting {endpoint} ...\")\n",
226
+ " \n",
227
+ " all_data =[]\n",
228
+ " response = requests.post(endpoint, json=query) \n",
229
+ " if response.status_code == 200: \n",
230
+ " data = response.json() \n",
231
+ " for article in data[\"data\"]: \n",
232
+ " article_url = article['fields']['url'] \n",
233
+ " try:\n",
234
+ " r = article['fields']\n",
235
+ " print(article_url)\n",
236
+ " article_response = requests.get(article_url) \n",
237
+ " if save_body_to_text:\n",
238
+ " soup = BeautifulSoup(article_response.text, 'html.parser') \n",
239
+ " main_content = [p.text for p in soup.find_all('p')] \n",
240
+ " article_text = ' '.join(main_content)\n",
241
+ " save_text(article_text, docs_folder + '/{}.txt'.format(get_safe_name(article['fields']['title']))) \n",
242
+ " for link in soup.find_all('a'): \n",
243
+ " href = link.get('href') \n",
244
+ " if href.endswith('.pdf'): \n",
245
+ " download_path = os.path.join(docs_folder, href.split('/')[-1]) \n",
246
+ " if href.startswith('/attachments'):\n",
247
+ " pdf_url = f'{reliefweb_pdf_url}{href}'\n",
248
+ " else:\n",
249
+ " pdf_url = href\n",
250
+ " download_pdf(pdf_url, download_path) \n",
251
+ " print(f\". Downloaded PDF {download_path} from {pdf_url}\")\n",
252
+ " article_text = extract_text_from_pdf(download_path)\n",
253
+ " r['article_text'] = article_text\n",
254
+ " r['reliefweb_query'] = keyword\n",
255
+ " all_data.append(r)\n",
256
+ " except Exception as e:\n",
257
+ " print(f\"An exception occurred trying to extract {article_url}\")\n",
258
+ " tb_str = ''.join(traceback.format_exception(None, e, e.__traceback__))\n",
259
+ " print(tb_str)\n",
260
+ "\n",
261
+ " all_data = pd.DataFrame(all_data)\n",
262
+ " for f in ['disaster','theme']:\n",
263
+ " if f in list(all_data.columns):\n",
264
+ " all_data[f] = all_data[f].astype(str)\n",
265
+ " return all_data \n",
266
+ " else: \n",
267
+ " print(f\"Request failed with status {response.status_code} {response.text}\") \n",
268
+ " return None "
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": null,
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": []
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": null,
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": []
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "metadata": {},
289
+ "outputs": [],
290
+ "source": []
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": null,
295
+ "metadata": {},
296
+ "outputs": [],
297
+ "source": []
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": null,
302
+ "metadata": {},
303
+ "outputs": [],
304
+ "source": []
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": null,
309
+ "metadata": {},
310
+ "outputs": [],
311
+ "source": []
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": []
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": null,
323
+ "metadata": {},
324
+ "outputs": [],
325
+ "source": []
326
+ }
327
+ ],
328
+ "metadata": {
329
+ "kernelspec": {
330
+ "display_name": "venv_GPT4_Disaster",
331
+ "language": "python",
332
+ "name": "python3"
333
+ },
334
+ "language_info": {
335
+ "codemirror_mode": {
336
+ "name": "ipython",
337
+ "version": 3
338
+ },
339
+ "file_extension": ".py",
340
+ "mimetype": "text/x-python",
341
+ "name": "python",
342
+ "nbconvert_exporter": "python",
343
+ "pygments_lexer": "ipython3",
344
+ "version": "3.10.12"
345
+ }
346
+ },
347
+ "nbformat": 4,
348
+ "nbformat_minor": 2
349
+ }