{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1 align=\"center\"><font color=\"yellow\">Researching a Humanitarian Disaster Situation Report Chatbot — Using GPT-4-Turbo and full-context prompting</font></h1>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font color=\"yellow\">Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro</font>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Link de estudo:\n",
    "\n",
    "* [Tutorial Disaster Situation](https://towardsdatascience.com/researching-a-humanitarian-disaster-situation-report-chatbot-using-gpt-4-turbo-and-full-context-f742203d495a)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Aqui vamos analisar (seguindo o tutorial, acima) [Relatórios de Situações de Desastres Humanitários na incrível plataforma ReliefWeb](https://reliefweb.int/disasters). \n",
    "\n",
    "Estes relatórios (conhecidos como “Sitreps”) são vitais para monitorizar e reagir a catástrofes humanitárias em todo o mundo.\n",
    "\n",
    "\n",
    "Usaremos a [ReliefWeb API](https://reliefweb.int/help/api)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
    "# Substitua sua chave de API OpenAI:\n",
    "import openai\n",
    "import os\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "_ = load_dotenv(find_dotenv()) # read local .env file\n",
    "openai.api_key  = os.environ['OPENAI_API_KEY']\n",
    "\n",
    "model = \"gpt-3.5-turbo-1106\"\n",
    "\n",
    "def run_llm(query, system_prompt, reference_content):\n",
    "\n",
    "    llm_query = {\n",
    "        \"temperature\": 0.0,\n",
    "        \"max_tokens\": 1000,\n",
    "        \"top_p\": 0.95,\n",
    "        \"frequency_penalty\": 0,\n",
    "        \"presence_penalty\": 0,\n",
    "    }\n",
    "\n",
    "    response = openai.ChatCompletion.create(\n",
    "        model=model,\n",
    "        messages=[  {\n",
    "                \"role\":\"system\",\n",
    "                \"content\": system_prompt\n",
    "            },\n",
    "            {\n",
    "                \"role\":\"user\",\n",
    "                \"content\": query\n",
    "            }\n",
    "        ],\n",
    "        temperature=llm_query['temperature'],\n",
    "        max_tokens=llm_query['max_tokens'],\n",
    "        top_p=llm_query['top_p'],\n",
    "        frequency_penalty=llm_query['frequency_penalty'],\n",
    "        presence_penalty=llm_query['presence_penalty'],\n",
    "        stop=None\n",
    "    )  \n",
    "\n",
    "    answer = response['choices'][0]['message']['content']\n",
    "    return answer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests  \n",
    "import os  \n",
    "from bs4 import BeautifulSoup \n",
    "import re\n",
    "import pandas as pd\n",
    "import PyPDF2 \n",
    "import traceback\n",
    "import json\n",
    "import ast\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "import tiktoken\n",
    "\n",
    "from googletrans import Translator\n",
    "translator = Translator()\n",
    "\n",
    "\n",
    "def auto_translate(text):\n",
    "    \"\"\"\n",
    "    This function automatically detects language and translates to english \n",
    "\n",
    "    Parameters:\n",
    "        text(str): The text to be translated\n",
    "\n",
    "    Returns:\n",
    "        text (str): Translated text if in another language, otherwise \n",
    "                    input text\n",
    "    \"\"\"\n",
    "    try:\n",
    "        lang = translator.detect(text)\n",
    "        lang = lang.lang\n",
    "        print(f\"Linguagem detectado: {lang}\")\n",
    "        q = translator.translate(text, dest='pt')\n",
    "        text = q.text\n",
    "    except Exception as e:\n",
    "        print(\"An exception occurred trying to translate\")\n",
    "    return text\n",
    "\n",
    "def get_safe_name(name):\n",
    "    \"\"\"\n",
    "    This function takes a string and returns a version of it that is \n",
    "    safe to use as a filename.\n",
    "\n",
    "    Parameters:\n",
    "        name (str): The string to be converted to a safe filename.\n",
    "\n",
    "    Returns:\n",
    "        name (str): The safe filename.\n",
    "    \"\"\"\n",
    "    name = str(name)\n",
    "    name = re.sub(\"[^0-9a-zA-Z]+\", \"_\", name)\n",
    "    name = re.sub(r\"_$\",\"\", name)\n",
    "    if len(name) == 0:\n",
    "        name = 'Unknown' \n",
    "    return name\n",
    "\n",
    "def download_pdf(url, download_path):  \n",
    "    \"\"\"\n",
    "    Function to download a PDF from a URL and save locally\n",
    "\n",
    "    Parameters:\n",
    "        url (str): Location of online PDF file\n",
    "        download_path (str): Folder where to save PDF\n",
    "\n",
    "    \"\"\"\n",
    "    response = requests.get(url)  \n",
    "    with open(download_path, 'wb') as f:  \n",
    "        f.write(response.content)  \n",
    "  \n",
    "def save_text(content, file_path):  \n",
    "    \"\"\"\n",
    "    Function to save text to local file\n",
    "\n",
    "    Parameters:\n",
    "        content (str): Text to save\n",
    "        file_path (str): Folder where to save \n",
    "    \"\"\"\n",
    "    with open(file_path, 'w') as file:  \n",
    "        print(f'Saving {file_path}')\n",
    "        file.write(content)  \n",
    "  \n",
    "def extract_text_from_pdf(pdf_path):  \n",
    "    \"\"\"\n",
    "    Function to extract text from PDF file\n",
    "\n",
    "    Parameters:\n",
    "        pdf_path (str): Path to PDF file\n",
    "\n",
    "    Returns:\n",
    "        text (str): Text extracted from PDF file\n",
    "    \"\"\"\n",
    "    print(pdf_path)\n",
    "    pdf_reader = PyPDF2.PdfReader(pdf_path)  \n",
    "    text = ''  \n",
    "    for page_num in range(len(pdf_reader.pages)):  \n",
    "        page_obj = pdf_reader.pages[page_num]\n",
    "        text += page_obj.extract_text()  \n",
    "    return text  \n",
    "\n",
    "def get_rw_data(keyword, filter, sort, fields, endpoint, limit=10, \\\n",
    "                save_body_to_text=False):  \n",
    "    \"\"\"\n",
    "    Function to extract data from ReliefWeb API. For API details see:\n",
    "\n",
    "    https://apidoc.rwlabs.org/?utm_medium=blog&utm_source=reliefweb+website&utm_campaign=api+doc+launching+2016_06\n",
    "\n",
    "    Parameters:\n",
    "        keyword (str): Search string\n",
    "        filter (dict): ReliefWeb filter json\n",
    "        sort (dict): ReliefWeb sort json\n",
    "        fields (list): List of fields to return\n",
    "        endpoint (str): API Endpoint, eg reports, disasters\n",
    "        limit (int): Maximum records to return\n",
    "        save_body_to_text (bool) : Flag to save body to text file, including any PDFs on page\n",
    "\n",
    "    Returns:\n",
    "        all_data (pandas dataframe): Dataframe of data from API\n",
    "    \"\"\"\n",
    "    query = {  \n",
    "        \"appname\": \"myapp\",  \n",
    "        \"query\": {  \n",
    "            \"value\": keyword\n",
    "        },  \n",
    "        \"filter\":filter,\n",
    "        \"sort\": sort,\n",
    "        \"limit\": limit,  \n",
    "        \"fields\": fields\n",
    "    }  \n",
    "\n",
    "    reliefweb_api_url =\"https://api.reliefweb.int/v1/reports\" # Eddy Adicionou\n",
    "\n",
    "    endpoint = f\"{reliefweb_api_url}/{endpoint}?appname=apidoc&query[value]=\"\n",
    "    print(f\"Getting {endpoint} ...\")\n",
    "  \n",
    "    all_data =[]\n",
    "    response = requests.post(endpoint, json=query)  \n",
    "    if response.status_code == 200:  \n",
    "        data = response.json()  \n",
    "        for article in data[\"data\"]: \n",
    "            article_url = article['fields']['url']   \n",
    "            try:\n",
    "                r = article['fields']\n",
    "                print(article_url)\n",
    "                article_response = requests.get(article_url)  \n",
    "                if save_body_to_text:\n",
    "                    soup = BeautifulSoup(article_response.text, 'html.parser')  \n",
    "                    main_content = [p.text for p in soup.find_all('p')]  \n",
    "                    article_text = ' '.join(main_content)\n",
    "                    save_text(article_text, docs_folder + '/{}.txt'.format(get_safe_name(article['fields']['title'])))  \n",
    "                    for link in soup.find_all('a'):  \n",
    "                        href = link.get('href')  \n",
    "                        if href.endswith('.pdf'):  \n",
    "                            download_path = os.path.join(docs_folder, href.split('/')[-1])  \n",
    "                            if href.startswith('/attachments'):\n",
    "                                pdf_url = f'{reliefweb_pdf_url}{href}'\n",
    "                            else:\n",
    "                                pdf_url = href\n",
    "                            download_pdf(pdf_url, download_path)  \n",
    "                            print(f\".    Downloaded PDF {download_path} from {pdf_url}\")\n",
    "                            article_text = extract_text_from_pdf(download_path)\n",
    "                    r['article_text'] = article_text\n",
    "                    r['reliefweb_query'] = keyword\n",
    "                all_data.append(r)\n",
    "            except Exception as e:\n",
    "                print(f\"An exception occurred trying to extract {article_url}\")\n",
    "                tb_str = ''.join(traceback.format_exception(None, e, e.__traceback__))\n",
    "                print(tb_str)\n",
    "\n",
    "        all_data = pd.DataFrame(all_data)\n",
    "        for f in ['disaster','theme']:\n",
    "            if f in list(all_data.columns):\n",
    "                all_data[f] = all_data[f].astype(str)\n",
    "        return all_data  \n",
    "    else:  \n",
    "        print(f\"Request failed with status {response.status_code} {response.text}\")  \n",
    "        return None "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv_GPT4_Disaster",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}