<h1 align="center"><font color="yellow">Researching a Humanitarian Disaster Situation Report Chatbot — Using GPT-4-Turbo and full-context prompting</font></h1>

<font color="yellow">Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro</font>

Link de estudo:

* [Tutorial Disaster Situation](https://towardsdatascience.com/researching-a-humanitarian-disaster-situation-report-chatbot-using-gpt-4-turbo-and-full-context-f742203d495a)

Aqui vamos analisar (seguindo o tutorial, acima) [Relatórios de Situações de Desastres Humanitários na incrível plataforma ReliefWeb](https://reliefweb.int/disasters). 

Estes relatórios (conhecidos como “Sitreps”) são vitais para monitorizar e reagir a catástrofes humanitárias em todo o mundo.


Usaremos a [ReliefWeb API](https://reliefweb.int/help/api).

In [1]:
import openai

# Substitua sua chave de API OpenAI:
import openai
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']

model = "gpt-3.5-turbo-1106"

def run_llm(query, system_prompt, reference_content):

    llm_query = {
        "temperature": 0.0,
        "max_tokens": 1000,
        "top_p": 0.95,
        "frequency_penalty": 0,
        "presence_penalty": 0,
    }

    response = openai.ChatCompletion.create(
        model=model,
        messages=[  {
                "role":"system",
                "content": system_prompt
            },
            {
                "role":"user",
                "content": query
            }
        ],
        temperature=llm_query['temperature'],
        max_tokens=llm_query['max_tokens'],
        top_p=llm_query['top_p'],
        frequency_penalty=llm_query['frequency_penalty'],
        presence_penalty=llm_query['presence_penalty'],
        stop=None
    )  

    answer = response['choices'][0]['message']['content']
    return answer


In [None]:
import requests  
import os  
from bs4 import BeautifulSoup 
import re
import pandas as pd
import PyPDF2 
import traceback
import json
import ast
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

from googletrans import Translator
translator = Translator()


def auto_translate(text):
    """
    This function automatically detects language and translates to english 

    Parameters:
        text(str): The text to be translated

    Returns:
        text (str): Translated text if in another language, otherwise 
                    input text
    """
    try:
        lang = translator.detect(text)
        lang = lang.lang
        print(f"Linguagem detectado: {lang}")
        q = translator.translate(text, dest='pt')
        text = q.text
    except Exception as e:
        print("An exception occurred trying to translate")
    return text

def get_safe_name(name):
    """
    This function takes a string and returns a version of it that is 
    safe to use as a filename.

    Parameters:
        name (str): The string to be converted to a safe filename.

    Returns:
        name (str): The safe filename.
    """
    name = str(name)
    name = re.sub("[^0-9a-zA-Z]+", "_", name)
    name = re.sub(r"_$","", name)
    if len(name) == 0:
        name = 'Unknown' 
    return name

def download_pdf(url, download_path):  
    """
    Function to download a PDF from a URL and save locally

    Parameters:
        url (str): Location of online PDF file
        download_path (str): Folder where to save PDF

    """
    response = requests.get(url)  
    with open(download_path, 'wb') as f:  
        f.write(response.content)  
  
def save_text(content, file_path):  
    """
    Function to save text to local file

    Parameters:
        content (str): Text to save
        file_path (str): Folder where to save 
    """
    with open(file_path, 'w') as file:  
        print(f'Saving {file_path}')
        file.write(content)  
  
def extract_text_from_pdf(pdf_path):  
    """
    Function to extract text from PDF file

    Parameters:
        pdf_path (str): Path to PDF file

    Returns:
        text (str): Text extracted from PDF file
    """
    print(pdf_path)
    pdf_reader = PyPDF2.PdfReader(pdf_path)  
    text = ''  
    for page_num in range(len(pdf_reader.pages)):  
        page_obj = pdf_reader.pages[page_num]
        text += page_obj.extract_text()  
    return text  

def get_rw_data(keyword, filter, sort, fields, endpoint, limit=10, \
                save_body_to_text=False):  
    """
    Function to extract data from ReliefWeb API. For API details see:

    https://apidoc.rwlabs.org/?utm_medium=blog&utm_source=reliefweb+website&utm_campaign=api+doc+launching+2016_06

    Parameters:
        keyword (str): Search string
        filter (dict): ReliefWeb filter json
        sort (dict): ReliefWeb sort json
        fields (list): List of fields to return
        endpoint (str): API Endpoint, eg reports, disasters
        limit (int): Maximum records to return
        save_body_to_text (bool) : Flag to save body to text file, including any PDFs on page

    Returns:
        all_data (pandas dataframe): Dataframe of data from API
    """
    query = {  
        "appname": "myapp",  
        "query": {  
            "value": keyword
        },  
        "filter":filter,
        "sort": sort,
        "limit": limit,  
        "fields": fields
    }  

    reliefweb_api_url ="https://api.reliefweb.int/v1/reports" # Eddy Adicionou

    endpoint = f"{reliefweb_api_url}/{endpoint}?appname=apidoc&query[value]="
    print(f"Getting {endpoint} ...")
  
    all_data =[]
    response = requests.post(endpoint, json=query)  
    if response.status_code == 200:  
        data = response.json()  
        for article in data["data"]: 
            article_url = article['fields']['url']   
            try:
                r = article['fields']
                print(article_url)
                article_response = requests.get(article_url)  
                if save_body_to_text:
                    soup = BeautifulSoup(article_response.text, 'html.parser')  
                    main_content = [p.text for p in soup.find_all('p')]  
                    article_text = ' '.join(main_content)
                    save_text(article_text, docs_folder + '/{}.txt'.format(get_safe_name(article['fields']['title'])))  
                    for link in soup.find_all('a'):  
                        href = link.get('href')  
                        if href.endswith('.pdf'):  
                            download_path = os.path.join(docs_folder, href.split('/')[-1])  
                            if href.startswith('/attachments'):
                                pdf_url = f'{reliefweb_pdf_url}{href}'
                            else:
                                pdf_url = href
                            download_pdf(pdf_url, download_path)  
                            print(f".    Downloaded PDF {download_path} from {pdf_url}")
                            article_text = extract_text_from_pdf(download_path)
                    r['article_text'] = article_text
                    r['reliefweb_query'] = keyword
                all_data.append(r)
            except Exception as e:
                print(f"An exception occurred trying to extract {article_url}")
                tb_str = ''.join(traceback.format_exception(None, e, e.__traceback__))
                print(tb_str)

        all_data = pd.DataFrame(all_data)
        for f in ['disaster','theme']:
            if f in list(all_data.columns):
                all_data[f] = all_data[f].astype(str)
        return all_data  
    else:  
        print(f"Request failed with status {response.status_code} {response.text}")  
        return None 