Nt3awnou-rescue-map / src /data_analysis.py
imomayiz's picture
feat: a python script with functions used for processing and analysis of requests and interventions data
dc5bb62
"""
This file contains some functions used to analyze the data from requests and interventions.
"""
import re
import datetime as dt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
SUPPLIES_TAGS = {
'alimentation': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
'eau': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
'food': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
'water': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
'nourriture': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
'medical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
'médical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
'doctor': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
'vêtements': 'VÊTEMENTS / CLOTHES / الملابس',
'clothes': 'VÊTEMENTS / CLOTHES / الملابس',
'secours': 'SECOURS / RESCUE / الإنقاذ',
'rescue': 'SECOURS / RESCUE / الإنقاذ',
'refuge': 'REFUGE / SHELTER / المأوى',
'shelter': 'REFUGE / SHELTER / المأوى',
'couvertures': 'COUVERTURES / COVERS / البطانيات',
'covers': 'COUVERTURES / COVERS / البطانيات',
'pharmaceuticals': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
'medicaments': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
'pharmacy': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
'medicine': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
'blankets': 'COUVERTURES / COVERS / البطانيات',
'tents': 'REFUGE / SHELTER / المأوى',
'couches': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية'
}
SUPPLIES_NEEDS_CATEGORIES = ['ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
'VÊTEMENTS / CLOTHES / الملابس',
'SECOURS / RESCUE / الإنقاذ',
'REFUGE / SHELTER / المأوى',
'COUVERTURES / COVERS / البطانيات',
# 'KITCHEN TOOLS / USTENSILES DE CUISINE / أدوات المطبخ',
'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
'OTHER']
TRANSLATION_DICT = {
'أغطية': 'covers',
'أسرة': 'beds',
'وسادات': 'pillows',
'مصابح': 'lamps',
'خيام': 'tents',
'ألعاب أطفال': 'toys',
'قليل من المواد الغذائية': 'food',
'افرشة': 'covers',
'جلباب': 'clothes',
'ملابس': 'clothes',
'لديهم كل شيء': 'unknown'
}
def clean_text(text):
"""
remove special characters from text
"""
pattern = re.compile(r'[\u200e\xa0()\u200f]')
cleaned_text = pattern.sub('', text)
return cleaned_text
def contains_arabic(text):
"""
check if the text contains arabic characters
"""
arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
if type(text)!=str:
return False
return arabic_pattern.search(text) is not None
def arabic_to_latin_punctuation(text):
"""
replace arabic punctuation with latin punctuation
"""
punctuation_mapping = {
'،': ',',
'؛': ';',
'ـ': '_',
'؟': '?',
'٪': '%',
'٫': '.',
}
for arabic_punct, latin_punct in punctuation_mapping.items():
text = text.replace(arabic_punct, latin_punct)
return text
def plot_timeline(df: pd.DataFrame, today: dt.datetime, date_col: str):
"""Plot the timeline of requests and interventions.
"""
df_past = df[df[date_col]<=today.date()]
df_future = df[df[date_col]>today.date()]
count_past = (df_past
.groupby(date_col)
.size()
.rename('count')
.reset_index())
past_date_range = pd.date_range(start=min(count_past[date_col]),
end=today.date(),
freq='D')
count_past = (count_past
.set_index(date_col)
.reindex(past_date_range, fill_value=0)
.reset_index())
if len(df_future)>0:
count_future = df_future.groupby(date_col).size().rename('count').reset_index()
future_date_range = pd.date_range(start=today.date()+dt.timedelta(days=1),
end=max(count_future[date_col]),
freq='D')
count_future = (count_future
.set_index(date_col)
.reindex(future_date_range, fill_value=0)
.reset_index())
else:
count_future = pd.DataFrame()
bridge_date = today.date()
bridge_data = pd.DataFrame(
{'index': bridge_date, 'form_date':count_past.iloc[-1]['count']}, index=[0])
count_future = pd.concat([bridge_data, count_future], ignore_index=True)
# Plot
fig = go.Figure()
# past
fig.add_trace(go.Scatter(x=count_past['index'],
y=count_past['count'],
mode='lines',
name='Past Interventions',
line=dict(color='blue')))
# future
fig.add_trace(go.Scatter(x=count_future['index'],
y=count_future['count'],
mode='lines',
name='Future Interventions',
line=dict(color='orange')))
fig.add_vline(x=today.date(), line_dash="dash", line_color="black")
fig.update_layout(yaxis_title="#", xaxis_title='date')
return fig
def classify_supplies_rule_based(text: pd.DataFrame, keep_raw: bool = False):
""" Classifies text into supplies categories from SUPPLIES_TAGS
using a rule-based approach."""
classes = []
lowercase_text = text.lower() # case-insensitive matching
for keyword, category in SUPPLIES_TAGS.items():
if keyword in lowercase_text:
classes.append(category)
if keep_raw:
classes.append(lowercase_text)
elif not classes:
classes.append('OTHER')
return list(set(classes))
def classify_multilingual_field_e5(df: pd.DataFrame,
field_to_tag: str = 'supplies',
categories: list = SUPPLIES_NEEDS_CATEGORIES):
"""
Tag supplies/requests into categories using multilingual-e5-large model.
Returns a dataframe with a new column containing the list of predicted categories.
Requires CUDA
"""
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(
~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')
model.cuda()
# classify ar supplies
processed_df = df.copy()
values_to_classify = processed_df[field_to_tag]
mapped_inputs = dict()
for text in values_to_classify:
gt = [f"{s}" for s in categories]
qr = [f"{v}" for v in re.split("\.|,| و", text)]
input_texts = qr + gt
# Tokenize the input texts
batch_dict = tokenizer(
input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
batch_dict = {k: v.cuda() for k, v in batch_dict.items()}
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:len(qr)] @ embeddings[len(qr):].T) * 100
mapped_inputs[text] = list(
set([categories[int(scores[i,:].argmax())] for i in range(len(qr))]))
processed_df.loc[values_to_classify.index, f'{field_to_tag}_category'] = list(
mapped_inputs.values())
return processed_df
def plot_categories_share(raw_df: pd.DataFrame,
today: dt.datetime,
field: str = 'supplies'):
"""
Plot the share of each category of requests/supplies.
"""
df = raw_df[[field, f'{field}_category']].explode(f'{field}_category')
pie_data = df.groupby(f'{field}_category', as_index=False).size().rename('n')
fig = px.pie(pie_data,
names=f'{field}_category',
values='n',
title=f'# per {field} category up till {today.date()}',
labels={f'{field}_category': f'{field}', 'n': '%'})
return fig