""" This file contains some functions used to analyze the data from requests and interventions. """ import re import datetime as dt import pandas as pd import plotly.express as px import plotly.graph_objects as go from torch import Tensor from transformers import AutoModel, AutoTokenizer import torch.nn.functional as F SUPPLIES_TAGS = { 'alimentation': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', 'eau': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', 'food': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', 'water': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', 'nourriture': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', 'medical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', 'médical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', 'doctor': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', 'vêtements': 'VÊTEMENTS / CLOTHES / الملابس', 'clothes': 'VÊTEMENTS / CLOTHES / الملابس', 'secours': 'SECOURS / RESCUE / الإنقاذ', 'rescue': 'SECOURS / RESCUE / الإنقاذ', 'refuge': 'REFUGE / SHELTER / المأوى', 'shelter': 'REFUGE / SHELTER / المأوى', 'couvertures': 'COUVERTURES / COVERS / البطانيات', 'covers': 'COUVERTURES / COVERS / البطانيات', 'pharmaceuticals': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', 'medicaments': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', 'pharmacy': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', 'medicine': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', 'blankets': 'COUVERTURES / COVERS / البطانيات', 'tents': 'REFUGE / SHELTER / المأوى', 'couches': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية' } SUPPLIES_NEEDS_CATEGORIES = ['ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', 'VÊTEMENTS / CLOTHES / الملابس', 'SECOURS / RESCUE / الإنقاذ', 'REFUGE / SHELTER / المأوى', 'COUVERTURES / COVERS / البطانيات', # 'KITCHEN TOOLS / USTENSILES DE CUISINE / أدوات المطبخ', 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', 'OTHER'] TRANSLATION_DICT = { 'أغطية': 'covers', 'أسرة': 'beds', 'وسادات': 'pillows', 'مصابح': 'lamps', 'خيام': 'tents', 'ألعاب أطفال': 'toys', 'قليل من المواد الغذائية': 'food', 'افرشة': 'covers', 'جلباب': 'clothes', 'ملابس': 'clothes', 'لديهم كل شيء': 'unknown' } def clean_text(text): """ remove special characters from text """ pattern = re.compile(r'[\u200e\xa0()\u200f]') cleaned_text = pattern.sub('', text) return cleaned_text def contains_arabic(text): """ check if the text contains arabic characters """ arabic_pattern = re.compile(r'[\u0600-\u06FF]+') if type(text)!=str: return False return arabic_pattern.search(text) is not None def arabic_to_latin_punctuation(text): """ replace arabic punctuation with latin punctuation """ punctuation_mapping = { '،': ',', '؛': ';', 'ـ': '_', '؟': '?', '٪': '%', '٫': '.', } for arabic_punct, latin_punct in punctuation_mapping.items(): text = text.replace(arabic_punct, latin_punct) return text def plot_timeline(df: pd.DataFrame, today: dt.datetime, date_col: str): """Plot the timeline of requests and interventions. """ df_past = df[df[date_col]<=today.date()] df_future = df[df[date_col]>today.date()] count_past = (df_past .groupby(date_col) .size() .rename('count') .reset_index()) past_date_range = pd.date_range(start=min(count_past[date_col]), end=today.date(), freq='D') count_past = (count_past .set_index(date_col) .reindex(past_date_range, fill_value=0) .reset_index()) if len(df_future)>0: count_future = df_future.groupby(date_col).size().rename('count').reset_index() future_date_range = pd.date_range(start=today.date()+dt.timedelta(days=1), end=max(count_future[date_col]), freq='D') count_future = (count_future .set_index(date_col) .reindex(future_date_range, fill_value=0) .reset_index()) else: count_future = pd.DataFrame() bridge_date = today.date() bridge_data = pd.DataFrame( {'index': bridge_date, 'form_date':count_past.iloc[-1]['count']}, index=[0]) count_future = pd.concat([bridge_data, count_future], ignore_index=True) # Plot fig = go.Figure() # past fig.add_trace(go.Scatter(x=count_past['index'], y=count_past['count'], mode='lines', name='Past Interventions', line=dict(color='blue'))) # future fig.add_trace(go.Scatter(x=count_future['index'], y=count_future['count'], mode='lines', name='Future Interventions', line=dict(color='orange'))) fig.add_vline(x=today.date(), line_dash="dash", line_color="black") fig.update_layout(yaxis_title="#", xaxis_title='date') return fig def classify_supplies_rule_based(text: pd.DataFrame, keep_raw: bool = False): """ Classifies text into supplies categories from SUPPLIES_TAGS using a rule-based approach.""" classes = [] lowercase_text = text.lower() # case-insensitive matching for keyword, category in SUPPLIES_TAGS.items(): if keyword in lowercase_text: classes.append(category) if keep_raw: classes.append(lowercase_text) elif not classes: classes.append('OTHER') return list(set(classes)) def classify_multilingual_field_e5(df: pd.DataFrame, field_to_tag: str = 'supplies', categories: list = SUPPLIES_NEEDS_CATEGORIES): """ Tag supplies/requests into categories using multilingual-e5-large model. Returns a dataframe with a new column containing the list of predicted categories. Requires CUDA """ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: last_hidden = last_hidden_states.masked_fill( ~attention_mask[..., None].bool(), 0.0) return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large') model = AutoModel.from_pretrained('intfloat/multilingual-e5-large') model.cuda() # classify ar supplies processed_df = df.copy() values_to_classify = processed_df[field_to_tag] mapped_inputs = dict() for text in values_to_classify: gt = [f"{s}" for s in categories] qr = [f"{v}" for v in re.split("\.|,| و", text)] input_texts = qr + gt # Tokenize the input texts batch_dict = tokenizer( input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt') batch_dict = {k: v.cuda() for k, v in batch_dict.items()} outputs = model(**batch_dict) embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # normalize embeddings embeddings = F.normalize(embeddings, p=2, dim=1) scores = (embeddings[:len(qr)] @ embeddings[len(qr):].T) * 100 mapped_inputs[text] = list( set([categories[int(scores[i,:].argmax())] for i in range(len(qr))])) processed_df.loc[values_to_classify.index, f'{field_to_tag}_category'] = list( mapped_inputs.values()) return processed_df def plot_categories_share(raw_df: pd.DataFrame, today: dt.datetime, field: str = 'supplies'): """ Plot the share of each category of requests/supplies. """ df = raw_df[[field, f'{field}_category']].explode(f'{field}_category') pie_data = df.groupby(f'{field}_category', as_index=False).size().rename('n') fig = px.pie(pie_data, names=f'{field}_category', values='n', title=f'# per {field} category up till {today.date()}', labels={f'{field}_category': f'{field}', 'n': '%'}) return fig