Spaces:
Runtime error
Runtime error
""" | |
This file contains some functions used to analyze the data from requests and interventions. | |
""" | |
import re | |
import datetime as dt | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from torch import Tensor | |
from transformers import AutoModel, AutoTokenizer | |
import torch.nn.functional as F | |
SUPPLIES_TAGS = { | |
'alimentation': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
'eau': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
'food': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
'water': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
'nourriture': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
'medical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
'médical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
'doctor': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
'vêtements': 'VÊTEMENTS / CLOTHES / الملابس', | |
'clothes': 'VÊTEMENTS / CLOTHES / الملابس', | |
'secours': 'SECOURS / RESCUE / الإنقاذ', | |
'rescue': 'SECOURS / RESCUE / الإنقاذ', | |
'refuge': 'REFUGE / SHELTER / المأوى', | |
'shelter': 'REFUGE / SHELTER / المأوى', | |
'couvertures': 'COUVERTURES / COVERS / البطانيات', | |
'covers': 'COUVERTURES / COVERS / البطانيات', | |
'pharmaceuticals': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
'medicaments': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
'pharmacy': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
'medicine': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
'blankets': 'COUVERTURES / COVERS / البطانيات', | |
'tents': 'REFUGE / SHELTER / المأوى', | |
'couches': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية' | |
} | |
SUPPLIES_NEEDS_CATEGORIES = ['ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
'VÊTEMENTS / CLOTHES / الملابس', | |
'SECOURS / RESCUE / الإنقاذ', | |
'REFUGE / SHELTER / المأوى', | |
'COUVERTURES / COVERS / البطانيات', | |
# 'KITCHEN TOOLS / USTENSILES DE CUISINE / أدوات المطبخ', | |
'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
'OTHER'] | |
TRANSLATION_DICT = { | |
'أغطية': 'covers', | |
'أسرة': 'beds', | |
'وسادات': 'pillows', | |
'مصابح': 'lamps', | |
'خيام': 'tents', | |
'ألعاب أطفال': 'toys', | |
'قليل من المواد الغذائية': 'food', | |
'افرشة': 'covers', | |
'جلباب': 'clothes', | |
'ملابس': 'clothes', | |
'لديهم كل شيء': 'unknown' | |
} | |
def clean_text(text): | |
""" | |
remove special characters from text | |
""" | |
pattern = re.compile(r'[\u200e\xa0()\u200f]') | |
cleaned_text = pattern.sub('', text) | |
return cleaned_text | |
def contains_arabic(text): | |
""" | |
check if the text contains arabic characters | |
""" | |
arabic_pattern = re.compile(r'[\u0600-\u06FF]+') | |
if type(text)!=str: | |
return False | |
return arabic_pattern.search(text) is not None | |
def arabic_to_latin_punctuation(text): | |
""" | |
replace arabic punctuation with latin punctuation | |
""" | |
punctuation_mapping = { | |
'،': ',', | |
'؛': ';', | |
'ـ': '_', | |
'؟': '?', | |
'٪': '%', | |
'٫': '.', | |
} | |
for arabic_punct, latin_punct in punctuation_mapping.items(): | |
text = text.replace(arabic_punct, latin_punct) | |
return text | |
def plot_timeline(df: pd.DataFrame, today: dt.datetime, date_col: str): | |
"""Plot the timeline of requests and interventions. | |
""" | |
df_past = df[df[date_col]<=today.date()] | |
df_future = df[df[date_col]>today.date()] | |
count_past = (df_past | |
.groupby(date_col) | |
.size() | |
.rename('count') | |
.reset_index()) | |
past_date_range = pd.date_range(start=min(count_past[date_col]), | |
end=today.date(), | |
freq='D') | |
count_past = (count_past | |
.set_index(date_col) | |
.reindex(past_date_range, fill_value=0) | |
.reset_index()) | |
if len(df_future)>0: | |
count_future = df_future.groupby(date_col).size().rename('count').reset_index() | |
future_date_range = pd.date_range(start=today.date()+dt.timedelta(days=1), | |
end=max(count_future[date_col]), | |
freq='D') | |
count_future = (count_future | |
.set_index(date_col) | |
.reindex(future_date_range, fill_value=0) | |
.reset_index()) | |
else: | |
count_future = pd.DataFrame() | |
bridge_date = today.date() | |
bridge_data = pd.DataFrame( | |
{'index': bridge_date, 'form_date':count_past.iloc[-1]['count']}, index=[0]) | |
count_future = pd.concat([bridge_data, count_future], ignore_index=True) | |
# Plot | |
fig = go.Figure() | |
# past | |
fig.add_trace(go.Scatter(x=count_past['index'], | |
y=count_past['count'], | |
mode='lines', | |
name='Past Interventions', | |
line=dict(color='blue'))) | |
# future | |
fig.add_trace(go.Scatter(x=count_future['index'], | |
y=count_future['count'], | |
mode='lines', | |
name='Future Interventions', | |
line=dict(color='orange'))) | |
fig.add_vline(x=today.date(), line_dash="dash", line_color="black") | |
fig.update_layout(yaxis_title="#", xaxis_title='date') | |
return fig | |
def classify_supplies_rule_based(text: pd.DataFrame, keep_raw: bool = False): | |
""" Classifies text into supplies categories from SUPPLIES_TAGS | |
using a rule-based approach.""" | |
classes = [] | |
lowercase_text = text.lower() # case-insensitive matching | |
for keyword, category in SUPPLIES_TAGS.items(): | |
if keyword in lowercase_text: | |
classes.append(category) | |
if keep_raw: | |
classes.append(lowercase_text) | |
elif not classes: | |
classes.append('OTHER') | |
return list(set(classes)) | |
def classify_multilingual_field_e5(df: pd.DataFrame, | |
field_to_tag: str = 'supplies', | |
categories: list = SUPPLIES_NEEDS_CATEGORIES): | |
""" | |
Tag supplies/requests into categories using multilingual-e5-large model. | |
Returns a dataframe with a new column containing the list of predicted categories. | |
Requires CUDA | |
""" | |
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: | |
last_hidden = last_hidden_states.masked_fill( | |
~attention_mask[..., None].bool(), 0.0) | |
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] | |
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large') | |
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large') | |
model.cuda() | |
# classify ar supplies | |
processed_df = df.copy() | |
values_to_classify = processed_df[field_to_tag] | |
mapped_inputs = dict() | |
for text in values_to_classify: | |
gt = [f"{s}" for s in categories] | |
qr = [f"{v}" for v in re.split("\.|,| و", text)] | |
input_texts = qr + gt | |
# Tokenize the input texts | |
batch_dict = tokenizer( | |
input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt') | |
batch_dict = {k: v.cuda() for k, v in batch_dict.items()} | |
outputs = model(**batch_dict) | |
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) | |
# normalize embeddings | |
embeddings = F.normalize(embeddings, p=2, dim=1) | |
scores = (embeddings[:len(qr)] @ embeddings[len(qr):].T) * 100 | |
mapped_inputs[text] = list( | |
set([categories[int(scores[i,:].argmax())] for i in range(len(qr))])) | |
processed_df.loc[values_to_classify.index, f'{field_to_tag}_category'] = list( | |
mapped_inputs.values()) | |
return processed_df | |
def plot_categories_share(raw_df: pd.DataFrame, | |
today: dt.datetime, | |
field: str = 'supplies'): | |
""" | |
Plot the share of each category of requests/supplies. | |
""" | |
df = raw_df[[field, f'{field}_category']].explode(f'{field}_category') | |
pie_data = df.groupby(f'{field}_category', as_index=False).size().rename('n') | |
fig = px.pie(pie_data, | |
names=f'{field}_category', | |
values='n', | |
title=f'# per {field} category up till {today.date()}', | |
labels={f'{field}_category': f'{field}', 'n': '%'}) | |
return fig |