Spaces:
Runtime error
Runtime error
| """ | |
| This file contains some functions used to analyze the data from requests and interventions. | |
| """ | |
| import re | |
| import datetime as dt | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from torch import Tensor | |
| from transformers import AutoModel, AutoTokenizer | |
| import torch.nn.functional as F | |
| SUPPLIES_TAGS = { | |
| 'alimentation': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
| 'eau': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
| 'food': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
| 'water': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
| 'nourriture': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
| 'medical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
| 'médical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
| 'doctor': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
| 'vêtements': 'VÊTEMENTS / CLOTHES / الملابس', | |
| 'clothes': 'VÊTEMENTS / CLOTHES / الملابس', | |
| 'secours': 'SECOURS / RESCUE / الإنقاذ', | |
| 'rescue': 'SECOURS / RESCUE / الإنقاذ', | |
| 'refuge': 'REFUGE / SHELTER / المأوى', | |
| 'shelter': 'REFUGE / SHELTER / المأوى', | |
| 'couvertures': 'COUVERTURES / COVERS / البطانيات', | |
| 'covers': 'COUVERTURES / COVERS / البطانيات', | |
| 'pharmaceuticals': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
| 'medicaments': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
| 'pharmacy': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
| 'medicine': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
| 'blankets': 'COUVERTURES / COVERS / البطانيات', | |
| 'tents': 'REFUGE / SHELTER / المأوى', | |
| 'couches': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية' | |
| } | |
| SUPPLIES_NEEDS_CATEGORIES = ['ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء', | |
| 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية', | |
| 'VÊTEMENTS / CLOTHES / الملابس', | |
| 'SECOURS / RESCUE / الإنقاذ', | |
| 'REFUGE / SHELTER / المأوى', | |
| 'COUVERTURES / COVERS / البطانيات', | |
| # 'KITCHEN TOOLS / USTENSILES DE CUISINE / أدوات المطبخ', | |
| 'PHARMACEUTICALS / MEDICAMENTS / الأدوية', | |
| 'OTHER'] | |
| TRANSLATION_DICT = { | |
| 'أغطية': 'covers', | |
| 'أسرة': 'beds', | |
| 'وسادات': 'pillows', | |
| 'مصابح': 'lamps', | |
| 'خيام': 'tents', | |
| 'ألعاب أطفال': 'toys', | |
| 'قليل من المواد الغذائية': 'food', | |
| 'افرشة': 'covers', | |
| 'جلباب': 'clothes', | |
| 'ملابس': 'clothes', | |
| 'لديهم كل شيء': 'unknown' | |
| } | |
| def clean_text(text): | |
| """ | |
| remove special characters from text | |
| """ | |
| pattern = re.compile(r'[\u200e\xa0()\u200f]') | |
| cleaned_text = pattern.sub('', text) | |
| return cleaned_text | |
| def contains_arabic(text): | |
| """ | |
| check if the text contains arabic characters | |
| """ | |
| arabic_pattern = re.compile(r'[\u0600-\u06FF]+') | |
| if type(text)!=str: | |
| return False | |
| return arabic_pattern.search(text) is not None | |
| def arabic_to_latin_punctuation(text): | |
| """ | |
| replace arabic punctuation with latin punctuation | |
| """ | |
| punctuation_mapping = { | |
| '،': ',', | |
| '؛': ';', | |
| 'ـ': '_', | |
| '؟': '?', | |
| '٪': '%', | |
| '٫': '.', | |
| } | |
| for arabic_punct, latin_punct in punctuation_mapping.items(): | |
| text = text.replace(arabic_punct, latin_punct) | |
| return text | |
| def plot_timeline(df: pd.DataFrame, today: dt.datetime, date_col: str): | |
| """Plot the timeline of requests and interventions. | |
| """ | |
| df_past = df[df[date_col]<=today.date()] | |
| df_future = df[df[date_col]>today.date()] | |
| count_past = (df_past | |
| .groupby(date_col) | |
| .size() | |
| .rename('count') | |
| .reset_index()) | |
| past_date_range = pd.date_range(start=min(count_past[date_col]), | |
| end=today.date(), | |
| freq='D') | |
| count_past = (count_past | |
| .set_index(date_col) | |
| .reindex(past_date_range, fill_value=0) | |
| .reset_index()) | |
| if len(df_future)>0: | |
| count_future = df_future.groupby(date_col).size().rename('count').reset_index() | |
| future_date_range = pd.date_range(start=today.date()+dt.timedelta(days=1), | |
| end=max(count_future[date_col]), | |
| freq='D') | |
| count_future = (count_future | |
| .set_index(date_col) | |
| .reindex(future_date_range, fill_value=0) | |
| .reset_index()) | |
| else: | |
| count_future = pd.DataFrame() | |
| bridge_date = today.date() | |
| bridge_data = pd.DataFrame( | |
| {'index': bridge_date, 'form_date':count_past.iloc[-1]['count']}, index=[0]) | |
| count_future = pd.concat([bridge_data, count_future], ignore_index=True) | |
| # Plot | |
| fig = go.Figure() | |
| # past | |
| fig.add_trace(go.Scatter(x=count_past['index'], | |
| y=count_past['count'], | |
| mode='lines', | |
| name='Past Interventions', | |
| line=dict(color='blue'))) | |
| # future | |
| fig.add_trace(go.Scatter(x=count_future['index'], | |
| y=count_future['count'], | |
| mode='lines', | |
| name='Future Interventions', | |
| line=dict(color='orange'))) | |
| fig.add_vline(x=today.date(), line_dash="dash", line_color="black") | |
| fig.update_layout(yaxis_title="#", xaxis_title='date') | |
| return fig | |
| def classify_supplies_rule_based(text: pd.DataFrame, keep_raw: bool = False): | |
| """ Classifies text into supplies categories from SUPPLIES_TAGS | |
| using a rule-based approach.""" | |
| classes = [] | |
| lowercase_text = text.lower() # case-insensitive matching | |
| for keyword, category in SUPPLIES_TAGS.items(): | |
| if keyword in lowercase_text: | |
| classes.append(category) | |
| if keep_raw: | |
| classes.append(lowercase_text) | |
| elif not classes: | |
| classes.append('OTHER') | |
| return list(set(classes)) | |
| def classify_multilingual_field_e5(df: pd.DataFrame, | |
| field_to_tag: str = 'supplies', | |
| categories: list = SUPPLIES_NEEDS_CATEGORIES): | |
| """ | |
| Tag supplies/requests into categories using multilingual-e5-large model. | |
| Returns a dataframe with a new column containing the list of predicted categories. | |
| Requires CUDA | |
| """ | |
| def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: | |
| last_hidden = last_hidden_states.masked_fill( | |
| ~attention_mask[..., None].bool(), 0.0) | |
| return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] | |
| tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large') | |
| model = AutoModel.from_pretrained('intfloat/multilingual-e5-large') | |
| model.cuda() | |
| # classify ar supplies | |
| processed_df = df.copy() | |
| values_to_classify = processed_df[field_to_tag] | |
| mapped_inputs = dict() | |
| for text in values_to_classify: | |
| gt = [f"{s}" for s in categories] | |
| qr = [f"{v}" for v in re.split("\.|,| و", text)] | |
| input_texts = qr + gt | |
| # Tokenize the input texts | |
| batch_dict = tokenizer( | |
| input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt') | |
| batch_dict = {k: v.cuda() for k, v in batch_dict.items()} | |
| outputs = model(**batch_dict) | |
| embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) | |
| # normalize embeddings | |
| embeddings = F.normalize(embeddings, p=2, dim=1) | |
| scores = (embeddings[:len(qr)] @ embeddings[len(qr):].T) * 100 | |
| mapped_inputs[text] = list( | |
| set([categories[int(scores[i,:].argmax())] for i in range(len(qr))])) | |
| processed_df.loc[values_to_classify.index, f'{field_to_tag}_category'] = list( | |
| mapped_inputs.values()) | |
| return processed_df | |
| def plot_categories_share(raw_df: pd.DataFrame, | |
| today: dt.datetime, | |
| field: str = 'supplies'): | |
| """ | |
| Plot the share of each category of requests/supplies. | |
| """ | |
| df = raw_df[[field, f'{field}_category']].explode(f'{field}_category') | |
| pie_data = df.groupby(f'{field}_category', as_index=False).size().rename('n') | |
| fig = px.pie(pie_data, | |
| names=f'{field}_category', | |
| values='n', | |
| title=f'# per {field} category up till {today.date()}', | |
| labels={f'{field}_category': f'{field}', 'n': '%'}) | |
| return fig |