Spaces:

mtyrrell
/

maf_prefilter_app

Sleeping

App Files Files Community

mtyrrell commited on May 30

Commit

747caaf

•

1 Parent(s): c30cd54

sector incl. in filter logic

Browse files

Files changed (5) hide show

app.py +2 -2
images/pipeline.png +0 -0
modules/__pycache__/utils.cpython-38.pyc +0 -0
modules/utils.py +37 -15
processed_applications.csv +0 -0

app.py CHANGED Viewed

@@ -102,8 +102,8 @@ def main():
                 df = st.session_state['df']
                 # Get the current date
-                date = datetime.now().strftime('%d-%m-%Y')
-                output_filename = 'processed_applications_'+date+'.csv'
                 output_file = 'processed_applications.csv'
                 df.to_csv(output_file, index=False)

                 df = st.session_state['df']
                 # Get the current date
+                current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
+                output_filename = 'processed_applications_'+current_datetime+'.csv'
                 output_file = 'processed_applications.csv'
                 df.to_csv(output_file, index=False)

images/pipeline.png CHANGED Viewed

modules/__pycache__/utils.cpython-38.pyc CHANGED Viewed

Binary files a/modules/__pycache__/utils.cpython-38.pyc and b/modules/__pycache__/utils.cpython-38.pyc differ

modules/utils.py CHANGED Viewed

@@ -38,19 +38,38 @@ def clean_text(input_text):
     cleaned_text = re.sub(r"\s+", " ", cleaned_text)
     cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
     return cleaned_text
-# Function for extracting classifications for each SECTOR label
-def extract_predicted_labels(outputs, threshold=0.5):
-    predicted_labels = []
-    for item in outputs:
-        # check if the score is above the threshold
-        if item.get('score', 0) > threshold:
-            # if yes, append the label to the predicted_labels list
-            predicted_labels.append(item.get('label'))
-    return predicted_labels
 # Function to call model and run inference for varying classification tasks/models
 def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
@@ -66,10 +85,9 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
     predictions = []
     total = len(df)
     for i, text in enumerate(df[col_name]):
-        # Simulate prediction
-        prediction = model(text)  # This is a placeholder for your actual prediction call
         if model_name in model_names_sf:
-            predictions.append(0 if prediction == 'NEGATIVE' else 1)  # Modify according to actual model output
         elif model_name == 'ADAPMIT':
             predictions.append(re.sub('Label$', '', prediction[0]['label']))
         elif model_name == 'SECTOR':
@@ -79,6 +97,7 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
         # Update progress bar with each iteration
         progress = (i + 1) / total
         progress_bar.progress(progress)
     return predictions
@@ -128,7 +147,9 @@ def process_data(uploaded_file, sens_level):
         elif model_name == 'ADAPMIT':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
         elif model_name == 'SECTOR':
-            df[model_name] = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
         elif model_name == 'LANG':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
@@ -155,7 +176,8 @@ def process_data(uploaded_file, sens_level):
     st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
     # Further data processing and actions
-    df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10, 0), axis=1)
-    df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] < sens_level or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation') else 'REVIEW', axis=1)
     return df

     cleaned_text = re.sub(r"\s+", " ", cleaned_text)
     cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
     return cleaned_text
+# # Function for extracting classifications for each SECTOR label
+def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
+    # verify output is a list of dictionaries
+    if isinstance(output, list) and all(isinstance(item, dict) for item in output):
+        # filter items with scores above the threshold
+        filtered_items = [item for item in output if item.get('score', 0) > threshold]
+        # sort the filtered items by score in descending order
+        sorted_items = sorted(filtered_items, key=lambda x: x.get('score', 0), reverse=True)
+        # extract the highest and second-highest labels
+        if len(sorted_items) >= 2:
+            highest_label = sorted_items[0].get('label')
+            second_highest_label = sorted_items[1].get('label')
+        elif len(sorted_items) == 1:
+            highest_label = sorted_items[0].get('label')
+            second_highest_label = None
+        else:
+            print("Warning: Less than two items above the threshold in the current list.")
+            highest_label = None
+            second_highest_label = None
+    else:
+        print("Error: Inner data is not formatted correctly. Each item must be a dictionary.")
+        highest_label = None
+        second_highest_label = None
+    # Output dictionary of highest and second-highest labels to the all_predicted_labels list
+    predicted_labels = {"SECTOR1": highest_label, "SECTOR2": second_highest_label}
+    return predicted_labels
 # Function to call model and run inference for varying classification tasks/models
 def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
     predictions = []
     total = len(df)
     for i, text in enumerate(df[col_name]):
+        prediction = model(text)
         if model_name in model_names_sf:
+            predictions.append(0 if prediction == 'NEGATIVE' else 1)
         elif model_name == 'ADAPMIT':
             predictions.append(re.sub('Label$', '', prediction[0]['label']))
         elif model_name == 'SECTOR':
         # Update progress bar with each iteration
         progress = (i + 1) / total
         progress_bar.progress(progress)
+    # st.write(predictions)
     return predictions
         elif model_name == 'ADAPMIT':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
         elif model_name == 'SECTOR':
+            sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
+            df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
+            df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
         elif model_name == 'LANG':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
     st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
     # Further data processing and actions
+    sector_classes = ['Energy','Transport','Industries']
+    df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10,0), axis=1)
+    df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
     return df

processed_applications.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff