mtyrrell commited on
Commit
747caaf
1 Parent(s): c30cd54

sector incl. in filter logic

Browse files
app.py CHANGED
@@ -102,8 +102,8 @@ def main():
102
  df = st.session_state['df']
103
 
104
  # Get the current date
105
- date = datetime.now().strftime('%d-%m-%Y')
106
- output_filename = 'processed_applications_'+date+'.csv'
107
 
108
  output_file = 'processed_applications.csv'
109
  df.to_csv(output_file, index=False)
 
102
  df = st.session_state['df']
103
 
104
  # Get the current date
105
+ current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
106
+ output_filename = 'processed_applications_'+current_datetime+'.csv'
107
 
108
  output_file = 'processed_applications.csv'
109
  df.to_csv(output_file, index=False)
images/pipeline.png CHANGED
modules/__pycache__/utils.cpython-38.pyc CHANGED
Binary files a/modules/__pycache__/utils.cpython-38.pyc and b/modules/__pycache__/utils.cpython-38.pyc differ
 
modules/utils.py CHANGED
@@ -38,19 +38,38 @@ def clean_text(input_text):
38
  cleaned_text = re.sub(r"\s+", " ", cleaned_text)
39
  cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
40
  return cleaned_text
 
41
 
 
 
42
 
43
- # Function for extracting classifications for each SECTOR label
44
- def extract_predicted_labels(outputs, threshold=0.5):
45
- predicted_labels = []
46
- for item in outputs:
47
- # check if the score is above the threshold
48
- if item.get('score', 0) > threshold:
49
- # if yes, append the label to the predicted_labels list
50
- predicted_labels.append(item.get('label'))
51
 
52
- return predicted_labels
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
 
 
54
 
55
  # Function to call model and run inference for varying classification tasks/models
56
  def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
@@ -66,10 +85,9 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
66
  predictions = []
67
  total = len(df)
68
  for i, text in enumerate(df[col_name]):
69
- # Simulate prediction
70
- prediction = model(text) # This is a placeholder for your actual prediction call
71
  if model_name in model_names_sf:
72
- predictions.append(0 if prediction == 'NEGATIVE' else 1) # Modify according to actual model output
73
  elif model_name == 'ADAPMIT':
74
  predictions.append(re.sub('Label$', '', prediction[0]['label']))
75
  elif model_name == 'SECTOR':
@@ -79,6 +97,7 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
79
  # Update progress bar with each iteration
80
  progress = (i + 1) / total
81
  progress_bar.progress(progress)
 
82
  return predictions
83
 
84
 
@@ -128,7 +147,9 @@ def process_data(uploaded_file, sens_level):
128
  elif model_name == 'ADAPMIT':
129
  df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
130
  elif model_name == 'SECTOR':
131
- df[model_name] = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
 
 
132
  elif model_name == 'LANG':
133
  df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
134
 
@@ -155,7 +176,8 @@ def process_data(uploaded_file, sens_level):
155
  st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
156
 
157
  # Further data processing and actions
158
- df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10, 0), axis=1)
159
- df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] < sens_level or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation') else 'REVIEW', axis=1)
 
160
 
161
  return df
 
38
  cleaned_text = re.sub(r"\s+", " ", cleaned_text)
39
  cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
40
  return cleaned_text
41
+
42
 
43
+ # # Function for extracting classifications for each SECTOR label
44
+ def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
45
 
46
+ # verify output is a list of dictionaries
47
+ if isinstance(output, list) and all(isinstance(item, dict) for item in output):
48
+ # filter items with scores above the threshold
49
+ filtered_items = [item for item in output if item.get('score', 0) > threshold]
 
 
 
 
50
 
51
+ # sort the filtered items by score in descending order
52
+ sorted_items = sorted(filtered_items, key=lambda x: x.get('score', 0), reverse=True)
53
+
54
+ # extract the highest and second-highest labels
55
+ if len(sorted_items) >= 2:
56
+ highest_label = sorted_items[0].get('label')
57
+ second_highest_label = sorted_items[1].get('label')
58
+ elif len(sorted_items) == 1:
59
+ highest_label = sorted_items[0].get('label')
60
+ second_highest_label = None
61
+ else:
62
+ print("Warning: Less than two items above the threshold in the current list.")
63
+ highest_label = None
64
+ second_highest_label = None
65
+ else:
66
+ print("Error: Inner data is not formatted correctly. Each item must be a dictionary.")
67
+ highest_label = None
68
+ second_highest_label = None
69
 
70
+ # Output dictionary of highest and second-highest labels to the all_predicted_labels list
71
+ predicted_labels = {"SECTOR1": highest_label, "SECTOR2": second_highest_label}
72
+ return predicted_labels
73
 
74
  # Function to call model and run inference for varying classification tasks/models
75
  def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
 
85
  predictions = []
86
  total = len(df)
87
  for i, text in enumerate(df[col_name]):
88
+ prediction = model(text)
 
89
  if model_name in model_names_sf:
90
+ predictions.append(0 if prediction == 'NEGATIVE' else 1)
91
  elif model_name == 'ADAPMIT':
92
  predictions.append(re.sub('Label$', '', prediction[0]['label']))
93
  elif model_name == 'SECTOR':
 
97
  # Update progress bar with each iteration
98
  progress = (i + 1) / total
99
  progress_bar.progress(progress)
100
+ # st.write(predictions)
101
  return predictions
102
 
103
 
 
147
  elif model_name == 'ADAPMIT':
148
  df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
149
  elif model_name == 'SECTOR':
150
+ sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
151
+ df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
152
+ df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
153
  elif model_name == 'LANG':
154
  df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
155
 
 
176
  st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
177
 
178
  # Further data processing and actions
179
+ sector_classes = ['Energy','Transport','Industries']
180
+ df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10,0), axis=1)
181
+ df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
182
 
183
  return df
processed_applications.csv CHANGED
The diff for this file is too large to render. See raw diff