Spaces:
Sleeping
Sleeping
sector incl. in filter logic
Browse files- app.py +2 -2
- images/pipeline.png +0 -0
- modules/__pycache__/utils.cpython-38.pyc +0 -0
- modules/utils.py +37 -15
- processed_applications.csv +0 -0
app.py
CHANGED
@@ -102,8 +102,8 @@ def main():
|
|
102 |
df = st.session_state['df']
|
103 |
|
104 |
# Get the current date
|
105 |
-
|
106 |
-
output_filename = 'processed_applications_'+
|
107 |
|
108 |
output_file = 'processed_applications.csv'
|
109 |
df.to_csv(output_file, index=False)
|
|
|
102 |
df = st.session_state['df']
|
103 |
|
104 |
# Get the current date
|
105 |
+
current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
|
106 |
+
output_filename = 'processed_applications_'+current_datetime+'.csv'
|
107 |
|
108 |
output_file = 'processed_applications.csv'
|
109 |
df.to_csv(output_file, index=False)
|
images/pipeline.png
CHANGED
modules/__pycache__/utils.cpython-38.pyc
CHANGED
Binary files a/modules/__pycache__/utils.cpython-38.pyc and b/modules/__pycache__/utils.cpython-38.pyc differ
|
|
modules/utils.py
CHANGED
@@ -38,19 +38,38 @@ def clean_text(input_text):
|
|
38 |
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
|
39 |
cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
|
40 |
return cleaned_text
|
|
|
41 |
|
|
|
|
|
42 |
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
# check if the score is above the threshold
|
48 |
-
if item.get('score', 0) > threshold:
|
49 |
-
# if yes, append the label to the predicted_labels list
|
50 |
-
predicted_labels.append(item.get('label'))
|
51 |
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
|
|
|
|
|
|
54 |
|
55 |
# Function to call model and run inference for varying classification tasks/models
|
56 |
def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
|
@@ -66,10 +85,9 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
|
|
66 |
predictions = []
|
67 |
total = len(df)
|
68 |
for i, text in enumerate(df[col_name]):
|
69 |
-
|
70 |
-
prediction = model(text) # This is a placeholder for your actual prediction call
|
71 |
if model_name in model_names_sf:
|
72 |
-
predictions.append(0 if prediction == 'NEGATIVE' else 1)
|
73 |
elif model_name == 'ADAPMIT':
|
74 |
predictions.append(re.sub('Label$', '', prediction[0]['label']))
|
75 |
elif model_name == 'SECTOR':
|
@@ -79,6 +97,7 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
|
|
79 |
# Update progress bar with each iteration
|
80 |
progress = (i + 1) / total
|
81 |
progress_bar.progress(progress)
|
|
|
82 |
return predictions
|
83 |
|
84 |
|
@@ -128,7 +147,9 @@ def process_data(uploaded_file, sens_level):
|
|
128 |
elif model_name == 'ADAPMIT':
|
129 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
130 |
elif model_name == 'SECTOR':
|
131 |
-
|
|
|
|
|
132 |
elif model_name == 'LANG':
|
133 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
134 |
|
@@ -155,7 +176,8 @@ def process_data(uploaded_file, sens_level):
|
|
155 |
st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
|
156 |
|
157 |
# Further data processing and actions
|
158 |
-
|
159 |
-
df['
|
|
|
160 |
|
161 |
return df
|
|
|
38 |
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
|
39 |
cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
|
40 |
return cleaned_text
|
41 |
+
|
42 |
|
43 |
+
# # Function for extracting classifications for each SECTOR label
|
44 |
+
def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
|
45 |
|
46 |
+
# verify output is a list of dictionaries
|
47 |
+
if isinstance(output, list) and all(isinstance(item, dict) for item in output):
|
48 |
+
# filter items with scores above the threshold
|
49 |
+
filtered_items = [item for item in output if item.get('score', 0) > threshold]
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
# sort the filtered items by score in descending order
|
52 |
+
sorted_items = sorted(filtered_items, key=lambda x: x.get('score', 0), reverse=True)
|
53 |
+
|
54 |
+
# extract the highest and second-highest labels
|
55 |
+
if len(sorted_items) >= 2:
|
56 |
+
highest_label = sorted_items[0].get('label')
|
57 |
+
second_highest_label = sorted_items[1].get('label')
|
58 |
+
elif len(sorted_items) == 1:
|
59 |
+
highest_label = sorted_items[0].get('label')
|
60 |
+
second_highest_label = None
|
61 |
+
else:
|
62 |
+
print("Warning: Less than two items above the threshold in the current list.")
|
63 |
+
highest_label = None
|
64 |
+
second_highest_label = None
|
65 |
+
else:
|
66 |
+
print("Error: Inner data is not formatted correctly. Each item must be a dictionary.")
|
67 |
+
highest_label = None
|
68 |
+
second_highest_label = None
|
69 |
|
70 |
+
# Output dictionary of highest and second-highest labels to the all_predicted_labels list
|
71 |
+
predicted_labels = {"SECTOR1": highest_label, "SECTOR2": second_highest_label}
|
72 |
+
return predicted_labels
|
73 |
|
74 |
# Function to call model and run inference for varying classification tasks/models
|
75 |
def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
|
|
|
85 |
predictions = []
|
86 |
total = len(df)
|
87 |
for i, text in enumerate(df[col_name]):
|
88 |
+
prediction = model(text)
|
|
|
89 |
if model_name in model_names_sf:
|
90 |
+
predictions.append(0 if prediction == 'NEGATIVE' else 1)
|
91 |
elif model_name == 'ADAPMIT':
|
92 |
predictions.append(re.sub('Label$', '', prediction[0]['label']))
|
93 |
elif model_name == 'SECTOR':
|
|
|
97 |
# Update progress bar with each iteration
|
98 |
progress = (i + 1) / total
|
99 |
progress_bar.progress(progress)
|
100 |
+
# st.write(predictions)
|
101 |
return predictions
|
102 |
|
103 |
|
|
|
147 |
elif model_name == 'ADAPMIT':
|
148 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
149 |
elif model_name == 'SECTOR':
|
150 |
+
sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
|
151 |
+
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
152 |
+
df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
|
153 |
elif model_name == 'LANG':
|
154 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
155 |
|
|
|
176 |
st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
|
177 |
|
178 |
# Further data processing and actions
|
179 |
+
sector_classes = ['Energy','Transport','Industries']
|
180 |
+
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10,0), axis=1)
|
181 |
+
df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
|
182 |
|
183 |
return df
|
processed_applications.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|