|
import glob, os, sys; |
|
sys.path.append('../udfPreprocess') |
|
|
|
|
|
import udfPreprocess.docPreprocessing as pre |
|
import udfPreprocess.cleaning as clean |
|
|
|
|
|
import seaborn as sns |
|
from pandas import DataFrame |
|
from keybert import KeyBERT |
|
from transformers import pipeline |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import streamlit as st |
|
import pandas as pd |
|
import docx |
|
from docx.shared import Inches |
|
from docx.shared import Pt |
|
from docx.enum.style import WD_STYLE_TYPE |
|
|
|
import tempfile |
|
import sqlite3 |
|
import logging |
|
logger = logging.getLogger(__name__) |
|
import configparser |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def load_sdgClassifier(): |
|
classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg") |
|
logging.info("Loading classifier") |
|
return classifier |
|
|
|
def sdg_classification(par_list): |
|
logging.info("running SDG classifiication") |
|
config = configparser.ConfigParser() |
|
config.read_file(open('udfPreprocess/paramconfig.cfg')) |
|
threshold = float(config.get('sdg','THRESHOLD')) |
|
|
|
|
|
classifier = load_sdgClassifier() |
|
labels = classifier(par_list) |
|
|
|
labels_= [(l['label'],l['score']) for l in labels] |
|
|
|
df2 = DataFrame(labels_, columns=["SDG", "Relevancy"]) |
|
|
|
df2['text'] = par_list |
|
df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True) |
|
df2.index += 1 |
|
df2 =df2[df2['Relevancy']>threshold] |
|
x = df2['SDG'].value_counts() |
|
df3 = df2.copy() |
|
df3= df3.drop(['Relevancy'], axis = 1) |
|
|
|
|
|
return df3, x |