import streamlit as st st.set_page_config(f'SDSN x GIZ Policy Tracing', layout="wide") import seaborn as sns import pdfplumber from pandas import DataFrame import matplotlib.pyplot as plt import numpy as np import streamlit as st import sentence-transformers ##@st.cache(allow_output_mutation=True) def load_model(): return KeyBERT() def read_(file): if file is not None: text = [] with pdfplumber.open(file) as pdf: for page in pdf.pages: text.append(page.extract_text()) text_str = ' '.join([page for page in text]) st.write('Document:', pdf.metadata) st.write('Number of pages:',len(pdf.pages)) pdf.close() return text_str st.sidebar.image( "https://github.com/gizdatalab/policy_tracing/blob/main/img/sdsn.png?raw=true", use_column_width=True ) st.sidebar.markdown("## 📌 Step One: Upload document ") with st.sidebar: file = st.file_uploader('Upload PDF File', type=['pdf']) st.sidebar.title( "Options:" ) st.sidebar.markdown( "You can freely browse the different chapters - ie example prompts from different people - and see the results." ) selected_date = st.sidebar.selectbox( "Please select the chapter you want to read:", ['c1','c2'] ) with st.container(): st.markdown("

SDSN X GIZ - Policy Action Tracking

", unsafe_allow_html=True) st.write(' ') st.write(' ') with st.expander("ℹī¸ - About this app", expanded=True): st.write( """ The *Policy Action Tracker* app is an easy-to-use interface built with Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗 to create keywords/keyphrases that are most similar to a document. """ ) st.markdown("") st.markdown("") #st.markdown("## 📌 Step One: Upload document ") with st.container(): st.markdown("## 📌 Step One: Upload document ") ##file = st.file_uploader('Upload PDF File', type=['pdf']) text_str = read_(file) import seaborn as sns import pdfplumber from pandas import DataFrame from keybert import KeyBERT import matplotlib.pyplot as plt import numpy as np import streamlit as st @st.cache(allow_output_mutation=True) def load_model(): return KeyBERT() kw_model = load_model() keywords = kw_model.extract_keywords( text_str, keyphrase_ngram_range=(1, 2), use_mmr=True, stop_words="english", top_n=10, diversity=0.7, ) st.markdown("## 🎈 What is my document about?") df = ( DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"]) .sort_values(by="Relevancy", ascending=False) .reset_index(drop=True) ) df.index += 1 # Add styling cmGreen = sns.light_palette("green", as_cmap=True) cmRed = sns.light_palette("red", as_cmap=True) df = df.style.background_gradient( cmap=cmGreen, subset=[ "Relevancy", ], ) c1, c2, c3 = st.columns([1, 3, 1]) format_dictionary = { "Relevancy": "{:.1%}", } df = df.format(format_dictionary) with c2: st.table(df) ######## SDG! from transformers import pipeline finetuned_checkpoint = "jonas/sdg_classifier_osdg" classifier = pipeline("text-classification", model=finetuned_checkpoint) word_list = text_str.split() len_word_list = len(word_list) par_list = [] par_len = 130 for i in range(0,len_word_list // par_len): string_part = ' '.join(word_list[i*par_len:(i+1)*par_len]) par_list.append(string_part) labels = classifier(par_list) labels_= [(l['label'],l['score']) for l in labels] df = DataFrame(labels_, columns=["SDG", "Relevancy"]) df['text'] = ['... '+par+' ...' for par in par_list] df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True) df.index += 1 df =df[df['Relevancy']>.9] x = df['SDG'].value_counts() plt.rcParams['font.size'] = 25 colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x))) # plot fig, ax = plt.subplots() ax.pie(x, colors=colors, radius=2, center=(4, 4), wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index)) st.markdown("## 🎈 Anything related to SDGs?") c4, c5, c6 = st.columns([5, 7, 1]) # Add styling cmGreen = sns.light_palette("green", as_cmap=True) cmRed = sns.light_palette("red", as_cmap=True) df = df.style.background_gradient( cmap=cmGreen, subset=[ "Relevancy", ], ) format_dictionary = { "Relevancy": "{:.1%}", } df = df.format(format_dictionary) with c4: st.pyplot(fig) with c5: st.table(df)