Spaces:

jonas
/

sdg-policy-tracing

Sleeping

App Files Files Community

jonas commited on Jul 18, 2022

Commit

f51b958

1 Parent(s): 28998a0

add app.py

Browse files

Files changed (20) hide show

.DS_Store +0 -0
analyse_site.py +43 -0
app.py +15 -0
check_site.py +16 -0
giz_sdsn.jpg +0 -0
img/150723_Kenya_First NDC0 +0 -0
img/ndc_policy.png +0 -0
img/sdsn.png +0 -0
img/semantic_search.png +0 -0
img/topics.png +0 -0
main_site.py +179 -0
multiapp.py +51 -0
paris.png +0 -0
pic1.PNG +0 -0
requirements.txt +12 -0
src/__init__.py +8 -0
src/__pycache__/cleaning.cpython-39.pyc +0 -0
src/__pycache__/preprocessing.cpython-39.pyc +0 -0
src/cleaning.py +124 -0
src/preprocessing.py +63 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

analyse_site.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import streamlit as st
+import glob, os, sys; sys.path.append('/src')
+#import helper
+import preprocessing as pre
+import cleaning as clean
+def app():
+    # Sidebar
+    st.sidebar.title('Analyse Policy Document')
+    # Container
+    with st.container():
+        st.markdown("<h1 style='text-align: center; color: black;'>SDSN X GIZ Policy Tracing</h1>",
+                    unsafe_allow_html=True)
+        file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
+        if file is not None:
+            st.write("Filename: ", file.name)
+            # text = []
+            # with pdfplumber.open(file) as pdf:
+            #     for page in pdf.pages:
+            #         text.append(page.extract_text())
+            #     text_str = ' '.join([page for page in text])
+            #     st.write('Number of pages:',len(pdf.pages))
+            # load document
+            docs = pre.load_document(file)
+            # preprocess document
+            docs_processed, df, all_text = clean.preprocessing(docs)
+            st.write('... ')
+        else:
+            st.write(' ')
+            st.write(' ')
+            st.markdown("<h3 style='text-align: center; color: black;'>no PDF uploaded ...</h3>",
+                        unsafe_allow_html=True)

app.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import analyse_site
+import main_site
+import check_site
+from multiapp import MultiApp
+import streamlit as st
+st.set_page_config(f'SDSN x GIZ Policy Tracing', layout="wide")
+app = MultiApp()
+app.add_app("SDSN X GIZ Policy Tracing", main_site.app)
+app.add_app("Analyse Policy Document", analyse_site.app)
+app.add_app("Check Coherence", check_site.app)
+app.run()

check_site.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import streamlit as st
+from PIL import Image
+def app():
+    # Sidebar
+    st.sidebar.title('Check Coherence')
+    st.sidebar.write(' ')
+    st.sidebar.selectbox('Select NDC', ('South Africa', 'Ethiopia'))
+    # Container
+    c1, c2, c3 = st.columns([1, 7, 1])
+    c2.markdown("<h1 style='text-align: center; color: black;'>SDSN X GIZ Policy Tracing</h1>", unsafe_allow_html=True)
+    c1, c2, c3 = st.columns([1.8, 7, 1])
+    image = Image.open('pic1.PNG')
+    c2.image(image, width=1000)

giz_sdsn.jpg ADDED Viewed

img/150723_Kenya_First NDC0 ADDED Viewed

Binary file (525 kB). View file

img/ndc_policy.png ADDED Viewed

img/sdsn.png ADDED Viewed

img/semantic_search.png ADDED Viewed

img/topics.png ADDED Viewed

main_site.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# set path
+import glob, os, sys; sys.path.append('/src')
+#import helper
+import preprocessing as pre
+import cleaning as clean
+#import needed libraries
+import seaborn as sns
+from pandas import DataFrame
+from keybert import KeyBERT
+from transformers import pipeline
+import matplotlib.pyplot as plt
+import numpy as np
+import streamlit as st
+import pandas as pd
+import tempfile
+def app():
+    with st.container():
+        st.markdown("<h1 style='text-align: center; color: black;'> Policy Action Tracking</h1>", unsafe_allow_html=True)
+        st.write(' ')
+        st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=True):
+        st.write(
+            """
+            The *Policy Action Tracker* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network.
+            It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗 to create keywords/keyphrases that are most similar to a document.
+            """
+        )
+        st.markdown("")
+    st.markdown("")
+    st.markdown("##  📌 Step One: Upload document ")
+    with st.container():
+        file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
+        if file is not None:
+            with tempfile.NamedTemporaryFile(mode="wb") as temp:
+                bytes_data = file.getvalue()
+                temp.write(bytes_data)
+                st.write("Filename: ", file.name)
+                # load document
+                docs = pre.load_document(temp.name, file)
+                # preprocess document
+                docs_processed, df, all_text, par_list = clean.preprocessing(docs)
+                # testing
+                # st.write(len(all_text))
+                # for i in par_list:
+                #     st.write(i)
+                @st.cache(allow_output_mutation=True)
+                def load_keyBert():
+                    return KeyBERT()
+                kw_model = load_keyBert()
+                keywords = kw_model.extract_keywords(
+                all_text,
+                keyphrase_ngram_range=(1, 2),
+                use_mmr=True,
+                stop_words="english",
+                top_n=15,
+                diversity=0.7,
+                )
+                st.markdown("## 🎈 What is my document about?")
+                df = (
+                    DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
+                    .sort_values(by="Relevancy", ascending=False)
+                    .reset_index(drop=True)
+                )
+                df.index += 1
+                # Add styling
+                cmGreen = sns.light_palette("green", as_cmap=True)
+                cmRed = sns.light_palette("red", as_cmap=True)
+                df = df.style.background_gradient(
+                    cmap=cmGreen,
+                    subset=[
+                        "Relevancy",
+                    ],
+                )
+                c1, c2, c3 = st.columns([1, 3, 1])
+                format_dictionary = {
+                    "Relevancy": "{:.1%}",
+                }
+                df = df.format(format_dictionary)
+                with c2:
+                    st.table(df)
+                ######## SDG classiciation
+                # @st.cache(allow_output_mutation=True)
+                # def load_sdgClassifier():
+                #     classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
+                #     return classifier
+                # load from disc (github repo) for performance boost
+                @st.cache(allow_output_mutation=True)
+                def load_sdgClassifier():
+                    classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
+                    return classifier
+                classifier = load_sdgClassifier()
+                # # not needed, par list comes from pre_processing function already
+                # word_list = all_text.split()
+                # len_word_list = len(word_list)
+                # par_list = []
+                # par_len = 130
+                # for i in range(0,len_word_list // par_len):
+                #     string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
+                #     par_list.append(string_part)
+                labels = classifier(par_list)
+                labels_= [(l['label'],l['score']) for l in labels]
+                df = DataFrame(labels_, columns=["SDG", "Relevancy"])
+                df['text'] = par_list
+                df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
+                df.index += 1
+                df =df[df['Relevancy']>.85]
+                x = df['SDG'].value_counts()
+                plt.rcParams['font.size'] = 25
+                colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
+                # plot
+                fig, ax = plt.subplots()
+                ax.pie(x, colors=colors, radius=2, center=(4, 4),
+                    wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
+                st.markdown("## 🎈 Anything related to SDGs?")
+                c4, c5, c6 = st.columns([5, 7, 1])
+                # Add styling
+                cmGreen = sns.light_palette("green", as_cmap=True)
+                cmRed = sns.light_palette("red", as_cmap=True)
+                df = df.style.background_gradient(
+                    cmap=cmGreen,
+                    subset=[
+                        "Relevancy",
+                    ],
+                )
+                format_dictionary = {
+                    "Relevancy": "{:.1%}",
+                }
+                df = df.format(format_dictionary)
+                with c4:
+                    st.pyplot(fig)
+                with c5:
+                    st.table(df)

multiapp.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Frameworks for running multiple Streamlit applications as a single app.
+"""
+import streamlit as st
+from PIL import Image
+class MultiApp:
+    """Framework for combining multiple streamlit applications.
+    Usage:
+        def foo():
+            st.title("Hello Foo")
+        def bar():
+            st.title("Hello Bar")
+        app = MultiApp()
+        app.add_app("Foo", foo)
+        app.add_app("Bar", bar)
+        app.run()
+    It is also possible keep each application in a separate file.
+        import foo
+        import bar
+        app = MultiApp()
+        app.add_app("Foo", foo.app)
+        app.add_app("Bar", bar.app)
+        app.run()
+    """
+    def __init__(self):
+        self.apps = []
+    def add_app(self, title, func):
+        """Adds a new application.
+        Parameters
+        ----------
+        func:
+            the python function to render this app.
+        title:
+            title of the app. Appears in the dropdown in the sidebar.
+        """
+        self.apps.append({
+            "title": title,
+            "function": func
+        })
+    def run(self):
+        st.sidebar.write(format_func=lambda app: app['title'])
+        image = Image.open('giz_sdsn.jpg')
+        st.sidebar.image(image)
+        app = st.sidebar.radio(
+            'Go To',
+            self.apps,
+            format_func=lambda app: app['title'])
+        app['function']()

paris.png ADDED Viewed

pic1.PNG ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+django_haystack==3.2.1
+spacy==3.2.0
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
+keybert==0.5.1
+matplotlib==3.5.1
+nltk==3.7
+numpy==1.22.1
+pandas==1.4.0
+pdfplumber==0.6.2
+Pillow==9.1.1
+seaborn==0.11.2
+transformers==4.13.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 5 2020
+@author: jonas
+"""

src/__pycache__/cleaning.cpython-39.pyc ADDED Viewed

Binary file (2.94 kB). View file

src/__pycache__/preprocessing.cpython-39.pyc ADDED Viewed

Binary file (2.11 kB). View file

src/cleaning.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import pandas as pd
+import numpy as np
+import string
+import nltk
+import spacy
+import en_core_web_sm
+import re
+import streamlit as st
+from haystack.nodes import PreProcessor
+'''basic cleaning - suitable for transformer models'''
+def basic(s):
+    """
+    :param s: string to be processed
+    :return: processed string: see comments in the source code for more info
+    """
+    # Text Lowercase
+    s = s.lower()
+    # Remove punctuation
+    translator = str.maketrans(' ', ' ', string.punctuation)
+    s = s.translate(translator)
+    # Remove URLs
+    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
+    s = re.sub(r"http\S+", " ", s)
+    # Remove new line characters
+    s = re.sub('\n', ' ', s)
+    # Remove distracting single quotes
+    s = re.sub("\'", " ", s)
+    # Remove all remaining numbers and non alphanumeric characters
+    s = re.sub(r'\d+', ' ', s)
+    s = re.sub(r'\W+', ' ', s)
+    # define custom words to replace:
+    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
+    return s.strip()
+def preprocessing(document):
+    """
+    takes in haystack document object and splits it into paragraphs and applies simple cleaning.
+    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
+    list that contains all text joined together.
+    """
+    preprocessor = PreProcessor(
+        clean_empty_lines=True,
+        clean_whitespace=True,
+        clean_header_footer=True,
+        split_by="word",
+        split_length=120,
+        split_respect_sentence_boundary=True,
+        #split_overlap=5
+    )
+    for i in document:
+        docs_processed = preprocessor.process([i])
+        for item in docs_processed:
+            item.content = basic(item.content)
+    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
+    # create dataframe of text and list of all text
+    df = pd.DataFrame(docs_processed)
+    all_text = " ".join(df.content.to_list())
+    par_list = df.content.to_list()
+    return docs_processed, df, all_text, par_list
+'''processing with spacy - suitable for models such as tf-idf, word2vec'''
+def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
+    """
+    Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
+    filters out all but proper nouns, nounts, verbs and adjectives.
+    Parameters
+    ----------
+    alpha : str
+            The input string.
+    use_nlp : bool, default False
+            Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
+            Should be set to False if used inside nlp.pipeline
+     Returns
+    -------
+    ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
+    Notes
+    -----
+    Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
+    Use together with nlp.pipeline for batch processing.
+    """
+    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
+    if use_nlp:
+        alpha = nlp(alpha)
+    beta = []
+    for tok in alpha:
+        if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
+            beta.append(tok.lemma_)
+    text = ' '.join(beta)
+    text = text.lower()
+    return text

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from typing import Callable, Dict, List, Optional
+from pathlib import Path
+import re
+import logging
+import string
+import streamlit as st
+logger = logging.getLogger(__name__)
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from haystack.utils import convert_files_to_docs, fetch_archive_from_http
+from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
+from haystack.schema import Document
+import pdfplumber
+import pandas as pd
+def load_document(
+    file: str,
+    file_name,
+    encoding: Optional[str] = None,
+    id_hash_keys: Optional[List[str]] = None,
+) -> List[Document]:
+    """
+    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
+    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
+    via Haystack.
+    Returns a list of type haystack.schema.Document
+    """
+    if file_name.name.endswith('.pdf'):
+        converter = PDFToTextConverter(remove_numeric_tables=True)
+    if file_name.name.endswith('.txt'):
+        converter = TextConverter()
+    if file_name.name.endswith('.docx'):
+        converter = DocxToTextConverter()
+    documents = []
+    logger.info("Converting {}".format(file_name))
+    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
+    document = converter.convert(
+                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
+            )[0]
+    text = document.content
+    documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
+    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
+    for i in documents:
+        if i.content == "":
+            st.write("using pdfplumber")
+            text = []
+            with pdfplumber.open(file) as pdf:
+                for page in pdf.pages:
+                    text.append(page.extract_text())
+            i.content = ' '.join([page for page in text])
+    return documents