jonas commited on
Commit
f51b958
1 Parent(s): 28998a0

add app.py

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
analyse_site.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ import glob, os, sys; sys.path.append('/src')
4
+ #import helper
5
+ import preprocessing as pre
6
+ import cleaning as clean
7
+
8
+ def app():
9
+ # Sidebar
10
+ st.sidebar.title('Analyse Policy Document')
11
+
12
+ # Container
13
+ with st.container():
14
+ st.markdown("<h1 style='text-align: center; color: black;'>SDSN X GIZ Policy Tracing</h1>",
15
+ unsafe_allow_html=True)
16
+
17
+ file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
18
+
19
+ if file is not None:
20
+ st.write("Filename: ", file.name)
21
+ # text = []
22
+ # with pdfplumber.open(file) as pdf:
23
+ # for page in pdf.pages:
24
+ # text.append(page.extract_text())
25
+ # text_str = ' '.join([page for page in text])
26
+
27
+ # st.write('Number of pages:',len(pdf.pages))
28
+
29
+ # load document
30
+ docs = pre.load_document(file)
31
+
32
+ # preprocess document
33
+ docs_processed, df, all_text = clean.preprocessing(docs)
34
+
35
+
36
+
37
+ st.write('... ')
38
+
39
+ else:
40
+ st.write(' ')
41
+ st.write(' ')
42
+ st.markdown("<h3 style='text-align: center; color: black;'>no PDF uploaded ...</h3>",
43
+ unsafe_allow_html=True)
app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import analyse_site
2
+ import main_site
3
+ import check_site
4
+ from multiapp import MultiApp
5
+ import streamlit as st
6
+
7
+ st.set_page_config(f'SDSN x GIZ Policy Tracing', layout="wide")
8
+
9
+ app = MultiApp()
10
+
11
+ app.add_app("SDSN X GIZ Policy Tracing", main_site.app)
12
+ app.add_app("Analyse Policy Document", analyse_site.app)
13
+ app.add_app("Check Coherence", check_site.app)
14
+
15
+ app.run()
check_site.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+
4
+
5
+ def app():
6
+ # Sidebar
7
+ st.sidebar.title('Check Coherence')
8
+ st.sidebar.write(' ')
9
+ st.sidebar.selectbox('Select NDC', ('South Africa', 'Ethiopia'))
10
+
11
+ # Container
12
+ c1, c2, c3 = st.columns([1, 7, 1])
13
+ c2.markdown("<h1 style='text-align: center; color: black;'>SDSN X GIZ Policy Tracing</h1>", unsafe_allow_html=True)
14
+ c1, c2, c3 = st.columns([1.8, 7, 1])
15
+ image = Image.open('pic1.PNG')
16
+ c2.image(image, width=1000)
giz_sdsn.jpg ADDED
img/150723_Kenya_First NDC0 ADDED
Binary file (525 kB). View file
img/ndc_policy.png ADDED
img/sdsn.png ADDED
img/semantic_search.png ADDED
img/topics.png ADDED
main_site.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys; sys.path.append('/src')
3
+
4
+ #import helper
5
+ import preprocessing as pre
6
+ import cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from keybert import KeyBERT
12
+ from transformers import pipeline
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ import streamlit as st
16
+ import pandas as pd
17
+
18
+ import tempfile
19
+
20
+ def app():
21
+
22
+ with st.container():
23
+ st.markdown("<h1 style='text-align: center; color: black;'> Policy Action Tracking</h1>", unsafe_allow_html=True)
24
+ st.write(' ')
25
+ st.write(' ')
26
+
27
+ with st.expander("ℹ️ - About this app", expanded=True):
28
+
29
+ st.write(
30
+ """
31
+ The *Policy Action Tracker* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network.
32
+
33
+ It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗 to create keywords/keyphrases that are most similar to a document.
34
+ """
35
+ )
36
+
37
+ st.markdown("")
38
+
39
+ st.markdown("")
40
+ st.markdown("## 📌 Step One: Upload document ")
41
+
42
+ with st.container():
43
+
44
+ file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
45
+
46
+ if file is not None:
47
+
48
+
49
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
50
+ bytes_data = file.getvalue()
51
+ temp.write(bytes_data)
52
+
53
+ st.write("Filename: ", file.name)
54
+
55
+ # load document
56
+ docs = pre.load_document(temp.name, file)
57
+
58
+ # preprocess document
59
+ docs_processed, df, all_text, par_list = clean.preprocessing(docs)
60
+
61
+ # testing
62
+ # st.write(len(all_text))
63
+ # for i in par_list:
64
+ # st.write(i)
65
+
66
+ @st.cache(allow_output_mutation=True)
67
+ def load_keyBert():
68
+ return KeyBERT()
69
+
70
+ kw_model = load_keyBert()
71
+
72
+ keywords = kw_model.extract_keywords(
73
+ all_text,
74
+ keyphrase_ngram_range=(1, 2),
75
+ use_mmr=True,
76
+ stop_words="english",
77
+ top_n=15,
78
+ diversity=0.7,
79
+ )
80
+
81
+ st.markdown("## 🎈 What is my document about?")
82
+
83
+ df = (
84
+ DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
85
+ .sort_values(by="Relevancy", ascending=False)
86
+ .reset_index(drop=True)
87
+ )
88
+
89
+ df.index += 1
90
+
91
+ # Add styling
92
+ cmGreen = sns.light_palette("green", as_cmap=True)
93
+ cmRed = sns.light_palette("red", as_cmap=True)
94
+ df = df.style.background_gradient(
95
+ cmap=cmGreen,
96
+ subset=[
97
+ "Relevancy",
98
+ ],
99
+ )
100
+ c1, c2, c3 = st.columns([1, 3, 1])
101
+
102
+ format_dictionary = {
103
+ "Relevancy": "{:.1%}",
104
+ }
105
+
106
+ df = df.format(format_dictionary)
107
+
108
+ with c2:
109
+ st.table(df)
110
+
111
+ ######## SDG classiciation
112
+ # @st.cache(allow_output_mutation=True)
113
+ # def load_sdgClassifier():
114
+ # classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
115
+
116
+ # return classifier
117
+
118
+ # load from disc (github repo) for performance boost
119
+ @st.cache(allow_output_mutation=True)
120
+ def load_sdgClassifier():
121
+ classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
122
+
123
+ return classifier
124
+
125
+ classifier = load_sdgClassifier()
126
+
127
+ # # not needed, par list comes from pre_processing function already
128
+
129
+ # word_list = all_text.split()
130
+ # len_word_list = len(word_list)
131
+ # par_list = []
132
+ # par_len = 130
133
+ # for i in range(0,len_word_list // par_len):
134
+ # string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
135
+ # par_list.append(string_part)
136
+
137
+ labels = classifier(par_list)
138
+ labels_= [(l['label'],l['score']) for l in labels]
139
+ df = DataFrame(labels_, columns=["SDG", "Relevancy"])
140
+ df['text'] = par_list
141
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
142
+ df.index += 1
143
+ df =df[df['Relevancy']>.85]
144
+ x = df['SDG'].value_counts()
145
+
146
+ plt.rcParams['font.size'] = 25
147
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
148
+ # plot
149
+ fig, ax = plt.subplots()
150
+ ax.pie(x, colors=colors, radius=2, center=(4, 4),
151
+ wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
152
+
153
+ st.markdown("## 🎈 Anything related to SDGs?")
154
+
155
+ c4, c5, c6 = st.columns([5, 7, 1])
156
+
157
+ # Add styling
158
+ cmGreen = sns.light_palette("green", as_cmap=True)
159
+ cmRed = sns.light_palette("red", as_cmap=True)
160
+ df = df.style.background_gradient(
161
+ cmap=cmGreen,
162
+ subset=[
163
+ "Relevancy",
164
+ ],
165
+ )
166
+
167
+ format_dictionary = {
168
+ "Relevancy": "{:.1%}",
169
+ }
170
+
171
+ df = df.format(format_dictionary)
172
+
173
+ with c4:
174
+ st.pyplot(fig)
175
+ with c5:
176
+ st.table(df)
177
+
178
+
179
+
multiapp.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frameworks for running multiple Streamlit applications as a single app.
2
+ """
3
+ import streamlit as st
4
+ from PIL import Image
5
+
6
+ class MultiApp:
7
+ """Framework for combining multiple streamlit applications.
8
+ Usage:
9
+ def foo():
10
+ st.title("Hello Foo")
11
+ def bar():
12
+ st.title("Hello Bar")
13
+ app = MultiApp()
14
+ app.add_app("Foo", foo)
15
+ app.add_app("Bar", bar)
16
+ app.run()
17
+ It is also possible keep each application in a separate file.
18
+ import foo
19
+ import bar
20
+ app = MultiApp()
21
+ app.add_app("Foo", foo.app)
22
+ app.add_app("Bar", bar.app)
23
+ app.run()
24
+ """
25
+ def __init__(self):
26
+ self.apps = []
27
+
28
+ def add_app(self, title, func):
29
+ """Adds a new application.
30
+ Parameters
31
+ ----------
32
+ func:
33
+ the python function to render this app.
34
+ title:
35
+ title of the app. Appears in the dropdown in the sidebar.
36
+ """
37
+ self.apps.append({
38
+ "title": title,
39
+ "function": func
40
+ })
41
+
42
+ def run(self):
43
+ st.sidebar.write(format_func=lambda app: app['title'])
44
+ image = Image.open('giz_sdsn.jpg')
45
+ st.sidebar.image(image)
46
+ app = st.sidebar.radio(
47
+ 'Go To',
48
+ self.apps,
49
+ format_func=lambda app: app['title'])
50
+
51
+ app['function']()
paris.png ADDED
pic1.PNG ADDED
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ django_haystack==3.2.1
2
+ spacy==3.2.0
3
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
4
+ keybert==0.5.1
5
+ matplotlib==3.5.1
6
+ nltk==3.7
7
+ numpy==1.22.1
8
+ pandas==1.4.0
9
+ pdfplumber==0.6.2
10
+ Pillow==9.1.1
11
+ seaborn==0.11.2
12
+ transformers==4.13.0
src/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Mon Oct 5 2020
5
+
6
+ @author: jonas
7
+ """
8
+
src/__pycache__/cleaning.cpython-39.pyc ADDED
Binary file (2.94 kB). View file
src/__pycache__/preprocessing.cpython-39.pyc ADDED
Binary file (2.11 kB). View file
src/cleaning.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import string
4
+ import nltk
5
+ import spacy
6
+ import en_core_web_sm
7
+ import re
8
+ import streamlit as st
9
+
10
+ from haystack.nodes import PreProcessor
11
+
12
+ '''basic cleaning - suitable for transformer models'''
13
+ def basic(s):
14
+ """
15
+ :param s: string to be processed
16
+ :return: processed string: see comments in the source code for more info
17
+ """
18
+ # Text Lowercase
19
+ s = s.lower()
20
+ # Remove punctuation
21
+ translator = str.maketrans(' ', ' ', string.punctuation)
22
+ s = s.translate(translator)
23
+ # Remove URLs
24
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
25
+ s = re.sub(r"http\S+", " ", s)
26
+ # Remove new line characters
27
+ s = re.sub('\n', ' ', s)
28
+
29
+ # Remove distracting single quotes
30
+ s = re.sub("\'", " ", s)
31
+ # Remove all remaining numbers and non alphanumeric characters
32
+ s = re.sub(r'\d+', ' ', s)
33
+ s = re.sub(r'\W+', ' ', s)
34
+
35
+ # define custom words to replace:
36
+ #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
37
+
38
+ return s.strip()
39
+
40
+
41
+ def preprocessing(document):
42
+
43
+ """
44
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
45
+
46
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
47
+ list that contains all text joined together.
48
+ """
49
+
50
+ preprocessor = PreProcessor(
51
+ clean_empty_lines=True,
52
+ clean_whitespace=True,
53
+ clean_header_footer=True,
54
+ split_by="word",
55
+ split_length=120,
56
+ split_respect_sentence_boundary=True,
57
+ #split_overlap=5
58
+ )
59
+ for i in document:
60
+ docs_processed = preprocessor.process([i])
61
+ for item in docs_processed:
62
+ item.content = basic(item.content)
63
+
64
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
65
+
66
+ # create dataframe of text and list of all text
67
+ df = pd.DataFrame(docs_processed)
68
+ all_text = " ".join(df.content.to_list())
69
+ par_list = df.content.to_list()
70
+
71
+ return docs_processed, df, all_text, par_list
72
+
73
+ '''processing with spacy - suitable for models such as tf-idf, word2vec'''
74
+ def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
75
+
76
+ """
77
+
78
+ Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
79
+
80
+ filters out all but proper nouns, nounts, verbs and adjectives.
81
+
82
+ Parameters
83
+ ----------
84
+ alpha : str
85
+
86
+ The input string.
87
+
88
+ use_nlp : bool, default False
89
+
90
+ Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
91
+
92
+ Should be set to False if used inside nlp.pipeline
93
+
94
+ Returns
95
+ -------
96
+ ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
97
+
98
+ Notes
99
+ -----
100
+ Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
101
+ Use together with nlp.pipeline for batch processing.
102
+
103
+ """
104
+
105
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
106
+
107
+ if use_nlp:
108
+
109
+ alpha = nlp(alpha)
110
+
111
+
112
+
113
+ beta = []
114
+
115
+ for tok in alpha:
116
+
117
+ if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
118
+
119
+ beta.append(tok.lemma_)
120
+
121
+
122
+ text = ' '.join(beta)
123
+ text = text.lower()
124
+ return text
src/preprocessing.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Optional
2
+
3
+ from pathlib import Path
4
+ import re
5
+ import logging
6
+ import string
7
+ import streamlit as st
8
+ logger = logging.getLogger(__name__)
9
+
10
+ import os
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+ from haystack.utils import convert_files_to_docs, fetch_archive_from_http
14
+ from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
15
+ from haystack.schema import Document
16
+ import pdfplumber
17
+
18
+ import pandas as pd
19
+
20
+ def load_document(
21
+ file: str,
22
+ file_name,
23
+ encoding: Optional[str] = None,
24
+ id_hash_keys: Optional[List[str]] = None,
25
+ ) -> List[Document]:
26
+
27
+ """
28
+ takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
29
+ does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
30
+ via Haystack.
31
+
32
+ Returns a list of type haystack.schema.Document
33
+ """
34
+
35
+ if file_name.name.endswith('.pdf'):
36
+ converter = PDFToTextConverter(remove_numeric_tables=True)
37
+ if file_name.name.endswith('.txt'):
38
+ converter = TextConverter()
39
+ if file_name.name.endswith('.docx'):
40
+ converter = DocxToTextConverter()
41
+
42
+
43
+ documents = []
44
+ logger.info("Converting {}".format(file_name))
45
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
46
+ document = converter.convert(
47
+ file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
48
+ )[0]
49
+ text = document.content
50
+ documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
51
+
52
+ '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
53
+ for i in documents:
54
+ if i.content == "":
55
+ st.write("using pdfplumber")
56
+ text = []
57
+ with pdfplumber.open(file) as pdf:
58
+ for page in pdf.pages:
59
+ text.append(page.extract_text())
60
+ i.content = ' '.join([page for page in text])
61
+
62
+ return documents
63
+