Spaces:
GIZ
/
Running on CPU Upgrade

ppsingh commited on
Commit
22b8e0b
1 Parent(s): fdf8499

new_version

Browse files
.gitattributes CHANGED
@@ -29,3 +29,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ appStore/img/giz_sdsn.jpg filter=lfs diff=lfs merge=lfs -text
33
+ appStore/img/paris.png filter=lfs diff=lfs merge=lfs -text
34
+ appStore/img/pic1.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import appStore.keyword_search as keyword_search
2
+ import appStore.sdg_analysis as sdg_analysis
3
+ import appStore.coherence as coherence
4
+ import appStore.info as info
5
+ from appStore.multiapp import MultiApp
6
+ import streamlit as st
7
+
8
+ st.set_page_config(f'SDSN x GIZ Policy Action Tracking v0.1', layout="wide")
9
+
10
+ app = MultiApp()
11
+
12
+ app.add_app("Analyse Policy Document", sdg_analysis.app)
13
+ app.add_app("KeyWord Search", keyword_search.app)
14
+ app.add_app("Check Coherence", coherence.app)
15
+ app.add_app("Info", info.app)
16
+
17
+ app.run()
appStore/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # creating appstore package
appStore/coherence.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys; sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ # from keybert import KeyBERT
14
+ from transformers import pipeline
15
+ import matplotlib.pyplot as plt
16
+ import numpy as np
17
+ import streamlit as st
18
+ import pandas as pd
19
+ from rank_bm25 import BM25Okapi
20
+ from sklearn.feature_extraction import _stop_words
21
+ import string
22
+ from tqdm.autonotebook import tqdm
23
+ import numpy as np
24
+ import urllib.request
25
+ import ast
26
+ import tempfile
27
+ import sqlite3
28
+ import json
29
+ import urllib.request
30
+ import ast
31
+ def app():
32
+ # Sidebar
33
+ st.sidebar.title('Check Coherence')
34
+ st.sidebar.write(' ')
35
+ with open('ndcs/countryList.txt') as dfile:
36
+ countryList = dfile.read()
37
+
38
+ countryList = ast.literal_eval(countryList)
39
+ countrynames = list(countryList.keys())
40
+
41
+ option = st.sidebar.selectbox('Select Country', (countrynames))
42
+ countryCode = countryList[option]
43
+
44
+
45
+ with st.container():
46
+ st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
47
+ st.write(' ')
48
+ st.write(' ')
49
+
50
+ with st.expander("ℹ️ - About this app", expanded=True):
51
+
52
+ st.write(
53
+ """
54
+ The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
55
+ """
56
+ )
57
+
58
+ st.markdown("")
59
+
60
+ st.markdown("")
61
+ st.markdown("## 📌 Step One: Upload document of the country selected ")
62
+
63
+ with st.container():
64
+ docs = None
65
+ # asking user for either upload or select existing doc
66
+ choice = st.radio(label = 'Select the Document',
67
+ help = 'You can upload the document \
68
+ or else you can try a example document.',
69
+ options = ('Upload Document', 'Try Example'),
70
+ horizontal = True)
71
+
72
+ if choice == 'Upload Document':
73
+ uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
74
+ if uploaded_file is not None:
75
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
76
+ bytes_data = uploaded_file.getvalue()
77
+ temp.write(bytes_data)
78
+
79
+ st.write("Uploaded Filename: ", uploaded_file.name)
80
+ file_name = uploaded_file.name
81
+ file_path = temp.name
82
+ docs = pre.load_document(file_path, file_name)
83
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
84
+
85
+ else:
86
+ # listing the options
87
+ option = st.selectbox('Select the example document',
88
+ ('South Africa:Low Emission strategy',
89
+ 'Ethiopia: 10 Year Development Plan'))
90
+ if option is 'South Africa:Low Emission strategy':
91
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
92
+ countryCode = countryList['South Africa']
93
+ st.write("Selected document:", file_name.split('/')[1])
94
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
95
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
96
+ else:
97
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
98
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
99
+ countryCode = countryList['Ethiopia']
100
+ st.write("Selected document:", file_name.split('/')[1])
101
+
102
+ if option is not None:
103
+ docs = pre.load_document(file_path,file_name)
104
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
105
+
106
+ with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
107
+ cca_sent = dfile.read()
108
+
109
+ cca_sent = ast.literal_eval(cca_sent)
110
+
111
+ with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
112
+ ccm_sent = dfile.read()
113
+
114
+ ccm_sent = ast.literal_eval(ccm_sent)
115
+
116
+ with open('ndcs/countryList.txt') as dfile:
117
+ countryList = dfile.read()
118
+
119
+ countryList = ast.literal_eval(countryList)
120
+
121
+ def get_document(countryCode: str):
122
+ link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
123
+ with urllib.request.urlopen(link) as urlfile:
124
+ data = json.loads(urlfile.read())
125
+ categoriesData = {}
126
+ categoriesData['categories']= data['categories']
127
+ categoriesData['subcategories']= data['subcategories']
128
+ keys_sub = categoriesData['subcategories'].keys()
129
+ documentType= 'NDCs'
130
+ if documentType in data.keys():
131
+ if countryCode in data[documentType].keys():
132
+ get_dict = {}
133
+ for key, value in data[documentType][countryCode].items():
134
+ if key not in ['country_name','region_id', 'region_name']:
135
+ get_dict[key] = value['classification']
136
+ else:
137
+ get_dict[key] = value
138
+ else:
139
+ return None
140
+ else:
141
+ return None
142
+
143
+ country = {}
144
+ for key in categoriesData['categories']:
145
+ country[key]= {}
146
+ for key,value in categoriesData['subcategories'].items():
147
+ country[value['category']][key] = get_dict[key]
148
+
149
+ return country
150
+
151
+ # country_ndc = get_document('NDCs', countryList[option])
152
+
153
+ def countrySpecificCCA(cca_sent, threshold, countryCode):
154
+ temp = {}
155
+ doc = get_document(countryCode)
156
+ for key,value in cca_sent.items():
157
+ id_ = doc['climate change adaptation'][key]['id']
158
+ if id_ >threshold:
159
+ temp[key] = value['id'][id_]
160
+ return temp
161
+
162
+
163
+ def countrySpecificCCM(ccm_sent, threshold, countryCode):
164
+ temp = {}
165
+ doc = get_document(countryCode)
166
+ for key,value in ccm_sent.items():
167
+ id_ = doc['climate change mitigation'][key]['id']
168
+ if id_ >threshold:
169
+ temp[key] = value['id'][id_]
170
+
171
+ return temp
172
+
173
+
174
+
175
+ if docs is not None:
176
+ sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
177
+ sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
178
+ #st.write(sent_ccm)
179
+ @st.cache(allow_output_mutation=True)
180
+ def load_sentenceTransformer(name):
181
+ return SentenceTransformer(name)
182
+ model = load_sentenceTransformer('all-MiniLM-L6-v2')
183
+
184
+ document_embeddings = model.encode(paraList, show_progress_bar=True)
185
+
186
+ genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
187
+ if genre == 'Climate Change Adaptation':
188
+ sent_dict = sent_cca
189
+ sent_labels = []
190
+ for key,sent in sent_dict.items():
191
+ sent_labels.append(sent)
192
+ label_embeddings = model.encode(sent_labels, show_progress_bar=True)
193
+ similarity_high_threshold = 0.55
194
+ similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
195
+ label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
196
+
197
+ positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
198
+
199
+
200
+ else:
201
+ sent_dict = sent_ccm
202
+ sent_labels = []
203
+ for key,sent in sent_dict.items():
204
+ sent_labels.append(sent)
205
+ label_embeddings = model.encode(sent_labels, show_progress_bar=True)
206
+ similarity_high_threshold = 0.55
207
+ similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
208
+ label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
209
+
210
+ positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
211
+
212
+
213
+ # sent_labels = []
214
+ # for key,sent in sent_dict.items():
215
+ # sent_labels.append(sent)
216
+
217
+
218
+ # label_embeddings = model.encode(sent_labels, show_progress_bar=True)
219
+
220
+ #similarity_high_threshold = 0.55
221
+ # similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
222
+ #label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
223
+
224
+ #positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
225
+
226
+ for _label_idx, _paragraph_idx in positive_indices:
227
+ st.write("This paragraph: \n")
228
+ st.write(paraList[_paragraph_idx])
229
+ st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
230
+ st.write('-'*10)
231
+
appStore/img/giz_sdsn.jpg ADDED

Git LFS Details

  • SHA256: 137194ec5008c5f634befdada64a9e093528d02db2b87be67441a18d1d7e54f4
  • Pointer size: 130 Bytes
  • Size of remote file: 10 kB
appStore/img/paris.png ADDED

Git LFS Details

  • SHA256: 1b8a00527d472d140b40037ff786a87b167fc93d6fccc5e339cb269800b64e24
  • Pointer size: 131 Bytes
  • Size of remote file: 128 kB
appStore/img/pic1.png ADDED

Git LFS Details

  • SHA256: 1b8a00527d472d140b40037ff786a87b167fc93d6fccc5e339cb269800b64e24
  • Pointer size: 131 Bytes
  • Size of remote file: 128 kB
appStore/info.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def app():
5
+ with open('style.css') as f:
6
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
7
+ footer = """
8
+ <div class="footer-custom">
9
+ Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
10
+ <a href="https://www.linkedin.com/in/jonas-nothnagel-bb42b114b/" target="_blank">Jonas Nothnagel</a> |
11
+ <a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
12
+ Guidance & Feedback - Maren Bernlöhr | Manuel Kuhn </a>
13
+ </div>
14
+ """
15
+ st.markdown(footer, unsafe_allow_html=True)
16
+
17
+ st.subheader("Policy Action Tracker Manual")
18
+ intro = """
19
+ <div class="text">
20
+ The manual extraction of relevant information from text documents is a time-consuming task for any policy analyst.
21
+ As the amount and length of public policy documents in relation to sustainable development (such as National Development Plans and
22
+ Nationally Determined Contributions) continuously increases, a major challenge for policy action tracking – the evaluation of stated
23
+ goals and targets and their actual implementation on the ground – arises. Luckily, Artificial Intelligence (AI) and Natural Language Processing (NLP)
24
+ methods can help in shortening and easing this task for policy analysts.
25
+ For this purpose, the United Nations Sustainable Development Solutions Network (SDSN) and the Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ) GmbH
26
+ are collaborating since 2021 in the development of an AI-powered open-source web application that helps find and extract relevant information from public policy
27
+ documents faster to facilitate evidence-based decision-making processes in sustainable development and beyond.
28
+ <ul>
29
+ <li>Analizing the policy document</li>
30
+ <li>finding SDG related content</li>
31
+ <li>Make it searchable</li>
32
+ <li>compare it to the national NDC</li>
33
+ </ul>
34
+ </div>
35
+ <br>
36
+ """
37
+ st.markdown(intro, unsafe_allow_html=True)
38
+ st.image("appStore/img/pic1.png", caption="NDC Coherence")
39
+ st.subheader("Methodology")
40
+ #st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
41
+ # "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
42
+ # "Wikipedia context passages retrieved. Mouseover on the tooltip will show the sentence from the "
43
+ # "Wikipedia context passage. If a sentence similarity is 1.0, the seq2seq model extracted and "
44
+ # "copied the sentence verbatim from Wikipedia context passages. Lower values of sentence "
45
+ # "similarity indicate the seq2seq model is struggling to generate a relevant sentence for the question "
46
+ # "asked.")
47
+ #st.image("wikipedia_answer.png", caption="Answer with similarity tooltips")
appStore/keyword_search.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys; sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
12
+ # from keybert import KeyBERT
13
+ from transformers import pipeline
14
+ import matplotlib.pyplot as plt
15
+ import numpy as np
16
+ import streamlit as st
17
+ import pandas as pd
18
+ from rank_bm25 import BM25Okapi
19
+ from sklearn.feature_extraction import _stop_words
20
+ import string
21
+ from tqdm.autonotebook import tqdm
22
+ import numpy as np
23
+
24
+ import tempfile
25
+ import sqlite3
26
+
27
+ def app():
28
+
29
+ with st.container():
30
+ st.markdown("<h1 style='text-align: center; \
31
+ color: black;'> Keyword Search</h1>",
32
+ unsafe_allow_html=True)
33
+ st.write(' ')
34
+ st.write(' ')
35
+
36
+ with st.expander("ℹ️ - About this app", expanded=True):
37
+
38
+ st.write(
39
+ """
40
+ The *Keyword Search* app is an easy-to-use interface \
41
+ built in Streamlit for doing keyword search in \
42
+ policy document - developed by GIZ Data and the \
43
+ Sustainable Development Solution Network.
44
+ """
45
+ )
46
+
47
+ st.markdown("")
48
+
49
+ st.markdown("")
50
+ st.markdown("### 📌 Step One: Upload document ### ")
51
+
52
+ with st.container():
53
+ def bm25_tokenizer(text):
54
+ tokenized_doc = []
55
+ for token in text.lower().split():
56
+ token = token.strip(string.punctuation)
57
+
58
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
59
+ tokenized_doc.append(token)
60
+ return tokenized_doc
61
+
62
+ def bm25TokenizeDoc(paraList):
63
+ tokenized_corpus = []
64
+ for passage in tqdm(paraList):
65
+ if len(passage.split()) >256:
66
+ temp = " ".join(passage.split()[:256])
67
+ tokenized_corpus.append(bm25_tokenizer(temp))
68
+ temp = " ".join(passage.split()[256:])
69
+ tokenized_corpus.append(bm25_tokenizer(temp))
70
+ else:
71
+ tokenized_corpus.append(bm25_tokenizer(passage))
72
+
73
+ return tokenized_corpus
74
+ def search(keyword):
75
+ ##### BM25 search (lexical search) #####
76
+ bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
77
+ top_n = np.argpartition(bm25_scores, -10)[-10:]
78
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
79
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
80
+
81
+ ##### Sematic Search #####
82
+ # Encode the query using the bi-encoder and find potentially relevant passages
83
+ #query = "Does document contain {} issues ?".format(keyword)
84
+ question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
85
+
86
+ hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
87
+ hits = hits[0] # Get the hits for the first query
88
+
89
+
90
+ ##### Re-Ranking #####
91
+ # Now, score all retrieved passages with the cross_encoder
92
+ #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
93
+ #cross_scores = cross_encoder.predict(cross_inp)
94
+
95
+ # Sort results by the cross-encoder scores
96
+ #for idx in range(len(cross_scores)):
97
+ # hits[idx]['cross-score'] = cross_scores[idx]
98
+
99
+
100
+ return bm25_hits, hits
101
+
102
+ def show_results(keywordList):
103
+ for keyword in keywordList:
104
+ st.write("Results for Query: {}".format(keyword))
105
+ bm25_hits, hits = search(keyword)
106
+
107
+ st.markdown("""
108
+ We will provide with 2 kind of results. The 'lexical search' and the semantic search.
109
+ """)
110
+ # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
111
+ st.markdown("Top few lexical search (BM25) hits")
112
+ for hit in bm25_hits[0:5]:
113
+ if hit['score'] > 0.00:
114
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
115
+
116
+
117
+
118
+
119
+
120
+ # st.table(bm25_hits[0:3])
121
+
122
+ st.markdown("\n-------------------------\n")
123
+ st.markdown("Top few Bi-Encoder Retrieval hits")
124
+
125
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
126
+ for hit in hits[0:5]:
127
+ # if hit['score'] > 0.45:
128
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
129
+ #st.table(hits[0:3]
130
+
131
+
132
+ @st.cache(allow_output_mutation=True)
133
+ def load_sentenceTransformer(name):
134
+ return SentenceTransformer(name)
135
+
136
+
137
+
138
+ docs = None
139
+ # asking user for either upload or select existing doc
140
+ choice = st.radio(label = 'Select the Document',
141
+ help = 'You can upload the document \
142
+ or else you can try a example document',
143
+ options = ('Upload Document', 'Try Example'),
144
+ horizontal = True)
145
+
146
+ if choice == 'Upload Document':
147
+ uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
148
+ if uploaded_file is not None:
149
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
150
+ bytes_data = uploaded_file.getvalue()
151
+ temp.write(bytes_data)
152
+
153
+ st.write("Uploaded Filename: ", uploaded_file.name)
154
+ file_name = uploaded_file.name
155
+ file_path = temp.name
156
+ docs = pre.load_document(file_path, file_name)
157
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
158
+
159
+ else:
160
+ # listing the options
161
+ option = st.selectbox('Select the example document',
162
+ ('South Africa:Low Emission strategy',
163
+ 'Ethiopia: 10 Year Development Plan'))
164
+ if option is 'South Africa:Low Emission strategy':
165
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
166
+ st.write("Selected document:", file_name.split('/')[1])
167
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
168
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
169
+ else:
170
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
171
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
172
+ st.write("Selected document:", file_name.split('/')[1])
173
+
174
+ if option is not None:
175
+ docs = pre.load_document(file_path,file_name)
176
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
177
+
178
+ if docs is not None:
179
+
180
+ bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
181
+ bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
182
+ top_k = 32
183
+
184
+ document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
185
+ tokenized_corpus = bm25TokenizeDoc(paraList)
186
+ document_bm25 = BM25Okapi(tokenized_corpus)
187
+ keywordList = None
188
+
189
+ col1, col2 = st.columns(2)
190
+ with col1:
191
+ if st.button('Climate Change Keyword Search'):
192
+ keywordList = ['extreme weather', 'floods', 'droughts']
193
+
194
+ # show_results(keywordList)
195
+ with col2:
196
+ if st.button('Gender Keywords Search'):
197
+ keywordList = ['Gender', 'Women empowernment']
198
+
199
+ # show_results(keywordList)
200
+
201
+ keyword = st.text_input("Please enter here \
202
+ what you want to search, \
203
+ we will look for similar context \
204
+ in the document.",
205
+ value="",)
206
+ if st.button("Find them."):
207
+ keywordList = [keyword]
208
+ if keywordList is not None:
209
+ show_results(keywordList)
210
+
211
+
212
+
213
+
214
+ # @st.cache(allow_output_mutation=True)
215
+ # def load_sentenceTransformer(name):
216
+ # return SentenceTransformer(name)
217
+
218
+ # bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
219
+ # bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
220
+ # top_k = 32
221
+
222
+ # #@st.cache(allow_output_mutation=True)
223
+ # #def load_crossEncoder(name):
224
+ # # return CrossEncoder(name)
225
+
226
+ # # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
227
+ # document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
228
+
229
+ # def bm25_tokenizer(text):
230
+ # tokenized_doc = []
231
+ # for token in text.lower().split():
232
+ # token = token.strip(string.punctuation)
233
+
234
+ # if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
235
+ # tokenized_doc.append(token)
236
+ # return tokenized_doc
237
+
238
+ # def bm25TokenizeDoc(paraList):
239
+ # tokenized_corpus = []
240
+ # for passage in tqdm(paraList):
241
+ # if len(passage.split()) >256:
242
+ # temp = " ".join(passage.split()[:256])
243
+ # tokenized_corpus.append(bm25_tokenizer(temp))
244
+ # temp = " ".join(passage.split()[256:])
245
+ # tokenized_corpus.append(bm25_tokenizer(temp))
246
+ # else:
247
+ # tokenized_corpus.append(bm25_tokenizer(passage))
248
+
249
+ # return tokenized_corpus
250
+
251
+ # tokenized_corpus = bm25TokenizeDoc(paraList)
252
+
253
+
254
+ # document_bm25 = BM25Okapi(tokenized_corpus)
255
+
256
+ # # def search(keyword):
257
+ # # ##### BM25 search (lexical search) #####
258
+ # # bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
259
+ # top_n = np.argpartition(bm25_scores, -10)[-10:]
260
+ # bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
261
+ # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
262
+
263
+ # ##### Sematic Search #####
264
+ # # Encode the query using the bi-encoder and find potentially relevant passages
265
+ # #query = "Does document contain {} issues ?".format(keyword)
266
+ # question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
267
+
268
+ # hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
269
+ # hits = hits[0] # Get the hits for the first query
270
+
271
+
272
+ # ##### Re-Ranking #####
273
+ # # Now, score all retrieved passages with the cross_encoder
274
+ # #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
275
+ # #cross_scores = cross_encoder.predict(cross_inp)
276
+
277
+ # # Sort results by the cross-encoder scores
278
+ # #for idx in range(len(cross_scores)):
279
+ # # hits[idx]['cross-score'] = cross_scores[idx]
280
+
281
+
282
+ # return bm25_hits, hits
283
+
284
+ # def show_results(keywordList):
285
+ # for keyword in keywordList:
286
+ # bm25_hits, hits = search(keyword)
287
+
288
+ # st.markdown("""
289
+ # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
290
+ # """)
291
+ # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
292
+ # st.markdown("Top few lexical search (BM25) hits")
293
+ # for hit in bm25_hits[0:5]:
294
+ # if hit['score'] > 0.00:
295
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
296
+
297
+
298
+
299
+
300
+
301
+ # # st.table(bm25_hits[0:3])
302
+
303
+ # st.markdown("\n-------------------------\n")
304
+ # st.markdown("Top few Bi-Encoder Retrieval hits")
305
+
306
+ # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
307
+ # for hit in hits[0:5]:
308
+ # # if hit['score'] > 0.45:
309
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
310
+ # #st.table(hits[0:3]
311
+
312
+
313
+ # # if docs is not None:
314
+ # # col1, col2 = st.columns(2)
315
+ # # with col1:
316
+ # # if st.button('Gender Keywords Search'):
317
+ # # keywordList = ['Gender Equality', 'Women empowernment']
318
+ # # show_results(keywordList)
319
+ # # with col2:
320
+ # # if st.button('Climate Change Keyword Search'):
321
+ # # keywordList = ['extreme weather', 'floods', 'droughts']
322
+ # # show_results(keywordList)
323
+
324
+ # # keyword = st.text_input("Please enter here \
325
+ # # what you want to search, \
326
+ # # we will look for similar context \
327
+ # # in the document.",
328
+ # # value="",)
329
+ # # if st.button("Find them."):
330
+ # # show_results([keyword])
331
+
332
+
333
+ # choice1 = st.radio(label = 'Keyword Search',
334
+ # help = 'Search \
335
+ # or else you can try a example document',
336
+ # options = ('Enter your own Query', 'Try Example'),
337
+ # horizontal = True)
338
+
339
+ # if choice1 == 'Enter your own Query':
340
+ # keyword = st.text_input("Please enter here \
341
+ # what you want to search, \
342
+ # we will look for similar context \
343
+ # in the document.",
344
+ # value="",)
345
+ # else:
346
+ # option1 = st.selectbox('Select the Predefined word cluster',
347
+ # ('Gender:[Gender Equality, Women empowernment]',
348
+ # 'Climate change:[extreme weather, floods, droughts]',
349
+ # ))
350
+ # if option1 == 'Gender:[Gender Equality, Women empowernment]':
351
+ # keywordList = ['Gender Equality', 'Women empowernment']
352
+ # else:
353
+ # keywordList = ['extreme weather', 'floods', 'droughts']
354
+
355
+ # option1 = st.selectbox('Select the Predefined word cluster',
356
+ # ('Gender:[Gender Equality, Women empowernment]',
357
+ # 'Climate change:[extreme weather, floods, droughts]',
358
+ # # 'Enter your Own Keyword Query'))
359
+ # if option1 == 'Enter your Own Keyword Query':
360
+ # keyword = st.text_input("Please enter here \
361
+ # what you want to search, \
362
+ # we will look for similar context \
363
+ # in the document.",
364
+ # value="",)
365
+ # elif option1 == 'Gender:[Gender Equality, Women empowernment]':
366
+ # keywordList = ['Gender Equality', 'Women empowernment']
367
+ # elif option1 == 'Climate change:[extreme weather, floods, droughts]':
368
+ # keywordList = ['extreme weather', 'floods', 'droughts']
369
+
370
+
371
+ # st.markdown("### 📌 Step Two: Search Keyword in Document ### ")
372
+
373
+
374
+ # @st.cache(allow_output_mutation=True)
375
+ # def load_sentenceTransformer(name):
376
+ # return SentenceTransformer(name)
377
+
378
+ # bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
379
+ # bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
380
+ # top_k = 32
381
+
382
+ # #@st.cache(allow_output_mutation=True)
383
+ # #def load_crossEncoder(name):
384
+ # # return CrossEncoder(name)
385
+
386
+ # # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
387
+ # document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
388
+
389
+ # def bm25_tokenizer(text):
390
+ # tokenized_doc = []
391
+ # for token in text.lower().split():
392
+ # token = token.strip(string.punctuation)
393
+
394
+ # if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
395
+ # tokenized_doc.append(token)
396
+ # return tokenized_doc
397
+
398
+ # def bm25TokenizeDoc(paraList):
399
+ # tokenized_corpus = []
400
+ # for passage in tqdm(paraList):
401
+ # if len(passage.split()) >256:
402
+ # temp = " ".join(passage.split()[:256])
403
+ # tokenized_corpus.append(bm25_tokenizer(temp))
404
+ # temp = " ".join(passage.split()[256:])
405
+ # tokenized_corpus.append(bm25_tokenizer(temp))
406
+ # else:
407
+ # tokenized_corpus.append(bm25_tokenizer(passage))
408
+
409
+ # return tokenized_corpus
410
+
411
+ # tokenized_corpus = bm25TokenizeDoc(paraList)
412
+
413
+
414
+ # document_bm25 = BM25Okapi(tokenized_corpus)
415
+
416
+
417
+ # def search(keyword):
418
+ # ##### BM25 search (lexical search) #####
419
+ # bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
420
+ # top_n = np.argpartition(bm25_scores, -10)[-10:]
421
+ # bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
422
+ # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
423
+
424
+ # ##### Sematic Search #####
425
+ # # Encode the query using the bi-encoder and find potentially relevant passages
426
+ # #query = "Does document contain {} issues ?".format(keyword)
427
+ # question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
428
+
429
+ # hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
430
+ # hits = hits[0] # Get the hits for the first query
431
+
432
+
433
+ # ##### Re-Ranking #####
434
+ # # Now, score all retrieved passages with the cross_encoder
435
+ # #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
436
+ # #cross_scores = cross_encoder.predict(cross_inp)
437
+
438
+ # # Sort results by the cross-encoder scores
439
+ # #for idx in range(len(cross_scores)):
440
+ # # hits[idx]['cross-score'] = cross_scores[idx]
441
+
442
+
443
+ # return bm25_hits, hits
444
+
445
+ # def show_results(keywordList):
446
+ # for keyword in keywordList:
447
+ # bm25_hits, hits = search(keyword)
448
+
449
+ # st.markdown("""
450
+ # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
451
+ # """)
452
+ # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
453
+ # st.markdown("Top few lexical search (BM25) hits")
454
+ # for hit in bm25_hits[0:5]:
455
+ # if hit['score'] > 0.00:
456
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
457
+
458
+
459
+
460
+
461
+
462
+ # # st.table(bm25_hits[0:3])
463
+
464
+ # st.markdown("\n-------------------------\n")
465
+ # st.markdown("Top few Bi-Encoder Retrieval hits")
466
+
467
+ # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
468
+ # for hit in hits[0:5]:
469
+ # # if hit['score'] > 0.45:
470
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
471
+ # #st.table(hits[0:3]
472
+
473
+
474
+
475
+
476
+ # # if st.button("Find them."):
477
+ # # bm25_hits, hits = search(keyword)
478
+
479
+ # # st.markdown("""
480
+ # # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
481
+ # # """)
482
+ # # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
483
+ # # st.markdown("Top few lexical search (BM25) hits")
484
+ # # for hit in bm25_hits[0:5]:
485
+ # # if hit['score'] > 0.00:
486
+ # # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
487
+
488
+
489
+
490
+
491
+
492
+ # # # st.table(bm25_hits[0:3])
493
+
494
+ # # st.markdown("\n-------------------------\n")
495
+ # # st.markdown("Top few Bi-Encoder Retrieval hits")
496
+
497
+ # # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
498
+ # # for hit in hits[0:5]:
499
+ # # # if hit['score'] > 0.45:
500
+ # # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
501
+ # # #st.table(hits[0:3]
502
+
503
+
504
+
appStore/multiapp.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frameworks for running multiple Streamlit applications as a single app.
2
+ """
3
+ import streamlit as st
4
+ from PIL import Image
5
+
6
+ class MultiApp:
7
+ """Framework for combining multiple streamlit applications.
8
+ Usage:
9
+ def foo():
10
+ st.title("Hello Foo")
11
+ def bar():
12
+ st.title("Hello Bar")
13
+ app = MultiApp()
14
+ app.add_app("Foo", foo)
15
+ app.add_app("Bar", bar)
16
+ app.run()
17
+ It is also possible keep each application in a separate file.
18
+ import foo
19
+ import bar
20
+ app = MultiApp()
21
+ app.add_app("Foo", foo.app)
22
+ app.add_app("Bar", bar.app)
23
+ app.run()
24
+ """
25
+ def __init__(self):
26
+ self.apps = []
27
+
28
+ def add_app(self, title, func):
29
+ """Adds a new application.
30
+ Parameters
31
+ ----------
32
+ func:
33
+ the python function to render this app.
34
+ title:
35
+ title of the app. Appears in the dropdown in the sidebar.
36
+ """
37
+ self.apps.append({
38
+ "title": title,
39
+ "function": func
40
+ })
41
+
42
+ def run(self):
43
+ st.sidebar.write(format_func=lambda app: app['title'])
44
+ image = Image.open('appStore/img/giz_sdsn.jpg')
45
+ st.sidebar.image(image)
46
+ app = st.sidebar.radio(
47
+ 'Go To',
48
+ self.apps,
49
+ format_func=lambda app: app['title'])
50
+
51
+ app['function']()
appStore/sdg_analysis.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys; sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from keybert import KeyBERT
12
+ from transformers import pipeline
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ import streamlit as st
16
+ import pandas as pd
17
+
18
+ import tempfile
19
+ import sqlite3
20
+
21
+ def app():
22
+
23
+ with st.container():
24
+ st.markdown("<h1 style='text-align: center; color: black;'> SDSN x GIZ Policy Action Tracking v0.1</h1>", unsafe_allow_html=True)
25
+ st.write(' ')
26
+ st.write(' ')
27
+
28
+ with st.expander("ℹ️ - About this app", expanded=True):
29
+
30
+ st.write(
31
+ """
32
+ The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. \n
33
+ 1. Keyword heatmap \n
34
+ 2. SDG Classification for the paragraphs/texts in the document
35
+ """
36
+ )
37
+
38
+ st.markdown("")
39
+
40
+ st.markdown("")
41
+ st.markdown("## 📌 Step One: Upload document ")
42
+
43
+ with st.container():
44
+
45
+
46
+ docs = None
47
+ # asking user for either upload or select existing doc
48
+ choice = st.radio(label = 'Select the Document',
49
+ help = 'You can upload the document \
50
+ or else you can try a example document',
51
+ options = ('Upload Document', 'Try Example'),
52
+ horizontal = True)
53
+
54
+ if choice == 'Upload Document':
55
+ uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
56
+ if uploaded_file is not None:
57
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
58
+ bytes_data = uploaded_file.getvalue()
59
+ temp.write(bytes_data)
60
+
61
+ st.write("Uploaded Filename: ", uploaded_file.name)
62
+ file_name = uploaded_file.name
63
+ file_path = temp.name
64
+ docs = pre.load_document(file_path, file_name)
65
+ docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
66
+ #haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
67
+
68
+ else:
69
+ # listing the options
70
+ option = st.selectbox('Select the example document',
71
+ ('Ethiopia: 10 Year Development Plan',
72
+ 'South Africa:Low Emission strategy'))
73
+ if option is 'South Africa:Low Emission strategy':
74
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
75
+ st.write("Selected document:", file_name.split('/')[1])
76
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
77
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
78
+ else:
79
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
80
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
81
+ st.write("Selected document:", file_name.split('/')[1])
82
+
83
+ if option is not None:
84
+ docs = pre.load_document(file_path,file_name)
85
+ # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
86
+ docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
87
+
88
+
89
+
90
+ if docs is not None:
91
+
92
+ @st.cache(allow_output_mutation=True)
93
+ def load_keyBert():
94
+ return KeyBERT()
95
+
96
+ kw_model = load_keyBert()
97
+
98
+ keywords = kw_model.extract_keywords(
99
+ all_text,
100
+ keyphrase_ngram_range=(1, 2),
101
+ use_mmr=True,
102
+ stop_words="english",
103
+ top_n=15,
104
+ diversity=0.7,
105
+ )
106
+
107
+ st.markdown("## 🎈 What is my document about?")
108
+
109
+ df = (
110
+ DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
111
+ .sort_values(by="Relevancy", ascending=False)
112
+ .reset_index(drop=True)
113
+ )
114
+
115
+ df.index += 1
116
+
117
+ # Add styling
118
+ cmGreen = sns.light_palette("green", as_cmap=True)
119
+ cmRed = sns.light_palette("red", as_cmap=True)
120
+ df = df.style.background_gradient(
121
+ cmap=cmGreen,
122
+ subset=[
123
+ "Relevancy",
124
+ ],
125
+ )
126
+ c1, c2, c3 = st.columns([1, 3, 1])
127
+
128
+ format_dictionary = {
129
+ "Relevancy": "{:.1%}",
130
+ }
131
+
132
+ df = df.format(format_dictionary)
133
+
134
+ with c2:
135
+ st.table(df)
136
+
137
+ ######## SDG classiciation
138
+ # @st.cache(allow_output_mutation=True)
139
+ # def load_sdgClassifier():
140
+ # classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
141
+
142
+ # return classifier
143
+
144
+ # load from disc (github repo) for performance boost
145
+ @st.cache(allow_output_mutation=True)
146
+ def load_sdgClassifier():
147
+ classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
148
+
149
+ return classifier
150
+
151
+ classifier = load_sdgClassifier()
152
+
153
+ # # not needed, par list comes from pre_processing function already
154
+
155
+ # word_list = all_text.split()
156
+ # len_word_list = len(word_list)
157
+ # par_list = []
158
+ # par_len = 130
159
+ # for i in range(0,len_word_list // par_len):
160
+ # string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
161
+ # par_list.append(string_part)
162
+
163
+ labels = classifier(par_list)
164
+ labels_= [(l['label'],l['score']) for l in labels]
165
+ df = DataFrame(labels_, columns=["SDG", "Relevancy"])
166
+ df['text'] = par_list
167
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
168
+ df.index += 1
169
+ df =df[df['Relevancy']>.85]
170
+ x = df['SDG'].value_counts()
171
+
172
+ plt.rcParams['font.size'] = 25
173
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
174
+ # plot
175
+ fig, ax = plt.subplots()
176
+ ax.pie(x, colors=colors, radius=2, center=(4, 4),
177
+ wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
178
+
179
+ st.markdown("## 🎈 Anything related to SDGs?")
180
+
181
+ c4, c5, c6 = st.columns([5, 7, 1])
182
+
183
+ # Add styling
184
+ cmGreen = sns.light_palette("green", as_cmap=True)
185
+ cmRed = sns.light_palette("red", as_cmap=True)
186
+ df = df.style.background_gradient(
187
+ cmap=cmGreen,
188
+ subset=[
189
+ "Relevancy",
190
+ ],
191
+ )
192
+
193
+ format_dictionary = {
194
+ "Relevancy": "{:.1%}",
195
+ }
196
+
197
+ df = df.format(format_dictionary)
198
+
199
+ with c4:
200
+ st.pyplot(fig)
201
+ with c5:
202
+ st.table(df)
203
+
204
+
ndcs/cca.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"climate_risks_droughts": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
2
+ 1: "Droughts are not climate risks concerns",
3
+ 2: "Droughts are among the five climate risks concerns"}},
4
+ "climate_risks_extreme_weather": {"category": "climate change adaptation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
5
+ 1: "Extreme Weathers are not climate risks concerns",
6
+ 2: "Extreme Weathers are among the five climate risks concerns"}},
7
+ "climate_risks_floods": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
8
+ 1: "Floods are not climate risks concerns",
9
+ 2: "Floods are among the five climate risks concerns"}},
10
+ "climate_risks_temp_increase": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
11
+ 1: "Temperature increase are not climate risks concerns",
12
+ 2: "Temperature increase are among the five climate risks concerns"}},
13
+ "climate_risks_sea_level_rise": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
14
+ 1: "Sea level rise is not a climate risks concerns",
15
+ 2: "Sea level rise is among the five climate risks concerns"}},
16
+
17
+ "priority_sectors_agriculture": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
18
+ 1: "Agricultural sector is not that important in the context of adaptation ambitions",
19
+ 2: "In the context of adaptation ambitions Agricultural sector is very important for the country",
20
+ 3: "Agriculture sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
21
+
22
+ "priority_sectors_ecosystems": {"category": "climate change adaptation","id": {0 :"(I)NDC not submitted or not yet included in analysis",
23
+ 1 :"Biodiversity and preservation of Ecosystems is not that important in the context of adaptation ambitions",
24
+ 2: "In the context of adaptation ambitions Biodiversity and preservation of Ecosystems is very important for the country",
25
+ 3: "Biodiversity and Ecosystems plays an importance for the country, and therefore in the adaptation ambitions Biodiversity and Ecosystems has special actions and aims"}},
26
+ "priority_sectors_forestry": {"category": "climate change adaptation", "id": {0: "(I)NDC not submitted or not yet included in analysis",
27
+ 1: "Forestry sector is not that important in the context of adaptation ambitions",
28
+ 2: "In the context of adaptation ambitions Forestry sector is very important for the country",
29
+ 3: "Forestry sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
30
+ "priority_sectors_health": {"category": "climate change adaptation","id": { 0: "(I)NDC not submitted or not yet included in analysis",
31
+ 1: "Health sector is not that important in the context of adaptation ambitions",
32
+ 2: "In the context of adaptation ambitions Health sector is very important for the country",
33
+ 3: "Health sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
34
+
35
+ "priority_sectors_water": {"category": "climate change adaptation","id": { 0 : "(I)NDC not submitted or not yet included in analysis",
36
+ 1: "Water sector is not that important in the context of adaptation ambitions",
37
+ 2: "In the context of adaptation ambitions Water sector is very important for the country",
38
+ 3: "Water sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
39
+
40
+ "vulnerability_agriculture": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
41
+ 1: "Agriculture is a not a vulnerable sector",
42
+ 2: "Agriculture is a vulnerable sector"}},
43
+ "vulnerability_coastal_zones": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
44
+ 1: "Coastal Zone is a not a vulnerable sector",
45
+ 2: "Coastal Zone is a vulnerable sector"}},
46
+ "vulnerability_ecosystems": {"category": "climate change adaptation", "id":{ 0: "(I)NDC not submitted or not yet included in analysis",
47
+ 1: "Biodiversity and Ecosystems is a not a vulnerable sector",
48
+ 2: "Biodiversity and Ecosystems is a vulnerable sector"}},
49
+ "vulnerability_health": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
50
+ 1: "Health is a not a vulnerable sector",
51
+ 2: "Health is a vulnerable sector"}},
52
+ "vulnerability_water": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
53
+ 1: "Water is a not a vulnerable sector",
54
+ 2: "Water is a vulnerable sector"}},
55
+
56
+ "costs_of_adaptation": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
57
+ 1: "The partial cost of adaptation is tentatively around few million dollars",
58
+ 2: " The cost of adaptation will be 0-1 billion US$ until 2030",
59
+ 3: " The cost of adaptation will be 1-5 billion US$ until 2030",
60
+ 4: " The cost of adaptation will be 5-10 billion US$ until 2030",
61
+ 5: " The cost of adaptation will be 10-20 billion US$ until 2030",
62
+ 6: "The cost of adaptation will be more than 20 billion US$ until 2030"}},
63
+ "costs_of_future_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
64
+ 1: "The future losses from climate change will be huge",
65
+ 2: "The climate hazards cause significant loss to economy and life, and the cost of Future losses could go around few million dollars"}},
66
+
67
+ "costs_of_recent_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
68
+ 1: "No losses indicated",
69
+ 2: "In the recent climate hazards there has been significant Economic losses.",
70
+ 3: "In the recent climate hazards the impact on human life has been significant",
71
+ 4: "In the recent climate hazards the impact on human life has been significant and the economic loss amounts to 5.3"}},
72
+ "quantified_adaptation_targets": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
73
+ 1:"No quantitative adaptation target",
74
+ 2: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
75
+ 3: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
76
+ 4: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years"}},
77
+
78
+ "slow_onset_others": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
79
+ 1:"Apart from sea level rise and temperature increase, no other specific slow onset process",
80
+ 2: "There are other slow onset processes additional to sea level rise and temperature increase like loss of biodiversity, desertification, glacier retreat, salinisation or ocean acidification"}},
81
+ }
ndcs/ccm.txt ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"agriculture": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
2
+ 1: "Agriculture sector is not considered for climate change mitigation",
3
+ 2: "Agriculture sector contribution in greenhouse gases emission is significant and therefore is part of climate change mitigation",
4
+ 3: "Agriculture sector contribution in greenhouse gases emission is significant. Given the importance of agriculture sector for economy and and its adverse contribution in greenhouse gas emissions it is a Focus area for climate change mitigation and needs to be prioritised"}},
5
+
6
+ "energy_efficiency": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
7
+ 1: "Energy Efficiency is not considered for climate change mitigation",
8
+ 2: "Energy sector contribution in greenhouse gases emission is significant and therefore Energy Efficiency is part of climate change mitigation",
9
+ 3: "Energy sector contribution in greenhouse gases emission is significant. Given the importance of the energy sector for economy and its adverse contribution to greenhouse gas emissions, energy efficiency is a Focus area for climate change mitigation and needs to be prioritised. The quantified renewable energy targets like for example in solar, geothermal, wind power are provided."}},
10
+
11
+ "fossil_fuel_production": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
12
+ 1:"There is no recorded FFP (2016)",
13
+ 2: "Fossil fuel Production is important for economy",
14
+ 3:"Fossil fuel Production is important to provide for the basic requirements of the people in the country",
15
+ 4:"The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to the same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production",
16
+ 5: "Fossil fuel Production is important to provide for the basic requirements of the people in the country.The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production"}},
17
+ "fossil_fuel_subsidiaries": {"category": "climate change mitigation","id":{0: "(I)NDC not submitted or not yet included in analysis",
18
+ 1:"fossil Fuel subsidiaries are not considered",
19
+ 2:"the alternates/subsidiaries to fossil Fuel need to be considered to meet the mitigations ambitions",
20
+ 3:"The fossil fuel contribution towards greenhouse gas emissions is very high and therefore there is a need to find the alternatives/substitutes for the same. The replacement of fossil fuels with alternates is a priority focus area in the mitigation actions to meet mitigation ambitions."}},
21
+
22
+ "land_use_and_forestry": {"category": "climate change mitigation", "id":{0:"(I)NDC not submitted or not yet included in analysis",
23
+ 1:"land use and forestry are not considered",
24
+ 2:"the land use and forestry contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
25
+ 3:"The land use and forestry contribution towards greenhouse gas emissions is significant and therefore there is need to quantify the mitigation potential land use and forestry."}},
26
+ "land_use_change": {"category": "climate change mitigation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
27
+ 1: "land use change Not mentioned",
28
+ 2: "land use change is being considered, but there are no mitigation targets",
29
+ 3: "land use change is being considered as part of mitigation targets",
30
+ 4: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change.",
31
+ 5: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change."}},
32
+
33
+ "renewable_energy": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
34
+ 1:"renewable energy is not considered",
35
+ 2:"Renewable energy are direct measure to reduce the greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
36
+ 3:"Renewable energy are direct measure to reduce the greenhouse gas emissions and therefore there is need to quantify the mitigation potential in terms of renewable energy targets and specific sub-sectors of action (e.g. solar, geothermal, wind power)"}},
37
+
38
+ "temp_target": {"category": "climate change mitigation", "id": { 0: "(I)NDC not submitted or not yet included in analysis",
39
+ 1:"Not mentioning global effort to limit global temperature increase to 2 degree celsius or 1.5 degree C",
40
+ 2:"there is urgent need to limit global temperature increase to 2 degree celsius",
41
+ 3:"there is urgent need to limit global temperature increase to 1.5 degree C",
42
+ 4:"there is urgent need to limit global temperature increase to 2 degree celsius",
43
+ 5:"there is urgent need to limit global temperature increase to 1.5 degree C"}},
44
+ "waste": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
45
+ 1:"Waste as a topic is not mentioned",
46
+ 2:"Waste reduction or management can play important role in mitigation plan and ambitions",
47
+ 3:"Waste reduction or management can play an important role in sustainable development and hence is a focus area in mitigation plan and ambitions"}},
48
+ "transport": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
49
+ 1:"Transport is not considered",
50
+ 2:"Transport contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
51
+ 3:"transport sector contribution towards greenhouse gas emissions is significant and therefore there is need to focus/prioritise the transport sector to meet the mitigation potential"}},
52
+
53
+ "reducing_non_co2_gases": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
54
+ 1:"Reduction of non CO2 gases not indicated",
55
+ 2:"Efforts should be made in reduction of NOn CO2 gases too."}},
56
+
57
+
58
+ "base_year": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
59
+ 1: "No base year",
60
+ 2: "the base year or reference point for measurement of emissions is year 19XX"}},
61
+
62
+ "carbon_capture_and_storage": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
63
+ 1: "carbon capture and storage not indicated",
64
+ 2:"With Technology advancement the mitigation efforts can also in form of carbon capture and storage.",
65
+ 3: "With technological advancement the mitigation efforts can also be in form of carbon capture and storage. This should be a focus area and more options need to be explored to do carbon capture and storage."}},
66
+
67
+ "costs_of_ccm": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
68
+ 1: "(partial) costs not indicated",
69
+ 2: " the mitigation actions and efforts will cost 0-1 billion US$ until 2030",
70
+ 3:"the mitigation actions and efforts will cost 1-5 billion US$ until 2030",
71
+ 4:"the mitigation actions and efforts will cost5-10 billion US$ until 2030",
72
+ 5: "the mitigation actions and efforts will cost 10-20 billion US$ until 2030",
73
+ 6:"the mitigation actions and efforts will cost will be more than 20 billion US$ until 2030"}},
74
+
75
+ "market_mechanisms": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
76
+ 1: "International market mechanisms not mentioned",
77
+ 2:"One good mechanism to deal with greenhouse gas emissions is to explore International market mechanisms",
78
+ 3: "International market mechanisms are not a good way of dealing with mitigation ambitions and therefore should not be considered. Greenhouse gas emissions cannot be part of tradable commodity.",
79
+ 4: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target",
80
+ 5: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target"}},
81
+
82
+ "redd": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
83
+ 1: "REDD+ not mentioned",
84
+ 2: "Reducing Emissions of Deforestation and Forest Degradation/REDD+",
85
+ 3: "Reducing Emissions of Deforestation and Forest Degradation/REDD+"}},
86
+ }
ndcs/countryList.txt ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {'Afghanistan': 'AFG',
2
+ 'Albania': 'ALB',
3
+ 'Algeria': 'DZA',
4
+ 'Andorra': 'AND',
5
+ 'Angola': 'AGO',
6
+ 'Antigua and Barbuda': 'ATG',
7
+ 'Argentina': 'ARG',
8
+ 'Armenia': 'ARM',
9
+ 'Australia': 'AUS',
10
+ 'Azerbaijan': 'AZE',
11
+ 'Bahamas': 'BHS',
12
+ 'Bahrain': 'BHR',
13
+ 'Bangladesh': 'BGD',
14
+ 'Barbados': 'BRB',
15
+ 'Belarus': 'BLR',
16
+ 'Belize': 'BLZ',
17
+ 'Benin': 'BEN',
18
+ 'Bhutan': 'BTN',
19
+ 'Bolivia': 'BOL',
20
+ 'Bosnia and Herzegovina': 'BIH',
21
+ 'Botswana': 'BWA',
22
+ 'Brazil ': 'BRA',
23
+ 'Brunei Darussalam': 'BRN',
24
+ 'Burkina Faso': 'BFA',
25
+ 'Burundi ': 'BDI',
26
+ 'Cabo Verde': 'CPV',
27
+ 'Cambodia': 'KHM',
28
+ 'Cameroon': 'CMR',
29
+ 'Canada': 'CAN',
30
+ 'Central African Republic': 'CAF',
31
+ 'Chad': 'TCD',
32
+ 'Chile': 'CHL',
33
+ 'China': 'CHN',
34
+ 'Colombia': 'COL',
35
+ 'Comoros': 'COM',
36
+ 'Congo': 'COG',
37
+ 'Cook Islands': 'COK',
38
+ 'Costa Rica': 'CRI',
39
+ 'Cote dIvoire': 'CIV',
40
+ 'Cuba': 'CUB',
41
+ "Democratic People's Republic of Korea": 'PRK',
42
+ 'Democratic Republic of Congo': 'COD',
43
+ 'Djibouti': 'DJI',
44
+ 'Dominica': 'DMA',
45
+ 'Dominican Republic': 'DOM',
46
+ 'Ecuador': 'ECU',
47
+ 'Egypt': 'EGY',
48
+ 'El Salvador': 'SLV',
49
+ 'Equatorial Guinea': 'GNQ',
50
+ 'Eritrea': 'ERI',
51
+ 'Ethiopia': 'ETH',
52
+ 'European Union': 'EU',
53
+ 'Fiji': 'FJI',
54
+ 'Gabon': 'GAB',
55
+ 'Gambia': 'GMB',
56
+ 'Georgia': 'GEO',
57
+ 'Ghana': 'GHA',
58
+ 'Grenada': 'GRD',
59
+ 'Guatemala': 'GTM',
60
+ 'Guinea': 'GIN',
61
+ 'Guinea Bissau': 'GNB',
62
+ 'Guyana': 'GUY',
63
+ 'Haiti': 'HTI',
64
+ 'Honduras': 'HND',
65
+ 'Iceland': 'ISL',
66
+ 'India': 'IND',
67
+ 'Indonesia': 'IDN',
68
+ 'Iran': 'IRN',
69
+ 'Iraq': 'IRQ',
70
+ 'Israel': 'ISR',
71
+ 'Jamaica': 'JAM',
72
+ 'Japan': 'JPN',
73
+ 'Jordan': 'JOR',
74
+ 'Kazakhstan': 'KAZ',
75
+ 'Kenya': 'KEN',
76
+ 'Kingdom of Eswatini': 'SWZ',
77
+ 'Kiribati': 'KIR',
78
+ 'Kuwait': 'KWT',
79
+ 'Kyrgyzstan': 'KGZ',
80
+ 'Lao Peoples Democratic Republic': 'LAO',
81
+ 'Lebanon': 'LBN',
82
+ 'Lesotho': 'LSO',
83
+ 'Liberia': 'LBR',
84
+ 'Libya': 'LBY',
85
+ 'Liechtenstein': 'LIE',
86
+ 'Madagascar': 'MDG',
87
+ 'Malawi': 'MWI',
88
+ 'Malaysia': 'MYS',
89
+ 'Maldives': 'MDV',
90
+ 'Mali': 'MLI',
91
+ 'Marshall Islands': 'MHL',
92
+ 'Mauritania': 'MRT',
93
+ 'Mauritius': 'MUS',
94
+ 'Mexico': 'MEX',
95
+ 'Micronesia': 'FSM',
96
+ 'Monaco': 'MCO',
97
+ 'Mongolia': 'MNG',
98
+ 'Montenegro': 'MNE',
99
+ 'Morocco': 'MAR',
100
+ 'Mozambique': 'MOZ',
101
+ 'Myanmar': 'MMR',
102
+ 'Namibia': 'NAM',
103
+ 'Nauru': 'NRU',
104
+ 'Nepal': 'NPL',
105
+ 'New Zealand': 'NZL',
106
+ 'Nicaragua': 'NIC',
107
+ 'Niger': 'NER',
108
+ 'Nigeria': 'NGA',
109
+ 'Niue': 'NIU',
110
+ 'Norway': 'NOR',
111
+ 'Oman': 'OMN',
112
+ 'Pakistan': 'PAK',
113
+ 'Palau ': 'PLW',
114
+ 'Palestine': 'PSE',
115
+ 'Panama': 'PAN',
116
+ 'Papua New Guinea': 'PNG',
117
+ 'Paraguay': 'PRY',
118
+ 'Peru': 'PER',
119
+ 'Philippines': 'PHL',
120
+ 'Qatar': 'QAT',
121
+ 'Republic of Moldova': 'MDA',
122
+ 'Republic of North Macedonia': 'MKD',
123
+ 'Russian Federation': 'RUS',
124
+ 'Rwanda': 'RWA',
125
+ 'Saint Kitts and Nevis': 'KNA',
126
+ 'Saint Lucia': 'LCA',
127
+ 'Saint Vincent and the Grenadines': 'VCT',
128
+ 'Samoa': 'WSM',
129
+ 'San Marino': 'SMR',
130
+ 'Sao Tome and Principe': 'STP',
131
+ 'Saudi Arabia': 'SAU',
132
+ 'Senegal': 'SEN',
133
+ 'Serbia': 'SRB',
134
+ 'Seychelles': 'SYC',
135
+ 'Sierra Leone': 'SLE',
136
+ 'Singapore': 'SGP',
137
+ 'Solomon Islands': 'SLB',
138
+ 'Somalia': 'SOM',
139
+ 'South Africa': 'ZAF',
140
+ 'South Korea': 'KOR',
141
+ 'South Sudan': 'SSD',
142
+ 'Sri Lanka': 'LKA',
143
+ 'Sudan': 'SDN',
144
+ 'Suriname': 'SUR',
145
+ 'Switzerland': 'CHE',
146
+ 'Syria': 'SYR',
147
+ 'Tajikistan': 'TJK',
148
+ 'Thailand': 'THA',
149
+ 'Timor Leste': 'TLS',
150
+ 'Togo': 'TGO',
151
+ 'Tonga': 'TON',
152
+ 'Trinidad and Tobago': 'TTO',
153
+ 'Tunisia': 'TUN',
154
+ 'Turkey': 'TUR',
155
+ 'Turkmenistan': 'TKM',
156
+ 'Tuvalu': 'TUV',
157
+ 'Uganda': 'UGA',
158
+ 'Ukraine': 'UKR',
159
+ 'United Arab Emirates': 'ARE',
160
+ 'United Kingdom': 'GBR',
161
+ 'United Republic of Tanzania': 'TZA',
162
+ 'United States of America': 'USA',
163
+ 'Uruguay': 'URY',
164
+ 'Uzbekistan': 'UZB',
165
+ 'Vanuatu': 'VUT',
166
+ 'Venezuela ': 'VEN',
167
+ 'Vietnam': 'VNM',
168
+ 'Yemen': 'YEM',
169
+ 'Zambia': 'ZMB',
170
+ 'Zimbabwe': 'ZWE'}
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+ xpdf
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ farm-haystack
2
+ farm-haystack[ocr]
3
+ spacy==3.2.0
4
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
5
+ keybert==0.5.1
6
+ matplotlib==3.5.1
7
+ nltk==3.7
8
+ numpy==1.22.1
9
+ pandas==1.4.0
10
+ pdfplumber==0.6.2
11
+ Pillow==9.1.1
12
+ seaborn==0.11.2
13
+ transformers==4.13.0
14
+ rank_bm25
sample/Ethiopia_s_2021_10 Year Development Plan.txt ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Ethiopia 2030: The Pathway to Prosperity
2
+ Ten Years Perspective Development Plan (2021 � 2030)
3
+ 1. Baselines and Assumptions
4
+ 2. Strategic pillars
5
+ 3. Departures
6
+ 4. Macroeconomic goals
7
+ 5. Implications of the COVID-19 pandemic and necessary mitigation measures
8
+ 6. Potentials/capabilities
9
+ 7. Focus areas
10
+ 7.1. Productive sectors
11
+ 7.2. Services sector
12
+ 7.3. Enabling sectors
13
+ 8. Balanced and competitive development (nationally, regionally and locally)
14
+ 9. Monitoring and Evaluation
15
+ Content
16
+ 1. Baselines and Assumptions
17
+ Poverty Reduction (%)
18
+ Key performances of previous years
19
+ 45.5 44.2
20
+ 38.7
21
+ 29.6
22
+ 23.5
23
+ 19
24
+ 0
25
+ 5
26
+ 10
27
+ 15
28
+ 20
29
+ 25
30
+ 30
31
+ 35
32
+ 40
33
+ 45
34
+ 50
35
+ 1994 2000 2005 2011 2016 2020
36
+ Percent
37
+ Year
38
+ Proportion of people living below poverty line
39
+ 10.5
40
+ 8.8
41
+ 10.1
42
+ 7.7
43
+ 9
44
+ 5.19-6.20
45
+ 0 2 4 6 8 10 12
46
+ GTP I: 2011-2015
47
+ GTP II: 2015/16
48
+ GTP II: 2016/17
49
+ GTP II: 2017/18
50
+ GTP II: 2018/19
51
+ GTP II: 2019/20 (projection, with
52
+ COVID-19)
53
+ GDP growth rate (%)
54
+ 1. Baselines and Assumptions
55
+ Share of economic sectors in GDP (%) Merchandise export as % of GDP
56
+ 8.66
57
+ 7.33
58
+ 6.57
59
+ 5.93
60
+ 4.91
61
+ 3.86 3.56 3.37
62
+ 2.77
63
+ 0
64
+ 1
65
+ 2
66
+ 3
67
+ 4
68
+ 5
69
+ 6
70
+ 7
71
+ 8
72
+ 9
73
+ 10
74
+ Percent
75
+ Year
76
+ 46.9
77
+ 45
78
+ 43.5
79
+ 41.4
80
+ 39.5
81
+ 37.1 35.9
82
+ 34.5
83
+ 32.8
84
+ 13.4
85
+ 15
86
+ 17.3
87
+ 18.8
88
+ 21
89
+ 23.5
90
+ 25.7 26.9 27.8
91
+ 4.7 4.8 5 5.3 5.6 6.1 6.9 6.8 6.8
92
+ 7.1
93
+ 8.6
94
+ 10.7 12
95
+ 14.2
96
+ 16.2
97
+ 17.8 19.1 20.1
98
+ 39.8 40.1 39.2 39.8 39.4 38.4 38.6 39.4
99
+ 0
100
+ 5
101
+ 10
102
+ 15
103
+ 20
104
+ 25
105
+ 30
106
+ 35
107
+ 40
108
+ 45
109
+ 50
110
+ 2010/11 2011/12 2012/13 2013/14 2014/15 2015/16 2016/17 2017/18 2018/19
111
+ Percent
112
+ Agriculture Industry Manufacturing Construction Services
113
+ 1. Baselines and Assumptions
114
+ Labour force participation (2013)
115
+ 73%
116
+ 7%
117
+ 20%
118
+ Agriculture
119
+ Industry
120
+ Services
121
+ 7%
122
+ 22%
123
+ 71%
124
+ Agriculture
125
+ Industry
126
+ Services
127
+ Urban labour force participation (2013)
128
+ 1. Baselines and Assumptions
129
+ High and increasing Unemployment Rate
130
+ � Urban unemployment rate = 19.1% in 2018
131
+ � Youth unemployment rate = 25.3 %
132
+ ? Male = 18.6%
133
+ ? Female 30.9 %
134
+ � Rural unemployment rate = 2% in 2013
135
+ � Declining per capita rural land creating
136
+ disguised unemployment
137
+ 402,869
138
+ 471,535
139
+ Male Female Total Male Female Total
140
+ 2014 2018
141
+ 15-19 yr. 20-24 yr. 25-29 yr. Linear (20-24 yr.)
142
+ Number of unemployed people in urban areas
143
+ 1. Baselines and Assumptions
144
+ Challenges
145
+ 1. Macroeconomic imbalances
146
+ ?Sustained high inflation
147
+ ?High and rising unemployment especially
148
+ in urban areas
149
+ ?High and rising debt burden
150
+ ?Chronic foreign currency shortage
151
+ ?Sluggish (though encouraging) rate of
152
+ structural change
153
+ 2. Vulnerability to shocks (COVID-19, Climate
154
+ changes, Desert Locust infestation, etc)
155
+ 3. Poor quality and high inequity in
156
+ infrastructure projects
157
+ 4. Poor quality services in health and
158
+ education
159
+ � High repetition and dropout rates from school
160
+ 1. Baselines and Assumptions
161
+ � Poor quality of growth and slow
162
+ structural change
163
+ � Excessive aid and loan
164
+ dependence for financing
165
+ infrastructural and construction
166
+ investments
167
+ � Limited success in expanding
168
+ manufacturing and modern
169
+ agriculture which have high job
170
+ creation potentials
171
+ � Weak institutional capacity as
172
+ the main culprit of all failures
173
+ ? Provision of quality services
174
+ (electricity, water, telephone,
175
+ internet)
176
+ ? Creation of enough jobs and
177
+ improved living standards
178
+ ? Generation of reliable foreign
179
+ exchange revenue and debtsustainable
180
+ national economic
181
+ capacity
182
+ ? Completion of development
183
+ projects and investment plans
184
+ under public-private
185
+ partnerships
186
+ � Low reward for merit, productivity and effort
187
+ while low disincentive for laziness, wastefulness
188
+ and corruption
189
+ � Slow institutional change and transformation in:
190
+ ? Government policies
191
+ ? Investor attitude
192
+ ? Youth behaviour
193
+ ? Role of the intellectuals
194
+ � The need for sustained increase in production
195
+ and productivity
196
+ � The need to set a common national vision to
197
+ achieve major successes with consensus and
198
+ popular legitimacy
199
+ Major areas of failure in the economy
200
+ 1. Baselines and Assumptions
201
+ � Poor quality of growth and slow
202
+ structural change
203
+ � Excessive aid and loan
204
+ dependence for financing
205
+ infrastructural and construction
206
+ investments
207
+ � Limited success in expanding
208
+ manufacturing and modern
209
+ agriculture which have high job
210
+ creation potentials
211
+ � Weak institutional capacity as
212
+ the main culprit of all failures
213
+ ? Provision of quality services
214
+ (electricity, water, telephone,
215
+ internet)
216
+ ? Creation of enough jobs and
217
+ improved living standards
218
+ ? Generation of reliable foreign
219
+ exchange revenue and debtsustainable
220
+ national economic
221
+ capacity
222
+ ? Completion of development
223
+ projects and investment plans
224
+ under public-private
225
+ partnerships
226
+ � Low reward for merit, productivity and effort
227
+ while low disincentive for laziness, wastefulness
228
+ and corruption
229
+ � Slow institutional change and transformation in:
230
+ ? Government policies
231
+ ? Investor attitude
232
+ ? Youth behaviour
233
+ ? Role of the intellectuals
234
+ � The need for sustained increase in production
235
+ and productivity
236
+ � The need to set a common national vision to
237
+ achieve major successes with consensus and
238
+ popular legitimacy
239
+ Major areas of failure in the economy
240
+ 2. Departures
241
+ 1. Emphasis on quality of economic growth
242
+ 2. Participation and coordination of sectors in the planning process
243
+ 3. Sectoral linkages and multi-sectoral development focus
244
+ 4. Preparation of national development corridors based on development potentials
245
+ 5. Focus on solving institutional bottlenecks
246
+ 6. The ongoing home grown economic reform programme as a sprinting board
247
+ 7. Emphasis on resilience building, innovation and entrepreneurship
248
+ 3. Strategic pillars
249
+ 1. Ensure quality growth
250
+ 2. Improve productivity and competitiveness
251
+ 3. Undertake institutional transformation
252
+ 4. Ensure private sector's leadership in the economy
253
+ 5. Ensure equitable participation of women and children
254
+ 6. Build climate resilient green economy
255
+ 3. Strategic pillars
256
+ � Increasing export revenues and substituting imports by
257
+ reducing production costs
258
+ � Availing quality and massive infrastructure
259
+ ? Linking infrastructural development with development corridors
260
+ � Producing required human resources with quality
261
+ � Producing enough and quality human resources
262
+ � Prioritizing innovative production systems
263
+ � Linking incentives with export revenue and job creation
264
+ performances
265
+ � Modernizing and enhancing the logistic system
266
+ � Creating technological competences needed for longterm
267
+ growth
268
+ � The economic growth should ensure:
269
+ ? Participation of all citizens and equitable utilization of the
270
+ growth proceeds
271
+ ? Improved standard of living of every citizen
272
+ ? Reduced poverty in all indicators
273
+ ? Reduced inflation and unemployment
274
+ � The economic growth should lead to increased
275
+ aggregate supply
276
+ � Focus on modern agriculture, manufacturing and
277
+ mining
278
+ � Emphasis on exploiting the sources of growth through
279
+ structural change
280
+ 1.Ensuring quality economic growth 2. Raising production and productivity
281
+ 3. Strategic pillars
282
+ � Build democratic and judicial institutions that ensure elite bargain,
283
+ national consensus, common vision and government legitimacy
284
+ � Build private sector and competition friendly bureaucracy
285
+ � Coordinate with parents, the society and teachers to make
286
+ educational institutions centers of excellence and virtuous citizens
287
+ � Coordinate with parents as well as social and religious leaders to
288
+ encourage religious institutions and their teachings contribute
289
+ towards poverty reduction efforts
290
+ � Prepare policies, strategies and legal frameworks for achieving
291
+ prosperity
292
+ � Increased focus on innovation and research
293
+ � Creating strong social security system
294
+ 3. Institutional Transformation 4. Private sector's leadership in the economy
295
+ � Create conducive investment climate and incentivize
296
+ domestic investors in key sectors
297
+ � Build strong and market-led public-private partnerships in
298
+ order to ensure the establishment of inclusive and
299
+ pragmatic market economy
300
+ � Enhance access and quality of infrastructure to attract
301
+ quality foreign direct investment
302
+ � Identify new sources of growth, empower and stimulate
303
+ the private sector, and supplement the private sector in
304
+ strategic areas
305
+ � Emphasis for public-private partnership on problem
306
+ solving innovations and research activities
307
+ 3. Strategic pillars
308
+ � Ensure gender equity in economic and social
309
+ sectors
310
+ ? Participation of women at all levels of education
311
+ ? Asset ownership of women
312
+ � Ensure fair participation of women and youth in
313
+ leadership and decision making positions
314
+ � Create awareness among citizens about the role of
315
+ women and youth in the country�s overall
316
+ development
317
+ � Increase basin development efforts to fight land
318
+ degradation and to reduce pollutions
319
+ � Improve productivity and reduce GHG emissions
320
+ � Increase forest protection and development
321
+ � Increase production of electricity from renewable
322
+ sources for domestic use and for export
323
+ � Focus on modern and energy saving technologies
324
+ 5. Equitable participation of women and children 6. Climate resilient green economy
325
+ 4. Macroeconomic Goals
326
+ Assumptions
327
+ ? Requirement to significantly reduce
328
+ poverty
329
+ ? Available national potentials
330
+ ? Potential for investment in the economy
331
+ ? Existing potentials in each sector
332
+ ? Low productivity that needs to be
333
+ improved
334
+ � Make Ethiopia a middle income
335
+ economy by 2022
336
+ � Raise per capita income to USD 1,115
337
+ in 2022
338
+ ? Threshold for middle-income is USD 1,026
339
+ ? Plus human development index and
340
+ economic vulnerability index
341
+ � Raise per capita income to USD 2,220
342
+ by 2030
343
+ Sectoral growth Targets (2021-2030)
344
+ Assured middle- income potential
345
+ 10.2%
346
+ Average
347
+ Growth
348
+ Target
349
+ Percentage of population below poverty line
350
+ 4. Macroeconomic Goals
351
+ Structural change
352
+ Financing Gaps
353
+ Reduce urban unemployment to less than 9%
354
+ ?1.36 million new jobs need to be
355
+ created per annum
356
+ Sectoral composition of GDP Labour force participation
357
+ Economic
358
+ Sectors
359
+ Performance Target
360
+ 2011 2015 2018/19 2030
361
+ Agriculture 45 39.7 32.8 22.0
362
+ Industry 15.1 21.2 27.6 35.9
363
+ Manufacturing 4.7 5.5 6.8 17.2
364
+ Services 39.9 39 39.4 42.1
365
+ 5. Implications of the COVID-19 pandemic and necessary mitigation measures
366
+ � GDP growth for 2019/20 fiscal year is projected to be lower than its target of 9.0% by between 2.81
367
+ and 3.80 percentage points (equivalent to 58.3 - 78.8 billion birr) due to COVID-19 pandemic
368
+ � If the current scenario continues, next year�s GDP growth could decline by 2.8 percentage points
369
+ � Returning the economy to its high growth trajectory requires focusing on sectors with high
370
+ productivity and job creation potentials
371
+ � Public investment should focus on empowering the private sector
372
+ � Promoting both domestic and foreign investments with the right set of incentives (merit based)
373
+ � Modernizing production systems and improving uptake of technology
374
+ � Conducting demand analysis for export commodities to remedy for the declining trend in exports
375
+ and foreign exchange earnings.
376
+ 6. Potentials
377
+ � Endowment of various natural resources contributing to the growth potential
378
+ � Huge unutilized arable land creates great potential for the success of the plan
379
+ � Endowment of gemstones, ornamental, energy, metals, and metallic minerals
380
+ � Gold, coal, iron ore, potash, tantalum, marble, petroleum and other natural resources
381
+ Natural
382
+ Resources
383
+ � Large youth population and potential for demographic dividend
384
+ � Cumulative capacity in education and health
385
+ � Positive attitude and noble culture of reaching agreement among citizens
386
+ Human
387
+ capital
388
+ 6. Potentials
389
+ Built physical and material capitals
390
+ ?Transport and communication
391
+ ? Irrigation infrastructures for modern agriculture
392
+ ?Industrial Parks
393
+ ?Mega energy infrastructures
394
+ Physical
395
+ capital
396
+ Unexploited
397
+ growth
398
+ potentials
399
+ � Utilizing the tourism potential through modernization
400
+ � Using the mining subsector as a source of input as well as a competitive industry in its
401
+ own right
402
+ 6. Potentials
403
+ � Solving supply side bottlenecks to satisfy the existing demand
404
+ � Improving international acceptance and reliable partnerships
405
+ ? The �medemer�/synergy philosophy
406
+ ? The ongoing political reform measures
407
+ ? The Homegrown Economic Reform programme
408
+ � Increased finance from partners and multilateral institutions
409
+ ? Increased availability of foreign exchange
410
+ ? Reduced debt stress for the short to medium term
411
+ ? Increased potential for development
412
+ Increased
413
+ demand as
414
+ potential
415
+ Political Capital
416
+ Continental
417
+ and regional
418
+ integrations
419
+ � Regional and continental economic integration agreements
420
+ � International and continental free trade agreements
421
+ 6. Potentials
422
+ Low
423
+ technology as
424
+ a potential
425
+ � Undeniably low status of technological development
426
+ � International mobility and spillover effect of technology
427
+ � Potential for development and catching up by filling the technological gaps
428
+ � Doubling crop productivity from the current 24-36 quintals per hectare will result
429
+ in 7% increase in crop production
430
+ � Raise the production efficiency of manufacturing from the current 50% to 80%
431
+ 7. Focus Areas
432
+ 7.1. Productive sectors: agriculture, manufacturing, mining
433
+ 7.2. Service sector: tourism
434
+ 7.3. Enabling sectors: energy, transport, sustainable finance,
435
+ innovation and technology, urban development, irrigation,
436
+ human capital development
437
+ 7.1. Productive sectors
438
+ Agriculture Objectives
439
+ 1. Free agriculture from rain dependence
440
+ 2. Agricultural mechanization services
441
+ 3. Contract farming, cluster approach and
442
+ land consolidation
443
+ 4. Livestock, animal feed and animal health
444
+ 5. Horticulture (irrigation and urban farming)
445
+ 6. Private sector participation
446
+ 7. Institutional implementation capacity
447
+ 8. Climate resilient sustainable agricultural
448
+ development
449
+ 1. Improve income and livelihood options for farming and pastoral
450
+ communities through increased productivity and competitiveness
451
+ 2. Modernize agriculture and ensure national food and nutrition security
452
+ 3. Raise export of agricultural output and substitute imports
453
+ 4. Make agriculture a viable and profitable enterprise through value addition
454
+ 5. Create rural employment opportunities
455
+ 6. Enhance livestock health access and quality
456
+ 7. Preserve animal genetic resources and increase pastoral research
457
+ 8. Improve the development of animal feed and access to markets
458
+ 9. Develop livestock specific extension package for each livestock type
459
+ Focus Areas
460
+ 7.1. Productive sector
461
+ Manufacturing Industry
462
+ Objectives
463
+ 1. Production of quality and competitive food, textile, housing and
464
+ pharmaceutical products for export and domestic markets
465
+ 2. Production and productivity of existing manufacturing industries
466
+ 3. Utilization of locally available inputs
467
+ 4. Value chains, linkages and interdependencies
468
+ 5. Linkages between large scale metallurgical and engineering,
469
+ chemical and pharmaceutical industries with other industries
470
+ 6. Job creation, cluster approaches and expanding small and medium
471
+ scale manufacturing
472
+ 7. Private sector participation and partnership
473
+ 1. Establish basis for domestic industrialization
474
+ 2. Value addition through enhanced inter-sectoral
475
+ linkages
476
+ 3. Enhance productivity through private sector
477
+ leadership and supportive role of the
478
+ government
479
+ ? Create job opportunities for the youth leaving
480
+ agriculture and concentrating in urban areas
481
+ ? Make exportable commodities internationally
482
+ competitive
483
+ ? Ensure structural change
484
+ Focus areas
485
+ 7.1. Productive sectors
486
+ Mining
487
+ Objectives
488
+ � Foreign exchange earning and
489
+ domestic revenues
490
+ � Increased investment in mining
491
+ � Participation of manufacturing
492
+ industries that add value
493
+ � Job creation
494
+ � Add value for improved contribution of the subsector
495
+ � Increase inter-sectoral linkages to raise raw material inputs to other
496
+ sectors
497
+ � Make mining a competent subsector and induce structural change
498
+ � Increase human resource and technological capabilities through
499
+ research and trainings
500
+ � Raise foreign exchange revenue from mining through increased
501
+ exploration and production
502
+ � Improve traditional mining production and marketing systems
503
+ � Improve the country�s geological information
504
+ Focus areas
505
+ 7.2. Service sector
506
+ Tourism
507
+ Objectives
508
+ � Identification and developing destinations
509
+ � Infrastructure
510
+ � Competitiveness
511
+ ?improve existing destinations
512
+ ?develop new destinations
513
+ ? diversify service and raise quality
514
+ � Market linkages, branding, and promotion
515
+ � Technology, research and development
516
+ � Preservation, maintenance and proper
517
+ utilization of heritage resources
518
+ � Expand job opportunities
519
+ � Raise incomes
520
+ � Build information management
521
+ systems
522
+ � Increase implementation capacity
523
+ Focus areas
524
+ 7.3. Enabling sectors
525
+ Urban development
526
+ Objectives
527
+ ? Prioritize productive sectors in job creation and enterprise
528
+ development plans
529
+ ? Rapid development and equity goals in land provision system
530
+ ? Participation of indigenous people in land redevelopment and
531
+ expansion
532
+ ? Urban land registration and cadaster system, modern
533
+ property valuation
534
+ ? Greenery and public spaces as well as waste disposal and
535
+ management in urban planning and implementation
536
+ ? Housing development and financing options to reduce
537
+ housing shortages
538
+ ? Integrated infrastructure and services provision
539
+ ? Role of private sector in infrastructure development and
540
+ service provision
541
+ � Expand micro and small-scale
542
+ enterprises to reduce urban
543
+ unemployment
544
+ � Develop and avail urban land based on
545
+ demand, equity and cost effectiveness
546
+ � Make quality housing accessible both in
547
+ rural and urban areas
548
+ � Develop quality and integrated
549
+ infrastructure as well as service
550
+ provision in towns
551
+ � Improve financial management and
552
+ resource utilization in urban areas
553
+ Focus areas
554
+ 7.3. Enabling sectors
555
+ Innovation and Technology
556
+ Objectives
557
+ ? Access to innovation and
558
+ technological information
559
+ ? Developing a digital economy
560
+ ? Productivity enhancement and
561
+ competitiveness
562
+ ? Build a digital economy
563
+ ? Develop national scientific research and technological
564
+ capabilities
565
+ ? Support problem solving research and development of
566
+ technologies necessary for raising production,
567
+ productivity and service provision
568
+ ? Create jobs and capital that are based on technology
569
+ ? Develop technological and data security protection
570
+ systems
571
+ Focus areas
572
+ 7.3. Enabling sectors
573
+ Sustainable finance
574
+ Objectives
575
+ � Access to modern finance and saving culture in rural
576
+ areas
577
+ � Support to the private sector and corporations to
578
+ reinvest profits in productive sectors
579
+ � Role of private financial institutions in manufacturing
580
+ and agriculture
581
+ � Digital revenue collection system
582
+ � Tax equity (contraband, tax evasion, and bringing the
583
+ underground economy to the tax system)
584
+ � Domestic and foreign strategic partnerships
585
+ � Transform financing from short term to long-term,
586
+ sustainable and quality sources
587
+ � Ensure financing quality based on sectoral prioritization
588
+ and reduction of wastage
589
+ � Increase the number of domestic saving institutions both
590
+ in rural and urban areas
591
+ � Support domestic finance with foreign exchange capacity
592
+ and foreign direct investment
593
+ � Modernize domestic revenue collection system
594
+ � Raise voluntary tax payment attitude
595
+ � Bring the informal sector to the formal tax system
596
+ Focus areas
597
+ 7.3. Enabling sectors
598
+ Transport
599
+ Objectives
600
+ � Access to infrastructure
601
+ � Implementation capacity
602
+ � Participation of the private sector and the general
603
+ public
604
+ � Financing capacity
605
+ � Ensure equitable access to transport infrastructure and
606
+ services
607
+ � Improve transport safety
608
+ � Make logistics services fast and reliable
609
+ � Build transport infrastructure and service that is
610
+ resilient to climate change
611
+ Focus areas
612
+ 7.3. Enabling sectors
613
+ Energy
614
+ Objectives
615
+ ? Equity in access to electricity services
616
+ ? Energy access and quality
617
+ ? Alternative sources of energy
618
+ ? Reliability of electricity infrastructure
619
+ ? Investment and income in energy subsector
620
+ � Ensure equitable access to transport
621
+ infrastructure and services
622
+ � Improve transport safety
623
+ � Make logistics services fast and reliable
624
+ � Build transport infrastructure and service that is
625
+ resilient to climate change
626
+ Focus areas
627
+ 7.3. Enabling sectors
628
+ Irrigation
629
+ Objectives
630
+ ? Medium and large scale irrigation infrastructure
631
+ ? Job creation
632
+ ? Share of government expenditure and alternative
633
+ financing options
634
+ ? Institutional capacity and human resource
635
+ development
636
+ ? Improve agricultural output and productivity
637
+ ? Reduce government spending and enhance
638
+ institutional capacity and human resources
639
+ development
640
+ ? Ensure the inclusion of all genders and
641
+ disabled citizens
642
+ ? Develop alternative financing options for
643
+ irrigation development
644
+ Focus areas
645
+ 7.3. Enabling sectors
646
+ Human capital development
647
+ Objectives
648
+ � Make education and training inclusive and equitable by
649
+ harmonizing the system with ability, need and capacity
650
+ � Develop capacity of educational institutions (teacher capacity,
651
+ inputs and technology)
652
+ � Establish education and training quality assurance system
653
+ � Avail free and compulsory education for pre-primary to junior
654
+ secondary levels and free education at the senior secondary levels
655
+ equitably
656
+ � Ensure the relevance of education and training system and
657
+ synchronize education policy with economic and social
658
+ development needs
659
+ � Make the education and training policy compatible with the
660
+ nation�s contemporary capacities as well as global and regional
661
+ market opportunities
662
+ � Enhance commitment, capability and responsibility of citizens
663
+ ? Ensure equitable and quality health services
664
+ ? Raise average life expectancy
665
+ ? Achieve universal health coverage through
666
+ proactive and prevention health system
667
+ ? Curtail preventable maternal and child deaths
668
+ ? Reduce incidences of contagious and noncontagious
669
+ related diseases and deaths
670
+ ? Build capacity for health tourism through
671
+ increased treatment capabilities
672
+ ? Create a healthy society that is free from
673
+ addictions and use technology for supporting
674
+ knowledge led economic development
675
+ Focus areas
676
+ 8 Nationally, regionally and locally balanced and competitive development
677
+ 1. Lack of synchronization of investment with
678
+ resource potentials and development needs
679
+ 2. Poor alignment of federal, regional and
680
+ district level investment plans with the
681
+ national development goals and envisioned
682
+ settlement patterns
683
+ 3. Poor regional coordination due to low
684
+ consideration for trans-regional and
685
+ spatial issues in development plans of
686
+ regional states
687
+ 4. Inter-regional and intra-regional
688
+ disparities in infrastructural development
689
+ and access to services
690
+ Challenges
691
+ 8. Nationally, regionally and locally balanced and competitive development
692
+ 1. Ensure that the investment flow and
693
+ infrastructural development plans fairly go hand in
694
+ hand with resource potential and development
695
+ needs
696
+ ?Developing underutilized natural resources
697
+ ?Equitable distribution and access to
698
+ infrastructure
699
+ ?Sustainable environmental protection
700
+ 2. Ensure the inclusion of pastoral and agro-pastoral
701
+ areas in the development
702
+ ?Focused infrastructural development in pastoral
703
+ areas such as education and health sector input
704
+ provision as well as governance
705
+ ?Market linkages with other areas and the central
706
+ markets
707
+ ?Improve rural finance (credit and insurance) to
708
+ encourage fattening, milk processing, leather
709
+ production and irrigation agriculture
710
+ Focus areas
711
+ 9. Monitoring and Evaluation
712
+ 10 Years Perspective
713
+ Plan KPIs
714
+ Federal Implementing
715
+ Institutions
716
+ Planning and
717
+ Development Commission
718
+ Generate Data (Census,
719
+ Sample and administrative
720
+ data)
721
+ Annual Reports
722
+ Dialogue forums
723
+ (Civic Organizations, professional
724
+ associations, development partners,
725
+ intellectuals)
726
+ Central Statistical Agency
727
+ Database
728
+ National
729
+ Information Portal
730
+ National Statistics
731
+ Development Strategic
732
+ plan
733
+ Evaluation Reports
734
+ Prime Minister�s Office
735
+ House of People�s
736
+ Representatives
737
+ Thank you!
sample/South Africa_s Low Emission Development Strategy.txt ADDED
The diff for this file is too large to render. See raw diff
 
style.css ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .row-widget.stTextInput > div:first-of-type {
3
+ background: #fff;
4
+ display: flex;
5
+ border: 1px solid #dfe1e5;
6
+ box-shadow: none;
7
+ border-radius: 24px;
8
+ height: 50px;
9
+ width: auto;
10
+ margin: 10px auto 30px;
11
+ }
12
+
13
+ .row-widget.stTextInput > div:first-of-type:hover,
14
+ .row-widget.stTextInput > div:first-of-type:focus {
15
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
16
+ }
17
+
18
+ .row-widget.stTextInput .st-bq {
19
+ background-color: #fff;
20
+ }
21
+
22
+ .row-widget.stTextInput > label {
23
+ color: #b3b3b3;
24
+ }
25
+
26
+ .row-widget.stButton > button {
27
+ border-radius: 24px;
28
+ background-color: #B6C9B1;
29
+ color: #fff;
30
+ border: none;
31
+ padding: 6px 20px;
32
+ float: right;
33
+ background-image: none;
34
+ }
35
+
36
+ .row-widget.stButton > button:hover {
37
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
38
+ }
39
+
40
+ .row-widget.stButton > button:focus {
41
+ border: none;
42
+ color: #fff;
43
+ }
44
+
45
+ .footer-custom {
46
+ position: fixed;
47
+ bottom: 0;
48
+ width: 100%;
49
+ color: var(--text-color);
50
+ max-width: 698px;
51
+ font-size: 14px;
52
+ height: 50px;
53
+ padding: 10px 0;
54
+ z-index: 50;
55
+ }
56
+
57
+ .main {
58
+ padding: 20px;
59
+ }
60
+
61
+ footer {
62
+ display: none !important;
63
+ }
64
+
65
+ .footer-custom a {
66
+ color: var(--text-color);
67
+ }
68
+
69
+ #wikipedia-assistant {
70
+ font-size: 36px;
71
+ }
72
+
73
+ .generated-answer p {
74
+ font-size: 16px;
75
+ font-weight: bold;
76
+ }
77
+
78
+ .react-json-view {
79
+ margin: 40px 0 80px;
80
+ }
81
+
82
+ .tooltip {
83
+ text-align: center;
84
+ line-height: 20px;
85
+ display: table-caption;
86
+ font-size: 10px;
87
+ border-radius: 50%;
88
+ height: 20px;
89
+ width: 20px;
90
+ position: relative;
91
+ cursor: pointer;
92
+ color:#000;
93
+ }
94
+
95
+ .tooltip .tooltiptext {
96
+ visibility: hidden;
97
+ width: 280px;
98
+ text-align: center;
99
+ border-radius: 6px;
100
+ padding: 10px;
101
+ position: absolute;
102
+ z-index: 1;
103
+ top: 25px;
104
+ left: 50%;
105
+ margin-left: -140px;
106
+ font-size: 14px;
107
+ background-color: #fff;
108
+ border: 1px solid #ccc;
109
+ box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
110
+ color: #000;
111
+ }
112
+
113
+ .tooltip:hover .tooltiptext {
114
+ visibility: visible;
115
+ }
116
+
117
+ .sentence-wrapper {
118
+ border-left: 4px solid #ffc423;
119
+ padding-left: 20px;
120
+ margin-bottom: 40px;
121
+ }
122
+
123
+ #context {
124
+ padding: 2rem 0 1rem;
125
+ }
126
+
127
+ hr {
128
+ margin: 2em 0 1em;
129
+ }
130
+
131
+ .technical-details-info {
132
+ margin-bottom: 100px;
133
+ }
134
+
135
+ .loader-wrapper {
136
+ display: flex;
137
+ align-items: center;
138
+ background-color: rgba(250, 202, 43, 0.2);
139
+ padding: 15px 20px;
140
+ border-radius: 6px;
141
+ }
142
+
143
+ .loader-wrapper p {
144
+ margin-bottom: 0;
145
+ margin-left: 20px;
146
+ }
147
+
148
+ .loader {
149
+ width: 30px;
150
+ height: 30px;
151
+ border: dotted 5px #868686;
152
+ border-radius: 100%;
153
+ animation: spin 1s linear infinite;
154
+ }
155
+
156
+ .loader-note {
157
+ font-size: 14px;
158
+ color: #b3b3b3;
159
+ margin-left: 5px;
160
+ }
161
+
162
+ @keyframes spin {
163
+ 0% {
164
+ transform: rotate(0deg) scale(0.8);
165
+ border-top-color: transparent;
166
+ border-right-color: transparent;
167
+ }
168
+ 50% { transform: rotate(180deg) scale(1.2);
169
+ border-color: #949494;
170
+ border-top-color: transparent;
171
+ border-right-color: transparent;
172
+ }
173
+ 100% { transform: rotate(360deg) scale(0.8);
174
+ border-color: #bbbbbb;
175
+ border-top-color: transparent;
176
+ border-right-color: transparent;
177
+ }
178
+ }
179
+
udfPreprocess/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # adding for package implementation
udfPreprocess/cleaning.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import string
4
+ import nltk
5
+ import spacy
6
+ import en_core_web_sm
7
+ import re
8
+ import streamlit as st
9
+
10
+ from haystack.nodes import PreProcessor
11
+
12
+ '''basic cleaning - suitable for transformer models'''
13
+ def basic(s):
14
+ """
15
+ :param s: string to be processed
16
+ :return: processed string: see comments in the source code for more info
17
+ """
18
+ # Text Lowercase
19
+ #s = s.lower()
20
+ # Remove punctuation
21
+ #translator = str.maketrans(' ', ' ', string.punctuation)
22
+ #s = s.translate(translator)
23
+ # Remove URLs
24
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
25
+ s = re.sub(r"http\S+", " ", s)
26
+ # Remove new line characters
27
+ #s = re.sub('\n', ' ', s)
28
+
29
+ # Remove distracting single quotes
30
+ #s = re.sub("\'", " ", s)
31
+ # Remove all remaining numbers and non alphanumeric characters
32
+ #s = re.sub(r'\d+', ' ', s)
33
+ #s = re.sub(r'\W+', ' ', s)
34
+
35
+ # define custom words to replace:
36
+ #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
37
+
38
+ return s.strip()
39
+
40
+
41
+ def preprocessingForSDG(document):
42
+
43
+ """
44
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
45
+
46
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
47
+ list that contains all text joined together.
48
+ """
49
+
50
+ preprocessor = PreProcessor(
51
+ clean_empty_lines=True,
52
+ clean_whitespace=True,
53
+ clean_header_footer=True,
54
+ split_by="word",
55
+ split_length=120,
56
+ split_respect_sentence_boundary=False,
57
+ #split_overlap=1
58
+ )
59
+ for i in document:
60
+ docs_processed = preprocessor.process([i])
61
+ for item in docs_processed:
62
+ item.content = basic(item.content)
63
+
64
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
65
+
66
+ # create dataframe of text and list of all text
67
+ df = pd.DataFrame(docs_processed)
68
+ all_text = " ".join(df.content.to_list())
69
+ par_list = df.content.to_list()
70
+
71
+ return docs_processed, df, all_text, par_list
72
+
73
+ def preprocessing(document):
74
+
75
+ """
76
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
77
+
78
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
79
+ list that contains all text joined together.
80
+ """
81
+
82
+ preprocessor = PreProcessor(
83
+ clean_empty_lines=True,
84
+ clean_whitespace=True,
85
+ clean_header_footer=True,
86
+ split_by="sentence",
87
+ split_length=3,
88
+ split_respect_sentence_boundary=False,
89
+ split_overlap=1
90
+ )
91
+ for i in document:
92
+ docs_processed = preprocessor.process([i])
93
+ for item in docs_processed:
94
+ item.content = basic(item.content)
95
+
96
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
97
+
98
+ # create dataframe of text and list of all text
99
+ df = pd.DataFrame(docs_processed)
100
+ all_text = " ".join(df.content.to_list())
101
+ par_list = df.content.to_list()
102
+
103
+ return docs_processed, df, all_text, par_list
104
+
105
+ '''processing with spacy - suitable for models such as tf-idf, word2vec'''
106
+ def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
107
+
108
+ """
109
+
110
+ Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
111
+
112
+ filters out all but proper nouns, nounts, verbs and adjectives.
113
+
114
+ Parameters
115
+ ----------
116
+ alpha : str
117
+
118
+ The input string.
119
+
120
+ use_nlp : bool, default False
121
+
122
+ Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
123
+
124
+ Should be set to False if used inside nlp.pipeline
125
+
126
+ Returns
127
+ -------
128
+ ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
129
+
130
+ Notes
131
+ -----
132
+ Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
133
+ Use together with nlp.pipeline for batch processing.
134
+
135
+ """
136
+
137
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
138
+
139
+ if use_nlp:
140
+
141
+ alpha = nlp(alpha)
142
+
143
+
144
+
145
+ beta = []
146
+
147
+ for tok in alpha:
148
+
149
+ if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
150
+
151
+ beta.append(tok.lemma_)
152
+
153
+
154
+ text = ' '.join(beta)
155
+ text = text.lower()
156
+ return text
udfPreprocess/docPreprocessing.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Optional
2
+
3
+ from pathlib import Path
4
+ import re
5
+ import logging
6
+ import string
7
+ import streamlit as st
8
+ logger = logging.getLogger(__name__)
9
+
10
+ import os
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+ from haystack.utils import convert_files_to_docs, fetch_archive_from_http
14
+ from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter
15
+ from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
16
+ from haystack.schema import Document
17
+ import pdfplumber
18
+
19
+ import pandas as pd
20
+
21
+ import tempfile
22
+ import sqlite3
23
+
24
+
25
+
26
+ def load_document(
27
+ file_path: str,
28
+ file_name,
29
+ encoding: Optional[str] = None,
30
+ id_hash_keys: Optional[List[str]] = None,
31
+ ) -> List[Document]:
32
+
33
+ """
34
+ takes docx, txt and pdf files as input and \
35
+ extracts text as well as the filename as metadata. \
36
+ Since haystack does not take care of all pdf files, \
37
+ pdfplumber is attached to the pipeline in case the pdf \
38
+ extraction fails via Haystack.
39
+
40
+ Returns a list of type haystack.schema.Document
41
+ """
42
+
43
+ if file_name.endswith('.pdf'):
44
+ converter = PDFToTextConverter(remove_numeric_tables=True)
45
+ if file_name.endswith('.txt'):
46
+ converter = TextConverter()
47
+ if file_name.endswith('.docx'):
48
+ converter = DocxToTextConverter()
49
+
50
+
51
+ documents = []
52
+ logger.info("Converting {}".format(file_name))
53
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter
54
+ # return a list containing a single Document
55
+ document = converter.convert(
56
+ file_path=file_path, meta=None,
57
+ encoding=encoding, id_hash_keys=id_hash_keys
58
+ )[0]
59
+ text = document.content
60
+ documents.append(Document(content=text,
61
+ meta={"name": file_name},
62
+ id_hash_keys=id_hash_keys))
63
+
64
+ '''check if text is empty and apply different pdf processor. \
65
+ This can happen whith certain pdf types.'''
66
+ for i in documents:
67
+ if i.content == "":
68
+ st.write("using pdfplumber")
69
+ text = []
70
+ with pdfplumber.open(file_path) as pdf:
71
+ for page in pdf.pages:
72
+ text.append(page.extract_text())
73
+ i.content = ' '.join([page for page in text])
74
+
75
+ return documents