prashant commited on
Commit
49a314a
1 Parent(s): f47e7d4

ver0.2 udfpreprocess update

Browse files
udfPreprocess/cleaning.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  import string
@@ -10,7 +11,7 @@ import streamlit as st
10
  from haystack.nodes import PreProcessor
11
 
12
  '''basic cleaning - suitable for transformer models'''
13
- def basic(s):
14
  """
15
  :param s: string to be processed
16
  :return: processed string: see comments in the source code for more info
@@ -23,6 +24,15 @@ def basic(s):
23
  # Remove URLs
24
  s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
25
  s = re.sub(r"http\S+", " ", s)
 
 
 
 
 
 
 
 
 
26
  # Remove new line characters
27
  #s = re.sub('\n', ' ', s)
28
 
@@ -59,9 +69,10 @@ def preprocessingForSDG(document):
59
  for i in document:
60
  docs_processed = preprocessor.process([i])
61
  for item in docs_processed:
62
- item.content = basic(item.content)
63
 
64
- st.write("your document has been splitted to", len(docs_processed), "paragraphs")
 
65
 
66
  # create dataframe of text and list of all text
67
  df = pd.DataFrame(docs_processed)
@@ -93,7 +104,8 @@ def preprocessing(document):
93
  for item in docs_processed:
94
  item.content = basic(item.content)
95
 
96
- st.write("your document has been splitted to", len(docs_processed), "paragraphs")
 
97
 
98
  # create dataframe of text and list of all text
99
  df = pd.DataFrame(docs_processed)
 
1
+ import logging
2
  import pandas as pd
3
  import numpy as np
4
  import string
 
11
  from haystack.nodes import PreProcessor
12
 
13
  '''basic cleaning - suitable for transformer models'''
14
+ def basic(s,SDG = False):
15
  """
16
  :param s: string to be processed
17
  :return: processed string: see comments in the source code for more info
 
24
  # Remove URLs
25
  s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
26
  s = re.sub(r"http\S+", " ", s)
27
+ if SDG == True:
28
+ s = s.lower()
29
+ translator = str.maketrans(' ', ' ', string.punctuation)
30
+ s = s.translate(translator)
31
+ s = re.sub('\n', ' ', s)
32
+ s = re.sub("\'", " ", s)
33
+ s = re.sub(r'\d+', ' ', s)
34
+ s = re.sub(r'\W+', ' ', s)
35
+
36
  # Remove new line characters
37
  #s = re.sub('\n', ' ', s)
38
 
 
69
  for i in document:
70
  docs_processed = preprocessor.process([i])
71
  for item in docs_processed:
72
+ item.content = basic(item.content, SDG = True)
73
 
74
+ with st.spinner("👑 document being splitted into paragraphs"):
75
+ logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
76
 
77
  # create dataframe of text and list of all text
78
  df = pd.DataFrame(docs_processed)
 
104
  for item in docs_processed:
105
  item.content = basic(item.content)
106
 
107
+ with st.spinner("👑 document being splitted into paragraphs"):
108
+ logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
109
 
110
  # create dataframe of text and list of all text
111
  df = pd.DataFrame(docs_processed)
udfPreprocess/docPreprocessing.py CHANGED
@@ -65,11 +65,11 @@ def load_document(
65
  This can happen whith certain pdf types.'''
66
  for i in documents:
67
  if i.content == "":
68
- st.write("using pdfplumber")
69
- text = []
70
- with pdfplumber.open(file_path) as pdf:
71
- for page in pdf.pages:
72
- text.append(page.extract_text())
73
- i.content = ' '.join([page for page in text])
74
 
75
  return documents
 
65
  This can happen whith certain pdf types.'''
66
  for i in documents:
67
  if i.content == "":
68
+ with st.spinner("using pdfplumber"):
69
+ text = []
70
+ with pdfplumber.open(file_path) as pdf:
71
+ for page in pdf.pages:
72
+ text.append(page.extract_text())
73
+ i.content = ' '.join([page for page in text])
74
 
75
  return documents
udfPreprocess/paramconfig.cfg ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [lexical_search]
2
+ TOP_K = 10
3
+ THRESHOLD = 0.1
4
+
5
+ [semantic_search]
6
+ TOP_K = 10
7
+ MAX_SEQ_LENGTH = 64
8
+ MODEL_NAME = msmarco-distilbert-cos-v5
9
+ THRESHOLD = 0.1
10
+
11
+ [sdg]
12
+ THRESHOLD = 0.85
udfPreprocess/sdg.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob, os, sys;
2
+ sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from keybert import KeyBERT
12
+ from transformers import pipeline
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ import streamlit as st
16
+ import pandas as pd
17
+ import docx
18
+ from docx.shared import Inches
19
+ from docx.shared import Pt
20
+ from docx.enum.style import WD_STYLE_TYPE
21
+
22
+ import tempfile
23
+ import sqlite3
24
+ import logging
25
+ logger = logging.getLogger(__name__)
26
+ import configparser
27
+
28
+ @st.cache(allow_output_mutation=True)
29
+ def load_sdgClassifier():
30
+ classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
31
+ logging.info("Loading classifier")
32
+ return classifier
33
+
34
+ def sdg_classification(par_list):
35
+ logging.info("running SDG classifiication")
36
+ config = configparser.ConfigParser()
37
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
38
+ threshold = float(config.get('sdg','THRESHOLD'))
39
+
40
+
41
+ classifier = load_sdgClassifier()
42
+ labels = classifier(par_list)
43
+
44
+ labels_= [(l['label'],l['score']) for l in labels]
45
+ # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
46
+ df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
47
+
48
+ df2['text'] = par_list
49
+ df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
50
+ df2.index += 1
51
+ df2 =df2[df2['Relevancy']>threshold]
52
+ x = df2['SDG'].value_counts()
53
+ df3 = df2.copy()
54
+ df3= df3.drop(['Relevancy'], axis = 1)
55
+
56
+
57
+ return df3, x
udfPreprocess/search.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob, os, sys; sys.path.append('../udfPreprocess')
2
+
3
+ #import helper
4
+ import udfPreprocess.docPreprocessing as pre
5
+ import udfPreprocess.cleaning as clean
6
+
7
+ #import needed libraries
8
+ import seaborn as sns
9
+ from pandas import DataFrame
10
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
+ # from keybert import KeyBERT
12
+ from transformers import pipeline
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ import streamlit as st
16
+ import pandas as pd
17
+ from rank_bm25 import BM25Okapi
18
+ from sklearn.feature_extraction import _stop_words
19
+ import string
20
+ from tqdm.autonotebook import tqdm
21
+ import numpy as np
22
+ import docx
23
+ from docx.shared import Inches
24
+ from docx.shared import Pt
25
+ from docx.enum.style import WD_STYLE_TYPE
26
+ import logging
27
+ logger = logging.getLogger(__name__)
28
+ import tempfile
29
+ import sqlite3
30
+ import configparser
31
+
32
+ ### These are lexcial search related functions/methods#####
33
+
34
+ def bm25_tokenizer(text):
35
+ tokenized_doc = []
36
+ for token in text.lower().split():
37
+ token = token.strip(string.punctuation)
38
+
39
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
40
+ tokenized_doc.append(token)
41
+ return tokenized_doc
42
+
43
+ def bm25TokenizeDoc(paraList):
44
+ tokenized_corpus = []
45
+ ##########Commenting this for now########### will incorporate paragrpah splitting later.
46
+ # for passage in tqdm(paraList):
47
+ # if len(passage.split()) >256:
48
+ # # st.write("Splitting")
49
+ # temp = " ".join(passage.split()[:256])
50
+ # tokenized_corpus.append(bm25_tokenizer(temp))
51
+ # temp = " ".join(passage.split()[256:])
52
+ # tokenized_corpus.append(bm25_tokenizer(temp))
53
+ # else:
54
+ # tokenized_corpus.append(bm25_tokenizer(passage))
55
+ ######################################################################################33333
56
+ for passage in tqdm(paraList):
57
+ tokenized_corpus.append(bm25_tokenizer(passage))
58
+
59
+ return tokenized_corpus
60
+
61
+ def lexical_search(keyword, document_bm25):
62
+ config = configparser.ConfigParser()
63
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
64
+ top_k = int(config.get('lexical_search','TOP_K'))
65
+ bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
66
+ top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
67
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
68
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
69
+ return bm25_hits
70
+
71
+ @st.cache(allow_output_mutation=True)
72
+ def load_sentenceTransformer(name):
73
+ return SentenceTransformer(name)
74
+
75
+
76
+ def semantic_search(keywordlist,paraList):
77
+
78
+ ##### Sematic Search #####
79
+ #query = "Does document contain {} issues ?".format(keyword)
80
+ config = configparser.ConfigParser()
81
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
82
+ model_name = config.get('semantic_search','MODEL_NAME')
83
+
84
+ bi_encoder = load_sentenceTransformer(model_name)
85
+ bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
86
+ top_k = int(config.get('semantic_search','TOP_K'))
87
+ document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
88
+ question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
89
+
90
+ hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
91
+
92
+ return hits
93
+
94
+ def show_results(keywordList):
95
+ document = docx.Document()
96
+ # document.add_heading('Document name:{}'.format(file_name), 2)
97
+ section = document.sections[0]
98
+
99
+ # Calling the footer
100
+ footer = section.footer
101
+
102
+ # Calling the paragraph already present in
103
+ # the footer section
104
+ footer_para = footer.paragraphs[0]
105
+
106
+ font_styles = document.styles
107
+ font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
108
+ font_object = font_charstyle.font
109
+ font_object.size = Pt(7)
110
+ # Adding the centered zoned footer
111
+ footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
112
+ document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
113
+ for keyword in keywordList:
114
+
115
+ st.write("Results for Query: {}".format(keyword))
116
+ para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
117
+ para.font.size = Pt(12)
118
+ bm25_hits, hits = search(keyword)
119
+
120
+ st.markdown("""
121
+ We will provide with 2 kind of results. The 'lexical search' and the semantic search.
122
+ """)
123
+ # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
124
+ st.markdown("Top few lexical search (BM25) hits")
125
+ document.add_paragraph("Top few lexical search (BM25) hits")
126
+
127
+ for hit in bm25_hits[0:5]:
128
+ if hit['score'] > 0.00:
129
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
130
+ document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
131
+
132
+
133
+
134
+ # st.table(bm25_hits[0:3])
135
+
136
+ st.markdown("\n-------------------------\n")
137
+ st.markdown("Top few Bi-Encoder Retrieval hits")
138
+ document.add_paragraph("\n-------------------------\n")
139
+ document.add_paragraph("Top few Bi-Encoder Retrieval hits")
140
+
141
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
142
+ for hit in hits[0:5]:
143
+ # if hit['score'] > 0.45:
144
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
145
+ document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
udfPreprocess/uploadAndExample.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import udfPreprocess.docPreprocessing as pre
4
+ import udfPreprocess.cleaning as clean
5
+
6
+ def add_upload(choice):
7
+
8
+
9
+ if choice == 'Upload Document':
10
+ uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
11
+ if uploaded_file is not None:
12
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
13
+ bytes_data = uploaded_file.getvalue()
14
+ temp.write(bytes_data)
15
+ st.session_state['filename'] = uploaded_file.name
16
+ # st.write("Uploaded Filename: ", uploaded_file.name)
17
+ file_name = uploaded_file.name
18
+ file_path = temp.name
19
+ docs = pre.load_document(file_path, file_name)
20
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
21
+ st.session_state['docs'] = docs
22
+ st.session_state['paraList'] = paraList
23
+
24
+
25
+ else:
26
+ # listing the options
27
+ option = st.sidebar.selectbox('Select the example document',
28
+ ('South Africa:Low Emission strategy',
29
+ 'Ethiopia: 10 Year Development Plan'))
30
+ if option is 'South Africa:Low Emission strategy':
31
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
32
+ st.session_state['filename'] = file_name
33
+ # st.write("Selected document:", file_name.split('/')[1])
34
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
35
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
36
+ else:
37
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
38
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
39
+ st.session_state['filename'] = file_name
40
+ # st.write("Selected document:", file_name.split('/')[1])
41
+
42
+ if option is not None:
43
+ docs = pre.load_document(file_path,file_name)
44
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
45
+ st.session_state['docs'] = docs
46
+ st.session_state['paraList'] = paraList
47
+
48
+