RMWeerasinghe commited on
Commit
99e744f
1 Parent(s): 0528be1

Initial Commit

Browse files
Files changed (10) hide show
  1. .gitignore +6 -1
  2. app.py +255 -68
  3. config.py +5 -0
  4. mapReduceSummarizer.py +50 -0
  5. model.py +43 -0
  6. preprocess.py +33 -0
  7. refineSummarizer.py +41 -0
  8. requirements.txt +0 -0
  9. summarizer.py +72 -0
  10. utils.py +6 -7
.gitignore CHANGED
@@ -25,6 +25,7 @@ share/python-wheels/
25
  .installed.cfg
26
  *.egg
27
  MANIFEST
 
28
 
29
  # PyInstaller
30
  # Usually these files are written by a python script from a template
@@ -142,4 +143,8 @@ Docs/
142
  .DS_Store
143
  .vscode/
144
  test.ipynb
145
- test.py
 
 
 
 
 
25
  .installed.cfg
26
  *.egg
27
  MANIFEST
28
+ .conda
29
 
30
  # PyInstaller
31
  # Usually these files are written by a python script from a template
 
143
  .DS_Store
144
  .vscode/
145
  test.ipynb
146
+ test.py
147
+ requirements1.txt
148
+
149
+ #logs
150
+ logs/
app.py CHANGED
@@ -1,10 +1,13 @@
 
 
1
  import nltk
2
  import validators
3
  import streamlit as st
4
- from transformers import AutoTokenizer, pipeline
 
 
5
 
6
- # local modules
7
- from extractive_summarizer.model_processors import Summarizer
8
  from utils import (
9
  clean_text,
10
  fetch_article_text,
@@ -12,20 +15,69 @@ from utils import (
12
  read_text_from_file,
13
  )
14
 
 
15
  from rouge import Rouge
16
 
17
- if __name__ == "__main__":
18
- # ---------------------------------
19
- # Main Application
20
- # ---------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  st.title("Text Summarizer 📝")
22
 
23
- st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
24
- st.markdown(
25
- "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
 
 
 
26
  )
27
- summarize_type = st.sidebar.selectbox(
28
- "Summarization type", options=["Extractive", "Abstractive"]
 
 
 
29
  )
30
 
31
  st.markdown(
@@ -44,15 +96,8 @@ if __name__ == "__main__":
44
  )
45
  st.markdown("---")
46
  # ---------------------------
47
- # SETUP & Constants
48
- nltk.download("punkt")
49
- abs_tokenizer_name = "facebook/bart-large-cnn"
50
- abs_model_name = "facebook/bart-large-cnn"
51
- abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
52
- abs_max_length = 90
53
- abs_min_length = 30
54
- # ---------------------------
55
 
 
56
  inp_text = st.text_input("Enter text or a url here")
57
  st.markdown(
58
  "<h3 style='text-align: center; color: green;'>OR</h3>",
@@ -65,11 +110,14 @@ if __name__ == "__main__":
65
  is_url = validators.url(inp_text)
66
  if is_url:
67
  # complete text, chunks to summarize (list of sentences for long docs)
 
68
  text, cleaned_txt = fetch_article_text(url=inp_text)
69
  elif uploaded_file:
 
70
  cleaned_txt = read_text_from_file(uploaded_file)
71
  cleaned_txt = clean_text(cleaned_txt)
72
  else:
 
73
  cleaned_txt = clean_text(inp_text)
74
 
75
  # view summarized text (expander)
@@ -80,51 +128,190 @@ if __name__ == "__main__":
80
  st.write(cleaned_txt)
81
  summarize = st.button("Summarize")
82
 
83
- # called on toggle button [summarize]
84
- if summarize:
85
- if summarize_type == "Extractive":
86
- if is_url:
87
- text_to_summarize = " ".join([txt for txt in cleaned_txt])
88
- else:
89
- text_to_summarize = cleaned_txt
90
- # extractive summarizer
91
-
92
- with st.spinner(
93
- text="Creating extractive summary. This might take a few seconds ..."
94
- ):
95
- ext_model = Summarizer()
96
- summarized_text = ext_model(text_to_summarize, num_sentences=5)
97
-
98
- elif summarize_type == "Abstractive":
99
- with st.spinner(
100
- text="Creating abstractive summary. This might take a few seconds ..."
101
- ):
102
- text_to_summarize = cleaned_txt
103
- abs_summarizer = pipeline(
104
- "summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
105
- )
106
-
107
- if is_url is False:
108
- # list of chunks
109
- text_to_summarize = preprocess_text_for_abstractive_summarization(
110
- tokenizer=abs_tokenizer, text=cleaned_txt
111
- )
112
-
113
- tmp_sum = abs_summarizer(
114
- text_to_summarize,
115
- max_length=abs_max_length,
116
- min_length=abs_min_length,
117
- do_sample=False,
118
- )
119
-
120
- summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
121
-
122
- # final summarized output
123
- st.subheader("Summarized text")
124
- st.info(summarized_text)
125
-
126
- st.subheader("Rogue Scores")
127
- rouge_sc = Rouge()
128
- ground_truth = cleaned_txt[0] if is_url else cleaned_txt
129
- score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
130
- st.code(score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
  import nltk
4
  import validators
5
  import streamlit as st
6
+ from summarizer import Summarizer
7
+ from config import MODELS
8
+ from warnings import filterwarnings
9
 
10
+ filterwarnings("ignore")
 
11
  from utils import (
12
  clean_text,
13
  fetch_article_text,
 
15
  read_text_from_file,
16
  )
17
 
18
+
19
  from rouge import Rouge
20
 
21
+ def filer():
22
+ # return "logs/log "
23
+ today = datetime.datetime.today()
24
+ log_filename = f"logs/{today.year}-{today.month:02d}-{today.day:02d}.log"
25
+ return log_filename
26
+
27
+ file_handler = logging.FileHandler(filer())
28
+ # file_handler = logging.handlers.TimedRotatingFileHandler(filer(),when="D")
29
+ file_handler.setLevel(logging.INFO)
30
+
31
+ logging.basicConfig(
32
+ level=logging.DEBUG,
33
+ format="%(asctime)s %(levelname)s (%(name)s) : %(message)s",
34
+ datefmt="%Y-%m-%d %H:%M:%S",
35
+ handlers=[file_handler],
36
+ force=True,
37
+ )
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ if "api_key" not in st.session_state:
43
+ st.session_state.api_key = " "
44
+
45
+
46
+ @st.cache_resource
47
+ def initialize_app():
48
+ nltk.download("punkt")
49
+
50
+ @st.cache_resource
51
+ def init_summarizer(model_name,api_key=None):
52
+
53
+ model_type = "local"
54
+ if model_name == "OpenAI":
55
+ model_type = "openai"
56
+
57
+ model_path = MODELS[model_name]
58
+ if model_type == "openai":
59
+ #validation logic
60
+
61
+ return Summarizer(model_path,model_type,api_key)
62
+ else:
63
+ logger.info(f"Model for summarization : {model_path}")
64
+ return Summarizer(model_path, model_type)
65
+
66
+ def load_app():
67
  st.title("Text Summarizer 📝")
68
 
69
+ # st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
70
+ # st.markdown(
71
+ # "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
72
+ # )
73
+ model_name = st.sidebar.selectbox(
74
+ "Model Name", options=["Version 0", "Version 1","OpenAI"]
75
  )
76
+ if model_name == "OpenAI":
77
+ st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
78
+
79
+ summarizer_type = st.sidebar.selectbox(
80
+ "Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
81
  )
82
 
83
  st.markdown(
 
96
  )
97
  st.markdown("---")
98
  # ---------------------------
 
 
 
 
 
 
 
 
99
 
100
+ # ---------------------------
101
  inp_text = st.text_input("Enter text or a url here")
102
  st.markdown(
103
  "<h3 style='text-align: center; color: green;'>OR</h3>",
 
110
  is_url = validators.url(inp_text)
111
  if is_url:
112
  # complete text, chunks to summarize (list of sentences for long docs)
113
+ logger.info("Text Input Type: URL")
114
  text, cleaned_txt = fetch_article_text(url=inp_text)
115
  elif uploaded_file:
116
+ logger.info("Text Input Type: FILE")
117
  cleaned_txt = read_text_from_file(uploaded_file)
118
  cleaned_txt = clean_text(cleaned_txt)
119
  else:
120
+ logger.info("Text Input Type: INPUT TEXT")
121
  cleaned_txt = clean_text(inp_text)
122
 
123
  # view summarized text (expander)
 
128
  st.write(cleaned_txt)
129
  summarize = st.button("Summarize")
130
 
131
+ if is_url:
132
+ text_to_summarize = " ".join([txt for txt in cleaned_txt])
133
+ else:
134
+ text_to_summarize = cleaned_txt
135
+
136
+ return text_to_summarize, model_name, summarizer_type, summarize
137
+
138
+
139
+
140
+
141
+ def get_summary(text_to_summarize,model_name, summarizer_type, summarize):
142
+
143
+ while not summarize:
144
+ continue
145
+
146
+ else:
147
+
148
+ logger.info(f"Model Name: {model_name}")
149
+ logger.info(f"Summarization Type for Long Text: {summarizer_type}")
150
+
151
+ api_key = st.session_state.api_key
152
+
153
+
154
+ summarizer = init_summarizer(model_name,api_key)
155
+
156
+
157
+ with st.spinner(
158
+ text="Creating summary. This might take a few seconds ..."
159
+ ):
160
+
161
+ if summarizer_type == "Refine":
162
+ summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
163
+ return summarized_text, time
164
+ else :
165
+ summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
166
+ return summarized_text, time
167
+
168
+
169
+
170
+
171
+ def display_output(summarized_text,time):
172
+
173
+
174
+ logger.info(f"SUMMARY: {summarized_text}")
175
+ logger.info(f"Summary took {time}s")
176
+ st.subheader("Summarized text")
177
+ st.info(f"{summarized_text}")
178
+ st.info(f"Time: {time}s")
179
+
180
+
181
+ # def summarizer_app():
182
+ # # ---------------------------------
183
+ # # Main Application
184
+ # # ---------------------------------
185
+ # st.title("Text Summarizer 📝")
186
+
187
+ # # st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
188
+ # # st.markdown(
189
+ # # "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
190
+ # # )
191
+ # model_name = st.sidebar.selectbox(
192
+ # "Model Name", options=["Version 0", "Version 1","OpenAI"]
193
+ # )
194
+ # if model_name == "OpenAI":
195
+ # st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
196
+
197
+ # summarizer_type = st.sidebar.selectbox(
198
+ # "Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
199
+ # )
200
+
201
+ # st.markdown(
202
+ # "Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
203
+ # )
204
+ # st.markdown(
205
+ # """- Raw text in text box
206
+ # - URL of article/news to be summarized
207
+ # - .txt, .pdf, .docx file formats"""
208
+ # )
209
+ # st.markdown(
210
+ # """This app supports two type of summarization:
211
+
212
+ # 1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
213
+ # 2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
214
+ # )
215
+ # st.markdown("---")
216
+ # # ---------------------------
217
+ # # SETUP & Constants
218
+ # # nltk.download("punkt")
219
+ # # abs_tokenizer_name = "facebook/bart-large-cnn"
220
+ # # abs_model_name = "facebook/bart-large-cnn"
221
+ # # abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
222
+ # # abs_max_length = 90
223
+ # # abs_min_length = 30
224
+
225
+ # # model_name_v0 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0"
226
+ # # model_name_v1 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1"
227
+ # # ---------------------------
228
+ # inp_text = st.text_input("Enter text or a url here")
229
+ # st.markdown(
230
+ # "<h3 style='text-align: center; color: green;'>OR</h3>",
231
+ # unsafe_allow_html=True,
232
+ # )
233
+ # uploaded_file = st.file_uploader(
234
+ # "Upload a .txt, .pdf, .docx file for summarization"
235
+ # )
236
+
237
+ # is_url = validators.url(inp_text)
238
+ # if is_url:
239
+ # # complete text, chunks to summarize (list of sentences for long docs)
240
+ # logger.info("Text Input Type: URL")
241
+ # text, cleaned_txt = fetch_article_text(url=inp_text)
242
+ # elif uploaded_file:
243
+ # logger.info("Text Input Type: FILE")
244
+ # cleaned_txt = read_text_from_file(uploaded_file)
245
+ # cleaned_txt = clean_text(cleaned_txt)
246
+ # else:
247
+ # logger.info("Text Input Type: INPUT TEXT")
248
+ # cleaned_txt = clean_text(inp_text)
249
+
250
+ # # view summarized text (expander)
251
+ # with st.expander("View input text"):
252
+ # if is_url:
253
+ # st.write(cleaned_txt[0])
254
+ # else:
255
+ # st.write(cleaned_txt)
256
+ # summarize = st.button("Summarize")
257
+
258
+ # # called on toggle button [summarize]
259
+ # if summarize:
260
+ # if is_url:
261
+ # text_to_summarize = " ".join([txt for txt in cleaned_txt])
262
+ # else:
263
+ # text_to_summarize = cleaned_txt
264
+
265
+ # logger.info(f"Model Name: {model_name}")
266
+ # logger.info(f"Summarization Type for Long Text: {summarizer_type}")
267
+
268
+ # api_key = st.session_state.api_key
269
+
270
+ # print(api_key)
271
+
272
+ # summarizer = init_summarizer(model_name,api_key)
273
+
274
+ # with st.spinner(
275
+ # text="Creating summary. This might take a few seconds ..."
276
+ # ):
277
+ # #ext_model = Summarizer()
278
+ # #summarized_text = ext_model(text_to_summarize, num_sentences=5)
279
+
280
+ # if summarizer_type == "Refine":
281
+ # summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
282
+ # else :
283
+ # summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
284
+
285
+
286
+ # # elif model_name == "Version 1":
287
+ # # with st.spinner(
288
+ # # text="Creating summary. This might take a few seconds ..."
289
+ # # ):
290
+ # # if summarizer_type == "Refine":
291
+ # # summarized_text, time = summarizer_v1.summarize(text_to_summarize,"refine")
292
+ # # else :
293
+ # # summarized_text, time = summarizer_v1.summarize(text_to_summarize,"map_reduce")
294
+
295
+ # # final summarized output
296
+
297
+ # logger.info(f"SUMMARY: {summarized_text}")
298
+ # logger.info(f"Summary took {time}s")
299
+ # st.subheader("Summarized text")
300
+ # st.info(f"{summarized_text}")
301
+ # st.info(f"Time: {time}s")
302
+
303
+ # # st.subheader("Rogue Scores")
304
+ # # rouge_sc = Rouge()
305
+ # # ground_truth = cleaned_txt[0] if is_url else cleaned_txt
306
+ # # score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
307
+ # # st.code(score)
308
+
309
+
310
+ if __name__ == "__main__":
311
+ initialize_app()
312
+ text_to_summarize, model_name, summarizer_type, summarize = load_app()
313
+ summarized_text,time = get_summary(text_to_summarize, model_name, summarizer_type, summarize)
314
+ display_output(summarized_text,time)
315
+
316
+
317
+
config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ MODELS = {
2
+ "Version 0":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0",
3
+ "Version 1":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1",
4
+ "OpenAI" : "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1" #for tokenizer
5
+ }
mapReduceSummarizer.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
2
+ from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, LLMChain, StuffDocumentsChain
3
+ from langchain.prompts import PromptTemplate
4
+
5
+ def get_map_reduce_chain(pipeline_or_llm,model_type)-> LLMChain:
6
+
7
+ if model_type == "openai":
8
+ llm = pipeline_or_llm
9
+ map_template = """The following is a set of documents
10
+ {docs}
11
+ Based on this list of docs, please identify the main themes.
12
+ Helpful Answer:"""
13
+ map_prompt = PromptTemplate.from_template(map_template)
14
+ reduce_template = """The following is set of summaries:
15
+ {docs}
16
+ Take these and distill into a final, consolidated summary of the main themes.
17
+ Helpful Answer:"""
18
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
19
+
20
+ else:
21
+ map_prompt = PromptTemplate.from_template(template="{docs}")
22
+ reduce_prompt = PromptTemplate.from_template(template="{docs}")
23
+ llm = HuggingFacePipeline(pipeline=pipeline_or_llm)
24
+
25
+
26
+ map_chain = LLMChain(llm = llm, prompt=map_prompt)
27
+ reduce_chain = LLMChain(llm = llm, prompt = reduce_prompt,verbose = True)
28
+ combine_documents_chain = StuffDocumentsChain(llm_chain=reduce_chain, document_variable_name="docs")
29
+ reduce_documents_chain = ReduceDocumentsChain(
30
+ combine_documents_chain=combine_documents_chain,
31
+ collapse_documents_chain=combine_documents_chain,
32
+ token_max=16384,
33
+ verbose = True,
34
+ )
35
+ map_reduce_chain = MapReduceDocumentsChain(
36
+ llm_chain=map_chain,
37
+ reduce_documents_chain=reduce_documents_chain,
38
+ document_variable_name="docs",
39
+ return_intermediate_steps=False,
40
+ verbose = True,
41
+ )
42
+
43
+ return map_reduce_chain
44
+
45
+
46
+
47
+
48
+
49
+
50
+
model.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
3
+ from langchain_openai import OpenAI
4
+ from huggingface_hub import login
5
+ from dotenv import load_dotenv
6
+ from logging import getLogger
7
+ import streamlit as st
8
+ import torch
9
+
10
+ # load_dotenv()
11
+ # hf_token = os.environ.get("HF_TOKEN")
12
+ hf_token = st.secrets["HF_TOKEN"]
13
+ login(token=hf_token)
14
+ logger = getLogger(__name__)
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ def get_local_model(model_name_or_path:str)->pipeline:
18
+
19
+ #print(f"Model is running on {device}")
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
22
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
23
+ pipe = pipeline(
24
+ task = 'summarization',
25
+ model=model,
26
+ tokenizer=tokenizer,
27
+ device = device,
28
+ )
29
+
30
+ logger.info(f"Summarization pipeline created and loaded to {device}")
31
+
32
+ return pipe
33
+
34
+ def get_endpoint(api_key:str):
35
+
36
+ llm = OpenAI(openai_api_key=api_key)
37
+ return llm
38
+
39
+ def get_model(model_type,model_name_or_path,api_key = None):
40
+ if model_type == "openai":
41
+ return get_endpoint(api_key)
42
+ else:
43
+ return get_local_model(model_name_or_path)
preprocess.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.docstore.document import Document
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from logging import getLogger
4
+
5
+ logger = getLogger(__name__)
6
+
7
+ def get_input_token_count(text:str,tokenizer)->int:
8
+ tokens = tokenizer.tokenize(text)
9
+ return len(tokens)
10
+
11
+ def get_document_splits_from_text(text:str) -> Document:
12
+ document = Document(page_content=text)
13
+ text_splitter = RecursiveCharacterTextSplitter(
14
+ separators=["\n\n","\n",".","?"," "],
15
+ chunk_size=15000,
16
+ chunk_overlap = 50
17
+ )
18
+ split_documents = text_splitter.split_documents([document])
19
+ logger.info(f"Splitting Document: Total Chunks: {len(split_documents)} ")
20
+ return split_documents
21
+
22
+
23
+ def prepare_for_summarize(text:str,tokenizer):
24
+ no_input_tokens = get_input_token_count(text,tokenizer)
25
+ if no_input_tokens<12000:
26
+ text_to_summarize = text
27
+ length_type = "short"
28
+ return text_to_summarize,length_type
29
+ else:
30
+ text_to_summarize = get_document_splits_from_text(text)
31
+ length_type = "long"
32
+
33
+ return text_to_summarize, length_type
refineSummarizer.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
2
+ from langchain.chains.summarize import load_summarize_chain
3
+ from langchain.prompts import PromptTemplate
4
+
5
+
6
+ def get_refine_chain(pipeline_or_llm, model_type):
7
+ if model_type == "openai":
8
+ llm = pipeline_or_llm
9
+ question_template = """Write a concise summary of the following:
10
+ {text}
11
+ CONCISE SUMMARY:"""
12
+ question_prompt = PromptTemplate.from_template(question_template)
13
+ refine_template = """Your job is to produce a final summary
14
+ We have provided an existing summary up to a certain point: {existing_answer}
15
+ We have the opportunity to refine the existing summary (only if needed) with some more context below.
16
+ ------------
17
+ {text}
18
+ ------------
19
+ Given the new context, refine the original summary in bullets. If the context isn't useful return the original summary."""
20
+ refine_prompt = PromptTemplate.from_template(refine_template)
21
+
22
+ else:
23
+ question_prompt = PromptTemplate.from_template(template="{text}")
24
+ refine_prompt = PromptTemplate.from_template(template= "{existing_answer}\n{text}")
25
+ llm = HuggingFacePipeline(pipeline=pipeline_or_llm)
26
+
27
+
28
+ refine_chain = load_summarize_chain(
29
+ llm=llm,
30
+ chain_type="refine",
31
+ question_prompt=question_prompt,
32
+ refine_prompt=refine_prompt,
33
+ return_intermediate_steps=False,
34
+ input_key="input_documents",
35
+ output_key="output_text",
36
+ verbose=True,
37
+ )
38
+ return refine_chain
39
+
40
+
41
+
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
summarizer.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from model import get_model
2
+ from mapReduceSummarizer import get_map_reduce_chain
3
+ from refineSummarizer import get_refine_chain
4
+ from preprocess import prepare_for_summarize
5
+ from transformers import AutoTokenizer
6
+ from langchain.prompts import PromptTemplate
7
+ from logging import getLogger
8
+ import time
9
+
10
+ logger = getLogger(__name__)
11
+ class Summarizer:
12
+
13
+
14
+ def __init__(self,model_name,model_type,api_key=None) -> None:
15
+ self.model_type = model_type
16
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ self.base_summarizer = get_model(model_type,model_name,api_key)
18
+
19
+ def summarize(self,text:str,summarizer_type = "map_reduce")->str:
20
+
21
+ text_to_summarize,length_type = prepare_for_summarize(text,self.tokenizer)
22
+
23
+ if length_type =="short":
24
+
25
+ logger.info("Processing Input Text less than 12000 Tokens")
26
+ if self.model_type=="openai":
27
+ llm = self.base_summarizer
28
+ prompt = PromptTemplate.from_template(
29
+ template="""Write a concise and complete summary in bullet points of the given annual report.
30
+ Important:
31
+ * Note that the summary should contain all important information and it should not contain any unwanted information.
32
+ * Make sure to keep the summary as short as possible. And Summary should be in bullet points. Seperate each point with a new line.
33
+ TEXT: {text}
34
+ SUMMARY:"""
35
+ )
36
+ llm_chain = prompt|llm
37
+ start = time.time()
38
+ summary = llm_chain.invoke({"text": text_to_summarize})
39
+ end = time.time()
40
+ print(f"Summary generation took {round((end-start),2)}s.")
41
+ return summary,round((end-start),2)
42
+
43
+ elif self.model_type == "local":
44
+ pipe = self.base_summarizer
45
+ start = time.time()
46
+ summary = pipe(text_to_summarize)[0]['summary_text']
47
+ end = time.time()
48
+ print(f"Summary generation took {round((end-start),2)}s.")
49
+ return summary,round((end-start),2)
50
+ else:
51
+ if summarizer_type == "refine":
52
+ print("The text is too long, Running Refine Summarizer")
53
+ llm_chain = get_refine_chain(self.base_summarizer,self.model_type)
54
+ logger.info("Running Refine Chain for Summarization")
55
+ start = time.time()
56
+ summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
57
+ end = time.time()
58
+ print(f"Summary generation took {round((end-start),2)}s.")
59
+ return summary,round((end-start),2)
60
+
61
+
62
+ else:
63
+ print("The text is too long, Running Map Reduce Summarizer")
64
+
65
+ llm_chain = get_map_reduce_chain(self.base_summarizer,model_type=self.model_type)
66
+ logger.info("Running Map Reduce Chain for Summarization")
67
+ start = time.time()
68
+ summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
69
+ end = time.time()
70
+ print(f"Summary generation took {round((end-start),2)}s.")
71
+ return summary,round((end-start),2)
72
+
utils.py CHANGED
@@ -2,7 +2,7 @@ import re
2
  import requests
3
  import docx2txt
4
  from io import StringIO
5
- from PyPDF2 import PdfFileReader
6
 
7
  from bs4 import BeautifulSoup
8
  from nltk.tokenize import sent_tokenize
@@ -31,7 +31,8 @@ def clean_text(x):
31
  # x = re.sub(r"\w*\d+\w*", "", x) # numbers
32
  x = re.sub(r"\s{2,}", " ", x) # over spaces
33
  x = emoji_pattern.sub(r"", x) # emojis
34
- x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
 
35
 
36
  return x
37
 
@@ -103,12 +104,10 @@ def preprocess_text_for_abstractive_summarization(tokenizer, text):
103
 
104
 
105
  def read_pdf(file):
106
- pdfReader = PdfFileReader(file)
107
- count = pdfReader.numPages
108
  all_page_text = ""
109
- for i in range(count):
110
- page = pdfReader.getPage(i)
111
- all_page_text += page.extractText()
112
 
113
  return all_page_text
114
 
 
2
  import requests
3
  import docx2txt
4
  from io import StringIO
5
+ from PyPDF2 import PdfReader
6
 
7
  from bs4 import BeautifulSoup
8
  from nltk.tokenize import sent_tokenize
 
31
  # x = re.sub(r"\w*\d+\w*", "", x) # numbers
32
  x = re.sub(r"\s{2,}", " ", x) # over spaces
33
  x = emoji_pattern.sub(r"", x) # emojis
34
+ x = x.replace("$","Dollars ")
35
+ x = re.sub("[^.,!?%A-Za-z0-9]+", " ", x) # special charachters except .,!?
36
 
37
  return x
38
 
 
104
 
105
 
106
  def read_pdf(file):
107
+ pdfReader = PdfReader(file)
 
108
  all_page_text = ""
109
+ for page in pdfReader.pages:
110
+ all_page_text += page.extract_text()
 
111
 
112
  return all_page_text
113