wjjessen commited on
Commit
6fe569d
β€’
1 Parent(s): b760865

init commit

Browse files
Files changed (3) hide show
  1. README.md +54 -1
  2. app.py +192 -0
  3. requirements.txt +16 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Rasa
3
  emoji: πŸ“ˆ
4
  colorFrom: purple
5
  colorTo: purple
@@ -10,3 +10,56 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: RASA
3
  emoji: πŸ“ˆ
4
  colorFrom: purple
5
  colorTo: purple
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ # RASA: Research Article Summarization App
15
+
16
+ ## Description
17
+
18
+ This application summarizes an uploaded research article PDF using the large language models "LaMini-Flan-T5-77M" or "LaMini-GPT-124M". LaMini-Flan-T5-77M is a fine-tuned version of google/flan-t5-small on LaMini-instruction dataset that contains 2.58M samples for instruction fine-tuning. LaMini-GPT-124M is a fine-tuned version of gpt2 on LaMini-instruction dataset that contains 2.58M samples for instruction fine-tuning.
19
+
20
+ https://huggingface.co/MBZUAI/LaMini-Flan-T5-77M
21
+
22
+ https://huggingface.co/MBZUAI/LaMini-GPT-124M
23
+
24
+ ## Table of Contents
25
+
26
+ - [Installation](#installation)
27
+ - [Usage](#usage)
28
+ - [Credits](#credits)
29
+ - [License](#license)
30
+
31
+ ## Installation
32
+
33
+ Create a virtual python environment. To install the required python application packages, type "pip install -r requirements.txt" in a terminal window within the virtual python environment.
34
+
35
+ ## Usage
36
+
37
+ To run locally, navigate to the project folder and in a terminal window type "streamlit run app.py".
38
+
39
+ ## Credits
40
+
41
+ Written by Walter Jessen
42
+
43
+ Based on https://www.youtube.com/watch?v=GIbar_kZzwk
44
+
45
+ ## MIT License
46
+
47
+ Copyright (c) 2023 Walter Jessen
48
+
49
+ Permission is hereby granted, free of charge, to any person obtaining a copy
50
+ of this software and associated documentation files (the "Software"), to deal
51
+ in the Software without restriction, including without limitation the rights
52
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
53
+ copies of the Software, and to permit persons to whom the Software is
54
+ furnished to do so, subject to the following conditions:
55
+
56
+ The above copyright notice and this permission notice shall be included in all
57
+ copies or substantial portions of the Software.
58
+
59
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
64
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
65
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import base64
3
+ from langchain.chains.summarize import load_summarize_chain
4
+ from langchain.docstore.document import Document
5
+ from langchain.document_loaders.pdf import PyMuPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from PyPDF2 import PdfReader
8
+ import streamlit as st
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
11
+
12
+
13
+ # notes
14
+ # https://huggingface.co/docs/transformers/pad_truncation
15
+
16
+
17
+ # file loader and preprocessor
18
+ def file_preprocessing(file, skipfirst, skiplast):
19
+ loader = PyMuPDFLoader(file)
20
+ pages = loader.load_and_split()
21
+ print("")
22
+ print("# pages[0] ##########")
23
+ print("")
24
+ print(pages[0])
25
+ print("")
26
+ print("# pages ##########")
27
+ print("")
28
+ print(pages)
29
+ # skip page(s)
30
+ if (skipfirst == 1) & (skiplast == 0):
31
+ del pages[0]
32
+ elif (skipfirst == 0) & (skiplast == 1):
33
+ del pages[-1]
34
+ elif (skipfirst == 1) & (skiplast == 1):
35
+ del pages[0]
36
+ del pages[-1]
37
+ else:
38
+ pages = pages
39
+ print("")
40
+ print("# pages after loop ##########")
41
+ print("")
42
+ print(pages)
43
+ text_splitter = RecursiveCharacterTextSplitter(
44
+ chunk_size=1000, # number of characters
45
+ chunk_overlap=100,
46
+ length_function=len,
47
+ separators=["\n\n", "\n", " ", ""], # default list
48
+ )
49
+ # https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
50
+ texts = text_splitter.split_documents(pages)
51
+ final_texts = ""
52
+ for text in texts:
53
+ final_texts = final_texts + text.page_content
54
+ return final_texts
55
+
56
+
57
+ def preproc_count(filepath, skipfirst, skiplast):
58
+ input_text = file_preprocessing(filepath, skipfirst, skiplast)
59
+ text_length = len(input_text)
60
+ return input_text, text_length
61
+
62
+
63
+ def postproc_count(summary):
64
+ text_length = len(summary)
65
+ return text_length
66
+
67
+
68
+ # llm pipeline
69
+ def llm_pipeline(tokenizer, base_model, input_text):
70
+ pipe_sum = pipeline(
71
+ "summarization",
72
+ model=base_model,
73
+ tokenizer=tokenizer,
74
+ max_length=600,
75
+ min_length=300,
76
+ truncation=True,
77
+ )
78
+ result = pipe_sum(input_text)
79
+ result = result[0]["summary_text"]
80
+ return result
81
+
82
+
83
+ @st.cache_data
84
+ # function to display the PDF
85
+ def displayPDF(file):
86
+ with open(file, "rb") as f:
87
+ base64_pdf = base64.b64encode(f.read()).decode("utf-8")
88
+ # embed pdf in html
89
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
90
+ # display file
91
+ st.markdown(pdf_display, unsafe_allow_html=True)
92
+
93
+
94
+ # streamlit code
95
+ st.set_page_config(layout="wide")
96
+
97
+
98
+ def main():
99
+ st.title("RASA: Research Article Summarization App")
100
+ uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
101
+ if uploaded_file is not None:
102
+ st.subheader("Options")
103
+ col1, col2, col3 = st.columns([1, 1, 2])
104
+ with col1:
105
+ model_names = [
106
+ "T5-Small",
107
+ "BART",
108
+ ]
109
+ selected_model = st.radio("Select a model to use:", model_names)
110
+ if selected_model == "BART":
111
+ checkpoint = "ccdv/lsg-bart-base-16384-pubmed"
112
+ tokenizer = AutoTokenizer.from_pretrained(
113
+ checkpoint,
114
+ truncation=True,
115
+ legacy=False,
116
+ model_max_length=1000,
117
+ trust_remote_code=True,
118
+ )
119
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(
120
+ checkpoint, torch_dtype=torch.float32, trust_remote_code=True
121
+ )
122
+ else: # default Flan T5 small
123
+ checkpoint = "MBZUAI/LaMini-Flan-T5-77M"
124
+ tokenizer = AutoTokenizer.from_pretrained(
125
+ checkpoint,
126
+ truncation=True,
127
+ legacy=False,
128
+ model_max_length=1000,
129
+ )
130
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(
131
+ checkpoint, torch_dtype=torch.float32
132
+ )
133
+ with col2:
134
+ st.write("Skip any pages?")
135
+ skipfirst = st.checkbox("Skip first page")
136
+ skiplast = st.checkbox("Skip last page")
137
+ with col3:
138
+ st.write("Background information (links open in a new window)")
139
+ st.write(
140
+ "Model class: [BART](https://huggingface.co/docs/transformers/main/en/model_doc/bart)"
141
+ "&nbsp;&nbsp;|&nbsp;&nbsp;Specific model: [MBZUAI/LaMini-Flan-T5-77M](https://huggingface.co/MBZUAI/LaMini-Flan-T5-77M)"
142
+ )
143
+ st.write(
144
+ "Model class: [T5-Small](https://huggingface.co/docs/transformers/main/en/model_doc/t5)"
145
+ "&nbsp;&nbsp;|&nbsp;&nbsp;Specific model: [ccdv/lsg-bart-base-16384-pubmed](https://huggingface.co/ccdv/lsg-bart-base-16384-pubmed)"
146
+ )
147
+ if st.button("Summarize"):
148
+ col1, col2 = st.columns(2)
149
+ filepath = "data/" + uploaded_file.name
150
+ with open(filepath, "wb") as temp_file:
151
+ temp_file.write(uploaded_file.read())
152
+ with col1:
153
+ input_text, text_length = preproc_count(filepath, skipfirst, skiplast)
154
+ st.info(
155
+ "Uploaded PDF&nbsp;&nbsp;|&nbsp;&nbsp;Number of words: "
156
+ f"{text_length:,}"
157
+ )
158
+ pdf_viewer = displayPDF(filepath)
159
+ with col2:
160
+ with st.spinner("Please wait..."):
161
+ summary = llm_pipeline(tokenizer, base_model, input_text)
162
+ text_length = postproc_count(summary)
163
+ st.info(
164
+ "PDF Summary&nbsp;&nbsp;|&nbsp;&nbsp;Number of words: "
165
+ f"{text_length:,}"
166
+ )
167
+ st.success(summary)
168
+
169
+
170
+ st.markdown(
171
+ """<style>
172
+ div[class*="stRadio"] > label > div[data-testid="stMarkdownContainer"] > p {
173
+ font-size: 1rem;
174
+ font-weight: 400;
175
+ }
176
+ div[class*="stMarkdown"] > div[data-testid="stMarkdownContainer"] > p {
177
+ margin-bottom: -15px;
178
+ }
179
+ div[class*="stCheckbox"] > label {
180
+ margin-bottom: -15px;
181
+ }
182
+ body > a {
183
+ text-decoration: underline;
184
+ }
185
+ </style>
186
+ """,
187
+ unsafe_allow_html=True,
188
+ )
189
+
190
+
191
+ if __name__ == "__main__":
192
+ main()
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ sentence_transformers
3
+ torch
4
+ sentencepiece
5
+ transformers==4.34.0
6
+ accelerate
7
+ chromadb
8
+ pypdf
9
+ tiktoken
10
+ streamlit
11
+ fastapi
12
+ uvicorn
13
+ python-multipart
14
+ aiofiles
15
+ PyPDF2
16
+ PyMuPDF