Spaces:

wjjessen
/

rasa

Running

App Files Files Community

wjjessen commited on Nov 27, 2023

Commit

d124ecd

1 Parent(s): 621da38

update code

Browse files

Files changed (1) hide show

app.py +28 -17

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ def preproc_count(filepath, skipfirst, skiplast):
 # llm pipeline
-def llm_pipeline(tokenizer, base_model, input_text):
     pipe_sum = pipeline(
         "summarization",
         model=base_model,
@@ -72,6 +72,7 @@ def llm_pipeline(tokenizer, base_model, input_text):
         min_length=300,
         truncation=True
     )
     print("Summarizing...")
     result = pipe_sum(input_text)
     summary = result[0]["summary_text"]
@@ -105,8 +106,14 @@ def main():
     uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
     if uploaded_file is not None:
         st.subheader("Options")
-        col1, col2, col3 = st.columns([1, 1, 2])
         with col1:
             model_names = [
                 "T5-Small",
                 "BART",
@@ -121,13 +128,15 @@ def main():
                     model_max_length=1000,
                     trust_remote_code=True,
                 )
-                #base_model = AutoModelForSeq2SeqLM.from_pretrained(
-                #    checkpoint,
-                #    torch_dtype=torch.float32,
-                #    trust_remote_code=True,
-                #)
-                base_model = "model_cache/models--ccdv--lsg-bart-base-16384-pubmed/snapshots/4072bc1a7a94e2b4fd860a5fdf1b71d0487dcf15"
-            else:  # default Flan T5 small
                 checkpoint = "MBZUAI/LaMini-Flan-T5-77M"
                 tokenizer = AutoTokenizer.from_pretrained(
                     checkpoint,
@@ -136,16 +145,18 @@ def main():
                     model_max_length=1000,
                     #cache_dir="model_cache"
                 )
-                #base_model = AutoModelForSeq2SeqLM.from_pretrained(
-                #    checkpoint,
-                #    torch_dtype=torch.float32,
-                #)
-                base_model = "model_cache/models--MBZUAI--LaMini-Flan-T5-77M/snapshots/c5b12d50a2616b9670a57189be20055d1357b474"
-        with col2:
             st.write("Skip any pages?")
             skipfirst = st.checkbox("Skip first page")
             skiplast = st.checkbox("Skip last page")
-        with col3:
             st.write("Background information (links open in a new window)")
             st.write(
                 "Model class: [T5-Small](https://huggingface.co/docs/transformers/main/en/model_doc/t5)"
@@ -170,7 +181,7 @@ def main():
             with col2:
                 start = time.time()
                 with st.spinner("Summarizing..."):
-                    summary = llm_pipeline(tokenizer, base_model, input_text)
                     postproc_text_length = postproc_count(summary)
                 end = time.time()
                 duration = end - start

 # llm pipeline
+def llm_pipeline(tokenizer, base_model, input_text, model_source):
     pipe_sum = pipeline(
         "summarization",
         model=base_model,
         min_length=300,
         truncation=True
     )
+    print("Model source: %s" %(model_source))
     print("Summarizing...")
     result = pipe_sum(input_text)
     summary = result[0]["summary_text"]
     uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
     if uploaded_file is not None:
         st.subheader("Options")
+        col1, col2, col3, col4 = st.columns([1, 1, 1, 2])
         with col1:
+            model_source_names = [
+                "Cached model",
+                "Download model"
+            ]
+            model_source = st.radio("For development:", model_source_names)
+        with col2:
             model_names = [
                 "T5-Small",
                 "BART",
                     model_max_length=1000,
                     trust_remote_code=True,
                 )
+                if model_source == "Download":
+                    base_model = AutoModelForSeq2SeqLM.from_pretrained(
+                    checkpoint,
+                    torch_dtype=torch.float32,
+                    trust_remote_code=True,
+                    )
+                else:
+                    base_model = "model_cache/models--ccdv--lsg-bart-base-16384-pubmed/snapshots/4072bc1a7a94e2b4fd860a5fdf1b71d0487dcf15"
+            else:
                 checkpoint = "MBZUAI/LaMini-Flan-T5-77M"
                 tokenizer = AutoTokenizer.from_pretrained(
                     checkpoint,
                     model_max_length=1000,
                     #cache_dir="model_cache"
                 )
+                if model_source == "Download":
+                    base_model = AutoModelForSeq2SeqLM.from_pretrained(
+                    checkpoint,
+                    torch_dtype=torch.float32,
+                    )
+                else:
+                    base_model = "model_cache/models--MBZUAI--LaMini-Flan-T5-77M/snapshots/c5b12d50a2616b9670a57189be20055d1357b474"
+        with col3:
             st.write("Skip any pages?")
             skipfirst = st.checkbox("Skip first page")
             skiplast = st.checkbox("Skip last page")
+        with col4:
             st.write("Background information (links open in a new window)")
             st.write(
                 "Model class: [T5-Small](https://huggingface.co/docs/transformers/main/en/model_doc/t5)"
             with col2:
                 start = time.time()
                 with st.spinner("Summarizing..."):
+                    summary = llm_pipeline(tokenizer, base_model, input_text, model_source)
                     postproc_text_length = postproc_count(summary)
                 end = time.time()
                 duration = end - start