File size: 2,432 Bytes
63337f5
7d0a6ff
11ef280
63337f5
 
 
 
c312545
6f9cc9b
 
7de3632
078b2c2
7de3632
 
078b2c2
6f9cc9b
 
63337f5
 
6f9cc9b
 
7de3632
63337f5
 
7de3632
d4376fd
63337f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30bc38f
63337f5
9018ff8
63337f5
 
c312545
63337f5
 
 
 
 
e89f971
db38720
 
63337f5
db38720
 
e89f971
0c332ef
 
 
 
 
 
db38720
 
 
63337f5
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import streamlit as st 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
import torch
import base64
import tempfile
#import os
#from dotenv import load_dotenv
#from huggingface_hub import HfApi

#api = HfApi()
#token = api.retrieve_token("secret_token")  # Replace with your secret name

#load_dotenv()
#token = os.environ.get("HF_TOKEN")




checkpoint = "MBZUAI/LaMini-Flan-T5-248M"
#model and tokenizer loading

tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32)

#file loader and preprocessing
def file_preprocessing(file):
    loader =  PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    texts = text_splitter.split_documents(pages)
    final_texts = ""
    for text in texts:
        print(text)
        final_texts = final_texts + text.page_content
    return final_texts

#LLM pipeline
def llm_pipeline(filepath):
    pipe_sum = pipeline(
        'summarization',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 500, 
        min_length = 50)
    input_text = file_preprocessing(filepath)
    result = pipe_sum(input_text)
    result = result[0]['summary_text']
    return result




def main():
    st.title("Document Summarization App")

    uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])

    if uploaded_file is not None:
        if st.button("Summarize"):
            col2 = st.columns(1)
            # Use a temporary filename directly
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(uploaded_file.read())
                temp_file.flush()  # Ensure contents are written to disk
                filepath = temp_file.name
            
            
                try:
                    summary = llm_pipeline(filepath)
                    st.success(summary)  # Display only the summary
                except Exception as e:
                    st.error(f"An error occurred during summarization: {e}")
            # Clean up the temporary file
            os.remove(filepath)




if __name__ == "__main__":
    main()