File size: 3,154 Bytes
77f16bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56bc6ae
77f16bc
 
 
649c6c9
6569c40
64b9cf1
 
40854b9
b078587
 
77f16bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Core Pkgs
import streamlit as st
from transformers import pipeline
from PyPDF2 import PdfFileReader
import docx2txt
import base64
import  re
import sqlite3
import time
from io import StringIO
import warnings
warnings.filterwarnings("ignore")

time_str = time.strftime("%Y%m%d-%H%M%S")
# Loading function the model pipeline from huggingface model
@st.cache(allow_output_mutation=True)
def bart():
    ''' Loading bart model using pipeline api '''
    summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
    return summarizer
@st.cache(allow_output_mutation=True)
def t5():
    ''' Loading t5 model using pipeline api '''
    summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")
    return summarizer
@st.cache(allow_output_mutation=True)    
def ssr():
    ''' Loading ssr model using pipeline api '''
    summarizer = pipeline("summarization", model="santiviquez/ssr-base-finetuned-samsum-en")
    return summarizer     


def preprocess_plain_text(x):

    x = x.encode("ascii", "ignore").decode()  # unicode
    x = re.sub(r"https*\S+", " ", x)  # url
    x = re.sub(r"@\S+", " ", x)  # mentions
    x = re.sub(r"#\S+", " ", x)  # hastags
    x = re.sub(r"\s{2,}", " ", x)  # over spaces
    x = re.sub("[^.,!?A-Za-z0-9]+", " ", x)  # special charachters except .,!?

    return x

def extract_pdf(file):
    
    '''Extract text from PDF file'''
    
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_text = ""
    for i in range(count):
        page = pdfReader.getPage(i)
        all_text += page.extractText()

    return all_text


def extract_text_from_file(file):

    '''Extract text from uploaded file'''

    # read text file
    if file.type == "text/plain":
        # To convert to a string based IO:
        stringio = StringIO(file.getvalue().decode("utf-8"))

        # To read file as string:
        file_text = stringio.read()

    # read pdf file
    elif file.type == "application/pdf":
        file_text = extract_pdf(file)

    # read docx file
    elif (
        file.type
        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ):
        file_text = docx2txt.process(file)

    return file_text

def summary_downloader(raw_text):
    
	b64 = base64.b64encode(raw_text.encode()).decode()
	new_filename = "new_text_file_{}_.txt".format(time_str)
	st.markdown("#### Download Summary as a File ###")
	href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
	st.markdown(href,unsafe_allow_html=True)


# Storage in A Database
conn = sqlite3.connect('summarizer_database.db',check_same_thread=False)
c = conn.cursor() 
    # Create Fxn From SQL
def create_table():
	c.execute('CREATE TABLE IF NOT EXISTS TextTable(text_to_summarize TEXT,summarized_text TEXT,postdate DATE)')


def add_data(text_to_summarize,summarized_text,postdate):
    c.execute('INSERT INTO TextTable(text_to_summarize,summarized_text,postdate) VALUES (?,?,?)',(text_to_summarize,summarized_text,postdate))
    conn.commit()

def view_all_data():
	c.execute("SELECT * FROM TextTable")
	data = c.fetchall()
	return data