vibey commited on
Commit
9147c45
1 Parent(s): a22e0cd

Upload function.py

Browse files
Files changed (1) hide show
  1. function.py +111 -0
function.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Pkgs
2
+ import streamlit as st
3
+ from transformers import pipeline
4
+ from PyPDF2 import PdfFileReader
5
+ import docx2txt
6
+ import base64
7
+ import re
8
+ import sqlite3
9
+ import time
10
+ from io import StringIO
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ time_str = time.strftime("%Y%m%d-%H%M%S")
15
+ # Loading function the model pipeline from huggingface model
16
+ @st.cache(allow_output_mutation=True)
17
+ def bart():
18
+ ''' Loading bart model using pipeline api '''
19
+ summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
20
+ return summarizer
21
+
22
+ def t5():
23
+ ''' Loading t5 model using pipeline api '''
24
+ summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")
25
+ return summarizer
26
+
27
+ def bart_t5():
28
+ ''' Loading bart_t5 model using pipeline api '''
29
+ summarizer = pipeline("summarization", model="tuner007/pegasus_summarizer")
30
+ return summarizer
31
+
32
+ # def pegasus():
33
+ # ''' Loading pegasus model using pipeline api '''
34
+ # summarizer = pipeline('summarization',model='google/pegasus-xsum')
35
+ # return summarizer
36
+
37
+ def preprocess_plain_text(x):
38
+
39
+ x = x.encode("ascii", "ignore").decode() # unicode
40
+ x = re.sub(r"https*\S+", " ", x) # url
41
+ x = re.sub(r"@\S+", " ", x) # mentions
42
+ x = re.sub(r"#\S+", " ", x) # hastags
43
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
44
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
45
+
46
+ return x
47
+
48
+ def extract_pdf(file):
49
+
50
+ '''Extract text from PDF file'''
51
+
52
+ pdfReader = PdfFileReader(file)
53
+ count = pdfReader.numPages
54
+ all_text = ""
55
+ for i in range(count):
56
+ page = pdfReader.getPage(i)
57
+ all_text += page.extractText()
58
+
59
+ return all_text
60
+
61
+
62
+ def extract_text_from_file(file):
63
+
64
+ '''Extract text from uploaded file'''
65
+
66
+ # read text file
67
+ if file.type == "text/plain":
68
+ # To convert to a string based IO:
69
+ stringio = StringIO(file.getvalue().decode("utf-8"))
70
+
71
+ # To read file as string:
72
+ file_text = stringio.read()
73
+
74
+ # read pdf file
75
+ elif file.type == "application/pdf":
76
+ file_text = extract_pdf(file)
77
+
78
+ # read docx file
79
+ elif (
80
+ file.type
81
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
82
+ ):
83
+ file_text = docx2txt.process(file)
84
+
85
+ return file_text
86
+
87
+ def summary_downloader(raw_text):
88
+
89
+ b64 = base64.b64encode(raw_text.encode()).decode()
90
+ new_filename = "new_text_file_{}_.txt".format(time_str)
91
+ st.markdown("#### Download Summary as a File ###")
92
+ href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
93
+ st.markdown(href,unsafe_allow_html=True)
94
+
95
+
96
+ # Storage in A Database
97
+ conn = sqlite3.connect('summarizer_database.db',check_same_thread=False)
98
+ c = conn.cursor()
99
+ # Create Fxn From SQL
100
+ def create_table():
101
+ c.execute('CREATE TABLE IF NOT EXISTS TextTable(text_to_summarize TEXT,summarized_text TEXT,postdate DATE)')
102
+
103
+
104
+ def add_data(text_to_summarize,summarized_text,postdate):
105
+ c.execute('INSERT INTO TextTable(text_to_summarize,summarized_text,postdate) VALUES (?,?,?)',(text_to_summarize,summarized_text,postdate))
106
+ conn.commit()
107
+
108
+ def view_all_data():
109
+ c.execute("SELECT * FROM TextTable")
110
+ data = c.fetchall()
111
+ return data