File size: 9,111 Bytes
9c37e72
dba2773
 
9c37e72
1a16a58
9c37e72
0c5b55b
9c37e72
0c5b55b
9c37e72
 
 
1a16a58
9c37e72
 
 
0c5b55b
9c37e72
 
6e58c44
bd18577
 
 
 
 
 
 
c75cc74
9c37e72
36603f5
09d4214
 
 
06dd768
09d4214
f4332f9
baf370a
 
06dd768
9c37e72
 
 
 
 
 
 
 
3e4f1f9
419e04c
9c37e72
baf370a
fb120e2
d0ba2f9
f780d66
 
 
fb120e2
f780d66
25ae3be
 
f780d66
fb120e2
 
 
 
 
 
2406036
fb120e2
 
 
 
 
 
 
 
cadb958
 
 
2406036
10ef8bd
 
 
54ee49c
f6a6e42
9c37e72
 
 
 
 
 
9531d63
 
 
 
 
9c37e72
f6a6e42
9c37e72
 
 
 
 
 
 
 
e113d20
3f6c2be
9c37e72
1a16a58
 
 
 
 
f1ae271
 
 
0a7287e
 
 
4b6d85d
63c4e55
 
 
2181afa
 
5aeb295
 
d0ba2f9
 
63c4e55
f61fc0e
2181afa
8cc1e8b
 
 
3e4f1f9
 
 
 
 
 
 
 
 
 
 
 
c08e6a6
b9c9fb8
cd370f7
 
 
 
 
 
 
 
 
fda8d0d
b9b4937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c95ac40
9c37e72
c95ac40
9c37e72
b9b4937
 
 
c95ac40
b9b4937
 
c95ac40
b9b4937
9c37e72
 
 
b9b4937
 
 
b4eff56
 
fca844f
b4eff56
b9b4937
 
9c37e72
b9b4937
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
#App: NLP App with Streamlit
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows;

+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy

+ Named Entity Recognition(NER)/Trigger word detection using SpaCy

+ Sentiment Analysis using TextBlob

+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive.

This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim
"""
# Core Pkgs
import os
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image


# NLP Pkgs
from textblob import TextBlob 
import spacy
from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract

#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
@st.experimental_singleton
def read_pdf(file):
    images=pdf2image.convert_from_path(file)
    # print(type(images))
    # pdfReader = PdfFileReader(file)
    # count = pdfReader.numPages
    all_page_text = ""
    for page in images:
       # page = pdfReader.getPage(i)
        #img = Image.open(page)
        img = Image.open(page)
        img = img.save("img.png")
        image_name = cv2.imread("img.png")
        # get co-ordinates to cr
        text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
        all_page_text += text + " " #page.extractText()
    return all_page_text
def read_pdf_with_pdfplumber(file):
#     all_page_text=" "
# #     all_page_text = ""
#     #with pdfplumber.open(file) as pdf:
# 	   # page = pdf.pages[0]
#     ge=page.to_image()
#     img = Image.open(ge)
#     img = img.save("img.png")
#     image_name = cv2.imread("img.png")
# get co-ordinates to c
        #return page.extract_text()
        # get co-ordinates to cr  
## get co-ordinates to cr
    text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
    all_page_text += text + " " #page.extractText()
    return all_page_text
st.title("Streamlit NLP APP")
@st.experimental_singleton
def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData
@st.experimental_singleton
def load_models():
    tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
    model = GPT2LMHeadModel.from_pretrained('gpt2-large')
    return tokenizer, model
# Function For Extracting Entities
@st.experimental_singleton
def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData
def main():
    """ NLP Based Application with Streamlit """
    st.markdown("""
    	#### Description
    	##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstractive.
    	""")
    def change_photo_state():
        st.session_state["photo"]="done"
    st.subheader("Please, feed your image/text, features/services will appear automatically!")
    message = st.text_input("Type your text here!")
    camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
    uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    if st.session_state["photo"]=="done" or message:
        #text=""
        if uploaded_photo.type=='application/pdf':
            file = uploaded_photo.read() # Read the data
            image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
            image_result.write(file)
            text = read_pdf(image_result)
            #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            st.success(text)
        elif uploaded_photo.type != "application/image":
            img = Image.open(uploaded_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            # get co-ordinates to crop the image
            image, lc = mark_region(img)
            c = lc[1]
            # cropping image img = image[y0:y1, x0:x1]
            img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
            plt.figure(figsize=(10,10))
            plt.imshow(img)
            # convert the image to black and white for better OCR
            ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
            # pytesseract image to string to get results
            text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
            #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            st.success(text)
        elif camera_photo:
            img = Image.open(camera_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            st.success(text)
        elif uploaded_photo==None and camera_photo==None:
    	#our_image=load_image("image.jpg")
        #img = cv2.imread("scholarly_text.jpg")
            text = message
        if st.checkbox("Show Named Entities English/Bangla"):
            entity_result = entity_analyzer(text)
            st.json(entity_result)
        if st.checkbox("Show Sentiment Analysis for English"):
            blob = TextBlob(text)
            result_sentiment = blob.sentiment
            st.success(result_sentiment)
        if st.checkbox("Spell Corrections for English"):
            st.success(TextBlob(text).correct())
        if st.checkbox("Text Generation"):
            ok = st.button("Generate")
            if ok:
                tokenizer, model = load_models()
                input_ids = tokenizer(text, return_tensors='pt').input_ids
                st.text("Using Hugging Face Transformer, Contrastive Search ..")
                output = model.generate(input_ids, max_length=128)
                st.success(tokenizer.decode(output[0], skip_special_tokens=True))
        if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
			#st.subheader("Summarize Your Text for English and Bangla Texts!")
			#message = st.text_area("Enter the Text","Type please ..")
			#st.text("Using Gensim Summarizer ..")
			#st.success(mess)
            summary_result = summarize(text)
            st.success(summary_result)
        if st.checkbox("Mark to better English Text Summarization!"):
			#st.title("Summarize Your Text for English only!")
            tokenizer = AutoTokenizer.from_pretrained('t5-base')
            model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
			#st.text("Using Google T5 Transformer ..")
            inputs = tokenizer.encode("summarize: " + text,
						return_tensors='pt',
										max_length=512,
										truncation=True)
            summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
            summary = tokenizer.decode(summary_ids[0])
            st.success(summary)
        # Title
        if st.button("REFRESH"):
            st.experimental_rerun()

    st.sidebar.subheader("About App")
    st.sidebar.markdown("By [Soumen Sarker](https://soumen-sarker-personal-website.streamlitapp.com/)")
if __name__ == '__main__':
    main()