File size: 6,591 Bytes
9c37e72 dba2773 9c37e72 68f40bc d82d18a 9c37e72 6e58c44 4834995 bd18577 2f51bd6 c75cc74 3d7adba 9c37e72 09d4214 06dd768 09d4214 0842639 baf370a 9c37e72 1ecea99 fa73ddc 419e04c 9c37e72 a07988a b446f5c a07988a b446f5c dd55b25 8c11fa3 f176b84 dd55b25 dec4937 6fe4e2e dd55b25 8c11fa3 dd55b25 10ef8bd d82d18a 1a9fa41 d82d18a 1a9fa41 c6ec27d c9a18bc 9c37e72 a07988a e113d20 3f6c2be 9c37e72 1a16a58 3d7adba f1ae271 3d7adba 0a7287e 2c22d61 6b10636 63c4e55 cb6a8b6 2bfe916 7b01ac0 d82d18a ea8c799 82d1709 6d5b302 d82d18a e8b7e7a 8cc1e8b 6e25163 dd55b25 3e4f1f9 dd55b25 ed0375d dd55b25 3e4f1f9 8c11fa3 f5aabdb 3e4f1f9 dd55b25 3e4f1f9 886ee3f 7c894b1 6c1c515 1e17e2b cd370f7 7c894b1 6c1c515 1e17e2b cd370f7 3d7adba 7c894b1 7af9178 7c894b1 a07988a 9c37e72 b9b4937 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')
#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
import torch
import docx2txt
from PIL import Image
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL = "https://api-inference.huggingface.co/models/gpt2-large"
headers = {"Authorization": "Bearer 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"}
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
def read_pdf(file):
# images=pdf2image.convert_from_path(file)
# # print(type(images))
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_page_text = " "
for i in range(count):
page = pdfReader.getPage(i)
# img = Image.open(page)
# img = Image.open(page)
# img = img.save("img.png")
# image_name = cv2.imread("img.png")
# # get co-ordinates to cr
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
all_page_text += page.extractText()+" "
return all_page_text
# def read_pdf_with_pdfplumber(file):
# # Open the uploaded PDF file with pdfplumber
# with pdfplumber.open(file) as pdf:
# extracted_text = ''
# for page in pdf.pages:
# extracted_text += page.extract_text()
# # Display the extracted text
# #st.text(extracted_text)
# return extracted_text
st.title("NLP APPLICATION")
#@st.cache_resource(experimental_allow_widgets=True)
def main():
#global tokenizer, model
#tokenizer = AutoTokenizer.from_pretrained('t5-base')
#model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
""" NLP Based Application with Streamlit """
st.markdown("""
#### Description
##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
PDF Document/Image's Text Summarization.
""")
def change_photo_state():
st.session_state["photo"]="done"
st.subheader("Please, feed your pdf/images/text, features/services will appear automatically!")
message = st.text_input("Type your text here!")
uploaded_photo = st.file_uploader("Upload your PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
camera_photo = st.camera_input("Take a photo, Containing English texts", on_change=change_photo_state)
if "photo" not in st.session_state:
st.session_state["photo"]="not done"
if st.session_state["photo"]=="done" or message:
text=" "
if uploaded_photo and uploaded_photo.type=='application/pdf':
#file = uploaded_photo.read() # Read the data
#image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
#image_result.write(file)
tet = read_pdf(uploaded_photo)
#tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*100):]
st.success(text)
elif uploaded_photo:
img = Image.open(uploaded_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
# get co-ordinates to crop the image
#imag, lc = line_cor.mark_region(imge)
#st.success(*lc)
# c = lc
# cropping image img = image[y0:y1, x0:x1]
#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
#plt.figure(figsize=(10,10))
# plt.imshow(img)
# convert the image to black and white for better OCR
#ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
# pytesseract image to string to get results
#text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Bangla") else pytesseract.image_to_string(img)
#st.success(text)
elif camera_photo:
img = Image.open(camera_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
#st.success(text)
elif uploaded_photo==None and camera_photo==None:
#our_image=load_image("image.jpg")
#img = cv2.imread("scholarly_text.jpg")
text = message
if st.checkbox("English Text Generation"):
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
output = query({
"inputs": text,
})
st.success(output)
if st.checkbox("Mark for ENG Text Summarization, ENSUER MARKING ABOVE BANGLA FOR BANGLA!"):
def query(payload):
response = requests.post(API_URL0, headers=headers0, json=payload)
return response.json()
output = query({
"inputs": text})
st.success(output)
if __name__ == '__main__':
main()
|