Spaces:

ANASDAVOODTK
/

prjt

Sleeping

App Files Files Community

ANASDAVOODTK commited on Apr 1, 2023

Commit

ee165bf

•

1 Parent(s): 23f5ed9

new p

Browse files

Files changed (4) hide show

app.py +165 -11
bg.png +0 -0
enlarged.pdf +0 -0
requirements.txt +11 -1

app.py CHANGED Viewed

@@ -1,15 +1,169 @@
-import streamlit as st
 import os
-import pyperclip
-os.system('sudo apt-get install -y xclip')
-st.title("Copy text button code1")
-user_input = st.text_input("Enter your text here:")
-if st.button("Display Text"):
-    st.write(user_input)
-if st.button("Copy to Clipboard"):
-    pyperclip.copy(user_input)
-    os.system("echo -n $'%s' | xsel -ib" % pyperclip.paste())
-    st.write("Text copied to clipboard!")

+import numpy as np
 import os
+import cv2
+from PIL import Image
+import pandas as pd
+from io import BytesIO
+import streamlit as st
+import openai
+import PyPDF2
+import base64
+import pypdfium2 as pdfium
+from pdf2image import convert_from_path
+import docx
+from docx import Document
+import fitz
+import pytesseract
+COMPLETIONS_MODEL = "gpt-4"
+openai.api_key = "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"
+COMPLETIONS_API_PARAMS = {
+    "temperature": 0.0,
+    "max_tokens": 300,
+    "model": COMPLETIONS_MODEL,
+}
+def run_on_chunks(data):
+    response = []
+    chunk = data_chunk(data , chunk_size = 10000)
+    print(chunk)
+    for i in chunk:
+        response.append(GPT_4_API(i))
+    return response
+def data_chunk(lst , chunk_size):
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def check_file_format(filename):
+    return filename.rsplit('.', 1)[1].lower()
+def pdf_to_images(pdf_file):
+    images = []
+    with fitz.open(pdf_file) as doc:
+        for page in doc:
+            pix = page.get_pixmap(alpha=False)
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            images.append(img)
+    return images
+def OCR(pdf_file):
+    pdf_reader = PyPDF2.PdfReader(pdf_file)
+    pdf_writer = PyPDF2.PdfWriter()
+    for page_num in range(len(pdf_reader.pages)):
+        page = pdf_reader.pages[page_num]
+        page.scale_by(2)
+        pdf_writer.add_page(page)
+    with open('enlarged.pdf', 'wb') as f:
+        pdf_writer.write(f)
+    images = pdf_to_images('enlarged.pdf')
+    text = ''
+    for image in images:
+        size = (image.width * 2, image.height * 2)
+        image = image.resize(size, Image.ANTIALIAS)
+        text += pytesseract.image_to_string(image)
+    print(text)
+    pdf_file.close()
+    return text
+def txt_extraction(file_path):
+    file_contents = file_path.read().decode("utf-8")
+    return file_contents
+def docx_extraction(path):
+    doc = docx.Document(path)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
+    return '\n'.join(full_text)
+def download_docx(text):
+    document = Document()
+    document.add_paragraph(text)
+    output = BytesIO()
+    document.save(output)
+    output.seek(0)
+    st.download_button(
+        label="Download as .docx",
+        data=output,
+        file_name="document.docx",
+        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    )
+@st.cache_data()
+def GPT_4_API(data):
+    print("request_send")
+    header =  """ create 20 question and answeres from this paragraph, Answer should strictly be exact lines from this paragraph"."\n\nContext:\n"""
+    QA = header + "".join(str(list(data)))
+    response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
+    return response["choices"][0]["message"]["content"]
+def my_text_editor(default_text, key, height=800):
+    textarea = st.text_area(key, height=height, value=default_text)
+    return textarea
+def get_base64_of_bin_file(bin_file):
+    with open(bin_file, 'rb') as f:
+        data = f.read()
+    return base64.b64encode(data).decode()
+def set_png_as_page_bg(png_file):
+    bin_str = get_base64_of_bin_file(png_file)
+    page_bg_img = '''
+    <style>
+    .stApp {
+        background-image: url("data:image/png;base64,%s");
+        background-size: cover;
+    }
+    </style>
+    ''' % bin_str
+    st.markdown(page_bg_img, unsafe_allow_html=True)
+    return
+def Extract_pdf_content(pdf_name):
+    page_text = []
+    pdf_reader = PyPDF2.PdfReader(pdf_name)
+    num_pages = len(pdf_reader.pages)
+    for page in range(num_pages):
+        pdf_page = pdf_reader.pages[page]
+        page_text.append(pdf_page.extract_text())
+    return page_text[0]
+def process(uploaded_file):
+    st.write("Filename:", uploaded_file.name)
+    data = Extract_pdf_content(uploaded_file)
+    return data
+if __name__=="__main__":
+    pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
+    PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
+    st.set_page_config(**PAGE_CONFIG)
+    main_bg = 'bg.png'
+    set_png_as_page_bg(main_bg)
+    st.title("pdf data extraction web application")
+    uploaded_file = st.file_uploader("Upload a PDF file", type = ["pdf","docx","txt"])
+    text = ""
+    if uploaded_file is not None:
+        if check_file_format(uploaded_file.name) == "pdf":
+            data = process(uploaded_file)
+            if data==" ":
+                data = OCR(uploaded_file)
+        elif check_file_format(uploaded_file.name) == "docx":
+            data = docx_extraction(uploaded_file)
+        else:
+            data = txt_extraction(uploaded_file)
+        response = run_on_chunks(data)
+        text = my_text_editor(response[0],"text-editor", height=800)
+        download_docx(text)

bg.png ADDED Viewed

enlarged.pdf ADDED Viewed

Binary file (13.6 kB). View file

requirements.txt CHANGED Viewed

	@@ -1 +1,11 @@
1	- ~~pyperclip~~

+numpy
+opencv-python
+Pillow
+pandas
+streamlit
+openai
+PyPDF2
+pdf2image
+python-docx
+PyMuPDF
+pytesseract