docsques / app.py
ravithejads's picture
Create app.py
6844495
raw
history blame contribute delete
No virus
6.88 kB
from gpt_index import Document, GPTListIndex
import gradio as gr
import openai
import os
import PyPDF2
import docx
import pytesseract
from PIL import Image
def pdftotext(file_name):
"""
Function to extract text from .pdf format files
"""
text = []
# Open the PDF file in read-binary mode
with open(file_name, 'rb') as file:
# Create a PDF object
pdf = PyPDF2.PdfReader(file)
# Get the number of pages in the PDF document
num_pages = len(pdf.pages)
# Iterate over every page
for page in range(num_pages):
# Extract the text from the page
result = pdf.pages[page].extract_text()
text.append(result)
text = "\n".join(text)
return text
def docxtotext(file_name):
"""
Function to read .docx format files
"""
# Open the Word document
document = docx.Document(file_name)
# Extract the text from the document
text = '\n'.join([paragraph.text for paragraph in document.paragraphs])
return text
def readtextfile(file_name):
"""
Function to read .txt format files
"""
# Open the Text document
with open(file_name, 'r') as file:
text = file.read()
return text
def imagetotext(file_name):
"""
Function to extract text from images
"""
# Open the image using PIL
image = Image.open(file_name)
# Extract the text from the image
text = pytesseract.image_to_string(image)
return text
def preprocesstext(text):
"""
Function to preprocess text
"""
# Split the string into lines
lines = text.splitlines()
# Use a list comprehension to filter out empty lines
lines = [line for line in lines if line.strip()]
# Join the modified lines back into a single string
text = '\n'.join(lines)
return text
def processfiles(files):
"""
Function to extract text from documents
"""
textlist = []
# Iterate over provided files
for file in files:
# Get file name
file_name = file.name
# Get extention of file name
ext = file_name.split(".")[-1].lower()
# Process document based on extention
if ext == "pdf":
text = pdftotext(file_name)
elif ext == "docx":
text = docxtotext(file_name)
elif ext == "txt":
text = readtextfile(file_name)
elif ext in ["png", "jpg", "jpeg"]:
text = imagetotext(file_name)
else:
text = ""
# Preprocess text
text = preprocesstext(text)
# Append the text to final result
textlist.append(text)
return textlist
def createdocuments(textlist):
"""
Function to create documents as needed for indexing.
"""
documents = []
# Create Document for indexing
for text in textlist:
documents.append(Document(text))
return documents
def fileformatvaliditycheck(files):
"""
Function to check validity of file formats
"""
for file1 in files:
file_name = file1.name
# Get extention of file name
ext = file_name.split(".")[-1].lower()
if ext not in ["pdf", "txt", "docx", "png", "jpg", "jpeg"]:
return False
return True
def openaiapikeyvaliditycheck(openaikey):
"""
Function to check validity of openai key
"""
# Set the API key
openai.api_key = openaikey
# Test the API key by making a request to the OpenAI API
try:
response = openai.Model.list()
return "Valid OpenAI API key"
except openai.OpenAIError:
apikeylink = "https://beta.openai.com/account/api-keys"
return f"Incorrect OpenAI API key provided: {openaikey}. You can find your OpenAI API key here - {apikeylink}"
def createindex(files, openaikey):
"""
Function to create index
"""
# Basic Checks
if not files:
return "Upload file before proceeding further."
fileformatvalidity = fileformatvaliditycheck(files)
if not fileformatvalidity:
return "Please upload documents in pdf/txt/docx/png/jpg/jpeg format only."
if not openaikey:
return "Please enter your openai key."
openaiapikeyvality = openaiapikeyvaliditycheck(openaikey)
if openaiapikeyvality != "Valid OpenAI API key":
return openaiapikeyvality
# Store openai key in environment
os.environ['OPENAI_API_KEY'] = openaikey
# Process the Documents
doctextlist = processfiles(files)
documents = createdocuments(doctextlist)
# Create index
index = GPTListIndex(documents, chunk_size_limit = 3500)
# Save index
index.save_to_disk('index.json')
return "Uploading documents successfully. OpenAI API Key provided is Valid."
def docques(query, openaikey):
"""
Function to for quering on the index created
"""
# Store openai key in environment
os.environ['OPENAI_API_KEY'] = openaikey
# Load index
index = GPTListIndex.load_from_disk('index.json')
# Query based on index
response = index.query(query, response_mode="tree_summarize")
return response
def cleartext(query, output):
"""
Function to clear text
"""
return ["", ""]
with gr.Blocks() as demo:
gr.Markdown(
"""
<h1><center><b>DocQues</center></h1>
""")
gr.Markdown(
"""
This app answers your queries on longer and multiple documents (pdf/docx/txt/png/jpeg/jpg) you upload. It uses <a href = "https://github.com/jerryjliu/gpt_index">GPT-Index</a> and OpenAI GPT3 in the backend, get your
<a href = "https://beta.openai.com/account/api-keys">Openai key here</a> before proceeding further.\n
""")
gr.Markdown(
"""
<br>**Use this space effectively by following below 2 step process.**</br>
*Step-1*
<br>- Upload pdf/docx/txt/png/jpeg/jpg format documents.
<br>- Enter your openai key.
<br>- Click upload and wait to see if upload is successful or not. </br>
*Step-2*
<br>- Enter your query.
<br>- Click submit.
<br>- Check Answer </br>
Please refer to the GitHub repo this Space is based on, here - <a href = "https://github.com/ravi03071991/DocQues">DocQues</a> .
"""
)
with gr.Row():
with gr.Column():
files = gr.File(label = "Upload pdf/docx/txt format documents.", file_count="multiple")
openaikey = gr.Textbox(lines = 1, label = "Enter your OpenAI Key.")
upload_button = gr.Button("Upload")
query = gr.Textbox(lines = 2, label = "Enter Your Question.")
submit_button = gr.Button("Submit")
with gr.Column():
upload_output = gr.Textbox(label = "Upload/ Error.")
ans_output = gr.Textbox(label = "Answer.")
clear_button = gr.Button("Clear")
# Upload button for uploading files and openai key.
upload_button.click(createindex, inputs=[files, openaikey], outputs= [upload_output] )
# Submit button for submitting query.
submit_button.click(docques, inputs=[query, openaikey], outputs= [ans_output] )
# Clear button for clearing query and answer.
clear_button.click(cleartext, inputs=[query, ans_output], outputs= [query, ans_output] )
demo.launch()