Spaces:

alphayomega
/

PDF15-English

Runtime error

App Files Files Community

PDF15-English / app.py

alphayomega

Update app.py

cc412f9 over 2 years ago

raw

history blame contribute delete

7.81 kB

	import subprocess

	subprocess.call(['pip', 'install', 'PyMuPDF', 'gradio', 'numpy==1.23.3', 'scikit-learn', 'tensorflow', 'tensorflow-hub', 'openai==0.10.2', '--user'])

	import urllib.request
	import fitz
	import re
	import numpy as np
	import tensorflow_hub as hub
	import openai
	import gradio as gr
	import os
	from sklearn.neighbors import NearestNeighbors

	def download_pdf(url, output_path):
	urllib.request.urlretrieve(url, output_path)


	def preprocess(text):
	text = text.replace('\n', ' ')
	text = re.sub('\s+', ' ', text)
	return text


	def pdf_to_text(path, start_page=1, end_page=None):
	doc = fitz.open(path)
	total_pages = doc.page_count

	if end_page is None:
	end_page = total_pages

	text_list = []

	for i in range(start_page-1, end_page):
	text = doc.load_page(i).get_text("text")
	text = preprocess(text)
	text_list.append(text)

	doc.close()
	return text_list


	def text_to_chunks(texts, word_length=150, start_page=1):
	text_toks = [t.split(' ') for t in texts]
	page_nums = []
	chunks = []

	for idx, words in enumerate(text_toks):
	for i in range(0, len(words), word_length):
	chunk = words[i:i+word_length]
	if (i+word_length) > len(words) and (len(chunk) < word_length) and (
	len(text_toks) != (idx+1)):
	text_toks[idx+1] = chunk + text_toks[idx+1]
	continue
	chunk = ' '.join(chunk).strip()
	chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
	chunks.append(chunk)
	return chunks


	class SemanticSearch:

	def __init__(self):
	self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
	self.fitted = False


	def fit(self, data, batch=1000, n_neighbors=5):
	self.data = data
	self.embeddings = self.get_text_embedding(data, batch=batch)
	n_neighbors = min(n_neighbors, len(self.embeddings))
	self.nn = NearestNeighbors(n_neighbors=n_neighbors)
	self.nn.fit(self.embeddings)
	self.fitted = True


	def __call__(self, text, return_data=True):
	inp_emb = self.use([text])
	neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

	if return_data:
	return [self.data[i] for i in neighbors]
	else:
	return neighbors


	def get_text_embedding(self, texts, batch=1000):
	embeddings = []
	for i in range(0, len(texts), batch):
	text_batch = texts[i:(i+batch)]
	emb_batch = self.use(text_batch)
	embeddings.append(emb_batch)
	embeddings = np.vstack(embeddings)
	return embeddings



	def load_recommender(path, start_page=1):
	global recommender
	texts = pdf_to_text(path, start_page=start_page)
	chunks = text_to_chunks(texts, start_page=start_page)
	recommender.fit(chunks)
	return 'Corpus Loaded.'


	def generate_text(openAI_key,prompt, engine="text-davinci-003"):
	openai.api_key = openAI_key
	completions = openai.Completion.create(
	engine=engine,
	prompt=prompt,
	max_tokens=1024,
	n=1,
	stop=None,
	temperature=0.7,
	)
	message = completions.choices[0].text
	return message


	def generate_answer(question,openAI_key, option):
	topn_chunks = recommender(question)
	prompt = ""
	prompt += 'search results:\n\n'
	for c in topn_chunks:
	prompt += c + '\n\n'

	if option == "FODA":
	prompt += "Instructions: Provide recommendations for improving organizational productivity based on industry best practices and research. "\
	"You are a management specialist. "\
	"You are required to conduct a SWOT analysis of the company. "\
	"You are to analyze relevant data related to the company, including financial information, market research, management reports, etc., and prepare a detailed SWOT analysis report. "\
	"You are required to conduct a SWOT analysis of the company. "\
	"I hope you will be able to complete this task efficiently and effectively. "\
	"All answers must be provided in Spanish.\n"
	elif option == "Igle":
	prompt += "Instructions: Provide recommendations for improving organizational productivity based on industry best practices and research. "\
	"Cite each recommendation using [number] notation (every recommendation has this number at the beginning). "\
	"If multiple recommendations are provided for a single topic, separate them into individual answers. "\
	"Only include information from trusted sources and do not provide false information. "\
	"If no information is available for a topic, state 'No information found'. "\
	"Ensure that each recommendation is concise and actionable. "\
	"Useful topics to consider include digital tools for time management and project tracking [3], "\
	"employee training and professional development programs to foster continuous improvement and growth in the company [5], "\
	"and optimizing organizational structure and management practices to maximize efficiency and output.\n"
	elif option == "Option 3":
	prompt += "Cuales son los minsiterios?\n"

	prompt += f"Query: {question}\nAnswer:"
	answer = generate_text(openAI_key, prompt,"text-davinci-003")
	return answer


	def question_answer(url, file, question,openAI_key, option):
	if openAI_key.strip()=='':
	return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
	if url.strip() == '' and file == None:
	return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'

	if url.strip() != '' and file != None:
	return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'

	if url.strip() != '':
	glob_url = url
	download_pdf(glob_url, 'corpus.pdf')
	load_recommender('corpus.pdf')

	else:
	old_file_name = file.name
	file_name = file.name
	file_name = file_name[:-12] + file_name[-4:]
	os.rename(old_file_name, file_name)
	load_recommender(file_name)

	if question.strip() == '':
	return '[ERROR]: Question field is empty'

	return generate_answer(question,openAI_key, option)


	recommender = SemanticSearch()

	title = 'Deep Learning'
	description = """ <p style="text-align:center">Sistema automatizado de productividad organizacional para incrementar la eficiencia, la efectividad y la eficacia en los procesos internos de la organización.</p>
	<p style="text-align:center"><strong>By Manget Impact LLC - Miguel Angel Gil.</strong></p>
	"""

	with gr.Blocks() as demo:

	gr.Markdown(f'<center><h1>{title}</h1></center>')
	gr.Markdown(description)

	with gr.Row():

	with gr.Group():
	gr.Markdown(f'<p style="text-align:center">Consiga su clave de OpenAI aquí <a href="https://platform.openai.com/account/api-keys">here</a></p>')
	openAI_key=gr.Textbox(label='INSERTE SU OPENAI:_APIKEY AQUI')
	url = gr.Textbox(label='URL del PDF a Procesar')
	gr.Markdown("<center><h4>OR<h4></center>")
	file = gr.File(label='Subir PDF a Procesar', file_types=['.pdf'])
	question = gr.Textbox(label='Escriba su consulta aqui:')
	option = gr.Dropdown(label='Seleccione una opción', choices=['FODA', 'Igle', 'Option 3'])
	btn = gr.Button(value='Consultar')
	btn.style(full_width=True)

	with gr.Group():
	answer = gr.Textbox(label='Respuesta de la consulta:')

	btn.click(question_answer, inputs=[url, file, question,openAI_key, option], outputs=[answer])
	demo.launch()