Spaces:

TzTok-Jad
/

Lil-Dewey

Sleeping

App Files Files Community

Lil-Dewey / app.py

TzTok-Jad

Update app.py

39b5519 verified over 1 year ago

raw

history blame contribute delete

10.8 kB

	import gradio as gr
	import fitz # PyMuPDF
	import re
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import faiss
	from langdetect import detect
	from translate import Translator
	import openai
	import os
	import urllib.request


	# Designed by: Jad Oubala, Will Kaminski & Sam Bradley

	# Function to download PDF from a user-inputted URL
	# Uses the urllib.request.urlretrieve function to fetch the PDF from the url and
	# save it as a file at output_path.


	global chunks

	def download_pdf(url, output_path):
	urllib.request.urlretrieve(url, output_path)

	# Preprocessing function to clean text
	# Cleans up the extracted text by removing newlines and extra spaces.

	def preprocess(text):
	text = text.replace('\n', ' ')
	text = re.sub('\s+', ' ', text)
	return text

	# Convert PDF document to text
	# Opens the PDF using fitz.open (PyMuPDF).
	# Iterates through the specified page range, extracting text from each page.
	# Applies the preprocess function to clean up each page's text.
	# Collects and returns a list of the cleaned text strings, one for each page.

	def pdf_to_text(path, start_page=1, end_page=None):
	doc = fitz.open(path)
	total_pages = doc.page_count

	if end_page is None or end_page > total_pages:
	end_page = total_pages

	text_list = []

	for i in tqdm(range(start_page-1, end_page), desc="Extracting text from PDF"):
	text = doc.load_page(i).get_text("text")
	text = preprocess(text)
	text_list.append(text)

	doc.close()
	return text_list

	# Convert list of texts to smaller chunks
	# Iterates through the list of preprocessed text strings (texts).
	# For each text string, splits it into words and then groups into chunks
	# If the end of a chunk falls short of the word_length and it's not the last
	# chunk, the remaining words are prepended to the next text string to avoid
	# having short ending chunk.
	# Each chunk is prefixed with its page number and enclosed in quotes.

	def text_to_chunks(texts, word_length=150, start_page=1):
	buffer = []

	for idx, text in enumerate(texts):
	words = text.split(' ')
	for word in words:
	buffer.append(word)
	if len(buffer) >= word_length:
	chunk = ' '.join(buffer).strip()
	chunks.append(f'Page {idx+start_page}: "{chunk}"')
	buffer = []

	# Handle the remaining buffer if it's long enough
	if len(buffer) >= word_length:
	chunk = ' '.join(buffer).strip()
	chunks.append(f'Page {idx+start_page}: "{chunk}"')
	buffer = []

	return chunks

	# Optionally, print or process the chunks
	# for chunk in chunks[:5]: # Print first 5 chunks as a sample
	# print(chunk)

	# Chunk Embedding:

	model = SentenceTransformer('all-MiniLM-L6-v2')

	# Assuming `chunks` is your list of preprocessed text chunks

	# embeddings = model.encode(chunks, show_progress_bar=True)

	# dimension = embeddings.shape[1] # Dimension of embeddings
	# index = faiss.IndexFlatL2(dimension) # L2 distance for similarity
	# index.add(embeddings.astype(np.float32)) # Add embeddings to index


	# Querying the Index for Relevant Chunks
	# create function to query the index with a user's question
	# find the most relevant chunks, and display them:
	# Also, account for translational logic!

	def search(query, k=5):

	original_language = detect_lang(query)
	query_in_english = translate_to_english(query) if original_language != 'en' else query

	query_embedding = model.encode([query_in_english])[0].astype(np.float32)
	distances, indices = index.search(np.array([query_embedding]), 5)

	relevant_chunks = [chunks[idx] for idx in indices[0]]
	return relevant_chunks, original_language

	# TRANSLATION-ADJACENT FUNCTIONS

	# Translates text safely by splitting it into segments, translating each segment,
	# and then concatenating the results.

	def safe_translate(text, from_lang, to_lang, max_length=500):
	translator = Translator(to_lang=to_lang, from_lang=from_lang)
	# Split text into segments of max_length characters without breaking words
	words = text.split()
	segments = []
	current_segment = []
	current_length = 0

	for word in words:
	if current_length + len(word) + 1 > max_length: # +1 for space
	segments.append(" ".join(current_segment))
	current_segment = [word]
	current_length = len(word)
	else:
	current_segment.append(word)
	current_length += len(word) + 1 # +1 for space

	# Add the last segment if it's not empty
	if current_segment:
	segments.append(" ".join(current_segment))

	# Translate each segment
	translated_segments = [translator.translate(segment) for segment in segments]

	# Combine translated segments
	translated_text = " ".join(translated_segments)
	return translated_text

	def detect_lang(text):
	return detect(text)

	def translate_to_english(text, max_length=500):
	detected_language = detect_lang(text)
	if detected_language != 'en':
	return safe_translate(text, from_lang=detected_language, to_lang='en', max_length=max_length)
	return text

	def translate_from_english(text, target_lang, max_length=500):
	if target_lang != 'en':
	return safe_translate(text, from_lang='en', to_lang=target_lang, max_length=max_length)
	return text

	# GPT Integration:
	openai.api_key = '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'

	# Takes the semantically searched chunks as input and generates a response using
	# the ChatGPT API
	def generate_response_from_chunks(user_query, max_tokens=325):
	relevant_chunks, original_language = search(user_query)

	if original_language != 'en':
	translated_query_to_english = translate_to_english(user_query)
	else:
	translated_query_to_english = user_query # Use the original query directly if it's already in English


	# Start constructing the prompt with more structured guidance
	prompt = "search results:\n\n" + "".join([f"{i+1}. {chunk}\n\n" for i, chunk in enumerate(relevant_chunks)])
	prompt += "Instructions: Compose a comprehensive and succinct reply to the query using the search results given. " \
	"Cite each reference using [Page #number] notation (every result has a number at the beginning). " \
	"Citation should be done at the end of each sentence. If the search results mention multiple subjects " \
	"with the same name, create separate answers for each. Only include information found in the results and " \
	"don't add any additional information. Make sure the answer is correct and don't output false content. You should also mention where a given answer might be found in the text if appropriate. Keep answers under around seven sentences." \
	"If the text does not relate to the query, simply state 'Sorry, Lil' Dewey found nothing relevant in the text.'. Don't write 'Answer:' " \
	"Directly start and state the answer.\n"

	prompt += f"Query: {translated_query_to_english}\n\n"

	# Send the prompt to the ChatGPT model using the chat/completions endpoint
	response = client.chat.completions.create(model="gpt-4", # Specify the chat model you're using
	messages=[
	{"role": "system", "content": prompt},
	{"role": "user", "content": "Please provide a response based on the above instructions."}
	],
	temperature=0.7,
	max_tokens=max_tokens,
	top_p=1.0,
	frequency_penalty=0.0,
	presence_penalty=0.0)


	# Extracting and returning the text from the response:
	generated_text = response.choices[0].message.content.strip()

	translated_response = translate_from_english(generated_text, original_language) if original_language != 'en' else generated_text

	return translated_response

	def question_answer(url, file, question):
	if url.strip() == '' and file is None:
	return '[ERROR]: Both URL and PDF are empty. Provide at least one.'

	if url.strip() != '' and file is not None:
	return '[ERROR]: Both URL and PDF are provided. Please provide only one (either URL or PDF).'

	file_path = 'temp_document.pdf' # Use a common file path for simplicity

	if url.strip() != '':
	download_pdf(url, file_path)
	else:
	# Save the uploaded file to disk
	with open(file_path, 'wb') as f:
	f.write(file.getbuffer())

	if question.strip() == '':
	return '[ERROR]: Question field is empty.'

	# Process the PDF
	texts = pdf_to_text(file_path)
	chunks = text_to_chunks(texts)

	# IMPORTANT: You must ensure the global `chunks` variable used in `search` is updated
	# This might not be the best practice but is necessary for the current setup
	chunks = chunks # Update the global variable to ensure `search` function works correctly

	# Embed the chunks (This part was missing)
	embeddings = model.encode(chunks, show_progress_bar=True)

	# Initialize or update the FAISS index (This part needs to be integrated correctly)
	dimension = embeddings.shape[1]
	global index # Similar concern as with `chunks`
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings.astype(np.float32))

	# Generate response based on the user's query
	response = generate_response_from_chunks(question)
	return response

	# Gradio UI setup
	title = 'PDF Chatbot with Translation Features'
	description = """This tool allows you to upload a PDF document or provide a URL to one, ask questions about its contents, and receive answers. It incorporates translation features, making it possible to ask questions in any language and receive responses in that language."""

	with gr.Blocks() as demo:

	gr.Markdown(f'<center><h1>{title}</h1></center>')
	gr.Markdown(description)

	with gr.Row():
	with gr.Column(scale=1):
	url = gr.Textbox(label='URL')
	gr.Markdown("<center>or</center>", elem_id="markdown_or")
	file = gr.File(label='PDF', file_types=['pdf'])
	question = gr.Textbox(label='Question')
	submit_btn = gr.Button(value='Submit')

	with gr.Column(scale=1):
	answer = gr.Textbox(label='Answer')

	submit_btn.click(fn=question_answer, inputs=[url, file, question], outputs=[answer])

	# Launch the Gradio app
	demo.launch()