Spaces:

Girinath11
/

DocVision_AI

Sleeping

App Files Files Community

DocVision_AI / app.py

Girinath11

Update app.py

bfe51e4 verified about 2 months ago

raw

history blame contribute delete

14.2 kB

	import gradio as gr
	from pathlib import Path
	from PIL import Image
	import PyPDF2
	import docx
	from sentence_transformers import SentenceTransformer, util
	import faiss
	import numpy as np
	from transformers import AutoTokenizer, AutoModelForCausalLM, BlipProcessor, BlipForConditionalGeneration
	import torch
	from datetime import datetime
	import fitz # PyMuPDF
	import shutil

	# Load models
	print("Loading models...")
	embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

	print("Loading LLM...")
	tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
	llm_model = AutoModelForCausalLM.from_pretrained(
	"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	torch_dtype=torch.float16,
	device_map="auto"
	)

	print("Loading image caption model...")
	caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
	caption_model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-large",
	torch_dtype=torch.float16
	).to("cuda" if torch.cuda.is_available() else "cpu")

	print("✅ All models loaded!")

	# Storage
	documents = []
	images = []
	image_captions = []
	embeddings_index = None

	def generate_image_caption(image_path):
	"""Generate detailed caption for image"""
	try:
	img = Image.open(image_path).convert('RGB')

	# Generate detailed caption
	inputs = caption_processor(img, return_tensors="pt").to(caption_model.device)
	output = caption_model.generate(
	**inputs,
	max_length=100,
	num_beams=5,
	temperature=0.7
	)
	caption = caption_processor.decode(output[0], skip_special_tokens=True)

	return caption.strip()
	except Exception as e:
	print(f"Caption error: {e}")
	return ""

	def extract_images_from_pdf(pdf_path):
	"""Extract images from PDF"""
	extracted = []
	try:
	doc = fitz.open(pdf_path)

	for page_num in range(len(doc)):
	page = doc[page_num]
	images_list = page.get_images(full=True)

	for img_index, img in enumerate(images_list):
	try:
	xref = img[0]
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]

	# Save image
	img_path = f"/tmp/pdf_page{page_num+1}_img{img_index}.png"
	with open(img_path, "wb") as f:
	f.write(image_bytes)

	# Validate image
	test_img = Image.open(img_path)
	width, height = test_img.size

	# Only keep meaningful images (not tiny icons/logos)
	if width >= 150 and height >= 150:
	extracted.append({
	'path': img_path,
	'page': page_num + 1,
	'source': Path(pdf_path).name
	})
	except Exception as e:
	continue

	doc.close()
	except Exception as e:
	print(f"PDF image extraction error: {e}")

	return extracted

	def extract_pdf_text(pdf_path):
	"""Extract text from PDF"""
	chunks = []
	with open(pdf_path, 'rb') as f:
	pdf = PyPDF2.PdfReader(f)
	for i, page in enumerate(pdf.pages):
	text = page.extract_text()
	if text.strip():
	chunks.append({
	'text': text,
	'page': i + 1,
	'source': Path(pdf_path).name
	})
	return chunks

	def extract_docx_text(docx_path):
	doc = docx.Document(docx_path)
	text = '\n'.join([p.text for p in doc.paragraphs if p.text.strip()])
	return [{'text': text, 'source': Path(docx_path).name}]

	def extract_txt_text(txt_path):
	with open(txt_path, 'r', encoding='utf-8') as f:
	return [{'text': f.read(), 'source': Path(txt_path).name}]

	def chunk_text(text, size=400):
	words = text.split()
	chunks = []
	for i in range(0, len(words), size):
	chunk = ' '.join(words[i:i+size])
	if chunk.strip():
	chunks.append(chunk)
	return chunks

	def process_files(files, progress=gr.Progress()):
	"""Process uploaded files"""
	global documents, images, image_captions, embeddings_index

	if not files:
	return "⚠️ Please upload files first"

	documents = []
	images = []
	image_captions = []

	total = len(files)

	for idx, file in enumerate(files):
	progress((idx + 1) / total, desc=f"Processing {Path(file.name).name}...")
	ext = Path(file.name).suffix.lower()

	if ext == '.pdf':
	# Extract text
	chunks = extract_pdf_text(file.name)
	for chunk in chunks:
	for small_chunk in chunk_text(chunk['text']):
	documents.append({
	'text': small_chunk,
	'source': chunk['source'],
	'page': chunk['page']
	})

	# Extract images
	pdf_images = extract_images_from_pdf(file.name)
	for img in pdf_images:
	caption = generate_image_caption(img['path'])
	if caption: # Only add if caption generated
	images.append(img)
	image_captions.append(caption)

	elif ext == '.docx':
	chunks = extract_docx_text(file.name)
	for chunk in chunks:
	for small_chunk in chunk_text(chunk['text']):
	documents.append({
	'text': small_chunk,
	'source': chunk['source']
	})

	elif ext == '.txt':
	chunks = extract_txt_text(file.name)
	for chunk in chunks:
	for small_chunk in chunk_text(chunk['text']):
	documents.append({
	'text': small_chunk,
	'source': chunk['source']
	})

	elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
	caption = generate_image_caption(file.name)
	if caption:
	images.append({
	'path': file.name,
	'source': Path(file.name).name,
	'page': ''
	})
	image_captions.append(caption)

	# Create embeddings
	progress(0.9, desc="Creating embeddings...")
	if documents:
	texts = [doc['text'] for doc in documents]
	embeddings = embedding_model.encode(texts, show_progress_bar=False)

	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings.astype('float32'))
	embeddings_index = index

	progress(1.0, desc="Done!")

	status = f"✅ Processing Complete!\n\n"
	status += f"📄 Text chunks: {len(documents)}\n"
	status += f"🖼️ Images extracted: {len(images)}\n"

	if images:
	status += f"\nSample captions:\n"
	for i, (img, cap) in enumerate(zip(images[:3], image_captions[:3]), 1):
	status += f"{i}. {img['source']}"
	if img.get('page'):
	status += f" (Page {img['page']})"
	status += f":\n _{cap}_\n"

	return status

	def search_documents(query, k=3):
	"""Search relevant documents"""
	if not documents or embeddings_index is None:
	return []

	query_vec = embedding_model.encode([query])
	distances, indices = embeddings_index.search(query_vec.astype('float32'), k)

	results = []
	for idx in indices[0]:
	if idx < len(documents):
	results.append(documents[idx])
	return results

	def find_relevant_images(query, relevance_threshold=0.25):
	"""Find images ONLY if relevant to query"""
	if not images or not image_captions:
	return [], []

	# Encode query and captions
	query_emb = embedding_model.encode(query, convert_to_tensor=True)
	caption_embs = embedding_model.encode(image_captions, convert_to_tensor=True)

	# Calculate cosine similarity
	similarities = util.cos_sim(query_emb, caption_embs)[0]

	# Filter by threshold and get top 3
	relevant_imgs = []
	explanations = []

	for idx, sim_score in enumerate(similarities):
	sim_value = float(sim_score)

	# Only show if relevance > threshold
	if sim_value > relevance_threshold:
	img_info = images[idx]
	caption = image_captions[idx]

	relevant_imgs.append(img_info['path'])

	# Create explanation
	exp = f"📄 Source: {img_info['source']}"
	if img_info.get('page'):
	exp += f" (Page {img_info['page']})"
	exp += f"\n💬 Description: {caption}"
	exp += f"\n🎯 Relevance: {sim_value * 100:.1f}%\n"

	explanations.append(exp)

	# Sort by relevance and take top 3
	if relevant_imgs:
	sorted_pairs = sorted(
	zip(similarities, relevant_imgs, explanations),
	key=lambda x: x[0],
	reverse=True
	)[:3]

	relevant_imgs = [pair[1] for pair in sorted_pairs]
	explanations = [pair[2] for pair in sorted_pairs]

	return relevant_imgs, explanations

	def generate_answer(question, context_docs):
	"""Generate answer from context"""
	context = '\n\n'.join([doc['text'] for doc in context_docs])

	prompt = f"""Answer this question based only on the context provided. Be concise and accurate.

	Context:
	{context}

	Question: {question}

	Answer:"""

	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1200)

	with torch.no_grad():
	outputs = llm_model.generate(
	inputs.input_ids,
	max_new_tokens=200,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract answer part
	if "Answer:" in answer:
	answer = answer.split("Answer:")[-1].strip()

	return answer

	def answer_query(question, progress=gr.Progress()):
	"""Answer question with relevant images only"""
	if not question.strip():
	return "⚠️ Please enter a question", None

	if not documents:
	return "⚠️ Please upload and process documents first", None

	# Search documents
	progress(0.3, desc="Searching documents...")
	relevant_docs = search_documents(question, k=3)

	if not relevant_docs:
	return "❌ No relevant information found", None

	# Generate answer
	progress(0.6, desc="Generating answer...")
	answer = generate_answer(question, relevant_docs)

	# Format response
	response = f"## 💡 Answer\n\n{answer}\n\n"
	response += f"## 📚 Text Sources\n\n"

	for i, doc in enumerate(relevant_docs, 1):
	source = doc['source']
	page = doc.get('page', '')
	if page:
	response += f"{i}. {source} (Page {page})\n"
	else:
	response += f"{i}. {source}\n"

	# Find relevant images
	progress(0.9, desc="Finding relevant images...")
	relevant_imgs, img_explanations = find_relevant_images(question, relevance_threshold=0.25)

	# Add image explanations if found
	if relevant_imgs and img_explanations:
	response += f"\n## 🖼️ Related Images\n\n"
	for exp in img_explanations:
	response += f"{exp}\n"
	else:
	response += f"\n_No relevant images found for this query_\n"

	progress(1.0, desc="Done!")

	return response, relevant_imgs if relevant_imgs else None

	# UI
	with gr.Blocks(
	title="DocVision AI",
	theme=gr.themes.Soft(primary_hue="indigo")
	) as app:

	gr.Markdown("""
	# 📚 DocVision AI - Intelligent Document Q&A
	### Upload documents and get AI-powered answers with relevant images
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="📁 Upload Documents & Images",
	file_count="multiple",
	file_types=[".pdf", ".docx", ".txt", ".jpg", ".png"]
	)
	process_btn = gr.Button(
	"⚡ Process Documents",
	variant="primary",
	size="lg"
	)
	status = gr.Markdown(label="📊 Processing Status")

	with gr.Column():
	question = gr.Textbox(
	label="❓ Ask Your Question",
	placeholder="What would you like to know about your documents?",
	lines=3
	)
	ask_btn = gr.Button(
	"🔍 Get Answer",
	variant="primary",
	size="lg"
	)

	answer = gr.Markdown(label="📝 Answer with Sources")

	gallery = gr.Gallery(
	label="🖼️ Relevant Images (Only shown if related to your question)",
	columns=2,
	height=500,
	show_label=True
	)

	gr.Markdown("### 💡 Example Questions")
	gr.Examples(
	examples=[
	["What is the main topic of this document?"],
	["Explain the workflow or architecture shown"],
	["What are the key findings?"],
	["Describe any diagrams or charts present"]
	],
	inputs=question
	)

	# Event handlers
	process_btn.click(
	process_files,
	inputs=[file_input],
	outputs=[status]
	)

	ask_btn.click(
	answer_query,
	inputs=[question],
	outputs=[answer, gallery]
	)

	question.submit(
	answer_query,
	inputs=[question],
	outputs=[answer, gallery]
	)

	if __name__ == "__main__":
	app.launch()