import os import subprocess import cv2 as cv # Ensure OpenCV is installed import numpy as np import pytesseract from pdf2image import convert_from_path import gradio as gr import json from PIL import Image # Ensure poppler-utils and tesseract-ocr are installed def install_dependencies(): try: result = subprocess.run(["bash", "setup.sh"], check=True, capture_output=True, text=True) print(result.stdout) except subprocess.CalledProcessError as e: print(f"An error occurred while installing dependencies: {e.stderr}") raise install_dependencies() # Function to rescale the frame def rescale_frame(frame, scale=0.75): width = int(frame.shape[1] * scale) height = int(frame.shape[0] * scale) dimensions = (width, height) return cv.resize(frame, dimensions, interpolation=cv.INTER_AREA) # Image Analysis def analyze_image(image): analysis = {} gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY) # Brightness and contrast mean_brightness = np.mean(gray) contrast = gray.std() analysis['mean_brightness'] = mean_brightness analysis['contrast'] = contrast # Noise level noise = cv.Laplacian(gray, cv.CV_64F).var() analysis['noise'] = noise # Skew detection (Hough line transform or other method) skew_angle = detect_skew(gray) analysis['skew_angle'] = skew_angle return analysis def detect_skew(image): coords = np.column_stack(np.where(image > 0)) angle = cv.minAreaRect(coords)[-1] if angle < -45: angle = -(90 + angle) else: angle = -angle return angle # Adaptive Preprocessing Pipeline def preprocess_image_adaptive(image): analysis = analyze_image(image) # Apply preprocessing steps based on analysis if analysis['mean_brightness'] < 50: image = adjust_brightness(image, 1.5) if analysis['contrast'] < 50: image = adjust_contrast(image, 1.5) if analysis['noise'] > 1000: image = reduce_noise(image) if abs(analysis['skew_angle']) > 5: image = deskew(image, analysis['skew_angle']) # Convert to grayscale and apply adaptive thresholding for binarization gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY) binary = cv.adaptiveThreshold(gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2) return binary def adjust_brightness(image, factor): return cv.convertScaleAbs(image, alpha=factor, beta=0) def adjust_contrast(image, alpha): return cv.convertScaleAbs(image, alpha=alpha, beta=0) def reduce_noise(image): return cv.fastNlMeansDenoisingColored(image, None, 30, 30, 7, 21) def deskew(image, angle): (h, w) = image.shape[:2] center = (w // 2, h // 2) M = cv.getRotationMatrix2D(center, angle, 1.0) rotated = cv.warpAffine(image, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE) return rotated def convert_to_pil(image): if image is None or image.size == 0: print("Error: Empty image passed to convert_to_pil") return None print("Converting image to PIL format") # Ensure the array is in uint8 format if image.dtype != np.uint8: image = image.astype(np.uint8) return Image.fromarray(cv.cvtColor(image, cv.COLOR_BGR2RGB)) def extract_text_from_image(image, langs='tel+osd+eng'): pil_image = convert_to_pil(image) if pil_image is None: print("Error: Failed to convert image to PIL format") return "" custom_config = r'--oem 3 --psm 6' try: return pytesseract.image_to_string(pil_image, lang=langs, config=custom_config) except pytesseract.TesseractError as e: print(f"Tesseract error: {e}") return "" def process_image(img): preprocessed = preprocess_image_adaptive(img) if preprocessed is None: return "" return extract_text_from_image(preprocessed) output_dir = "output" if not os.path.exists(output_dir): os.makedirs(output_dir) all_texts = {} def save_and_next(page_num, text, extracted_texts, original_images, total_pages): page_num = int(page_num) # Ensure page_num is an integer total_pages = int(total_pages) # Ensure total_pages is an integer formatted_text = { f"Page number: {page_num}": { "Content": [ line for line in text.split('\n') if line.strip() != '' ] } } all_texts.update(formatted_text) json_path = os.path.join(output_dir, "all_texts.json") with open(json_path, 'w', encoding='utf-8') as f: json.dump(all_texts, f, ensure_ascii=False, indent=4) next_page_num = page_num + 1 # Increment to next page if next_page_num <= total_pages: next_page_image = original_images[next_page_num - 1] text = process_image(next_page_image) extracted_texts.append(text) return gr.update(value=text), next_page_num, gr.update(value=next_page_image, height=None, width=None), json_path else: return "All pages processed", page_num, None, json_path def skip_page(page_num, extracted_texts, original_images, total_pages): next_page_num = int(page_num) + 1 # Ensure page_num is an integer and increment to next page total_pages = int(total_pages) # Ensure total_pages is an integer if next_page_num <= total_pages: next_page_image = original_images[next_page_num - 1] text = process_image(next_page_image) extracted_texts.append(text) return gr.update(value=text), next_page_num, gr.update(value=next_page_image, height=None, width=None) else: return "All pages processed", page_num, None def upload_pdf(pdf): pdf_path = pdf.name pages = convert_from_path(pdf_path) if not pages: print("Error: No pages found in PDF") return "Error: No pages found in PDF", None, 0, [], [], 0 print(f"PDF converted to {len(pages)} images") first_page = np.array(pages[0]) if first_page is None or first_page.size == 0: print("Error: First page is empty") return "Error: First page is empty", None, 0, [], [], 0 text = process_image(first_page) original_images = [np.array(page) for page in pages] extracted_texts = [text] return gr.update(value=original_images[0], height=None, width=None), gr.update(value=text), 1, extracted_texts, original_images, len(pages) def navigate_to_page(page_num, extracted_texts, original_images): page_num = int(page_num) # Ensure page_num is an integer if 0 <= page_num - 1 < len(original_images): return gr.update(value=original_images[page_num - 1], height=None, width=None), gr.update(value=extracted_texts[page_num - 1]), page_num else: return gr.update(value="Invalid Page Number"), None, page_num def display_pdf_and_text(): with gr.Blocks() as demo: gr.Markdown("## PDF Viewer and Text Editor") pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) with gr.Row(): image_output = gr.Image(label="Page Image", type="numpy") text_editor = gr.Textbox(label="Extracted Text", lines=10, interactive=True) page_num = gr.Number(value=1, label="Page Number", visible=True) extracted_texts = gr.State() original_images = gr.State() total_pages = gr.State() save_next_button = gr.Button("Save and Next") skip_button = gr.Button("Skip") pdf_input.upload(upload_pdf, inputs=pdf_input, outputs=[image_output, text_editor, page_num, extracted_texts, original_images, total_pages]) save_next_button.click(fn=save_and_next, inputs=[page_num, text_editor, extracted_texts, original_images, total_pages], outputs=[text_editor, page_num, image_output, gr.File(label="Download JSON")]) skip_button.click(fn=skip_page, inputs=[page_num, extracted_texts, original_images, total_pages], outputs=[text_editor, page_num, image_output]) page_buttons = gr.Row() def update_page_buttons(total_pages, extracted_texts, original_images): buttons = [] for i in range(1, total_pages + 1): button = gr.Button(str(i), variant="primary", size="small") button.click(navigate_to_page, inputs=[i, extracted_texts, original_images], outputs=[image_output, text_editor, page_num]) buttons.append(button) return buttons total_pages.change(fn=update_page_buttons, inputs=[total_pages, extracted_texts, original_images], outputs=[page_buttons]) return demo iface = display_pdf_and_text() iface.launch()