Spaces:

Insightly2
/

Article_Segmentation

Sleeping

App Files Files Community

Article_Segmentation / app.py

PriyankaSatish

Rename segmentation.py to app.py

ee019a1 verified 6 months ago

raw

history blame contribute delete

No virus

3.21 kB

	import streamlit as st
	from PIL import Image
	import io
	from dotenv import load_dotenv
	import os
	import openai
	from openai import OpenAI

	# Google Cloud Vision
	from google.cloud import vision

	# Load environment variables
	load_dotenv()

	# Set Google Cloud credentials in environment
	service_account_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'gcv-new-project-dd6ed833cc91.json'

	# Initialize Google Vision client
	vision_client = vision.ImageAnnotatorClient()

	def extract_text_with_google_vision_api(image):
	"""Extract structured text from image using Google Cloud Vision API, with additional formatting based on bounding box analysis."""
	import io
	from google.cloud import vision
	vision_client = vision.ImageAnnotatorClient()

	img_byte_arr = io.BytesIO()
	image.save(img_byte_arr, format=image.format)
	image_bytes = img_byte_arr.getvalue()

	image = vision.Image(content=image_bytes)
	response = vision_client.document_text_detection(image=image)

	structured_texts = []
	for page in response.full_text_annotation.pages:
	for block in page.blocks:
	block_texts = []
	last_paragraph_bottom = None # Store the bottom position of the last paragraph to compare spacing
	for paragraph in block.paragraphs:
	paragraph_text = ' '.join([''.join([symbol.text for symbol in word.symbols]) for word in paragraph.words])
	paragraph_text = paragraph_text.strip()

	# Example for analyzing bounding box position and size (simplified)
	paragraph_bounds = paragraph.bounding_box
	top_left = paragraph_bounds.vertices[0] # Example vertex

	if last_paragraph_bottom is not None and (top_left.y - last_paragraph_bottom) > 10:
	# Add additional break if the spacing exceeds some threshold
	block_texts.append("") # This adds an extra line break to indicate a significant separation

	# Update last_paragraph_bottom to the current paragraph's bottom position
	last_paragraph_bottom = paragraph_bounds.vertices[2].y # Assuming 0 is top-left and going clockwise

	if len(paragraph_text.split()) > 2:
	block_texts.append(paragraph_text)

	if block_texts:
	structured_texts.append('\n'.join(block_texts))

	if structured_texts:
	return '\n\n'.join(structured_texts)
	else:
	return "No structured text found."

	def main():
	st.title("Article Extraction")
	uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

	if uploaded_file is not None:
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	extracted_text = extract_text_with_google_vision_api(image)

	st.success("Analysis completed successfully!")
	st.header("Extracted Text:")
	st.write(extracted_text if extracted_text else "No text detected.")

	if __name__ == "__main__":
	main()