Spaces:

Insightly2
/

Article_Segmentation

Sleeping

App Files Files Community

PriyankaSatish commited on Apr 5

Commit

7a9b80b

•

1 Parent(s): e822c3d

Upload 2 files

Browse files

Files changed (2) hide show

requirements2.txt +0 -0
segmentation.py +81 -0

requirements2.txt ADDED Viewed

Binary file (3.93 kB). View file

segmentation.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import streamlit as st
+from PIL import Image
+import io
+from dotenv import load_dotenv
+import os
+import openai
+from openai import OpenAI
+# Google Cloud Vision
+from google.cloud import vision
+# Load environment variables
+load_dotenv()
+# Set Google Cloud credentials in environment
+service_account_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'gcv-new-project-dd6ed833cc91.json'
+# Initialize Google Vision client
+vision_client = vision.ImageAnnotatorClient()
+def extract_text_with_google_vision_api(image):
+    """Extract structured text from image using Google Cloud Vision API, with additional formatting based on bounding box analysis."""
+    import io
+    from google.cloud import vision
+    vision_client = vision.ImageAnnotatorClient()
+    img_byte_arr = io.BytesIO()
+    image.save(img_byte_arr, format=image.format)
+    image_bytes = img_byte_arr.getvalue()
+    image = vision.Image(content=image_bytes)
+    response = vision_client.document_text_detection(image=image)
+    structured_texts = []
+    for page in response.full_text_annotation.pages:
+        for block in page.blocks:
+            block_texts = []
+            last_paragraph_bottom = None  # Store the bottom position of the last paragraph to compare spacing
+            for paragraph in block.paragraphs:
+                paragraph_text = ' '.join([''.join([symbol.text for symbol in word.symbols]) for word in paragraph.words])
+                paragraph_text = paragraph_text.strip()
+                # Example for analyzing bounding box position and size (simplified)
+                paragraph_bounds = paragraph.bounding_box
+                top_left = paragraph_bounds.vertices[0]  # Example vertex
+                if last_paragraph_bottom is not None and (top_left.y - last_paragraph_bottom) > 10:
+                    # Add additional break if the spacing exceeds some threshold
+                    block_texts.append("")  # This adds an extra line break to indicate a significant separation
+                # Update last_paragraph_bottom to the current paragraph's bottom position
+                last_paragraph_bottom = paragraph_bounds.vertices[2].y  # Assuming 0 is top-left and going clockwise
+                if len(paragraph_text.split()) > 2:
+                    block_texts.append(paragraph_text)
+            if block_texts:
+                structured_texts.append('\n'.join(block_texts))
+    if structured_texts:
+        return '\n\n'.join(structured_texts)
+    else:
+        return "No structured text found."
+def main():
+    st.title("Article Extraction")
+    uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+    if uploaded_file is not None:
+        image = Image.open(uploaded_file)
+        st.image(image, caption="Uploaded Image", use_column_width=True)
+        extracted_text = extract_text_with_google_vision_api(image)
+        st.success("Analysis completed successfully!")
+        st.header("Extracted Text:")
+        st.write(extracted_text if extracted_text else "No text detected.")
+if __name__ == "__main__":
+    main()