Spaces:

aakash0563
/

Gemini-vision

Running

File size: 3,144 Bytes

# import google.generativeai as genai
# from PIL import Image
# import gradio as gr
# import numpy as np
# import os

# GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# # Now you can use hugging_face_api_key in your code

# genai.configure(api_key=GOOGLE_API_KEY)

# model = genai.GenerativeModel('gemini-pro-vision')
# def process_image_and_text(image, text):
#   # Assuming image is the input from Gradio
#   if text:
#     image_array = np.asarray(image.data)  # Convert memoryview to NumPy array
#     image = Image.fromarray(image_array.astype('uint8'), 'RGB')  # Now you can use astype
#     response = model.generate_content([text, image])
#     return response.text
#   else:
#     image_array = np.asarray(image.data)  # Convert memoryview to NumPy array
#     image = Image.fromarray(image_array.astype('uint8'), 'RGB')  # Now you can use astype
#     response = model.generate_content(["Tell me about this image in bulletin format", image])
#     return response.text


# iface = gr.Interface(
#     process_image_and_text,
#     inputs=["image", "textbox"],  # Specify image and text inputs
#     outputs="textbox",          # Specify text output
#     title="Image and Text Processor",  # Set the app title
# )

# iface.launch(debug=True, share=True)  # Launch the Gradio app


from dotenv import load_dotenv
import google.generativeai as genai
import os
import os
from pdf2image import convert_from_path
from PIL import Image
import pdf2image
import numpy as np

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
from youtube_transcript_api import YouTubeTranscriptApi
load_dotenv()
import gradio as gr
# print(llm.predict("Who is the PM of India?"))
model = genai.GenerativeModel('gemini-pro-vision')
def process_image_and_text(images):
    response = {}
    for i,image in enumerate(images):
        # # Assuming image is the input from Gradio
        # image_array = np.asarray(image.data)  # Convert memoryview to NumPy array
        # image = Image.fromarray(image_array.astype('uint8'), 'RGB')  # Now you can use astype
        response = model.generate_content(["You are act as a tutor Solve all the question in the image in step by step: ", image])
        response[i] = response.text
    return response

def input_pdf_setup(uploaded_pdf):
    # Convert PDF pages to images
    images = convert_from_path(uploaded_pdf, dpi=200)
    return images



def extract_answer(uploaded_pdf):
    """Retrieves answers from processed images and presents them clearly."""
    images = input_pdf_setup(uploaded_pdf)
    responses = process_image_and_text(images=images)

    # Present results in a user-friendly format
    answers = []
    for i, response in enumerate(responses.values()):
        answers.append(f"Answer for question {i+1}:\n {response}")

    return "\n".join(answers)

# Create Gradio interface
iface = gr.Interface(
    fn=extract_answer,
    inputs="file",
    outputs="text",
    title="Question-Answering with Gemstone.ai",
    description="Upload a PDF containing questions, and get step-by-step answers!",
    allow_flagging=True,
)


# Launch the Gradio application
iface.launch(share=True, debug=True)