import base64 import concurrent.futures import os from io import BytesIO import cv2 import gradio as gr import numpy as np import requests import supervision as sv from inference_sdk import InferenceHTTPClient, InferenceConfiguration from openai import OpenAI CLIENT = InferenceHTTPClient( api_url="http://detect.roboflow.com", api_key=os.environ["ROBOFLOW_API_KEY"], ) custom_configuration = InferenceConfiguration(confidence_threshold=0.2) openai_client = OpenAI() def process_mask(region, task_id): region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE) # change channels region = cv2.cvtColor(region, cv2.COLOR_BGR2RGB) base64_image = base64.b64encode( BytesIO(cv2.imencode(".jpg", region)[1]).read() ).decode("utf-8") response = openai_client.chat.completions.create( model="gpt-4-vision-preview", messages=[ { "role": "user", "content": [ { "type": "text", "text": "Read the text on the book spine. Only say the book cover title and author if you can find them. Say the book that is most prominent. Return the format [title] [author], with no punctuation.", }, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, }, ], } ], max_tokens=300, ) print(response.choices[0].message.content.rstrip("Title:").replace("\n", " ")) return response.choices[0].message.content.rstrip("Title:").replace("\n", " ") def process_book_with_google_books(book): response = requests.get( f"https://www.googleapis.com/books/v1/volumes?q={book}", headers={"User-Agent": "Mozilla/5.0"}, ) response = response.json() isbn, author, link = "NULL", "NULL", "NULL" try: isbn = response["items"][0]["volumeInfo"]["industryIdentifiers"][0][ "identifier" ] if ( "volumeInfo" in response["items"][0] and "authors" in response["items"][0]["volumeInfo"] ): author = response["items"][0]["volumeInfo"]["authors"][0] link = response["items"][0]["volumeInfo"]["infoLink"] except: pass return isbn, author, link # define function that accepts an image def detect_books(image): # infer on a local image with CLIENT.use_configuration(custom_configuration): results = CLIENT.infer(image, model_id="open-shelves/8") results = sv.Detections.from_inference(results) mask_annotator = sv.MaskAnnotator() annotated_image = mask_annotator.annotate(scene=image, detections=results) masks_isolated = [] polygons = [sv.mask_to_polygons(mask) for mask in results.mask] for mask in results.mask: masked_region = np.zeros_like(image) masked_region[mask] = image[mask] masks_isolated.append(masked_region) print("Calculated masks...") with concurrent.futures.ThreadPoolExecutor() as executor: tasks = [ executor.submit(process_mask, region, task_id) for task_id, region in enumerate(masks_isolated) ] books = [task.result() for task in tasks] print("Processed books...") links = [] isbns = [] authors = [] with concurrent.futures.ThreadPoolExecutor() as executor: tasks = [ executor.submit(process_book_with_google_books, book) for book in books ] for task in tasks: isbn, author, link = task.result() isbns.append(isbn) authors.append(author) links.append(link) print("Processed books with Google Books...") annotations = [ { "title": title, "author": author, "isbn": isbn, "polygons": [polygon.tolist() for polygon in polygon_list], "xyxy": xyxy.tolist(), "link": link, } for title, author, isbn, polygon_list, xyxy, link in zip( books, authors, isbns, polygons, results.xyxy, links ) if "sorry" not in title.lower() and "NULL" not in title and "cannot" not in title and "can't" not in title ] # order annotations by x0 annotations = sorted(annotations, key=lambda x: x["xyxy"][0]) books = [annotation["title"] for annotation in annotations] isbns = [annotation["isbn"] for annotation in annotations] width, height = image.shape[1], image.shape[0] svg = f"""