File size: 2,051 Bytes
90d7fd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b07f33
 
90d7fd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
Hugging Face's logo
Hugging Face
Search models, datasets, users...
Models
Datasets
Spaces
Posts
Docs
Solutions
Pricing



Spaces:

Satyacoder
/
vision_test


like
0
App
Files
Community
vision_test
/
app.py

Satyacoder's picture
Satyacoder
Update app.py
8602d39
5 months ago
raw
history
blame
contribute
delete
No virus
1.72 kB
from transformers import DetrImageProcessor, DetrForObjectDetection
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import requests
import gradio as gr

box_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
box_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

def predict_bounding_boxes(imageurl:str):
    try:
        response = requests.get(imageurl, stream=True)
        response.raise_for_status() 

        image_data = Image.open(response.raw)
        inputs = box_processor(images=image_data, return_tensors="pt")
        outputs = box_model(**inputs)

        target_sizes = torch.tensor([image_data.size[::-1]])
        results = box_processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.70)[0]

        detections = [{"score": score.item(), "label": box_model.config.id2label[label.item()], "box": box.tolist()} for score, label, box in zip(results["scores"], results["labels"], results["boxes"])]

        raw_image = image_data.convert('RGB')
        inputs = caption_processor(raw_image, return_tensors="pt")
        out = caption_model.generate(**inputs)
        label = caption_processor.decode(out[0], skip_special_tokens=True)
        return {"image label": label, "detections": detections}
    
    except Exception as e:
        
        return {"error": str(e)}

app = gr.Interface(fn=predict_bounding_boxes, inputs="text", outputs="json")
app.api = True
app.launch()