-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
64 lines (56 loc) · 2.45 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import streamlit as st
from GroundingDINO.groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2
import numpy as np
import GroundingDINO.groundingdino.datasets.transforms as T
from PIL import Image
import os
import wget
st.title('Grounding DINO')
# Model Backbone
backbone = st.selectbox('Choose Backbone:', ['Swin-T', 'Swin-B'])
if backbone == 'Swin-T':
CONFIG_PATH = os.path.join('GroundingDINO', 'groundingdino', 'config', 'GroundingDINO_SwinT_OGC.py')
WEIGHTS_PATH = "groundingdino_swint_ogc.pth"
if not os.path.exists(WEIGHTS_PATH):
wget.download('https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth')
else:
CONFIG_PATH = os.path.join('GroundingDINO', 'groundingdino', 'config', 'GroundingDINO_SwinB_cfg.py')
WEIGHTS_PATH = "groundingdino_swinb_cogcoor.pth"
if not os.path.exists(WEIGHTS_PATH):
wget.download('https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth')
# Transformation
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
BOX_TRESHOLD = st.sidebar.slider('Box Threshold:', min_value=0.0, max_value=1.0, value=0.35)
TEXT_TRESHOLD = st.sidebar.slider('Text Threshold:', min_value=0.0, max_value=1.0, value=0.25)
model = load_model(CONFIG_PATH, WEIGHTS_PATH)
upload_img_file = st.file_uploader(
'Upload Image', type=['jpg', 'jpeg', 'png'])
FRAME_WINDOW = st.image([])
if upload_img_file is not None:
file_bytes = np.asarray(
bytearray(upload_img_file.read()), dtype=np.uint8)
img = cv2.imdecode(file_bytes, 1)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = Image.fromarray(img)
image = np.asarray(img)
image_transformed, _ = transform(img, None)
FRAME_WINDOW.image(img, channels='BGR')
# TEXT_PROMPT = "glass most to the right"
TEXT_PROMPT = st.text_input('Text Prompt:', value="glass most to the right")
if st.checkbox('Predict'):
boxes, logits, phrases = predict(
model=model,
image=image_transformed,
caption=TEXT_PROMPT,
box_threshold=BOX_TRESHOLD,
text_threshold=TEXT_TRESHOLD
)
annotated_frame = annotate(image_source=image, boxes=boxes, logits=logits, phrases=phrases)
FRAME_WINDOW.image(annotated_frame, channels='BGR')