From 1d7f89099297282a37e0e7a2cdb22e5224de24c1 Mon Sep 17 00:00:00 2001 From: crislmfroes Date: Sun, 23 Jun 2024 16:21:19 -0300 Subject: [PATCH 1/9] Add vision language model node. --- butia_vision_language_model/CMakeLists.txt | 202 ++++++++++++++++++ .../config/vision_language_model_node.yaml | 11 + .../launch/vision_language_model.launch | 6 + butia_vision_language_model/package.xml | 59 +++++ .../scripts/vision_language_model_node.py | 87 ++++++++ 5 files changed, 365 insertions(+) create mode 100644 butia_vision_language_model/CMakeLists.txt create mode 100644 butia_vision_language_model/config/vision_language_model_node.yaml create mode 100644 butia_vision_language_model/launch/vision_language_model.launch create mode 100644 butia_vision_language_model/package.xml create mode 100755 butia_vision_language_model/scripts/vision_language_model_node.py diff --git a/butia_vision_language_model/CMakeLists.txt b/butia_vision_language_model/CMakeLists.txt new file mode 100644 index 0000000..9396249 --- /dev/null +++ b/butia_vision_language_model/CMakeLists.txt @@ -0,0 +1,202 @@ +cmake_minimum_required(VERSION 3.0.2) +project(butia_vision_language_model) + +## Compile as C++11, supported in ROS Kinetic and newer +# add_compile_options(-std=c++11) + +## Find catkin macros and libraries +## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) +## is used, also find other catkin packages +find_package(catkin REQUIRED) + +## System dependencies are found with CMake's conventions +# find_package(Boost REQUIRED COMPONENTS system) + + +## Uncomment this if the package has a setup.py. This macro ensures +## modules and global scripts declared therein get installed +## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html +# catkin_python_setup() + +################################################ +## Declare ROS messages, services and actions ## +################################################ + +## To declare and build messages, services or actions from within this +## package, follow these steps: +## * Let MSG_DEP_SET be the set of packages whose message types you use in +## your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...). +## * In the file package.xml: +## * add a build_depend tag for "message_generation" +## * add a build_depend and a exec_depend tag for each package in MSG_DEP_SET +## * If MSG_DEP_SET isn't empty the following dependency has been pulled in +## but can be declared for certainty nonetheless: +## * add a exec_depend tag for "message_runtime" +## * In this file (CMakeLists.txt): +## * add "message_generation" and every package in MSG_DEP_SET to +## find_package(catkin REQUIRED COMPONENTS ...) +## * add "message_runtime" and every package in MSG_DEP_SET to +## catkin_package(CATKIN_DEPENDS ...) +## * uncomment the add_*_files sections below as needed +## and list every .msg/.srv/.action file to be processed +## * uncomment the generate_messages entry below +## * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...) + +## Generate messages in the 'msg' folder +# add_message_files( +# FILES +# Message1.msg +# Message2.msg +# ) + +## Generate services in the 'srv' folder +# add_service_files( +# FILES +# Service1.srv +# Service2.srv +# ) + +## Generate actions in the 'action' folder +# add_action_files( +# FILES +# Action1.action +# Action2.action +# ) + +## Generate added messages and services with any dependencies listed here +# generate_messages( +# DEPENDENCIES +# std_msgs # Or other packages containing msgs +# ) + +################################################ +## Declare ROS dynamic reconfigure parameters ## +################################################ + +## To declare and build dynamic reconfigure parameters within this +## package, follow these steps: +## * In the file package.xml: +## * add a build_depend and a exec_depend tag for "dynamic_reconfigure" +## * In this file (CMakeLists.txt): +## * add "dynamic_reconfigure" to +## find_package(catkin REQUIRED COMPONENTS ...) +## * uncomment the "generate_dynamic_reconfigure_options" section below +## and list every .cfg file to be processed + +## Generate dynamic reconfigure parameters in the 'cfg' folder +# generate_dynamic_reconfigure_options( +# cfg/DynReconf1.cfg +# cfg/DynReconf2.cfg +# ) + +################################### +## catkin specific configuration ## +################################### +## The catkin_package macro generates cmake config files for your package +## Declare things to be passed to dependent projects +## INCLUDE_DIRS: uncomment this if your package contains header files +## LIBRARIES: libraries you create in this project that dependent projects also need +## CATKIN_DEPENDS: catkin_packages dependent projects also need +## DEPENDS: system dependencies of this project that dependent projects also need +catkin_package( +# INCLUDE_DIRS include +# LIBRARIES butia_vision_language_model +# CATKIN_DEPENDS other_catkin_pkg +# DEPENDS system_lib +) + +########### +## Build ## +########### + +## Specify additional locations of header files +## Your package locations should be listed before other locations +include_directories( +# include +# ${catkin_INCLUDE_DIRS} +) + +## Declare a C++ library +# add_library(${PROJECT_NAME} +# src/${PROJECT_NAME}/butia_vision_language_model.cpp +# ) + +## Add cmake target dependencies of the library +## as an example, code may need to be generated before libraries +## either from message generation or dynamic reconfigure +# add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) + +## Declare a C++ executable +## With catkin_make all packages are built within a single CMake context +## The recommended prefix ensures that target names across packages don't collide +# add_executable(${PROJECT_NAME}_node src/butia_vision_language_model_node.cpp) + +## Rename C++ executable without prefix +## The above recommended prefix causes long target names, the following renames the +## target back to the shorter version for ease of user use +## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node" +# set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "") + +## Add cmake target dependencies of the executable +## same as for the library above +# add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) + +## Specify libraries to link a library or executable target against +# target_link_libraries(${PROJECT_NAME}_node +# ${catkin_LIBRARIES} +# ) + +############# +## Install ## +############# + +# all install targets should use catkin DESTINATION variables +# See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html + +## Mark executable scripts (Python etc.) for installation +## in contrast to setup.py, you can choose the destination +# catkin_install_python(PROGRAMS +# scripts/my_python_script +# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +# ) + +## Mark executables for installation +## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html +# install(TARGETS ${PROJECT_NAME}_node +# RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +# ) + +## Mark libraries for installation +## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_libraries.html +# install(TARGETS ${PROJECT_NAME} +# ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} +# LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} +# RUNTIME DESTINATION ${CATKIN_GLOBAL_BIN_DESTINATION} +# ) + +## Mark cpp header files for installation +# install(DIRECTORY include/${PROJECT_NAME}/ +# DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION} +# FILES_MATCHING PATTERN "*.h" +# PATTERN ".svn" EXCLUDE +# ) + +## Mark other files for installation (e.g. launch and bag files, etc.) +# install(FILES +# # myfile1 +# # myfile2 +# DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} +# ) + +############# +## Testing ## +############# + +## Add gtest based cpp test target and link libraries +# catkin_add_gtest(${PROJECT_NAME}-test test/test_butia_vision_language_model.cpp) +# if(TARGET ${PROJECT_NAME}-test) +# target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME}) +# endif() + +## Add folders to be run by python nosetests +# catkin_add_nosetests(test) diff --git a/butia_vision_language_model/config/vision_language_model_node.yaml b/butia_vision_language_model/config/vision_language_model_node.yaml new file mode 100644 index 0000000..184d86b --- /dev/null +++ b/butia_vision_language_model/config/vision_language_model_node.yaml @@ -0,0 +1,11 @@ +vlm_api_type: google-genai #Must be one of: ['openai', 'ollama', 'google-genai'] +vlm_api_host: dummy #Must be set for openai and ollama, but is not used for google-genai +vlm_api_model: gemini-1.5-flash + +subscribers: + image_rgb: + topic: /butia_vision/bvb/image_rgb + +servers: + visual_question_answering: + service: /butia_vision/bvlm/visual_question_answering/query diff --git a/butia_vision_language_model/launch/vision_language_model.launch b/butia_vision_language_model/launch/vision_language_model.launch new file mode 100644 index 0000000..c45d7db --- /dev/null +++ b/butia_vision_language_model/launch/vision_language_model.launch @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/butia_vision_language_model/package.xml b/butia_vision_language_model/package.xml new file mode 100644 index 0000000..4129a5f --- /dev/null +++ b/butia_vision_language_model/package.xml @@ -0,0 +1,59 @@ + + + butia_vision_language_model + 0.0.0 + The butia_vision_language_model package + + + + + cris + + + + + + TODO + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + catkin + + + + + + + + diff --git a/butia_vision_language_model/scripts/vision_language_model_node.py b/butia_vision_language_model/scripts/vision_language_model_node.py new file mode 100755 index 0000000..73cebe4 --- /dev/null +++ b/butia_vision_language_model/scripts/vision_language_model_node.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +import rospy +from sensor_msgs.msg import Image +from butia_vision_msgs.srv import VisualQuestionAnswering, VisualQuestionAnsweringRequest, VisualQuestionAnsweringResponse +import PIL +from ros_numpy import numpify +import base64 +from io import BytesIO +from langchain_core.messages import HumanMessage + +try: + from langchain_community.chat_models.ollama import ChatOllama +except: + pass +try: + from langchain_openai.chat_models import ChatOpenAI +except: + pass +try: + from langchain_google_genai.chat_models import ChatGoogleGenerativeAI +except: + pass + + +class VisionLanguageModelNode: + def __init__(self): + self.read_parameters() + if self.vlm_api_type == 'ollama': + self.vlm = ChatOllama(model=self.vlm_api_model, base_url=self.vlm_api_host) + elif self.vlm_api_type == 'openai': + self.vlm = ChatOpenAI(model_name=self.vlm_api_model, openai_api_base=self.vlm_api_host) + if self.vlm_api_type == 'google-genai': + self.vlm = ChatGoogleGenerativeAI(model=self.vlm_api_model, convert_system_message_to_human=True) + else: + raise ValueError(f"VLM API type must be one of: {['ollama', 'openai', 'google-genai']}!") + self.image_rgb_subscriber = rospy.Subscriber(self.rgb_image_topic, Image, callback=self._update_rgb_image) + self.visual_question_answering_server = rospy.Service(self.visual_question_answering_service, VisualQuestionAnswering, handler=self._handle_visual_question_answering) + + def _update_rgb_image(self, msg: Image): + self.rgb_image_msg = msg + + def _handle_visual_question_answering(self, req: VisualQuestionAnsweringRequest): + message = HumanMessage( + content=[ + self.get_image_content(), + { + 'type': 'text', + 'text': f'{req.question}' + } + ] + ) + res = VisualQuestionAnsweringResponse() + res.answer = self.vlm.invoke([message,]).content + res.confidence = 1.0 + return res + + def get_image_content(self): + rospy.wait_for_message(self.rgb_image_topic, Image) + buffered = BytesIO() + img = PIL.Image.fromarray(numpify(self.rgb_image_msg)[:,:,::-1]) + img.save(buffered, format='JPEG') + b64_image_str = base64.b64encode(buffered.getvalue()).decode() + if self.vlm_api_type in ('ollama',): + return { + 'type': 'image_url', + 'image_url': f"data:image/jpeg;base64,{b64_image_str}" + } + else: + return { + 'type': 'image_url', + 'image_url': { + 'url': f"data:image/jpeg;base64,{b64_image_str}" + } + } + + + def read_parameters(self): + self.vlm_api_type = rospy.get_param('~vlm_api_type') + self.vlm_api_host = rospy.get_param('~vlm_api_host') + self.vlm_api_model = rospy.get_param('~vlm_api_model') + self.rgb_image_topic = rospy.get_param('~subscribers/image_rgb/topic') + self.visual_question_answering_service = rospy.get_param('~servers/visual_question_answering/service') + +if __name__ == '__main__': + rospy.init_node('vision_language_model_node', anonymous=True) + node = VisionLanguageModelNode() + rospy.spin() \ No newline at end of file From 284600b9e4f4e977fc84029ac4a1d9932819f35e Mon Sep 17 00:00:00 2001 From: crislmfroes Date: Tue, 25 Jun 2024 23:32:02 -0300 Subject: [PATCH 2/9] Add Paligemma VLM recognition. --- .../config/paligemma_object_recognition.yaml | 45 +++++ .../paligemma_object_recognition.launch | 21 +++ .../paligemma_recognition/__init__.py | 0 .../paligemma_recognition.py | 174 ++++++++++++++++++ 4 files changed, 240 insertions(+) create mode 100644 butia_recognition/config/paligemma_object_recognition.yaml create mode 100644 butia_recognition/launch/paligemma_object_recognition.launch create mode 100644 butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py create mode 100755 butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py diff --git a/butia_recognition/config/paligemma_object_recognition.yaml b/butia_recognition/config/paligemma_object_recognition.yaml new file mode 100644 index 0000000..082fc39 --- /dev/null +++ b/butia_recognition/config/paligemma_object_recognition.yaml @@ -0,0 +1,45 @@ +threshold: 0.3 +classes_by_category: + Transportation: ['bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat'] + Traffic: ['traffic light', 'fire hydrant', 'stop sign', 'parking meter'] + Furniture: ['bench', 'chair', 'couch', 'plant', 'bed', 'table', 'toilet'] + Electronics: ['tv', 'laptop', 'mouse', 'remote', 'keyboard', 'phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'clock', 'drier'] + Sports: ['frisbee', 'skis', 'snowboard', 'ball', 'kite', 'bat', 'glove', 'skateboard', 'surfboard', 'racket'] + Utensils: ['bottle', 'glass', 'cup', 'fork', 'knife', 'spoon', 'bowl'] + Fruits: ['banana', 'apple', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'grape', 'pineapple'] + Animals: ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'teddybear'] + Household: ['backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'vase', 'scissors', 'book', 'toothbrush'] +all_classes: ['pringles',] + + +max_sizes: + - [0.4, 2.5, 0.5] + +subscribers: + + queue_size: 1 + exact_time: false + slop: 0.2 + + image_rgb: /butia_vision/bvb/image_rgb + camera_info: /butia_vision/bvb/camera_info + image_depth: /butia_vision/bvb/image_depth + +publishers: + + object_recognition: + topic: /butia_vision/br/object_recognition + queue_size: 1 + +servers: + set_class: + service: /butia_vision/br/object_recognition/set_class + + list_classes: + service: /butia_vision/br/object_recognition/list_classes + + start: + service: /butia_vision/br/object_recognition/start + + stop: + service: /butia_vision/br/object_recognition/stop diff --git a/butia_recognition/launch/paligemma_object_recognition.launch b/butia_recognition/launch/paligemma_object_recognition.launch new file mode 100644 index 0000000..d4495ac --- /dev/null +++ b/butia_recognition/launch/paligemma_object_recognition.launch @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + [255,0,0] + + + diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py new file mode 100755 index 0000000..4aa12ed --- /dev/null +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import rospy +import ros_numpy +from butia_recognition import BaseRecognition, ifState +import numpy as np +import os +from copy import copy +import cv2 +from inference.models.paligemma import PaliGemma +from inference.models.sam import SegmentAnything +from std_msgs.msg import Header +from sensor_msgs.msg import Image +from geometry_msgs.msg import Vector3 +from butia_vision_msgs.msg import Description2D, Recognitions2D +from butia_vision_msgs.srv import SetClass, SetClassRequest, SetClassResponse +import torch +import gc +import PIL +import supervision as sv + + +class PaliGemmaRecognition(BaseRecognition): + def __init__(self, state=True): + super().__init__(state=state) + + self.readParameters() + + self.colors = dict([(k, np.random.randint(low=0, high=256, size=(3,)).tolist()) for k in self.classes]) + + self.loadModel() + self.initRosComm() + + def initRosComm(self): + self.debug_publisher = rospy.Publisher(self.debug_topic, Image, queue_size=self.debug_qs) + self.object_recognition_publisher = rospy.Publisher(self.object_recognition_topic, Recognitions2D, queue_size=self.object_recognition_qs) + self.people_detection_publisher = rospy.Publisher(self.people_detection_topic, Recognitions2D, queue_size=self.people_detection_qs) + self.set_class_service_server = rospy.Service(self.set_class_service, SetClass, self.serverSetClass) + super().initRosComm(callbacks_obj=self) + + def serverSetClass(self, req: SetClassRequest): + self.all_classes = [req.class_name,] + return SetClassResponse() + + def serverStart(self, req): + self.loadModel() + return super().serverStart(req) + + def serverStop(self, req): + self.unLoadModel() + return super().serverStop(req) + + def loadModel(self): + self.model = PaliGemma(model_id='paligemma-3b-mix-224') + self.sam = SegmentAnything() + print('Done loading model!') + + def unLoadModel(self): + del self.model + del self.sam + gc.collect() + torch.cuda.empty_cache() + self.model = None + + @ifState + def callback(self, *args): + source_data = self.sourceDataFromArgs(args) + + if 'image_rgb' not in source_data: + rospy.logwarn('Souce data has no image_rgb.') + return None + + img_rgb = source_data['image_rgb'] + cv_img = ros_numpy.numpify(img_rgb) + rospy.loginfo('Image ID: ' + str(img_rgb.header.seq)) + + objects_recognition = Recognitions2D() + h = Header() + h.seq = self.seq #id mensagem + self.seq += 1 #prox id + h.stamp = rospy.Time.now() + + objects_recognition.header = h + objects_recognition = BaseRecognition.addSourceData2Recognitions2D(source_data, objects_recognition) + people_recognition = copy(objects_recognition) + description_header = img_rgb.header + description_header.seq = 0 + + results = self.model.predict(image_in=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), prompt=f"detect " + " ; ".join(self.all_classes))[0] + boxes_ = sv.Detections.from_lmm(lmm='paligemma', result=results[0], resolution_wh=(cv_img.shape[1], cv_img.shape[0]), classes=self.all_classes) + debug_img = cv_img + masks = [] + embeddings = self.sam.embed_image(image=cv_img)[0] + for x1, y1, x2, y2 in boxes_.xyxy: + center_x = (x1 + x2)//2 + center_y = (y1 + y2)//2 + masks.append(self.sam.segment_image(image=cv_img, embeddings=embeddings, point_labels=[1], point_coords=[[center_x, center_y]])[0]) + boxes_.mask = np.array(masks) + if len(boxes_): + for i in range(len(boxes_)): + box = boxes_[i] + xyxy_box = list(boxes_[i].xyxy.astype(int)[0]) + + if int(box.class_id) >= len(self.all_classes): + continue + + label_class = self.all_classes[int(box.class_id)] + + + description = Description2D() + description.header = copy(description_header) + description.type = Description2D.DETECTION + description.id = description.header.seq + description.score = 1.0 + description.max_size = Vector3(*[0.05, 0.05, 0.05]) + size = int(xyxy_box[2] - xyxy_box[0]), int(xyxy_box[3] - xyxy_box[1]) + description.bbox.center.x = int(xyxy_box[0]) + int(size[0]/2) + description.bbox.center.y = int(xyxy_box[1]) + int(size[1]/2) + description.bbox.size_x = size[0] + description.bbox.size_y = size[1] + description.mask = ros_numpy.msgify(Image, boxes_.mask[i]) + + if ('people' in self.all_classes and label_class in self.classes_by_category['people'] or 'people' in self.all_classes and label_class == 'people'): + + description.label = 'people' + '/' + label_class + people_recognition.descriptions.append(description) + + elif (label_class in [val for sublist in self.all_classes for val in sublist] or label_class in self.all_classes): + index = None + + for value in self.classes_by_category.items(): + if label_class in value[1]: + index = value[0] + + description.label = index + '/' + label_class if index is not None else label_class + objects_recognition.descriptions.append(description) + + debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_) + description_header.seq += 1 + + self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8')) + + if len(objects_recognition.descriptions) > 0: + self.object_recognition_publisher.publish(objects_recognition) + + if len(people_recognition.descriptions) > 0: + self.people_detection_publisher.publish(people_recognition) + else: + debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_) + self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8')) + + def readParameters(self): + self.debug_topic = rospy.get_param("~publishers/debug/topic", "/butia_vision/br/debug") + self.debug_qs = rospy.get_param("~publishers/debug/queue_size", 1) + + self.object_recognition_topic = rospy.get_param("~publishers/object_recognition/topic", "/butia_vision/br/object_recognition") + self.object_recognition_qs = rospy.get_param("~publishers/object_recognition/queue_size", 1) + + self.people_detection_topic = rospy.get_param("~publishers/people_detection/topic", "/butia_vision/br/people_detection") + self.people_detection_qs = rospy.get_param("~publishers/people_detection/queue_size", 1) + + self.set_class_service = rospy.get_param("~servers/set_class/service", "/butia_vision/br/object_recognition/set_class") + + self.all_classes = list(rospy.get_param("~all_classes", [])) + self.classes_by_category = dict(rospy.get_param("~classes_by_category", {})) + + super().readParameters() + +if __name__ == '__main__': + rospy.init_node('paligemma_recognition_node', anonymous = True) + + paligemma = PaliGemmaRecognition() + + rospy.spin() From a12f9c48249b539da672a264ac7363cc3fc4ed7d Mon Sep 17 00:00:00 2001 From: crislmfroes Date: Wed, 26 Jun 2024 10:40:38 -0300 Subject: [PATCH 3/9] Plot masks and labels for PaliGemma. --- .../paligemma_recognition/paligemma_recognition.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py index 4aa12ed..bf4b7a2 100755 --- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py @@ -135,7 +135,8 @@ def callback(self, *args): description.label = index + '/' + label_class if index is not None else label_class objects_recognition.descriptions.append(description) - debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_) + debug_img = sv.MaskAnnotator().annotate(debug_img, boxes_) + debug_img = sv.LabelAnnotator().annotate(debug_img, boxes_, [self.all_classes[idx] for idx in boxes_.class_id]) description_header.seq += 1 self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8')) @@ -146,7 +147,8 @@ def callback(self, *args): if len(people_recognition.descriptions) > 0: self.people_detection_publisher.publish(people_recognition) else: - debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_) + debug_img = sv.MaskAnnotator().annotate(debug_img, boxes_) + debug_img = sv.LabelAnnotator().annotate(debug_img, boxes_, [self.all_classes[idx] for idx in boxes_.class_id]) self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8')) def readParameters(self): From cc66c2944aeda5dfa7cb8fb308851f8f8279ec05 Mon Sep 17 00:00:00 2001 From: crislmfroes Date: Wed, 26 Jun 2024 11:07:48 -0300 Subject: [PATCH 4/9] Add Visual Question Answering to PaliGemma. --- .../config/paligemma_object_recognition.yaml | 3 +++ .../paligemma_recognition/paligemma_recognition.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/butia_recognition/config/paligemma_object_recognition.yaml b/butia_recognition/config/paligemma_object_recognition.yaml index 082fc39..f575752 100644 --- a/butia_recognition/config/paligemma_object_recognition.yaml +++ b/butia_recognition/config/paligemma_object_recognition.yaml @@ -35,6 +35,9 @@ servers: set_class: service: /butia_vision/br/object_recognition/set_class + visual_question_answering: + service: /butia_vision/br/object_recognition/visual_question_answering + list_classes: service: /butia_vision/br/object_recognition/list_classes diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py index bf4b7a2..85e9910 100755 --- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py @@ -14,6 +14,7 @@ from geometry_msgs.msg import Vector3 from butia_vision_msgs.msg import Description2D, Recognitions2D from butia_vision_msgs.srv import SetClass, SetClassRequest, SetClassResponse +from butia_vision_msgs.srv import VisualQuestionAnswering, VisualQuestionAnsweringRequest, VisualQuestionAnsweringResponse import torch import gc import PIL @@ -36,12 +37,19 @@ def initRosComm(self): self.object_recognition_publisher = rospy.Publisher(self.object_recognition_topic, Recognitions2D, queue_size=self.object_recognition_qs) self.people_detection_publisher = rospy.Publisher(self.people_detection_topic, Recognitions2D, queue_size=self.people_detection_qs) self.set_class_service_server = rospy.Service(self.set_class_service, SetClass, self.serverSetClass) + self.visual_question_answering_service_server = rospy.Service(self.visual_question_answering_service, VisualQuestionAnswering, self.serverVisualQuestionAnswering) super().initRosComm(callbacks_obj=self) - def serverSetClass(self, req: SetClassRequest): + def serverSetClass(self, req): self.all_classes = [req.class_name,] return SetClassResponse() + def serverVisualQuestionAnswering(self, req): + result = self.model.predict(image_in=self.cv_img, prompt=res.question) + res = VisualQuestionAnsweringResponse() + res.answer = result[0] + return res + def serverStart(self, req): self.loadModel() return super().serverStart(req) @@ -72,6 +80,7 @@ def callback(self, *args): img_rgb = source_data['image_rgb'] cv_img = ros_numpy.numpify(img_rgb) + self.cv_img = cv_img rospy.loginfo('Image ID: ' + str(img_rgb.header.seq)) objects_recognition = Recognitions2D() @@ -162,6 +171,7 @@ def readParameters(self): self.people_detection_qs = rospy.get_param("~publishers/people_detection/queue_size", 1) self.set_class_service = rospy.get_param("~servers/set_class/service", "/butia_vision/br/object_recognition/set_class") + self.visual_question_answering_service = rospy.get_param("~servers/visual_question_answering/service", "/butia_vision/br/object_recognition/visual_question_answering") self.all_classes = list(rospy.get_param("~all_classes", [])) self.classes_by_category = dict(rospy.get_param("~classes_by_category", {})) From 00b328607c13d9f910c4adeeb59e28af62822751 Mon Sep 17 00:00:00 2001 From: crislmfroes Date: Mon, 1 Jul 2024 23:17:06 -0300 Subject: [PATCH 5/9] Add README.md to explain how to install paligemma_recognition. --- .../butia_recognition/paligemma_recognition/README.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md new file mode 100644 index 0000000..e8a8596 --- /dev/null +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md @@ -0,0 +1,7 @@ +# Install + +Run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated. + +```sh +pip install inference supervision +``` \ No newline at end of file From 8a2f8add047c8fa49394762845328b2b2e6aed5b Mon Sep 17 00:00:00 2001 From: crislmfroes Date: Mon, 1 Jul 2024 23:22:34 -0300 Subject: [PATCH 6/9] Update readme --- .../scripts/butia_recognition/paligemma_recognition/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md index e8a8596..c836d34 100644 --- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md @@ -1,6 +1,6 @@ # Install -Run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated. +Install [this branch](https://github.com/butia-bots/butia_vision_msgs/tree/feature/gpsr-recognition) of butia_vision_msgs. Then run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated during the install process. ```sh pip install inference supervision From 06ac07bd6d9a4156be4b2e1b2b2e745bb59766e6 Mon Sep 17 00:00:00 2001 From: crislmfroes Date: Thu, 4 Jul 2024 21:25:27 -0300 Subject: [PATCH 7/9] Update readme --- .../scripts/butia_recognition/paligemma_recognition/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md index c836d34..92ce27f 100644 --- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md @@ -3,5 +3,5 @@ Install [this branch](https://github.com/butia-bots/butia_vision_msgs/tree/feature/gpsr-recognition) of butia_vision_msgs. Then run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated during the install process. ```sh -pip install inference supervision +pip install inference supervision transformers accelerate peft ``` \ No newline at end of file From f41edac96cd748c6ce338b3f119b4850af6a022e Mon Sep 17 00:00:00 2001 From: Igor Maurell Date: Sat, 13 Jul 2024 21:36:31 -0300 Subject: [PATCH 8/9] Change VLM library from roboflow inference to huggingface transformers. --- .../paligemma_recognition.py | 48 ++++++++++++------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py index 85e9910..c7837f9 100755 --- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py @@ -7,8 +7,8 @@ import os from copy import copy import cv2 -from inference.models.paligemma import PaliGemma -from inference.models.sam import SegmentAnything +from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor +from transformers import SamModel, SamProcessor from std_msgs.msg import Header from sensor_msgs.msg import Image from geometry_msgs.msg import Vector3 @@ -29,6 +29,8 @@ def __init__(self, state=True): self.colors = dict([(k, np.random.randint(low=0, high=256, size=(3,)).tolist()) for k in self.classes]) + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.loadModel() self.initRosComm() @@ -45,9 +47,9 @@ def serverSetClass(self, req): return SetClassResponse() def serverVisualQuestionAnswering(self, req): - result = self.model.predict(image_in=self.cv_img, prompt=res.question) + result = self.inferPaliGemma(image=PIL.Image.fromarray(cv2.cvtColor(self.cv_img, cv2.COLOR_BGR2RGB)), prompt=req.question) res = VisualQuestionAnsweringResponse() - res.answer = result[0] + res.answer = result return res def serverStart(self, req): @@ -59,16 +61,33 @@ def serverStop(self, req): return super().serverStop(req) def loadModel(self): - self.model = PaliGemma(model_id='paligemma-3b-mix-224') - self.sam = SegmentAnything() + self.pg = PaliGemmaForConditionalGeneration.from_pretrained('google/paligemma-3b-mix-224').to(self.device) + self.pg_processor = PaliGemmaProcessor.from_pretrained('google/paligemma-3b-mix-224') + self.sam = SamModel.from_pretrained('facebook/sam-vit-base').to(self.device) + self.sam_processor = SamProcessor.from_pretrained('facebook/sam-vit-base') print('Done loading model!') def unLoadModel(self): - del self.model + del self.pg del self.sam gc.collect() torch.cuda.empty_cache() - self.model = None + self.pg = None + self.sam = None + + def inferPaliGemma(self, image, prompt): + inputs = self.pg_processor(text=prompt, images=image, return_tensors="pt").to(self.device) + with torch.inference_mode(): + outputs = self.pg.generate(**inputs, max_new_tokens=50, do_sample=False) + result = self.pg_processor.batch_decode(outputs, skip_special_tokens=True) + return result[0][len(prompt):].lstrip('\n') + + def inferSam(self, image, input_boxes): + inputs = self.sam_processor(images=image, input_boxes=input_boxes, return_tensors="pt").to(self.device) + with torch.inference_mode(): + outputs = self.sam(**inputs) + masks = self.sam_processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()) + return masks[0].detach().cpu().numpy() @ifState def callback(self, *args): @@ -95,17 +114,14 @@ def callback(self, *args): description_header = img_rgb.header description_header.seq = 0 - results = self.model.predict(image_in=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), prompt=f"detect " + " ; ".join(self.all_classes))[0] - boxes_ = sv.Detections.from_lmm(lmm='paligemma', result=results[0], resolution_wh=(cv_img.shape[1], cv_img.shape[0]), classes=self.all_classes) + results = self.inferPaliGemma(image=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), prompt=f"detect " + " ; ".join(self.all_classes)) + boxes_ = sv.Detections.from_lmm(lmm='paligemma', result=results, resolution_wh=(cv_img.shape[1], cv_img.shape[0]), classes=self.all_classes) debug_img = cv_img masks = [] - embeddings = self.sam.embed_image(image=cv_img)[0] for x1, y1, x2, y2 in boxes_.xyxy: - center_x = (x1 + x2)//2 - center_y = (y1 + y2)//2 - masks.append(self.sam.segment_image(image=cv_img, embeddings=embeddings, point_labels=[1], point_coords=[[center_x, center_y]])[0]) - boxes_.mask = np.array(masks) + masks.append(self.inferSam(image=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), input_boxes=[[[x1, y1, x2, y2]]])[:,0,:,:]) if len(boxes_): + boxes_.mask = np.array(masks).reshape((len(masks), cv_img.shape[0], cv_img.shape[1])) for i in range(len(boxes_)): box = boxes_[i] xyxy_box = list(boxes_[i].xyxy.astype(int)[0]) @@ -127,7 +143,7 @@ def callback(self, *args): description.bbox.center.y = int(xyxy_box[1]) + int(size[1]/2) description.bbox.size_x = size[0] description.bbox.size_y = size[1] - description.mask = ros_numpy.msgify(Image, boxes_.mask[i]) + description.mask = ros_numpy.msgify(Image, (boxes_.mask[i]*255).astype(np.uint8), encoding='mono8') if ('people' in self.all_classes and label_class in self.classes_by_category['people'] or 'people' in self.all_classes and label_class == 'people'): From eab80e04440d55c9eb3465867941a7ee981a6372 Mon Sep 17 00:00:00 2001 From: Igor Maurell Date: Sat, 13 Jul 2024 21:38:51 -0300 Subject: [PATCH 9/9] Update VLM install instructions. --- .../scripts/butia_recognition/paligemma_recognition/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md index 92ce27f..73c7ca7 100644 --- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md +++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md @@ -3,5 +3,5 @@ Install [this branch](https://github.com/butia-bots/butia_vision_msgs/tree/feature/gpsr-recognition) of butia_vision_msgs. Then run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated during the install process. ```sh -pip install inference supervision transformers accelerate peft +pip install transformers accelerate peft bitsandbytes supervision ``` \ No newline at end of file