From 1d7f89099297282a37e0e7a2cdb22e5224de24c1 Mon Sep 17 00:00:00 2001
From: crislmfroes <cris.lima.froes@gmail.com>
Date: Sun, 23 Jun 2024 16:21:19 -0300
Subject: [PATCH 1/9] Add vision language model node.

---
 butia_vision_language_model/CMakeLists.txt    | 202 ++++++++++++++++++
 .../config/vision_language_model_node.yaml    |  11 +
 .../launch/vision_language_model.launch       |   6 +
 butia_vision_language_model/package.xml       |  59 +++++
 .../scripts/vision_language_model_node.py     |  87 ++++++++
 5 files changed, 365 insertions(+)
 create mode 100644 butia_vision_language_model/CMakeLists.txt
 create mode 100644 butia_vision_language_model/config/vision_language_model_node.yaml
 create mode 100644 butia_vision_language_model/launch/vision_language_model.launch
 create mode 100644 butia_vision_language_model/package.xml
 create mode 100755 butia_vision_language_model/scripts/vision_language_model_node.py

diff --git a/butia_vision_language_model/CMakeLists.txt b/butia_vision_language_model/CMakeLists.txt
new file mode 100644
index 0000000..9396249
--- /dev/null
+++ b/butia_vision_language_model/CMakeLists.txt
@@ -0,0 +1,202 @@
+cmake_minimum_required(VERSION 3.0.2)
+project(butia_vision_language_model)
+
+## Compile as C++11, supported in ROS Kinetic and newer
+# add_compile_options(-std=c++11)
+
+## Find catkin macros and libraries
+## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
+## is used, also find other catkin packages
+find_package(catkin REQUIRED)
+
+## System dependencies are found with CMake's conventions
+# find_package(Boost REQUIRED COMPONENTS system)
+
+
+## Uncomment this if the package has a setup.py. This macro ensures
+## modules and global scripts declared therein get installed
+## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html
+# catkin_python_setup()
+
+################################################
+## Declare ROS messages, services and actions ##
+################################################
+
+## To declare and build messages, services or actions from within this
+## package, follow these steps:
+## * Let MSG_DEP_SET be the set of packages whose message types you use in
+##   your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...).
+## * In the file package.xml:
+##   * add a build_depend tag for "message_generation"
+##   * add a build_depend and a exec_depend tag for each package in MSG_DEP_SET
+##   * If MSG_DEP_SET isn't empty the following dependency has been pulled in
+##     but can be declared for certainty nonetheless:
+##     * add a exec_depend tag for "message_runtime"
+## * In this file (CMakeLists.txt):
+##   * add "message_generation" and every package in MSG_DEP_SET to
+##     find_package(catkin REQUIRED COMPONENTS ...)
+##   * add "message_runtime" and every package in MSG_DEP_SET to
+##     catkin_package(CATKIN_DEPENDS ...)
+##   * uncomment the add_*_files sections below as needed
+##     and list every .msg/.srv/.action file to be processed
+##   * uncomment the generate_messages entry below
+##   * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...)
+
+## Generate messages in the 'msg' folder
+# add_message_files(
+#   FILES
+#   Message1.msg
+#   Message2.msg
+# )
+
+## Generate services in the 'srv' folder
+# add_service_files(
+#   FILES
+#   Service1.srv
+#   Service2.srv
+# )
+
+## Generate actions in the 'action' folder
+# add_action_files(
+#   FILES
+#   Action1.action
+#   Action2.action
+# )
+
+## Generate added messages and services with any dependencies listed here
+# generate_messages(
+#   DEPENDENCIES
+#   std_msgs  # Or other packages containing msgs
+# )
+
+################################################
+## Declare ROS dynamic reconfigure parameters ##
+################################################
+
+## To declare and build dynamic reconfigure parameters within this
+## package, follow these steps:
+## * In the file package.xml:
+##   * add a build_depend and a exec_depend tag for "dynamic_reconfigure"
+## * In this file (CMakeLists.txt):
+##   * add "dynamic_reconfigure" to
+##     find_package(catkin REQUIRED COMPONENTS ...)
+##   * uncomment the "generate_dynamic_reconfigure_options" section below
+##     and list every .cfg file to be processed
+
+## Generate dynamic reconfigure parameters in the 'cfg' folder
+# generate_dynamic_reconfigure_options(
+#   cfg/DynReconf1.cfg
+#   cfg/DynReconf2.cfg
+# )
+
+###################################
+## catkin specific configuration ##
+###################################
+## The catkin_package macro generates cmake config files for your package
+## Declare things to be passed to dependent projects
+## INCLUDE_DIRS: uncomment this if your package contains header files
+## LIBRARIES: libraries you create in this project that dependent projects also need
+## CATKIN_DEPENDS: catkin_packages dependent projects also need
+## DEPENDS: system dependencies of this project that dependent projects also need
+catkin_package(
+#  INCLUDE_DIRS include
+#  LIBRARIES butia_vision_language_model
+#  CATKIN_DEPENDS other_catkin_pkg
+#  DEPENDS system_lib
+)
+
+###########
+## Build ##
+###########
+
+## Specify additional locations of header files
+## Your package locations should be listed before other locations
+include_directories(
+# include
+# ${catkin_INCLUDE_DIRS}
+)
+
+## Declare a C++ library
+# add_library(${PROJECT_NAME}
+#   src/${PROJECT_NAME}/butia_vision_language_model.cpp
+# )
+
+## Add cmake target dependencies of the library
+## as an example, code may need to be generated before libraries
+## either from message generation or dynamic reconfigure
+# add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS})
+
+## Declare a C++ executable
+## With catkin_make all packages are built within a single CMake context
+## The recommended prefix ensures that target names across packages don't collide
+# add_executable(${PROJECT_NAME}_node src/butia_vision_language_model_node.cpp)
+
+## Rename C++ executable without prefix
+## The above recommended prefix causes long target names, the following renames the
+## target back to the shorter version for ease of user use
+## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node"
+# set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "")
+
+## Add cmake target dependencies of the executable
+## same as for the library above
+# add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS})
+
+## Specify libraries to link a library or executable target against
+# target_link_libraries(${PROJECT_NAME}_node
+#   ${catkin_LIBRARIES}
+# )
+
+#############
+## Install ##
+#############
+
+# all install targets should use catkin DESTINATION variables
+# See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html
+
+## Mark executable scripts (Python etc.) for installation
+## in contrast to setup.py, you can choose the destination
+# catkin_install_python(PROGRAMS
+#   scripts/my_python_script
+#   DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+# )
+
+## Mark executables for installation
+## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html
+# install(TARGETS ${PROJECT_NAME}_node
+#   RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+# )
+
+## Mark libraries for installation
+## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_libraries.html
+# install(TARGETS ${PROJECT_NAME}
+#   ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
+#   LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
+#   RUNTIME DESTINATION ${CATKIN_GLOBAL_BIN_DESTINATION}
+# )
+
+## Mark cpp header files for installation
+# install(DIRECTORY include/${PROJECT_NAME}/
+#   DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION}
+#   FILES_MATCHING PATTERN "*.h"
+#   PATTERN ".svn" EXCLUDE
+# )
+
+## Mark other files for installation (e.g. launch and bag files, etc.)
+# install(FILES
+#   # myfile1
+#   # myfile2
+#   DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+# )
+
+#############
+## Testing ##
+#############
+
+## Add gtest based cpp test target and link libraries
+# catkin_add_gtest(${PROJECT_NAME}-test test/test_butia_vision_language_model.cpp)
+# if(TARGET ${PROJECT_NAME}-test)
+#   target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME})
+# endif()
+
+## Add folders to be run by python nosetests
+# catkin_add_nosetests(test)
diff --git a/butia_vision_language_model/config/vision_language_model_node.yaml b/butia_vision_language_model/config/vision_language_model_node.yaml
new file mode 100644
index 0000000..184d86b
--- /dev/null
+++ b/butia_vision_language_model/config/vision_language_model_node.yaml
@@ -0,0 +1,11 @@
+vlm_api_type: google-genai #Must be one of: ['openai', 'ollama', 'google-genai']
+vlm_api_host: dummy #Must be set for openai and ollama, but is not used for google-genai
+vlm_api_model: gemini-1.5-flash
+
+subscribers:
+  image_rgb:
+    topic: /butia_vision/bvb/image_rgb
+
+servers:
+  visual_question_answering:
+    service: /butia_vision/bvlm/visual_question_answering/query
diff --git a/butia_vision_language_model/launch/vision_language_model.launch b/butia_vision_language_model/launch/vision_language_model.launch
new file mode 100644
index 0000000..c45d7db
--- /dev/null
+++ b/butia_vision_language_model/launch/vision_language_model.launch
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<launch>
+    <node pkg="butia_vision_language_model" type="vision_language_model_node.py" name="vision_language_model_node">
+        <rosparam file="$(find butia_vision_language_model)/config/vision_language_model_node.yaml" command="load" ns="" />
+    </node>
+</launch>
\ No newline at end of file
diff --git a/butia_vision_language_model/package.xml b/butia_vision_language_model/package.xml
new file mode 100644
index 0000000..4129a5f
--- /dev/null
+++ b/butia_vision_language_model/package.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0"?>
+<package format="2">
+  <name>butia_vision_language_model</name>
+  <version>0.0.0</version>
+  <description>The butia_vision_language_model package</description>
+
+  <!-- One maintainer tag required, multiple allowed, one person per tag -->
+  <!-- Example:  -->
+  <!-- <maintainer email="jane.doe@example.com">Jane Doe</maintainer> -->
+  <maintainer email="cris@todo.todo">cris</maintainer>
+
+
+  <!-- One license tag required, multiple allowed, one license per tag -->
+  <!-- Commonly used license strings: -->
+  <!--   BSD, MIT, Boost Software License, GPLv2, GPLv3, LGPLv2.1, LGPLv3 -->
+  <license>TODO</license>
+
+
+  <!-- Url tags are optional, but multiple are allowed, one per tag -->
+  <!-- Optional attribute type can be: website, bugtracker, or repository -->
+  <!-- Example: -->
+  <!-- <url type="website">http://wiki.ros.org/butia_vision_language_model</url> -->
+
+
+  <!-- Author tags are optional, multiple are allowed, one per tag -->
+  <!-- Authors do not have to be maintainers, but could be -->
+  <!-- Example: -->
+  <!-- <author email="jane.doe@example.com">Jane Doe</author> -->
+
+
+  <!-- The *depend tags are used to specify dependencies -->
+  <!-- Dependencies can be catkin packages or system dependencies -->
+  <!-- Examples: -->
+  <!-- Use depend as a shortcut for packages that are both build and exec dependencies -->
+  <!--   <depend>roscpp</depend> -->
+  <!--   Note that this is equivalent to the following: -->
+  <!--   <build_depend>roscpp</build_depend> -->
+  <!--   <exec_depend>roscpp</exec_depend> -->
+  <!-- Use build_depend for packages you need at compile time: -->
+  <!--   <build_depend>message_generation</build_depend> -->
+  <!-- Use build_export_depend for packages you need in order to build against this package: -->
+  <!--   <build_export_depend>message_generation</build_export_depend> -->
+  <!-- Use buildtool_depend for build tool packages: -->
+  <!--   <buildtool_depend>catkin</buildtool_depend> -->
+  <!-- Use exec_depend for packages you need at runtime: -->
+  <!--   <exec_depend>message_runtime</exec_depend> -->
+  <!-- Use test_depend for packages you need only for testing: -->
+  <!--   <test_depend>gtest</test_depend> -->
+  <!-- Use doc_depend for packages you need only for building documentation: -->
+  <!--   <doc_depend>doxygen</doc_depend> -->
+  <buildtool_depend>catkin</buildtool_depend>
+
+
+  <!-- The export tag contains other, unspecified, tags -->
+  <export>
+    <!-- Other tools can request additional information be placed here -->
+
+  </export>
+</package>
diff --git a/butia_vision_language_model/scripts/vision_language_model_node.py b/butia_vision_language_model/scripts/vision_language_model_node.py
new file mode 100755
index 0000000..73cebe4
--- /dev/null
+++ b/butia_vision_language_model/scripts/vision_language_model_node.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+import rospy
+from sensor_msgs.msg import Image
+from butia_vision_msgs.srv import VisualQuestionAnswering, VisualQuestionAnsweringRequest, VisualQuestionAnsweringResponse
+import PIL
+from ros_numpy import numpify
+import base64
+from io import BytesIO
+from langchain_core.messages import HumanMessage
+
+try:
+    from langchain_community.chat_models.ollama import ChatOllama
+except:
+    pass
+try:
+    from langchain_openai.chat_models import ChatOpenAI
+except:
+    pass
+try:
+    from langchain_google_genai.chat_models import ChatGoogleGenerativeAI
+except:
+    pass
+
+
+class VisionLanguageModelNode:
+    def __init__(self):
+        self.read_parameters()
+        if self.vlm_api_type == 'ollama':
+            self.vlm = ChatOllama(model=self.vlm_api_model, base_url=self.vlm_api_host)
+        elif self.vlm_api_type == 'openai':
+            self.vlm = ChatOpenAI(model_name=self.vlm_api_model, openai_api_base=self.vlm_api_host)
+        if self.vlm_api_type == 'google-genai':
+            self.vlm = ChatGoogleGenerativeAI(model=self.vlm_api_model, convert_system_message_to_human=True)
+        else:
+            raise ValueError(f"VLM API type must be one of: {['ollama', 'openai', 'google-genai']}!")
+        self.image_rgb_subscriber = rospy.Subscriber(self.rgb_image_topic, Image, callback=self._update_rgb_image)
+        self.visual_question_answering_server = rospy.Service(self.visual_question_answering_service, VisualQuestionAnswering, handler=self._handle_visual_question_answering)
+
+    def _update_rgb_image(self, msg: Image):
+        self.rgb_image_msg = msg
+
+    def _handle_visual_question_answering(self, req: VisualQuestionAnsweringRequest):
+        message = HumanMessage(
+            content=[
+                self.get_image_content(),
+                {
+                    'type': 'text',
+                    'text': f'{req.question}'
+                }
+            ]
+        )
+        res = VisualQuestionAnsweringResponse()
+        res.answer = self.vlm.invoke([message,]).content
+        res.confidence = 1.0
+        return res
+
+    def get_image_content(self):
+        rospy.wait_for_message(self.rgb_image_topic, Image)
+        buffered = BytesIO()
+        img = PIL.Image.fromarray(numpify(self.rgb_image_msg)[:,:,::-1])
+        img.save(buffered, format='JPEG')
+        b64_image_str = base64.b64encode(buffered.getvalue()).decode()
+        if self.vlm_api_type in ('ollama',):
+            return {
+                'type': 'image_url',
+                'image_url': f"data:image/jpeg;base64,{b64_image_str}"
+            }
+        else:
+            return {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f"data:image/jpeg;base64,{b64_image_str}"
+                }
+            }
+        
+
+    def read_parameters(self):
+        self.vlm_api_type = rospy.get_param('~vlm_api_type')
+        self.vlm_api_host = rospy.get_param('~vlm_api_host')
+        self.vlm_api_model = rospy.get_param('~vlm_api_model')
+        self.rgb_image_topic = rospy.get_param('~subscribers/image_rgb/topic')
+        self.visual_question_answering_service = rospy.get_param('~servers/visual_question_answering/service')
+
+if __name__ == '__main__':
+    rospy.init_node('vision_language_model_node', anonymous=True)
+    node = VisionLanguageModelNode()
+    rospy.spin()
\ No newline at end of file

From 284600b9e4f4e977fc84029ac4a1d9932819f35e Mon Sep 17 00:00:00 2001
From: crislmfroes <cris.lima.froes@gmail.com>
Date: Tue, 25 Jun 2024 23:32:02 -0300
Subject: [PATCH 2/9] Add Paligemma VLM recognition.

---
 .../config/paligemma_object_recognition.yaml  |  45 +++++
 .../paligemma_object_recognition.launch       |  21 +++
 .../paligemma_recognition/__init__.py         |   0
 .../paligemma_recognition.py                  | 174 ++++++++++++++++++
 4 files changed, 240 insertions(+)
 create mode 100644 butia_recognition/config/paligemma_object_recognition.yaml
 create mode 100644 butia_recognition/launch/paligemma_object_recognition.launch
 create mode 100644 butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py
 create mode 100755 butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py

diff --git a/butia_recognition/config/paligemma_object_recognition.yaml b/butia_recognition/config/paligemma_object_recognition.yaml
new file mode 100644
index 0000000..082fc39
--- /dev/null
+++ b/butia_recognition/config/paligemma_object_recognition.yaml
@@ -0,0 +1,45 @@
+threshold: 0.3
+classes_by_category:
+    Transportation: ['bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat']
+    Traffic: ['traffic light', 'fire hydrant', 'stop sign', 'parking meter']
+    Furniture: ['bench', 'chair', 'couch', 'plant', 'bed', 'table', 'toilet']
+    Electronics: ['tv', 'laptop', 'mouse', 'remote', 'keyboard', 'phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'clock', 'drier']
+    Sports: ['frisbee', 'skis', 'snowboard', 'ball', 'kite', 'bat', 'glove', 'skateboard', 'surfboard', 'racket']
+    Utensils: ['bottle', 'glass', 'cup', 'fork', 'knife', 'spoon', 'bowl']
+    Fruits: ['banana', 'apple', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'grape', 'pineapple']
+    Animals: ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'teddybear']
+    Household: ['backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'vase', 'scissors', 'book', 'toothbrush']
+all_classes: ['pringles',]
+
+  
+max_sizes:
+    - [0.4, 2.5, 0.5]
+
+subscribers:
+  
+  queue_size: 1
+  exact_time: false
+  slop: 0.2
+
+  image_rgb: /butia_vision/bvb/image_rgb
+  camera_info: /butia_vision/bvb/camera_info
+  image_depth: /butia_vision/bvb/image_depth
+  
+publishers:
+
+  object_recognition:
+    topic: /butia_vision/br/object_recognition
+    queue_size: 1
+
+servers:
+  set_class:
+    service: /butia_vision/br/object_recognition/set_class
+
+  list_classes:
+    service: /butia_vision/br/object_recognition/list_classes
+
+  start:
+    service: /butia_vision/br/object_recognition/start
+
+  stop:
+    service: /butia_vision/br/object_recognition/stop
diff --git a/butia_recognition/launch/paligemma_object_recognition.launch b/butia_recognition/launch/paligemma_object_recognition.launch
new file mode 100644
index 0000000..d4495ac
--- /dev/null
+++ b/butia_recognition/launch/paligemma_object_recognition.launch
@@ -0,0 +1,21 @@
+<launch>
+    <arg name="machine" default="localhost"/>
+    <arg name="use_machine" default="true"/>
+    <arg name="output" default="screen"/>
+    <arg name="node_name" default="butia_object_recognition"/>
+    <arg name="config_file" default="paligemma_object_recognition.yaml"/>
+
+    <machine name="localhost" address="localhost" if="$(arg use_machine)"/>
+
+    <node pkg="butia_recognition" type="paligemma_recognition.py" output="$(arg output)" name="$(arg node_name)" machine="$(arg machine)">
+        <rosparam command="load" file="$(find butia_recognition)/config/$(arg config_file)"/>
+    </node>
+
+    <node pkg="butia_image2world" type="image2world.py" output="$(arg output)" name="image2world_object_recognition_node" machine="$(arg machine)">
+        <remap from="sub/recognitions2d" to="/butia_vision/br/object_recognition"/>
+        <remap from="pub/recognitions3d" to="/butia_vision/br/object_recognition3d"/>
+        <remap from="pub/markers" to="/butia_vision/br/markers"/>
+        <rosparam param="color">[255,0,0]</rosparam>
+    </node>
+
+</launch>
diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
new file mode 100755
index 0000000..4aa12ed
--- /dev/null
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import rospy
+import ros_numpy
+from butia_recognition import BaseRecognition, ifState
+import numpy as np
+import os
+from copy import copy
+import cv2
+from inference.models.paligemma import PaliGemma
+from inference.models.sam import SegmentAnything
+from std_msgs.msg import Header
+from sensor_msgs.msg import Image
+from geometry_msgs.msg import Vector3
+from butia_vision_msgs.msg import Description2D, Recognitions2D
+from butia_vision_msgs.srv import SetClass, SetClassRequest, SetClassResponse
+import torch
+import gc
+import PIL
+import supervision as sv
+
+
+class PaliGemmaRecognition(BaseRecognition):
+    def __init__(self, state=True):
+        super().__init__(state=state)
+
+        self.readParameters()
+
+        self.colors = dict([(k, np.random.randint(low=0, high=256, size=(3,)).tolist()) for k in self.classes])
+
+        self.loadModel()
+        self.initRosComm()
+
+    def initRosComm(self):
+        self.debug_publisher = rospy.Publisher(self.debug_topic, Image, queue_size=self.debug_qs)
+        self.object_recognition_publisher = rospy.Publisher(self.object_recognition_topic, Recognitions2D, queue_size=self.object_recognition_qs)
+        self.people_detection_publisher = rospy.Publisher(self.people_detection_topic, Recognitions2D, queue_size=self.people_detection_qs)
+        self.set_class_service_server = rospy.Service(self.set_class_service, SetClass, self.serverSetClass)
+        super().initRosComm(callbacks_obj=self)
+
+    def serverSetClass(self, req: SetClassRequest):
+        self.all_classes = [req.class_name,]
+        return SetClassResponse()
+
+    def serverStart(self, req):
+        self.loadModel()
+        return super().serverStart(req)
+
+    def serverStop(self, req):
+        self.unLoadModel()
+        return super().serverStop(req)
+
+    def loadModel(self): 
+        self.model = PaliGemma(model_id='paligemma-3b-mix-224')
+        self.sam = SegmentAnything()
+        print('Done loading model!')
+
+    def unLoadModel(self):
+        del self.model
+        del self.sam
+        gc.collect()
+        torch.cuda.empty_cache()
+        self.model = None
+
+    @ifState
+    def callback(self, *args):
+        source_data = self.sourceDataFromArgs(args)
+
+        if 'image_rgb' not in source_data:
+            rospy.logwarn('Souce data has no image_rgb.')
+            return None
+        
+        img_rgb = source_data['image_rgb']
+        cv_img = ros_numpy.numpify(img_rgb)
+        rospy.loginfo('Image ID: ' + str(img_rgb.header.seq))
+        
+        objects_recognition = Recognitions2D()
+        h = Header()
+        h.seq = self.seq #id mensagem
+        self.seq += 1 #prox id
+        h.stamp = rospy.Time.now()
+
+        objects_recognition.header = h
+        objects_recognition = BaseRecognition.addSourceData2Recognitions2D(source_data, objects_recognition)
+        people_recognition = copy(objects_recognition)
+        description_header = img_rgb.header
+        description_header.seq = 0
+
+        results = self.model.predict(image_in=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), prompt=f"detect " + " ; ".join(self.all_classes))[0]
+        boxes_ = sv.Detections.from_lmm(lmm='paligemma', result=results[0], resolution_wh=(cv_img.shape[1], cv_img.shape[0]), classes=self.all_classes)
+        debug_img = cv_img
+        masks = []
+        embeddings = self.sam.embed_image(image=cv_img)[0]
+        for x1, y1, x2, y2 in boxes_.xyxy:
+            center_x = (x1 + x2)//2
+            center_y = (y1 + y2)//2
+            masks.append(self.sam.segment_image(image=cv_img, embeddings=embeddings, point_labels=[1], point_coords=[[center_x, center_y]])[0])
+        boxes_.mask = np.array(masks)
+        if len(boxes_):
+            for i in range(len(boxes_)):
+                box = boxes_[i]
+                xyxy_box = list(boxes_[i].xyxy.astype(int)[0])
+                
+                if int(box.class_id) >= len(self.all_classes):
+                    continue
+
+                label_class = self.all_classes[int(box.class_id)]
+
+
+                description = Description2D()
+                description.header = copy(description_header)
+                description.type = Description2D.DETECTION
+                description.id = description.header.seq
+                description.score = 1.0
+                description.max_size = Vector3(*[0.05, 0.05, 0.05])
+                size = int(xyxy_box[2] - xyxy_box[0]), int(xyxy_box[3] - xyxy_box[1])
+                description.bbox.center.x = int(xyxy_box[0]) + int(size[0]/2)
+                description.bbox.center.y = int(xyxy_box[1]) + int(size[1]/2)
+                description.bbox.size_x = size[0]
+                description.bbox.size_y = size[1]
+                description.mask = ros_numpy.msgify(Image, boxes_.mask[i])
+
+                if ('people' in self.all_classes and label_class in self.classes_by_category['people'] or 'people' in self.all_classes and label_class == 'people'):
+
+                    description.label = 'people' + '/' + label_class
+                    people_recognition.descriptions.append(description)
+
+                elif (label_class in [val for sublist in self.all_classes for val in sublist] or label_class in self.all_classes):
+                    index = None
+
+                    for value in self.classes_by_category.items():
+                        if label_class in value[1]:
+                            index = value[0]
+
+                    description.label = index + '/' + label_class if index is not None else label_class
+                    objects_recognition.descriptions.append(description)
+
+                debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_)
+                description_header.seq += 1
+            
+            self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8'))
+
+            if len(objects_recognition.descriptions) > 0:
+                self.object_recognition_publisher.publish(objects_recognition)
+
+            if len(people_recognition.descriptions) > 0:
+                self.people_detection_publisher.publish(people_recognition)       
+        else:
+            debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_)            
+            self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8'))
+
+    def readParameters(self):
+        self.debug_topic = rospy.get_param("~publishers/debug/topic", "/butia_vision/br/debug")
+        self.debug_qs = rospy.get_param("~publishers/debug/queue_size", 1)
+
+        self.object_recognition_topic = rospy.get_param("~publishers/object_recognition/topic", "/butia_vision/br/object_recognition")
+        self.object_recognition_qs = rospy.get_param("~publishers/object_recognition/queue_size", 1)
+
+        self.people_detection_topic = rospy.get_param("~publishers/people_detection/topic", "/butia_vision/br/people_detection")
+        self.people_detection_qs = rospy.get_param("~publishers/people_detection/queue_size", 1)
+
+        self.set_class_service = rospy.get_param("~servers/set_class/service", "/butia_vision/br/object_recognition/set_class")
+
+        self.all_classes = list(rospy.get_param("~all_classes", []))
+        self.classes_by_category = dict(rospy.get_param("~classes_by_category", {}))
+
+        super().readParameters()
+
+if __name__ == '__main__':
+    rospy.init_node('paligemma_recognition_node', anonymous = True)
+
+    paligemma = PaliGemmaRecognition()
+
+    rospy.spin()

From a12f9c48249b539da672a264ac7363cc3fc4ed7d Mon Sep 17 00:00:00 2001
From: crislmfroes <cris.lima.froes@gmail.com>
Date: Wed, 26 Jun 2024 10:40:38 -0300
Subject: [PATCH 3/9] Plot masks and labels for PaliGemma.

---
 .../paligemma_recognition/paligemma_recognition.py          | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
index 4aa12ed..bf4b7a2 100755
--- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
@@ -135,7 +135,8 @@ def callback(self, *args):
                     description.label = index + '/' + label_class if index is not None else label_class
                     objects_recognition.descriptions.append(description)
 
-                debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_)
+                debug_img = sv.MaskAnnotator().annotate(debug_img, boxes_)
+                debug_img = sv.LabelAnnotator().annotate(debug_img, boxes_, [self.all_classes[idx] for idx in boxes_.class_id])
                 description_header.seq += 1
             
             self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8'))
@@ -146,7 +147,8 @@ def callback(self, *args):
             if len(people_recognition.descriptions) > 0:
                 self.people_detection_publisher.publish(people_recognition)       
         else:
-            debug_img = sv.BoundingBoxAnnotator().annotate(debug_img, boxes_)            
+            debug_img = sv.MaskAnnotator().annotate(debug_img, boxes_)
+            debug_img = sv.LabelAnnotator().annotate(debug_img, boxes_, [self.all_classes[idx] for idx in boxes_.class_id])
             self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8'))
 
     def readParameters(self):

From cc66c2944aeda5dfa7cb8fb308851f8f8279ec05 Mon Sep 17 00:00:00 2001
From: crislmfroes <cris.lima.froes@gmail.com>
Date: Wed, 26 Jun 2024 11:07:48 -0300
Subject: [PATCH 4/9] Add Visual Question Answering to PaliGemma.

---
 .../config/paligemma_object_recognition.yaml         |  3 +++
 .../paligemma_recognition/paligemma_recognition.py   | 12 +++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/butia_recognition/config/paligemma_object_recognition.yaml b/butia_recognition/config/paligemma_object_recognition.yaml
index 082fc39..f575752 100644
--- a/butia_recognition/config/paligemma_object_recognition.yaml
+++ b/butia_recognition/config/paligemma_object_recognition.yaml
@@ -35,6 +35,9 @@ servers:
   set_class:
     service: /butia_vision/br/object_recognition/set_class
 
+  visual_question_answering:
+    service: /butia_vision/br/object_recognition/visual_question_answering
+
   list_classes:
     service: /butia_vision/br/object_recognition/list_classes
 
diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
index bf4b7a2..85e9910 100755
--- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
@@ -14,6 +14,7 @@
 from geometry_msgs.msg import Vector3
 from butia_vision_msgs.msg import Description2D, Recognitions2D
 from butia_vision_msgs.srv import SetClass, SetClassRequest, SetClassResponse
+from butia_vision_msgs.srv import VisualQuestionAnswering, VisualQuestionAnsweringRequest, VisualQuestionAnsweringResponse
 import torch
 import gc
 import PIL
@@ -36,12 +37,19 @@ def initRosComm(self):
         self.object_recognition_publisher = rospy.Publisher(self.object_recognition_topic, Recognitions2D, queue_size=self.object_recognition_qs)
         self.people_detection_publisher = rospy.Publisher(self.people_detection_topic, Recognitions2D, queue_size=self.people_detection_qs)
         self.set_class_service_server = rospy.Service(self.set_class_service, SetClass, self.serverSetClass)
+        self.visual_question_answering_service_server = rospy.Service(self.visual_question_answering_service, VisualQuestionAnswering, self.serverVisualQuestionAnswering)
         super().initRosComm(callbacks_obj=self)
 
-    def serverSetClass(self, req: SetClassRequest):
+    def serverSetClass(self, req):
         self.all_classes = [req.class_name,]
         return SetClassResponse()
 
+    def serverVisualQuestionAnswering(self, req):
+        result = self.model.predict(image_in=self.cv_img, prompt=res.question)
+        res = VisualQuestionAnsweringResponse()
+        res.answer = result[0]
+        return res
+
     def serverStart(self, req):
         self.loadModel()
         return super().serverStart(req)
@@ -72,6 +80,7 @@ def callback(self, *args):
         
         img_rgb = source_data['image_rgb']
         cv_img = ros_numpy.numpify(img_rgb)
+        self.cv_img = cv_img
         rospy.loginfo('Image ID: ' + str(img_rgb.header.seq))
         
         objects_recognition = Recognitions2D()
@@ -162,6 +171,7 @@ def readParameters(self):
         self.people_detection_qs = rospy.get_param("~publishers/people_detection/queue_size", 1)
 
         self.set_class_service = rospy.get_param("~servers/set_class/service", "/butia_vision/br/object_recognition/set_class")
+        self.visual_question_answering_service = rospy.get_param("~servers/visual_question_answering/service", "/butia_vision/br/object_recognition/visual_question_answering")
 
         self.all_classes = list(rospy.get_param("~all_classes", []))
         self.classes_by_category = dict(rospy.get_param("~classes_by_category", {}))

From 00b328607c13d9f910c4adeeb59e28af62822751 Mon Sep 17 00:00:00 2001
From: crislmfroes <cris.lima.froes@gmail.com>
Date: Mon, 1 Jul 2024 23:17:06 -0300
Subject: [PATCH 5/9] Add README.md to explain how to install
 paligemma_recognition.

---
 .../butia_recognition/paligemma_recognition/README.md      | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md

diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
new file mode 100644
index 0000000..e8a8596
--- /dev/null
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
@@ -0,0 +1,7 @@
+# Install
+
+Run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated.
+
+```sh
+pip install inference supervision
+```
\ No newline at end of file

From 8a2f8add047c8fa49394762845328b2b2e6aed5b Mon Sep 17 00:00:00 2001
From: crislmfroes <cris.lima.froes@gmail.com>
Date: Mon, 1 Jul 2024 23:22:34 -0300
Subject: [PATCH 6/9] Update readme

---
 .../scripts/butia_recognition/paligemma_recognition/README.md   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
index e8a8596..c836d34 100644
--- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
@@ -1,6 +1,6 @@
 # Install
 
-Run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated.
+Install [this branch](https://github.com/butia-bots/butia_vision_msgs/tree/feature/gpsr-recognition) of butia_vision_msgs. Then run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated during the install process.
 
 ```sh
 pip install inference supervision

From 06ac07bd6d9a4156be4b2e1b2b2e745bb59766e6 Mon Sep 17 00:00:00 2001
From: crislmfroes <cris.lima.froes@gmail.com>
Date: Thu, 4 Jul 2024 21:25:27 -0300
Subject: [PATCH 7/9] Update readme

---
 .../scripts/butia_recognition/paligemma_recognition/README.md   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
index c836d34..92ce27f 100644
--- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
@@ -3,5 +3,5 @@
 Install [this branch](https://github.com/butia-bots/butia_vision_msgs/tree/feature/gpsr-recognition) of butia_vision_msgs. Then run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated during the install process.
 
 ```sh
-pip install inference supervision
+pip install inference supervision transformers accelerate peft
 ```
\ No newline at end of file

From f41edac96cd748c6ce338b3f119b4850af6a022e Mon Sep 17 00:00:00 2001
From: Igor Maurell <igormaurell@gmail.com>
Date: Sat, 13 Jul 2024 21:36:31 -0300
Subject: [PATCH 8/9] Change VLM library from roboflow inference to huggingface
 transformers.

---
 .../paligemma_recognition.py                  | 48 ++++++++++++-------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
index 85e9910..c7837f9 100755
--- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
@@ -7,8 +7,8 @@
 import os
 from copy import copy
 import cv2
-from inference.models.paligemma import PaliGemma
-from inference.models.sam import SegmentAnything
+from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
+from transformers import SamModel, SamProcessor
 from std_msgs.msg import Header
 from sensor_msgs.msg import Image
 from geometry_msgs.msg import Vector3
@@ -29,6 +29,8 @@ def __init__(self, state=True):
 
         self.colors = dict([(k, np.random.randint(low=0, high=256, size=(3,)).tolist()) for k in self.classes])
 
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
         self.loadModel()
         self.initRosComm()
 
@@ -45,9 +47,9 @@ def serverSetClass(self, req):
         return SetClassResponse()
 
     def serverVisualQuestionAnswering(self, req):
-        result = self.model.predict(image_in=self.cv_img, prompt=res.question)
+        result = self.inferPaliGemma(image=PIL.Image.fromarray(cv2.cvtColor(self.cv_img, cv2.COLOR_BGR2RGB)), prompt=req.question)
         res = VisualQuestionAnsweringResponse()
-        res.answer = result[0]
+        res.answer = result
         return res
 
     def serverStart(self, req):
@@ -59,16 +61,33 @@ def serverStop(self, req):
         return super().serverStop(req)
 
     def loadModel(self): 
-        self.model = PaliGemma(model_id='paligemma-3b-mix-224')
-        self.sam = SegmentAnything()
+        self.pg = PaliGemmaForConditionalGeneration.from_pretrained('google/paligemma-3b-mix-224').to(self.device)
+        self.pg_processor = PaliGemmaProcessor.from_pretrained('google/paligemma-3b-mix-224')
+        self.sam = SamModel.from_pretrained('facebook/sam-vit-base').to(self.device)
+        self.sam_processor = SamProcessor.from_pretrained('facebook/sam-vit-base')
         print('Done loading model!')
 
     def unLoadModel(self):
-        del self.model
+        del self.pg
         del self.sam
         gc.collect()
         torch.cuda.empty_cache()
-        self.model = None
+        self.pg = None
+        self.sam = None
+
+    def inferPaliGemma(self, image, prompt):
+        inputs = self.pg_processor(text=prompt, images=image, return_tensors="pt").to(self.device)
+        with torch.inference_mode():
+            outputs = self.pg.generate(**inputs, max_new_tokens=50, do_sample=False)
+        result = self.pg_processor.batch_decode(outputs, skip_special_tokens=True)
+        return result[0][len(prompt):].lstrip('\n')
+    
+    def inferSam(self, image, input_boxes):
+        inputs = self.sam_processor(images=image, input_boxes=input_boxes, return_tensors="pt").to(self.device)
+        with torch.inference_mode():
+            outputs = self.sam(**inputs)
+        masks = self.sam_processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+        return masks[0].detach().cpu().numpy()
 
     @ifState
     def callback(self, *args):
@@ -95,17 +114,14 @@ def callback(self, *args):
         description_header = img_rgb.header
         description_header.seq = 0
 
-        results = self.model.predict(image_in=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), prompt=f"detect " + " ; ".join(self.all_classes))[0]
-        boxes_ = sv.Detections.from_lmm(lmm='paligemma', result=results[0], resolution_wh=(cv_img.shape[1], cv_img.shape[0]), classes=self.all_classes)
+        results = self.inferPaliGemma(image=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), prompt=f"detect " + " ; ".join(self.all_classes))
+        boxes_ = sv.Detections.from_lmm(lmm='paligemma', result=results, resolution_wh=(cv_img.shape[1], cv_img.shape[0]), classes=self.all_classes)
         debug_img = cv_img
         masks = []
-        embeddings = self.sam.embed_image(image=cv_img)[0]
         for x1, y1, x2, y2 in boxes_.xyxy:
-            center_x = (x1 + x2)//2
-            center_y = (y1 + y2)//2
-            masks.append(self.sam.segment_image(image=cv_img, embeddings=embeddings, point_labels=[1], point_coords=[[center_x, center_y]])[0])
-        boxes_.mask = np.array(masks)
+            masks.append(self.inferSam(image=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), input_boxes=[[[x1, y1, x2, y2]]])[:,0,:,:])        
         if len(boxes_):
+            boxes_.mask = np.array(masks).reshape((len(masks), cv_img.shape[0], cv_img.shape[1]))
             for i in range(len(boxes_)):
                 box = boxes_[i]
                 xyxy_box = list(boxes_[i].xyxy.astype(int)[0])
@@ -127,7 +143,7 @@ def callback(self, *args):
                 description.bbox.center.y = int(xyxy_box[1]) + int(size[1]/2)
                 description.bbox.size_x = size[0]
                 description.bbox.size_y = size[1]
-                description.mask = ros_numpy.msgify(Image, boxes_.mask[i])
+                description.mask = ros_numpy.msgify(Image, (boxes_.mask[i]*255).astype(np.uint8), encoding='mono8')
 
                 if ('people' in self.all_classes and label_class in self.classes_by_category['people'] or 'people' in self.all_classes and label_class == 'people'):
 

From eab80e04440d55c9eb3465867941a7ee981a6372 Mon Sep 17 00:00:00 2001
From: Igor Maurell <igormaurell@gmail.com>
Date: Sat, 13 Jul 2024 21:38:51 -0300
Subject: [PATCH 9/9] Update VLM install instructions.

---
 .../scripts/butia_recognition/paligemma_recognition/README.md   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
index 92ce27f..73c7ca7 100644
--- a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
@@ -3,5 +3,5 @@
 Install [this branch](https://github.com/butia-bots/butia_vision_msgs/tree/feature/gpsr-recognition) of butia_vision_msgs. Then run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated during the install process.
 
 ```sh
-pip install inference supervision transformers accelerate peft
+pip install transformers accelerate peft bitsandbytes supervision
 ```
\ No newline at end of file