diff --git a/butia_recognition/config/paligemma_object_recognition.yaml b/butia_recognition/config/paligemma_object_recognition.yaml
new file mode 100644
index 0000000..f575752
--- /dev/null
+++ b/butia_recognition/config/paligemma_object_recognition.yaml
@@ -0,0 +1,48 @@
+threshold: 0.3
+classes_by_category:
+ Transportation: ['bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat']
+ Traffic: ['traffic light', 'fire hydrant', 'stop sign', 'parking meter']
+ Furniture: ['bench', 'chair', 'couch', 'plant', 'bed', 'table', 'toilet']
+ Electronics: ['tv', 'laptop', 'mouse', 'remote', 'keyboard', 'phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'clock', 'drier']
+ Sports: ['frisbee', 'skis', 'snowboard', 'ball', 'kite', 'bat', 'glove', 'skateboard', 'surfboard', 'racket']
+ Utensils: ['bottle', 'glass', 'cup', 'fork', 'knife', 'spoon', 'bowl']
+ Fruits: ['banana', 'apple', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'grape', 'pineapple']
+ Animals: ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'teddybear']
+ Household: ['backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'vase', 'scissors', 'book', 'toothbrush']
+all_classes: ['pringles',]
+
+
+max_sizes:
+ - [0.4, 2.5, 0.5]
+
+subscribers:
+
+ queue_size: 1
+ exact_time: false
+ slop: 0.2
+
+ image_rgb: /butia_vision/bvb/image_rgb
+ camera_info: /butia_vision/bvb/camera_info
+ image_depth: /butia_vision/bvb/image_depth
+
+publishers:
+
+ object_recognition:
+ topic: /butia_vision/br/object_recognition
+ queue_size: 1
+
+servers:
+ set_class:
+ service: /butia_vision/br/object_recognition/set_class
+
+ visual_question_answering:
+ service: /butia_vision/br/object_recognition/visual_question_answering
+
+ list_classes:
+ service: /butia_vision/br/object_recognition/list_classes
+
+ start:
+ service: /butia_vision/br/object_recognition/start
+
+ stop:
+ service: /butia_vision/br/object_recognition/stop
diff --git a/butia_recognition/launch/paligemma_object_recognition.launch b/butia_recognition/launch/paligemma_object_recognition.launch
new file mode 100644
index 0000000..d4495ac
--- /dev/null
+++ b/butia_recognition/launch/paligemma_object_recognition.launch
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ [255,0,0]
+
+
+
diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
new file mode 100644
index 0000000..73c7ca7
--- /dev/null
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/README.md
@@ -0,0 +1,7 @@
+# Install
+
+Install [this branch](https://github.com/butia-bots/butia_vision_msgs/tree/feature/gpsr-recognition) of butia_vision_msgs. Then run the following commands on the jetson, and make sure the pre-installed version of pytorch, numpy and other libraries from JetPack SDK is kept frozen and not updated during the install process.
+
+```sh
+pip install transformers accelerate peft bitsandbytes supervision
+```
\ No newline at end of file
diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
new file mode 100755
index 0000000..c7837f9
--- /dev/null
+++ b/butia_recognition/scripts/butia_recognition/paligemma_recognition/paligemma_recognition.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import rospy
+import ros_numpy
+from butia_recognition import BaseRecognition, ifState
+import numpy as np
+import os
+from copy import copy
+import cv2
+from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
+from transformers import SamModel, SamProcessor
+from std_msgs.msg import Header
+from sensor_msgs.msg import Image
+from geometry_msgs.msg import Vector3
+from butia_vision_msgs.msg import Description2D, Recognitions2D
+from butia_vision_msgs.srv import SetClass, SetClassRequest, SetClassResponse
+from butia_vision_msgs.srv import VisualQuestionAnswering, VisualQuestionAnsweringRequest, VisualQuestionAnsweringResponse
+import torch
+import gc
+import PIL
+import supervision as sv
+
+
+class PaliGemmaRecognition(BaseRecognition):
+ def __init__(self, state=True):
+ super().__init__(state=state)
+
+ self.readParameters()
+
+ self.colors = dict([(k, np.random.randint(low=0, high=256, size=(3,)).tolist()) for k in self.classes])
+
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+ self.loadModel()
+ self.initRosComm()
+
+ def initRosComm(self):
+ self.debug_publisher = rospy.Publisher(self.debug_topic, Image, queue_size=self.debug_qs)
+ self.object_recognition_publisher = rospy.Publisher(self.object_recognition_topic, Recognitions2D, queue_size=self.object_recognition_qs)
+ self.people_detection_publisher = rospy.Publisher(self.people_detection_topic, Recognitions2D, queue_size=self.people_detection_qs)
+ self.set_class_service_server = rospy.Service(self.set_class_service, SetClass, self.serverSetClass)
+ self.visual_question_answering_service_server = rospy.Service(self.visual_question_answering_service, VisualQuestionAnswering, self.serverVisualQuestionAnswering)
+ super().initRosComm(callbacks_obj=self)
+
+ def serverSetClass(self, req):
+ self.all_classes = [req.class_name,]
+ return SetClassResponse()
+
+ def serverVisualQuestionAnswering(self, req):
+ result = self.inferPaliGemma(image=PIL.Image.fromarray(cv2.cvtColor(self.cv_img, cv2.COLOR_BGR2RGB)), prompt=req.question)
+ res = VisualQuestionAnsweringResponse()
+ res.answer = result
+ return res
+
+ def serverStart(self, req):
+ self.loadModel()
+ return super().serverStart(req)
+
+ def serverStop(self, req):
+ self.unLoadModel()
+ return super().serverStop(req)
+
+ def loadModel(self):
+ self.pg = PaliGemmaForConditionalGeneration.from_pretrained('google/paligemma-3b-mix-224').to(self.device)
+ self.pg_processor = PaliGemmaProcessor.from_pretrained('google/paligemma-3b-mix-224')
+ self.sam = SamModel.from_pretrained('facebook/sam-vit-base').to(self.device)
+ self.sam_processor = SamProcessor.from_pretrained('facebook/sam-vit-base')
+ print('Done loading model!')
+
+ def unLoadModel(self):
+ del self.pg
+ del self.sam
+ gc.collect()
+ torch.cuda.empty_cache()
+ self.pg = None
+ self.sam = None
+
+ def inferPaliGemma(self, image, prompt):
+ inputs = self.pg_processor(text=prompt, images=image, return_tensors="pt").to(self.device)
+ with torch.inference_mode():
+ outputs = self.pg.generate(**inputs, max_new_tokens=50, do_sample=False)
+ result = self.pg_processor.batch_decode(outputs, skip_special_tokens=True)
+ return result[0][len(prompt):].lstrip('\n')
+
+ def inferSam(self, image, input_boxes):
+ inputs = self.sam_processor(images=image, input_boxes=input_boxes, return_tensors="pt").to(self.device)
+ with torch.inference_mode():
+ outputs = self.sam(**inputs)
+ masks = self.sam_processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+ return masks[0].detach().cpu().numpy()
+
+ @ifState
+ def callback(self, *args):
+ source_data = self.sourceDataFromArgs(args)
+
+ if 'image_rgb' not in source_data:
+ rospy.logwarn('Souce data has no image_rgb.')
+ return None
+
+ img_rgb = source_data['image_rgb']
+ cv_img = ros_numpy.numpify(img_rgb)
+ self.cv_img = cv_img
+ rospy.loginfo('Image ID: ' + str(img_rgb.header.seq))
+
+ objects_recognition = Recognitions2D()
+ h = Header()
+ h.seq = self.seq #id mensagem
+ self.seq += 1 #prox id
+ h.stamp = rospy.Time.now()
+
+ objects_recognition.header = h
+ objects_recognition = BaseRecognition.addSourceData2Recognitions2D(source_data, objects_recognition)
+ people_recognition = copy(objects_recognition)
+ description_header = img_rgb.header
+ description_header.seq = 0
+
+ results = self.inferPaliGemma(image=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), prompt=f"detect " + " ; ".join(self.all_classes))
+ boxes_ = sv.Detections.from_lmm(lmm='paligemma', result=results, resolution_wh=(cv_img.shape[1], cv_img.shape[0]), classes=self.all_classes)
+ debug_img = cv_img
+ masks = []
+ for x1, y1, x2, y2 in boxes_.xyxy:
+ masks.append(self.inferSam(image=PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)), input_boxes=[[[x1, y1, x2, y2]]])[:,0,:,:])
+ if len(boxes_):
+ boxes_.mask = np.array(masks).reshape((len(masks), cv_img.shape[0], cv_img.shape[1]))
+ for i in range(len(boxes_)):
+ box = boxes_[i]
+ xyxy_box = list(boxes_[i].xyxy.astype(int)[0])
+
+ if int(box.class_id) >= len(self.all_classes):
+ continue
+
+ label_class = self.all_classes[int(box.class_id)]
+
+
+ description = Description2D()
+ description.header = copy(description_header)
+ description.type = Description2D.DETECTION
+ description.id = description.header.seq
+ description.score = 1.0
+ description.max_size = Vector3(*[0.05, 0.05, 0.05])
+ size = int(xyxy_box[2] - xyxy_box[0]), int(xyxy_box[3] - xyxy_box[1])
+ description.bbox.center.x = int(xyxy_box[0]) + int(size[0]/2)
+ description.bbox.center.y = int(xyxy_box[1]) + int(size[1]/2)
+ description.bbox.size_x = size[0]
+ description.bbox.size_y = size[1]
+ description.mask = ros_numpy.msgify(Image, (boxes_.mask[i]*255).astype(np.uint8), encoding='mono8')
+
+ if ('people' in self.all_classes and label_class in self.classes_by_category['people'] or 'people' in self.all_classes and label_class == 'people'):
+
+ description.label = 'people' + '/' + label_class
+ people_recognition.descriptions.append(description)
+
+ elif (label_class in [val for sublist in self.all_classes for val in sublist] or label_class in self.all_classes):
+ index = None
+
+ for value in self.classes_by_category.items():
+ if label_class in value[1]:
+ index = value[0]
+
+ description.label = index + '/' + label_class if index is not None else label_class
+ objects_recognition.descriptions.append(description)
+
+ debug_img = sv.MaskAnnotator().annotate(debug_img, boxes_)
+ debug_img = sv.LabelAnnotator().annotate(debug_img, boxes_, [self.all_classes[idx] for idx in boxes_.class_id])
+ description_header.seq += 1
+
+ self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8'))
+
+ if len(objects_recognition.descriptions) > 0:
+ self.object_recognition_publisher.publish(objects_recognition)
+
+ if len(people_recognition.descriptions) > 0:
+ self.people_detection_publisher.publish(people_recognition)
+ else:
+ debug_img = sv.MaskAnnotator().annotate(debug_img, boxes_)
+ debug_img = sv.LabelAnnotator().annotate(debug_img, boxes_, [self.all_classes[idx] for idx in boxes_.class_id])
+ self.debug_publisher.publish(ros_numpy.msgify(Image, debug_img, 'rgb8'))
+
+ def readParameters(self):
+ self.debug_topic = rospy.get_param("~publishers/debug/topic", "/butia_vision/br/debug")
+ self.debug_qs = rospy.get_param("~publishers/debug/queue_size", 1)
+
+ self.object_recognition_topic = rospy.get_param("~publishers/object_recognition/topic", "/butia_vision/br/object_recognition")
+ self.object_recognition_qs = rospy.get_param("~publishers/object_recognition/queue_size", 1)
+
+ self.people_detection_topic = rospy.get_param("~publishers/people_detection/topic", "/butia_vision/br/people_detection")
+ self.people_detection_qs = rospy.get_param("~publishers/people_detection/queue_size", 1)
+
+ self.set_class_service = rospy.get_param("~servers/set_class/service", "/butia_vision/br/object_recognition/set_class")
+ self.visual_question_answering_service = rospy.get_param("~servers/visual_question_answering/service", "/butia_vision/br/object_recognition/visual_question_answering")
+
+ self.all_classes = list(rospy.get_param("~all_classes", []))
+ self.classes_by_category = dict(rospy.get_param("~classes_by_category", {}))
+
+ super().readParameters()
+
+if __name__ == '__main__':
+ rospy.init_node('paligemma_recognition_node', anonymous = True)
+
+ paligemma = PaliGemmaRecognition()
+
+ rospy.spin()
diff --git a/butia_vision_language_model/CMakeLists.txt b/butia_vision_language_model/CMakeLists.txt
new file mode 100644
index 0000000..9396249
--- /dev/null
+++ b/butia_vision_language_model/CMakeLists.txt
@@ -0,0 +1,202 @@
+cmake_minimum_required(VERSION 3.0.2)
+project(butia_vision_language_model)
+
+## Compile as C++11, supported in ROS Kinetic and newer
+# add_compile_options(-std=c++11)
+
+## Find catkin macros and libraries
+## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
+## is used, also find other catkin packages
+find_package(catkin REQUIRED)
+
+## System dependencies are found with CMake's conventions
+# find_package(Boost REQUIRED COMPONENTS system)
+
+
+## Uncomment this if the package has a setup.py. This macro ensures
+## modules and global scripts declared therein get installed
+## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html
+# catkin_python_setup()
+
+################################################
+## Declare ROS messages, services and actions ##
+################################################
+
+## To declare and build messages, services or actions from within this
+## package, follow these steps:
+## * Let MSG_DEP_SET be the set of packages whose message types you use in
+## your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...).
+## * In the file package.xml:
+## * add a build_depend tag for "message_generation"
+## * add a build_depend and a exec_depend tag for each package in MSG_DEP_SET
+## * If MSG_DEP_SET isn't empty the following dependency has been pulled in
+## but can be declared for certainty nonetheless:
+## * add a exec_depend tag for "message_runtime"
+## * In this file (CMakeLists.txt):
+## * add "message_generation" and every package in MSG_DEP_SET to
+## find_package(catkin REQUIRED COMPONENTS ...)
+## * add "message_runtime" and every package in MSG_DEP_SET to
+## catkin_package(CATKIN_DEPENDS ...)
+## * uncomment the add_*_files sections below as needed
+## and list every .msg/.srv/.action file to be processed
+## * uncomment the generate_messages entry below
+## * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...)
+
+## Generate messages in the 'msg' folder
+# add_message_files(
+# FILES
+# Message1.msg
+# Message2.msg
+# )
+
+## Generate services in the 'srv' folder
+# add_service_files(
+# FILES
+# Service1.srv
+# Service2.srv
+# )
+
+## Generate actions in the 'action' folder
+# add_action_files(
+# FILES
+# Action1.action
+# Action2.action
+# )
+
+## Generate added messages and services with any dependencies listed here
+# generate_messages(
+# DEPENDENCIES
+# std_msgs # Or other packages containing msgs
+# )
+
+################################################
+## Declare ROS dynamic reconfigure parameters ##
+################################################
+
+## To declare and build dynamic reconfigure parameters within this
+## package, follow these steps:
+## * In the file package.xml:
+## * add a build_depend and a exec_depend tag for "dynamic_reconfigure"
+## * In this file (CMakeLists.txt):
+## * add "dynamic_reconfigure" to
+## find_package(catkin REQUIRED COMPONENTS ...)
+## * uncomment the "generate_dynamic_reconfigure_options" section below
+## and list every .cfg file to be processed
+
+## Generate dynamic reconfigure parameters in the 'cfg' folder
+# generate_dynamic_reconfigure_options(
+# cfg/DynReconf1.cfg
+# cfg/DynReconf2.cfg
+# )
+
+###################################
+## catkin specific configuration ##
+###################################
+## The catkin_package macro generates cmake config files for your package
+## Declare things to be passed to dependent projects
+## INCLUDE_DIRS: uncomment this if your package contains header files
+## LIBRARIES: libraries you create in this project that dependent projects also need
+## CATKIN_DEPENDS: catkin_packages dependent projects also need
+## DEPENDS: system dependencies of this project that dependent projects also need
+catkin_package(
+# INCLUDE_DIRS include
+# LIBRARIES butia_vision_language_model
+# CATKIN_DEPENDS other_catkin_pkg
+# DEPENDS system_lib
+)
+
+###########
+## Build ##
+###########
+
+## Specify additional locations of header files
+## Your package locations should be listed before other locations
+include_directories(
+# include
+# ${catkin_INCLUDE_DIRS}
+)
+
+## Declare a C++ library
+# add_library(${PROJECT_NAME}
+# src/${PROJECT_NAME}/butia_vision_language_model.cpp
+# )
+
+## Add cmake target dependencies of the library
+## as an example, code may need to be generated before libraries
+## either from message generation or dynamic reconfigure
+# add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS})
+
+## Declare a C++ executable
+## With catkin_make all packages are built within a single CMake context
+## The recommended prefix ensures that target names across packages don't collide
+# add_executable(${PROJECT_NAME}_node src/butia_vision_language_model_node.cpp)
+
+## Rename C++ executable without prefix
+## The above recommended prefix causes long target names, the following renames the
+## target back to the shorter version for ease of user use
+## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node"
+# set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "")
+
+## Add cmake target dependencies of the executable
+## same as for the library above
+# add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS})
+
+## Specify libraries to link a library or executable target against
+# target_link_libraries(${PROJECT_NAME}_node
+# ${catkin_LIBRARIES}
+# )
+
+#############
+## Install ##
+#############
+
+# all install targets should use catkin DESTINATION variables
+# See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html
+
+## Mark executable scripts (Python etc.) for installation
+## in contrast to setup.py, you can choose the destination
+# catkin_install_python(PROGRAMS
+# scripts/my_python_script
+# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+# )
+
+## Mark executables for installation
+## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html
+# install(TARGETS ${PROJECT_NAME}_node
+# RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+# )
+
+## Mark libraries for installation
+## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_libraries.html
+# install(TARGETS ${PROJECT_NAME}
+# ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
+# LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
+# RUNTIME DESTINATION ${CATKIN_GLOBAL_BIN_DESTINATION}
+# )
+
+## Mark cpp header files for installation
+# install(DIRECTORY include/${PROJECT_NAME}/
+# DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION}
+# FILES_MATCHING PATTERN "*.h"
+# PATTERN ".svn" EXCLUDE
+# )
+
+## Mark other files for installation (e.g. launch and bag files, etc.)
+# install(FILES
+# # myfile1
+# # myfile2
+# DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+# )
+
+#############
+## Testing ##
+#############
+
+## Add gtest based cpp test target and link libraries
+# catkin_add_gtest(${PROJECT_NAME}-test test/test_butia_vision_language_model.cpp)
+# if(TARGET ${PROJECT_NAME}-test)
+# target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME})
+# endif()
+
+## Add folders to be run by python nosetests
+# catkin_add_nosetests(test)
diff --git a/butia_vision_language_model/config/vision_language_model_node.yaml b/butia_vision_language_model/config/vision_language_model_node.yaml
new file mode 100644
index 0000000..184d86b
--- /dev/null
+++ b/butia_vision_language_model/config/vision_language_model_node.yaml
@@ -0,0 +1,11 @@
+vlm_api_type: google-genai #Must be one of: ['openai', 'ollama', 'google-genai']
+vlm_api_host: dummy #Must be set for openai and ollama, but is not used for google-genai
+vlm_api_model: gemini-1.5-flash
+
+subscribers:
+ image_rgb:
+ topic: /butia_vision/bvb/image_rgb
+
+servers:
+ visual_question_answering:
+ service: /butia_vision/bvlm/visual_question_answering/query
diff --git a/butia_vision_language_model/launch/vision_language_model.launch b/butia_vision_language_model/launch/vision_language_model.launch
new file mode 100644
index 0000000..c45d7db
--- /dev/null
+++ b/butia_vision_language_model/launch/vision_language_model.launch
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/butia_vision_language_model/package.xml b/butia_vision_language_model/package.xml
new file mode 100644
index 0000000..4129a5f
--- /dev/null
+++ b/butia_vision_language_model/package.xml
@@ -0,0 +1,59 @@
+
+
+ butia_vision_language_model
+ 0.0.0
+ The butia_vision_language_model package
+
+
+
+
+ cris
+
+
+
+
+
+ TODO
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ catkin
+
+
+
+
+
+
+
+
diff --git a/butia_vision_language_model/scripts/vision_language_model_node.py b/butia_vision_language_model/scripts/vision_language_model_node.py
new file mode 100755
index 0000000..73cebe4
--- /dev/null
+++ b/butia_vision_language_model/scripts/vision_language_model_node.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+import rospy
+from sensor_msgs.msg import Image
+from butia_vision_msgs.srv import VisualQuestionAnswering, VisualQuestionAnsweringRequest, VisualQuestionAnsweringResponse
+import PIL
+from ros_numpy import numpify
+import base64
+from io import BytesIO
+from langchain_core.messages import HumanMessage
+
+try:
+ from langchain_community.chat_models.ollama import ChatOllama
+except:
+ pass
+try:
+ from langchain_openai.chat_models import ChatOpenAI
+except:
+ pass
+try:
+ from langchain_google_genai.chat_models import ChatGoogleGenerativeAI
+except:
+ pass
+
+
+class VisionLanguageModelNode:
+ def __init__(self):
+ self.read_parameters()
+ if self.vlm_api_type == 'ollama':
+ self.vlm = ChatOllama(model=self.vlm_api_model, base_url=self.vlm_api_host)
+ elif self.vlm_api_type == 'openai':
+ self.vlm = ChatOpenAI(model_name=self.vlm_api_model, openai_api_base=self.vlm_api_host)
+ if self.vlm_api_type == 'google-genai':
+ self.vlm = ChatGoogleGenerativeAI(model=self.vlm_api_model, convert_system_message_to_human=True)
+ else:
+ raise ValueError(f"VLM API type must be one of: {['ollama', 'openai', 'google-genai']}!")
+ self.image_rgb_subscriber = rospy.Subscriber(self.rgb_image_topic, Image, callback=self._update_rgb_image)
+ self.visual_question_answering_server = rospy.Service(self.visual_question_answering_service, VisualQuestionAnswering, handler=self._handle_visual_question_answering)
+
+ def _update_rgb_image(self, msg: Image):
+ self.rgb_image_msg = msg
+
+ def _handle_visual_question_answering(self, req: VisualQuestionAnsweringRequest):
+ message = HumanMessage(
+ content=[
+ self.get_image_content(),
+ {
+ 'type': 'text',
+ 'text': f'{req.question}'
+ }
+ ]
+ )
+ res = VisualQuestionAnsweringResponse()
+ res.answer = self.vlm.invoke([message,]).content
+ res.confidence = 1.0
+ return res
+
+ def get_image_content(self):
+ rospy.wait_for_message(self.rgb_image_topic, Image)
+ buffered = BytesIO()
+ img = PIL.Image.fromarray(numpify(self.rgb_image_msg)[:,:,::-1])
+ img.save(buffered, format='JPEG')
+ b64_image_str = base64.b64encode(buffered.getvalue()).decode()
+ if self.vlm_api_type in ('ollama',):
+ return {
+ 'type': 'image_url',
+ 'image_url': f"data:image/jpeg;base64,{b64_image_str}"
+ }
+ else:
+ return {
+ 'type': 'image_url',
+ 'image_url': {
+ 'url': f"data:image/jpeg;base64,{b64_image_str}"
+ }
+ }
+
+
+ def read_parameters(self):
+ self.vlm_api_type = rospy.get_param('~vlm_api_type')
+ self.vlm_api_host = rospy.get_param('~vlm_api_host')
+ self.vlm_api_model = rospy.get_param('~vlm_api_model')
+ self.rgb_image_topic = rospy.get_param('~subscribers/image_rgb/topic')
+ self.visual_question_answering_service = rospy.get_param('~servers/visual_question_answering/service')
+
+if __name__ == '__main__':
+ rospy.init_node('vision_language_model_node', anonymous=True)
+ node = VisionLanguageModelNode()
+ rospy.spin()
\ No newline at end of file