Skip to content
This repository has been archived by the owner on Feb 17, 2023. It is now read-only.

Add python bindings for skyhook #291

Draft
wants to merge 43 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
e7c6971
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
89fc7a7
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
7870077
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
18f8aca
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
5d59d88
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
c1c6274
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
68118b6
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
aa09840
Add python bindings for skyhook
JayjeetAtGithub Mar 12, 2022
3597f70
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
08204c0
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
0d4a382
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
a576784
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
82d71fe
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
ed02042
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
0eaa308
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
c904963
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
41a8464
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
1e98664
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
f81dc35
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
c632266
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
fbd02ee
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
35f9ff3
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
ba5e071
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
c60b276
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
5ce28fe
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
3deb62f
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
8d0d6a7
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
19cc7df
Add bindings for RadosConnCtx
JayjeetAtGithub Mar 13, 2022
3cdbc3f
updates
JayjeetAtGithub Mar 13, 2022
3ba4026
Add pkg config
JayjeetAtGithub Mar 13, 2022
bb325b3
Updates
JayjeetAtGithub Mar 13, 2022
7512b98
Updates
JayjeetAtGithub Mar 13, 2022
57c2e29
Updates
JayjeetAtGithub Mar 13, 2022
7ad9c3c
Rename libs
JayjeetAtGithub Mar 13, 2022
fa8e178
Rename libs
JayjeetAtGithub Mar 13, 2022
ca2b32c
Rename libs
JayjeetAtGithub Mar 13, 2022
85f41da
Rename libs
JayjeetAtGithub Mar 13, 2022
8eb6efb
Rename libs
JayjeetAtGithub Mar 13, 2022
8e3547d
Fixes
JayjeetAtGithub Mar 13, 2022
0a16d0d
Fixes
JayjeetAtGithub Mar 13, 2022
7759942
Fixes
JayjeetAtGithub Mar 13, 2022
38d146f
Fixes
JayjeetAtGithub Mar 13, 2022
3bec9b1
Fixes
JayjeetAtGithub Mar 13, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions cpp/cmake_modules/FindArrowSkyhook.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# - Find Arrow Skyhook (arrow/dataset/api.h, libarrow_dataset.a, libarrow_dataset.so)
#
# This module requires Arrow from which it uses
# arrow_find_package()
#
# This module defines
# ARROW_SKYHOOK_FOUND, whether Arrow Skyhook has been found
# ARROW_SKYHOOK_IMPORT_LIB,
# path to libarrow_dataset's import library (Windows only)
# ARROW_SKYHOOK_INCLUDE_DIR, directory containing headers
# ARROW_SKYHOOK_LIB_DIR, directory containing Arrow Skyhook libraries
# ARROW_SKYHOOK_SHARED_LIB, path to libarrow_dataset's shared library
# ARROW_SKYHOOK_STATIC_LIB, path to libarrow_dataset.a

if(DEFINED ARROW_SKYHOOK_FOUND)
return()
endif()

set(find_package_arguments)
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION)
list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}")
endif()
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED)
list(APPEND find_package_arguments REQUIRED)
endif()
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
list(APPEND find_package_arguments QUIET)
endif()
find_package(Arrow ${find_package_arguments})

if(ARROW_FOUND)
arrow_find_package(ARROW_SKYHOOK
"${ARROW_HOME}"
arrow_skyhook
skyhook/client/file_skyhook.h
ArrowSkyhook
arrow-skyhook)
if(NOT ARROW_SKYHOOK_VERSION)
set(ARROW_SKYHOOK_VERSION "${ARROW_VERSION}")
endif()
endif()

if("${ARROW_SKYHOOK_VERSION}" VERSION_EQUAL "${ARROW_VERSION}")
set(ARROW_SKYHOOK_VERSION_MATCH TRUE)
else()
set(ARROW_SKYHOOK_VERSION_MATCH FALSE)
endif()

mark_as_advanced(ARROW_SKYHOOK_IMPORT_LIB
ARROW_SKYHOOK_INCLUDE_DIR
ARROW_SKYHOOK_LIBS
ARROW_SKYHOOK_LIB_DIR
ARROW_SKYHOOK_SHARED_IMP_LIB
ARROW_SKYHOOK_SHARED_LIB
ARROW_SKYHOOK_STATIC_LIB
ARROW_SKYHOOK_VERSION
ARROW_SKYHOOK_VERSION_MATCH)

find_package_handle_standard_args(
ArrowSkyhook
REQUIRED_VARS ARROW_SKYHOOK_INCLUDE_DIR ARROW_SKYHOOK_LIB_DIR
ARROW_SKYHOOK_VERSION_MATCH
VERSION_VAR ARROW_SKYHOOK_VERSION)
set(ARROW_SKYHOOK_FOUND ${ArrowSkyhook_FOUND})

if(ArrowSkyhook_FOUND AND NOT ArrowSkyhook_FIND_QUIETLY)
message(STATUS "Found the Arrow Skyhook by ${ARROW_SKYHOOK_FIND_APPROACH}")
message(STATUS "Found the Arrow Skyhook shared library: ${ARROW_SKYHOOK_SHARED_LIB}")
message(STATUS "Found the Arrow Skyhook import library: ${ARROW_SKYHOOK_IMPORT_LIB}")
message(STATUS "Found the Arrow Skyhook static library: ${ARROW_SKYHOOK_STATIC_LIB}")
endif()
38 changes: 38 additions & 0 deletions cpp/src/skyhook/ArrowSkyhookConfig.cmake.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# This config sets the following variables in your project::
#
# ArrowDataset_FOUND - true if Arrow Dataset found on the system
#
# This config sets the following targets in your project::
#
# arrow_dataset_shared - for linked as shared library if shared library is built
# arrow_dataset_static - for linked as static library if static library is built

@PACKAGE_INIT@

include(CMakeFindDependencyMacro)
find_dependency(Arrow)
find_dependency(ArrowDataset)
find_dependency(Parquet)

# Load targets only once. If we load targets multiple times, CMake reports
# already existent target error.
if(NOT (TARGET arrow_dataset_shared OR TARGET arrow_dataset_static))
include("${CMAKE_CURRENT_LIST_DIR}/ArrowDatasetTargets.cmake")
endif()
22 changes: 13 additions & 9 deletions cpp/src/skyhook/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
add_subdirectory(client)

# define the targets to build
add_custom_target(arrow_skyhook_client)
add_custom_target(arrow_skyhook)
add_custom_target(cls_skyhook)

# define the dependencies
Expand All @@ -28,19 +28,23 @@ set(ARROW_SKYHOOK_LINK_STATIC arrow_dataset_static librados::rados)
set(ARROW_SKYHOOK_LINK_SHARED arrow_dataset_shared librados::rados)

# define the client and cls sources
set(ARROW_SKYHOOK_CLIENT_SOURCES client/file_skyhook.cc protocol/rados_protocol.cc
set(ARROW_SKYHOOK_SOURCES client/file_skyhook.cc protocol/rados_protocol.cc
protocol/skyhook_protocol.cc)
set(ARROW_SKYHOOK_CLS_SOURCES cls/cls_skyhook.cc protocol/rados_protocol.cc
protocol/skyhook_protocol.cc)

# define the client library
add_arrow_lib(arrow_skyhook_client
add_arrow_lib(arrow_skyhook
CMAKE_PACKAGE_NAME
ArrowSkyhook
PKG_CONFIG_NAME
skyhook
arrow-skyhook
# PKG_CONFIG_NAME
# skyhook
SOURCES
${ARROW_SKYHOOK_CLIENT_SOURCES}
${ARROW_SKYHOOK_SOURCES}
OUTPUTS
ARROW_SKYHOOK_CLIENT_LIBRARIES
ARROW_SKYHOOK_LIBRARIES
SHARED_LINK_LIBS
${ARROW_SKYHOOK_LINK_SHARED}
STATIC_LINK_LIBS
Expand All @@ -58,15 +62,15 @@ add_arrow_lib(cls_skyhook
${ARROW_SKYHOOK_LINK_STATIC})

# finish building the project
add_dependencies(arrow_skyhook_client ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
add_dependencies(arrow_skyhook ${ARROW_SKYHOOK_LIBRARIES})
add_dependencies(cls_skyhook ${ARROW_SKYHOOK_CLS_LIBRARIES})

# define the test builds
if(ARROW_TEST_LINKAGE STREQUAL "static")
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_client_static arrow_dataset_static
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_static arrow_dataset_static
${ARROW_TEST_STATIC_LINK_LIBS})
else()
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_client_shared arrow_dataset_shared
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_shared arrow_dataset_shared
${ARROW_TEST_SHARED_LINK_LIBS})
endif()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ Name: Skyhook
Description: Skyhook is a plugin for offloading computations into Ceph.
Version: @SKYHOOK_VERSION@
Requires: arrow_dataset
Libs: -L${libdir} -larrow_skyhook_client
Libs: -L${libdir} -larrow_skyhook
2 changes: 2 additions & 0 deletions cpp/src/skyhook/client/file_skyhook.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ struct RadosConnCtx {
std::string ceph_cluster_name;
std::string ceph_cls_name;

RadosConnCtx() = default;

RadosConnCtx(std::string ceph_config_path, std::string ceph_data_pool,
std::string ceph_user_name, std::string ceph_cluster_name,
std::string ceph_cls_name)
Expand Down
21 changes: 21 additions & 0 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
option(PYARROW_BUILD_SKYHOOK "Build the PyArrow Skyhook integration" OFF)
option(PYARROW_PARQUET_USE_SHARED "Rely on parquet shared libraries where relevant" ON)
option(PYARROW_BUILD_PARQUET_ENCRYPTION
"Build the PyArrow Parquet encryption integration" OFF)
Expand Down Expand Up @@ -511,6 +512,23 @@ if(PYARROW_BUILD_ORC)
endif()
endif()

if (PYARROW_BUILD_SKYHOOK)
# Skyhook
find_package(ArrowSkyhook REQUIRED)
include_directories(SYSTEM ${ARROW_SKYHOOK_INCLUDE_DIR})
if(PYARROW_BUILD_DATASET)
if(PYARROW_BUNDLE_ARROW_CPP)
file(COPY ${ARROW_SKYHOOK_INCLUDE_DIR}/skyhook
DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include)
bundle_arrow_lib(ARROW_SKYHOOK_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
if(MSVC)
bundle_arrow_import_lib(ARROW_SKYHOOK_IMPORT_LIB)
endif()
endif()
set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _dataset_skyhook)
endif()
endif()

# Flight
if(PYARROW_BUILD_FLIGHT)
# Arrow Flight
Expand Down Expand Up @@ -632,6 +650,9 @@ if(PYARROW_BUILD_DATASET)
if(PYARROW_BUILD_PARQUET)
target_link_libraries(_dataset_parquet PRIVATE ${DATASET_LINK_LIBS})
endif()
if (PYARROW_BUILD_SKYHOOK)
target_link_libraries(_dataset_skyhook PRIVATE ${DATASET_LINK_LIBS})
endif()
endif()

if(PYARROW_BUILD_GANDIVA)
Expand Down
19 changes: 19 additions & 0 deletions python/pyarrow/_dataset.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,24 @@ def _get_parquet_classes():
_dataset_pq = None


def _get_skyhook_fileformat():
"""
Import SkyhookFileFormat on first usage (to avoid circular import issue
when `pyarrow._dataset_skyhook` would be imported first)
"""
global _skyhook_fileformat
global _skyhook_imported
if not _skyhook_imported:
try:
from pyarrow._dataset_skyhook import SkyhookFileFormat
_skyhook_fileformat = SkyhookFileFormat
except ImportError as e:
_skyhook_fileformat = None
finally:
_skyhook_imported = True
return _skyhook_fileformat


def _get_parquet_symbol(name):
"""
Get a symbol from pyarrow.parquet if the latter is importable, otherwise
Expand Down Expand Up @@ -681,6 +699,7 @@ cdef class FileFormat(_Weakrefable):
'csv': CsvFileFormat,
'parquet': _get_parquet_symbol('ParquetFileFormat'),
'orc': _get_orc_fileformat(),
'skyhook': _get_skyhook_fileformat(),
}

class_ = classes.get(type_name, None)
Expand Down
68 changes: 68 additions & 0 deletions python/pyarrow/_dataset_skyhook.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# cython: language_level = 3

from pyarrow._dataset cimport FileFormat
from pyarrow.lib cimport *
from pyarrow.lib import frombytes, tobytes
from pyarrow.includes.libarrow_dataset_skyhook cimport *


cdef class SkyhookFileFormat(FileFormat):
"""
A FileFormat implementation that offloads the fragment
scan operations to the Ceph OSDs.
Parameters
---------
file_format: The underlying file format to use.
ceph_config_path: The path to the Ceph config file.
data_pool: Name of the CephFS data pool.
user_name: The username accessing the Ceph cluster.
cluster_name: Name of the cluster.
"""
cdef:
CSkyhookFileFormat* skyhook_format

def __init__(
self,
file_format="parquet",
ceph_config_path="/etc/ceph/ceph.conf",
ceph_data_pool="cephfs_data",
ceph_user_name="client.admin",
ceph_cluster_name="ceph",
ceph_cls_name="arrow"
):
cdef:
CRadosConnCtx ctx

ctx.ceph_config_path = ceph_config_path
ctx.ceph_data_pool = ceph_data_pool
ctx.ceph_user_name = ceph_user_name
ctx.ceph_cluster_name = ceph_cluster_name
ctx.ceph_cls_name = ceph_cls_name

self.init(shared_ptr[CFileFormat](
new CSkyhookFileFormat(
make_shared[CRadosConnCtx](ctx),
tobytes(file_format)
)
))

cdef void init(self, const shared_ptr[CFileFormat]& sp):
FileFormat.init(self, sp)
self.skyhook_format = <CSkyhookFileFormat*> sp.get()
19 changes: 19 additions & 0 deletions python/pyarrow/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@
from pyarrow.compute import Expression, scalar, field # noqa


_skyhook_available = False
_skyhook_msg = (
"The pyarrow installation is not built with support for the Skyhook file "
"format."
)

try:
from pyarrow._dataset_skyhook import SkyhookFileFormat
_skyhook_available = True
except ImportError:
pass

_orc_available = False
_orc_msg = (
"The pyarrow installation is not built with support for the ORC file "
Expand Down Expand Up @@ -92,6 +104,9 @@ def __getattr__(name):
if name == "ParquetFileFormat" and not _parquet_available:
raise ImportError(_parquet_msg)

if name == "SkyhookFileFormat" and not _skyhook_available:
raise ImportError(_skyhook_msg)

raise AttributeError(
"module 'pyarrow.dataset' has no attribute '{0}'".format(name)
)
Expand Down Expand Up @@ -263,6 +278,10 @@ def _ensure_format(obj):
if not _orc_available:
raise ValueError(_orc_msg)
return OrcFileFormat()
elif obj == "skyhook":
if not _skyhook_available:
raise ValueError(_skyhook_msg)
return SkyhookFileFormat()
else:
raise ValueError("format '{}' is not supported".format(obj))

Expand Down
Loading