- Use case: ML training sets
- Wanted: parallel, random access to objects (blobs)
- Compression of objects, if applicable
- SHA-1 digest for de-duplication, if applicable
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <archive.h>
#include <archive_entry.h>
#include <hdf5.h>
#include <openssl/sha.h>
#define BUF_SIZE 1048576
#define PATH_MAX 4096
if (argc == 1) {
printf("ERROR: No TAR file specified! Usage: %s <TAR>\n", argv[0]);
exit(1);
}
We use libarchive for reading TAR archives and extracting TAR archive entries.
We need a buffer buff
for reading data and count the files we read
(file_count
). We don’t expect files greater than 64K and set flg
, if
the data doesn’t fit into a single buffer.
int retval;
char tarname[PATH_MAX];
struct archive *a;
struct archive_entry *entry;
char buff[BUF_SIZE];
size_t file_count = 0;
unsigned int flg = 0;
Create a new archive for read and open the TAR file.
a = archive_read_new();
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
strncpy(tarname, argv[1], PATH_MAX);
printf("| %s\n", tarname);
retval = archive_read_open_filename(a, tarname, BUF_SIZE);
if (retval != ARCHIVE_OK) {
printf("ERROR: Can't open TAR file!\n");
exit(1);
}
Keep reading the next header until done. For each header, read the data in up
to 64K chunks. A return value of 0 from archive_read_data
indicates that we
are done with an entry. Set flg
after the first read so that we can detect
data sizes exceeding 64K. If that’s the case, print a warning. To prevent
double counting, we increment the file count only on the first block read.
Finally, print the number of (non-empty) files we’ve seen.
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
for (;;) {
retval = archive_read_data(a, buff, BUF_SIZE);
assert (retval >= 0);
if (retval == 0) {
flg = 0;
break;
}
if (flg == 1)
printf("WARNING: File > 64K found.\n");
if (flg == 0)
++file_count;
flg = 1;
}
}
printf("%d files can be extracted.\n", file_count);
Close the archive.
retval = archive_read_free(a);
if (retval != ARCHIVE_OK) {
printf("ERROR: Can't close the archive properly!\n");
exit(1);
}
<<std-includes>>
<<archive-includes>>
<<defines>>
int main(int argc, char** argv) {
<<basic-definitions>>
<<arg-check>>
<<open-archive>>
<<archive-checker-main-loop>>
<<close-archive>>
exit(0);
}
Let’s assume that all files in the archive do not exceed a certain size, such as 64K. We can then read the whole show in one go and don’t need a more complex control structure.
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
retval = archive_read_data(a, buff, BUF_SIZE);
assert (retval >= 0 && retval < 65536);
if (retval > 0)
++file_count;
}
printf("%d files can be extracted.\n", file_count);
Presumably, all files in the archive are smaller than 64K. Let’s create matching HDF5 datasets with compact storage layout! We use the location in the archive as the dataset path name.
We have the same set of variables as before plus a few HDF5 extras.
<<basic-definitions>>
char h5name[PATH_MAX];
hid_t fapl, file, space, dcpl, lcpl, dset;
hsize_t dims;
Open the archive and create an HDF5 file. Initialize HDF5 property lists that we’ll use in the main loop.
<<open-archive>>
assert((lcpl = H5Pcreate(H5P_LINK_CREATE)) >= 0);
assert(H5Pset_create_intermediate_group(lcpl, 1) >= 0);
assert((dcpl = H5Pcreate(H5P_DATASET_CREATE)) >= 0);
assert(H5Pset_layout(dcpl, H5D_COMPACT) >= 0);
assert((fapl = H5Pcreate(H5P_FILE_ACCESS)) >= 0);
assert(H5Pset_libver_bounds(fapl, H5F_LIBVER_V18, H5F_LIBVER_V110) >= 0);
snprintf(h5name, PATH_MAX, "%s.compactor.h5", tarname);
printf("| %s\n", h5name);
assert((file = H5Fcreate(h5name, H5F_ACC_TRUNC, H5P_DEFAULT, fapl))
>= 0);
assert(H5Pclose(fapl) >= 0);
Retrieve the data as before and write them to the correspoinding HDF5
dataset. The name of a file in the archive can be retrieved via
archive_entry_pathname
.
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
retval = archive_read_data(a, buff, BUF_SIZE);
assert (retval >= 0 && retval < BUF_SIZE);
if (retval > 0) {
dims = (hsize_t) retval;
assert((space = H5Screate_simple(1, &dims, NULL)) >= 0);
if ((dset = H5Dcreate(file, archive_entry_pathname(entry),
H5T_NATIVE_UINT8, space, lcpl, dcpl,
H5P_DEFAULT)) >= 0) {
assert(H5Dwrite(dset, H5T_NATIVE_UINT8, H5S_ALL, H5S_ALL,
H5P_DEFAULT, buff)>= 0);
assert(H5Dclose(dset) >= 0);
++file_count;
}
assert(H5Sclose(space) >= 0);
}
}
printf("%d files extracted.\n", file_count);
Do the usual housekeeping.
assert(H5Fclose(file) >= 0);
assert(H5Pclose(dcpl) >= 0);
assert(H5Pclose(lcpl) >= 0);
<<close-archive>>
The resulting HDF5 profile will look like this:
HDF5 "compact.h5" { SUPER_BLOCK { SUPERBLOCK_VERSION 2 FREELIST_VERSION 0 SYMBOLTABLE_VERSION 0 OBJECTHEADER_VERSION 0 OFFSET_SIZE 8 LENGTH_SIZE 8 BTREE_RANK 16 BTREE_LEAF 4 ISTORE_K 32 FILE_SPACE_STRATEGY H5F_FSPACE_STRATEGY_FSM_AGGR FREE_SPACE_PERSIST FALSE FREE_SPACE_SECTION_THRESHOLD 1 FILE_SPACE_PAGE_SIZE 4096 USER_BLOCK { USERBLOCK_SIZE 0 } } GROUP "/" { GROUP "train_frames_12fps_128_center_cropped" { GROUP "0074cdXclLU" { DATASET "000001.jpg" { DATATYPE H5T_STD_U8LE DATASPACE SIMPLE { ( 3991 ) / ( 3991 ) } STORAGE_LAYOUT { COMPACT SIZE 3991 } ... } ... } ... } } }
This is a thought experiment only. It’s problematic for at least two reasons:
- The SHA-1 calculation is rather slow. It needs to be replaced with something like MurmurHash3 or perfect hashing.
- There is a good chance that there will be collisions. What is the right default behavior, and what kind of options would users like?
This variation uses the SHA-1 message digest of the file data as an identifier. The corresponding HDF5 file will contain less metadata because of the flat top-level structure.
We need two additional buffers for the SHA-1 digest and the corresponding path name.
<<h5compactor-definitions>>
unsigned char md[SHA_DIGEST_LENGTH];
unsigned char hash[2*SHA_DIGEST_LENGTH];
int i;
<<open-archive>>
assert((lcpl = H5Pcreate(H5P_LINK_CREATE)) >= 0);
assert(H5Pset_create_intermediate_group(lcpl, 1) >= 0);
assert((dcpl = H5Pcreate(H5P_DATASET_CREATE)) >= 0);
assert(H5Pset_layout(dcpl, H5D_COMPACT) >= 0);
assert((fapl = H5Pcreate(H5P_FILE_ACCESS)) >= 0);
assert(H5Pset_libver_bounds(fapl, H5F_LIBVER_V18, H5F_LIBVER_V110) >= 0);
snprintf(h5name, PATH_MAX, "%s.compactor-sha1.h5", tarname);
printf("| %s\n", h5name);
assert((file = H5Fcreate(h5name, H5F_ACC_TRUNC, H5P_DEFAULT, fapl))
>= 0);
assert(H5Pclose(fapl) >= 0);
The main loop is exactly the same as for the regular compactor. We just need to calculate the SHA-1 message digest of the payload and convert it to a human-readable name.
If there is a collision (same SHA-1), the HDF5 library will throw an error and we’ll just ignore it and press on. We turn off HDF5 the error stack to keep the noise level down.
assert(H5Eset_auto(H5E_DEFAULT, NULL, NULL) >= 0);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
retval = archive_read_data(a, buff, BUF_SIZE);
assert (retval >= 0 && retval < 65536);
if (retval > 0) {
SHA1(buff, retval, md);
for (i=0; i < SHA_DIGEST_LENGTH; ++i) {
sprintf((char*)&(hash[i*2]), "%02x%c", md[i],
i < (SHA_DIGEST_LENGTH-1) ? ' ' : '\0');
}
dims = (hsize_t) retval;
assert((space = H5Screate_simple(1, &dims, NULL)) >= 0);
if ((dset = H5Dcreate(file, hash, H5T_NATIVE_UINT8, space,
H5P_DEFAULT, dcpl, H5P_DEFAULT)) >= 0) {
assert(H5Dwrite(dset, H5T_NATIVE_UINT8, H5S_ALL, H5S_ALL,
H5P_DEFAULT, buff)>= 0);
assert(H5Dclose(dset) >= 0);
++file_count;
}
assert(H5Sclose(space) >= 0);
}
}
printf("%d files extracted.\n", file_count);
The resulting HDF5 profile will look like this:
HDF5 "compact-sha1.h5" { SUPER_BLOCK { SUPERBLOCK_VERSION 2 FREELIST_VERSION 0 SYMBOLTABLE_VERSION 0 OBJECTHEADER_VERSION 0 OFFSET_SIZE 8 LENGTH_SIZE 8 BTREE_RANK 16 BTREE_LEAF 4 ISTORE_K 32 FILE_SPACE_STRATEGY H5F_FSPACE_STRATEGY_FSM_AGGR FREE_SPACE_PERSIST FALSE FREE_SPACE_SECTION_THRESHOLD 1 FILE_SPACE_PAGE_SIZE 4096 USER_BLOCK { USERBLOCK_SIZE 0 } } GROUP "/" { DATASET "010307d26fb757f466e7acad477eb2d4d21a42b5" { DATATYPE H5T_STD_U8LE DATASPACE SIMPLE { ( 3148 ) / ( 3148 ) } STORAGE_LAYOUT { COMPACT SIZE 3148 } ... } ... } }
The compactor created separate HDF5 datasets for each file in the TAR
archive. The shredder stores all data in a single dataset image
. The bytes
from each file begin (and end) at a certain position in that dataset and we
track that information in a separate dataset image_offset
. We do the same for
the path names in the TAR archive in the name
and name_offset
datasets. The
element type of the payload dataset will be bytes (H5T_NATIVE_UINT8
) and the
offsets will be stored as (potentially large) integers (H5T_NATIVE_HSIZE
). In
other words, the HDF5 shredder represents the file contents of the TAR
archive in merely four HDF5 datasets. The main advantage of this
representation is that it provides random access to the file data (and
names).
We have to keep track of the extents of four different datasets and need a few extra variables for the bookkeeping.
<<basic-definitions>>
char h5name[PATH_MAX];
hid_t fapl, file, fspace, mspace, dcpl;
hid_t image, image_offset, name, name_offset;
hsize_t dims, maxdims = H5S_UNLIMITED;
hsize_t extent = 0, offset_extent = 0, offset = 0;
hsize_t nextent = 0, noffset_extent = 0, noffset = 0;
hsize_t start, one = 1, block;
Since we generally don’t know how many files there are in an archive, all
datasets will be extendible. In the case of the name
dataset, we expect a
certain repetitiveness and it makes sense to apply (Gzip) compression.
We use 1 million element chunks for the byte datasets and 128K element chunks for the offset datasets.
<<open-archive>>
assert((fapl = H5Pcreate(H5P_FILE_ACCESS)) >= 0);
assert(H5Pset_libver_bounds(fapl, H5F_LIBVER_V18, H5F_LIBVER_V110) >= 0);
snprintf(h5name, PATH_MAX, "%s.shredder.h5", tarname);
printf("| %s\n", h5name);
assert((file = H5Fcreate(h5name, H5F_ACC_TRUNC, H5P_DEFAULT, fapl))
>= 0);
assert(H5Pclose(fapl) >= 0);
dims = (hsize_t) 0;
assert((fspace = H5Screate_simple(1, &dims, &maxdims)) >= 0);
assert((dcpl = H5Pcreate(H5P_DATASET_CREATE)) >= 0);
dims = 1<<17;
assert(H5Pset_chunk(dcpl, 1, &dims) >= 0);
assert((image_offset = H5Dcreate(file, "image_offset", H5T_NATIVE_HSIZE,
fspace, H5P_DEFAULT, dcpl, H5P_DEFAULT))
>= 0);
assert((name_offset = H5Dcreate(file, "name_offset", H5T_NATIVE_HSIZE,
fspace, H5P_DEFAULT, dcpl, H5P_DEFAULT))
>= 0);
dims = 1<<20;
assert(H5Pset_chunk(dcpl, 1, &dims) >= 0);
assert((image = H5Dcreate(file, "image", H5T_NATIVE_UINT8, fspace,
H5P_DEFAULT, dcpl, H5P_DEFAULT)) >= 0);
assert(H5Pset_deflate(dcpl, 6) >= 0);
assert((name = H5Dcreate(file, "name", H5T_NATIVE_UINT8, fspace, H5P_DEFAULT,
dcpl, H5P_DEFAULT)) >= 0);
assert(H5Pclose(dcpl) >= 0);
assert(H5Sclose(fspace) >= 0);
The main loop is more or less unchanged. Fetch a new file from the archive
and update the four datasets. The code for the image
and image_offset,
and
the name
and name_offset
datasets is identical, and only the block
<<h5shredder-write-image>>
is shown below. We have to keep in mind though
that a given file may require multiple block reads, but we must update
the name and offset arrays only once.
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
for (;;) {
retval = archive_read_data(a, buff, BUF_SIZE);
assert (retval >= 0);
if (retval == 0) {
flg = 0;
break;
}
<<h5shredder-write-image>>
if (flg == 0) {
<<h5shredder-write-image-offset>>
<<h5shredder-write-name>>
++file_count;
}
flg = 1;
}
}
printf("%d files extracted.\n", file_count);
This is the block <<h5shredder-write-image>>
for updating the image
and
image_offset
datasets. The update/extension is done in standard HDF5 fashion:
Construct an in-memory selection, extend the dataset, retrieve a handle of
the in-file dataspace, (hyperslab-)select the destination in the file, and
write.
Remember that retval
contains the size of the file in the archive.
offset = start = extent;
block = (hsize_t) retval;
extent += block;
assert((mspace = H5Screate_simple(1, &block, NULL)) >= 0);
assert(H5Sselect_all(mspace) >= 0);
assert(H5Dset_extent(image, &extent) >= 0);
assert((fspace = H5Dget_space(image)) >= 0);
assert(H5Sselect_hyperslab(fspace, H5S_SELECT_SET, &start, NULL, &one,
&block) >= 0);
assert(H5Dwrite(image, H5T_NATIVE_UINT8, mspace, fspace, H5P_DEFAULT,
buff) >= 0);
assert(H5Sclose(fspace) >= 0);
assert(H5Sclose(mspace) >= 0);
start = offset_extent;
block = 1;
offset_extent += block;
assert((mspace = H5Screate_simple(1, &block, NULL)) >= 0);
assert(H5Sselect_all(mspace) >= 0);
assert(H5Dset_extent(image_offset, &offset_extent) >= 0);
assert((fspace = H5Dget_space(image_offset)) >= 0);
assert(H5Sselect_hyperslab(fspace, H5S_SELECT_SET, &start, NULL, &one,
&block) >= 0);
assert(H5Dwrite(image_offset, H5T_NATIVE_HSIZE, mspace, fspace,
H5P_DEFAULT, &offset) >= 0);
assert(H5Sclose(fspace) >= 0);
assert(H5Sclose(mspace) >= 0);
Make sure we close the datasets and the file.
assert(H5Dclose(name_offset) >= 0);
assert(H5Dclose(name) >= 0);
assert(H5Dclose(image_offset) >= 0);
assert(H5Dclose(image) >= 0);
assert(H5Fclose(file) >= 0);
<<close-archive>>
The resulting HDF5 profile will look like this:
HDF5 "shredder.h5" { SUPER_BLOCK { SUPERBLOCK_VERSION 2 FREELIST_VERSION 0 SYMBOLTABLE_VERSION 0 OBJECTHEADER_VERSION 0 OFFSET_SIZE 8 LENGTH_SIZE 8 BTREE_RANK 16 BTREE_LEAF 4 ISTORE_K 32 FILE_SPACE_STRATEGY H5F_FSPACE_STRATEGY_FSM_AGGR FREE_SPACE_PERSIST FALSE FREE_SPACE_SECTION_THRESHOLD 1 FILE_SPACE_PAGE_SIZE 4096 USER_BLOCK { USERBLOCK_SIZE 0 } } GROUP "/" { DATASET "image" { DATATYPE H5T_STD_U8LE DATASPACE SIMPLE { ( 976083 ) / ( H5S_UNLIMITED ) } STORAGE_LAYOUT { CHUNKED ( 1048576 ) SIZE 1048576 } ... } DATASET "image_offset" { DATATYPE H5T_STD_U64LE DATASPACE SIMPLE { ( 242 ) / ( H5S_UNLIMITED ) } STORAGE_LAYOUT { CHUNKED ( 1048576 ) SIZE 8388608 } ... } DATASET "name" { DATATYPE H5T_STD_U8LE DATASPACE SIMPLE { ( 14762 ) / ( H5S_UNLIMITED ) } STORAGE_LAYOUT { CHUNKED ( 1048576 ) SIZE 1894 (7.794:1 COMPRESSION) } FILTERS { COMPRESSION DEFLATE { LEVEL 6 } } ... } DATASET "name_offset" { DATATYPE H5T_STD_U64LE DATASPACE SIMPLE { ( 242 ) / ( H5S_UNLIMITED ) } STORAGE_LAYOUT { CHUNKED ( 1048576 ) SIZE 8388608 } ... } } }