-
Notifications
You must be signed in to change notification settings - Fork 4
Limited random access reading on compressed data
Xiuwen Zheng edited this page Feb 17, 2015
·
5 revisions
- 10,000,000 random 0,1 sequence of 32-bit integers
- in each 32 bits, one bit stores random 0,1 and others are ZERO
- lower bound of compression percentage is 1/32 = 3.125%
- Testing:
- of 10,000 random positions, read a 32-bit integer
- compression ratio is maximized for each method
- Raw, ZIP, ZIP_RA, LZ4 and LZ4_RA
- Running time:
- MacBook Pro, Retina, 13-inch, Late 2013, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3
Method | Raw | ZIP | ZIP_RA | LZ4 | LZ4_RA |
---|---|---|---|---|---|
Compression Percent | 100% | 5.08% | 5.42% | 6.98% | 7.33% |
Time (second) | 0.35 | 203.99 | 3.54 | 90.03 | 1.47 |
- R code:
library(gdsfmt)
# cteate a GDS file
f <- createfn.gds("test.gds")
set.seed(100)
# 10,000,000 random 0,1 sequence of 32-bit integers
v <- sample.int(2, 10*1000*1000, replace=TRUE) - 1L
table(v)
# compression algorithms
compression <- c("", "ZIP.max", "ZIP_RA.max:16K", "LZ4.max", "LZ4_RA.max:16K")
# save
for (i in 1:length(compression))
{
cat("Compression:", compression[i], "\n")
n <- add.gdsn(f, paste0("I", i), val=v, compress=compression[i])
readmode.gdsn(n)
}
f
# close the file
closefn.gds(f)
# cteate a GDS file
(f <- openfn.gds("test.gds"))
n <- vector("list", length(compression))
for (i in 1:length(n))
n[[i]] <- index.gdsn(f, paste0("I", i))
# check all values
z1 <- read.gdsn(n[[1]])
for (i in 2:length(n))
{
z <- read.gdsn(n[[i]])
stopifnot(all(z1 == z))
}
# 10,000 random positions
set.seed(1000)
idx <- sample.int(length(v), 10000)
# for-loop each compression method
for (i in 1:length(n))
{
cat("Compression:", compression[i], "\n")
print(system.time({
vv <- rep.int(0, length(idx))
for (k in 1:length(idx))
vv[k] <- read.gdsn(n[[i]], start=idx[k], count=1)
}))
stopifnot(all(vv == v[idx]))
}
# close the file
closefn.gds(f)