persistent storage (part 2)

In a previous post, we show how to ‘randomize’ names on object base persistent storage to avoid service rate limits. We did not actually reduce the cost of object storage.

Cost for object storage is a product of a fix rate, amount stored and the time stored. You also pay for network traffic to and from object store and for access (both number of PUT and GET) to object store. The latter is only really controlled by a individual design pattern (PUTs being charged at a higher rate than GETs)- in particular transferring data out of the cloud costs a premium (so-called vendor lock-in). By compressing files before pushing to object store, you reduce the cost for both storage (less to store) and bandwidth (less to transfer). Ideally this should happen automatically.

We will show an implementation here for AWS S3 object store. Similar ideas can be applied to both Azure’s Blob and Google Drive.

We will create wrapper functions around AWS CLI commands to S3. We will ‘randomize’ the object names to avoid service rate limitations also. We use zstd for compression- it gives a good tradeoff between computational cost and compressed size for binary data. Of course, one can implement any compression algorithm.

$ cat utils.sh
#!/usr/bin/env bash

# some useful functions for dealing with S3
#  cps3 <source> <destination> <other aws s3 cp options>
#  mvs3 <source> <destination> <other aws s3 mv options>
#  lss3 [ <source> ]
#  finds3 <source> - if exit status is 0, file exists on s3

# hashes 'directories' and file names
# compress/decompress files automatically

# these vars are ASSUMED set
#  S3_bucket (w/ traling slash: "s3://my.bucket/")
#  AWSCLI_DEBUG_UTIL AWSCLI_DEBUG
#  region
# we set the following
#  HASH_LEN   #length of hash to prepend to keep S3 happy
#  zsuf       #suffix for compressed files
#

HASH_LEN=5   #length of hash to prepend to keep S3 happy
zsuf=".zst"

function encodes3 {
	[ -n "$1" ] && [ "$1" != "/" ] && \
		echo "$(echo "${1%%/*}" | md5sum | cut -c1-$HASH_LEN)${1%%/*}$([ "${1#*/}" != "$1" ] && echo "/$(encodes3 "${1#*/}")")" || echo "$1"
}

function decodes3 {
	local o="${1%%/*}"
	[ -n "$1" ] && [ "$1" != "/" ] && \
		echo "${o:$HASH_LEN}$([ "${1#*/}" != "$1" ] && echo "/$(decode "${1#*/}")")" || echo "$1"
}

function compress { #compress $1 to $2 = ["$1"$zsuf] if $3 is present delete the stuff going into archive
	local o="${1}${zsuf}"
	[ -n "$2" ] && o="$2"
	if [ -n "$3" ]; then
		zstd -f --rm -q "$1" -o "$o"
	else
		zstd -f -q "$1" -o "$o"
	fi
}

function decompress { #decompress $1 to $2 [="$1" w/o the $zsuf extension] if $3 is present delete the archive
	local o=${1//$zsuf}
	[ -n "$2" ] && o="$2"
	if [ -n "$3" ]; then
		zstd -f --rm -q -d "$1" -o "$o"
	else
		zstd -f -q -d "$1" -o "$o"
	fi
}

function cps3 {
	[ -n "$2" ] && [ "${1::2}" != "--" ] && [ "${2::2}" != "--" ] || return
	([ "${1::3}" == "s3:" ] || [ "${2::3}" == "s3:" ]) || return #at least one must be on s3

	local i="${1:3}"
	[ "${1::3}" == "s3:" ] && i="${S3_bucket}$(encodes3 "${i#/}")" || i="$1"   #fetching from s3 or not
	local cmd="aws $AWSCLI_DEBUG_UTIL --region $region s3 cp \"${i}"
	if [ "${1::3}" != "s3:" ];then [ -f "$i" ] && compress "$i" && [ -f "$i$zsuf" ] || return; fi   #need to compress before push
	[ "${1:(-1)}" != "/" ] && [ "$1" != "." ] && cmd+="$zsuf"   #append zsuf if not directory
	

	local o="${2:3}"
	[ "${2::3}" == "s3:" ] && o="${S3_bucket}$(encodes3 "${o#/}")" || o="$2"   #pushing to s3 or not
	[ "$o" == "." ] && [ "${1:(-1)}" != "/" ] && o="$(basename "${1:3}")"    #if destination is '.', grab basename for target
	cmd+="\" \"$o" #construct command, first source 
	[ "${o:(-1)}" != "/" ] && [ "$o" != "." ] && cmd+="$zsuf"   #append zsuf if not directory
	cmd+="\" --quiet"  #everything on s3 is compressed

	local s
	shift 2 && for s in "${@}"; do [ "${s::2}" == "--" ] && cmd+=" $s" || cmd+=" \"$s\""; done # extra parameters?
	
	s=1 && until eval "$cmd"; do  #infinite loop
		echo "WARNING: problem with ${cmd}" && sleep $(shuf -i 0-$s -n 1 | awk '{print 2**$1}') && ((s++))
	done
	if [ "${o::3}" != "s3:" ];then
		[ -f "$o$zsuf" ] && decompress "$o$zsuf" "$o" "delete" || return
	fi
	if [ "${i::3}" != "s3:" ];then
		rm -f "$i$zsuf"
	fi
}

function mvs3 {
	[ -n "$2" ] && [ "${1::2}" != "--" ] && [ "${2::2}" != "--" ] || return
	local i="${1:3}" && [ "${1::3}" == "s3:" ] || i="$1"
	local o="${2:3}" && [ "${2::3}" == "s3:" ] || o="$2"
	i="${S3_bucket}$(encodes3 "${i#/}")" && [ "$i" == "${i%/}" ] && i+=${zsuf}  #everything on s3 is compressed
	o="${S3_bucket}$(encodes3 "${o#/}")" && [ "$o" == "${o%/}" ] && o+=${zsuf}
	shift 2
	local cmd="aws $AWSCLI_DEBUG_UTIL --region $region s3 mv \"${i}\" \"${o}\" --quiet"
	local s
	for s in "${@}"; do [ "${s::2}" == "--" ] && cmd+=" $s" || cmd+=" \"$s\""; done # extra parameters?
	s=1
	until eval "$cmd"; do
		echo "WARNING: problem with ${cmd}" && sleep $(shuf -i 0-$s -n 1 | awk '{print 2**$1}') && ((s++))
	done
}

function lss3 {
	local o=""
	[ -n "$1" ] && [ "${1::2}" != "--" ] && o="$1" && shift && [ "${o::3}" == "s3:" ] && o="${o:3}"
	aws --region $region s3 ls "${S3_bucket}$(encodes3 "${o#/}")" $@ | \
		sed -E "s/(.+) (.{$HASH_LEN})(.*)$/\3s \1 \3/" | \
		sort | \
		sed -E "s/^(.*)s (.*)/\2/; s/.zst$//"
}

function finds3 {
	[ -n "$1" ] && [ "${1::2}" != "--" ] && [ "$1" == "${1%/}" ] || return
	aws $AWSCLI_DEBUG --region $region s3 ls "${S3_bucket}$(encodes3 "${1#/}")${zsuf}" > /dev/null 2> /dev/null
}

function rms3 {   #rm $1 from s3
	[ -n "$1" ] && [ "${1::2}" != "--" ] || return
	local i="$1" && shift
	[ "${i::3}" == "s3:" ] && i="${i:3}"
	i="\"${S3_bucket}$(encodes3 "${i#/}")" && [ "$i" == "${i%/}" ] && i+=${zsuf}
	i+="\""
	local s
	for s in "${@}"; do [ "${s::2}" == "--" ] && i+=" $s" || i+=" \"$s\""; done # extra parameters?
	eval "aws $AWSCLI_DEBUG_UTIL --region $region s3 rm $i --quiet"
}

$ source utils.sh 
$ echo $S3_bucket
s3://com.entonos.wp/
$ touch foo bar
$ cps3 foo s3:FOO/foo
$ cps3 bar s3:bar
$ lss3
                           PRE FOO/
2019-08-16 20:47:21         13 bar
$ lss3 FOO/
2019-08-16 20:47:12         13 foo
$ aws s3 ls --region $region $S3_bucket
                           PRE 70297FOO/
2019-08-16 20:47:21         13 c157abar.zst
$ aws s3 ls --region $region ${S3_bucket}70297FOO/
2019-08-16 20:47:12         13 d3b07foo.zst
$ rm *
$ cps3 s3:bar .
$ cps3 s3:FOO/foo foobar
$ ls
bar     foobar
$ rms3 bar
$ rms3 FOO/ --recursive
$ lss3
$

2 Responses

cost effective high-throughput computing in the cloud – Entonos August 14, 2019 at 1:00 am

[…] storage cost, as pointed out in a previous post, can be reduced by always compressing files before pushing to persistent storage. This reduces the […]

light-weight way to capture results from HPC/HTC cloud computation – Entonos August 29, 2019 at 10:42 pm

[…] the file contents actually change?). If so, the file is compressed and pushed to S3 (see previous post). If it is not an input file, then delete it (no longer needed for […]

2 Responses

Add a Comment Cancel reply