gzip compression for S3 uploads with boto3

2 Jun '17

boto3 doesn’t do compressed uploading, probably because S3 is pretty cheap, and in most cases it’s simply not worth the effort.

But for text files, compression can be over 10x (e.g. uncompressed 50MiB, compressed 5MiB). And if you allow downloads from S3, and you use gzip, browsers can uncompress the file automatically on download. This is awesome if you have e.g. the sales team download a huge CSV file! (To get this to work, you’ll need to set the correct content type. Browsers care about that, boto3 doesn’t.)

Sadly, Python’s gzip library is a bit confusing to use. Also, you need to know the final file size to upload files to S3. So the compression needs to be performed in advance.

For well-compressible files, I compress them in memory, but for truly large files, you can pass in e.g. a TemporaryFile to allow better scaling. See for yourself:

from io import BytesIO
import gzip
import shutil
def upload_gzipped(bucket, key, fp, compressed_fp=None, content_type='text/plain'):
"""Compress and upload the contents from fp to S3.
If compressed_fp is None, the compression is performed in memory.
"""
if not compressed_fp:
compressed_fp = BytesIO()
with gzip.GzipFile(fileobj=compressed_fp, mode='wb') as gz:
shutil.copyfileobj(fp, gz)
compressed_fp.seek(0)
bucket.upload_fileobj(
compressed_fp,
key,
{'ContentType': content_type, 'ContentEncoding': 'gzip'})
def download_gzipped(bucket, key, fp, compressed_fp=None):
"""Download and uncompress contents from S3 to fp.
If compressed_fp is None, the compression is performed in memory.
"""
if not compressed_fp:
compressed_fp = BytesIO()
bucket.download_fileobj(key, compressed_fp)
compressed_fp.seek(0)
with gzip.GzipFile(fileobj=compressed_fp, mode='rb') as gz:
shutil.copyfileobj(gz, fp)
import boto3
from tempfile import TemporaryFile
from io import BytesIO
s3 = boto3.resource('s3')
bucket = s3.Bucket('test') # CHANGE ME
def example1(bucket):
"""In memory compression"""
with open('foo.txt', 'rb') as fp:
upload_gzipped(bucket, 'test.txt', fp)
with open('bar.txt', 'wb') as fp:
download_gzipped(bucket, 'test.txt', fp)
def example2(bucket):
"""Using a temporary file for compression"""
with open('foo.txt', 'rb') as fp, TemporaryFile() as helper_fp:
upload_gzipped(bucket, 'test.txt', fp, compressed_fp=helper_fp)
with open('bar.txt', 'wb') as fp, TemporaryFile() as helper_fp:
download_gzipped(bucket, 'test.txt', fp, compressed_fp=helper_fp)
# Some actual tests
original = BytesIO(b'Jackdaws love my big sphinx of quartz.')
original.seek(0)
upload_gzipped(bucket, 'test.txt', original)
gzipped = BytesIO()
bucket.download_fileobj('test.txt', gzipped)
assert original.getvalue() != gzipped.getvalue()
ungzipped = BytesIO()
download_gzipped(bucket, 'test.txt', ungzipped)
assert original.getvalue() == ungzipped.getvalue()
view raw boto3-gzip.py hosted with ❤ by GitHub

Python

Newer Older