Source code for girder.utility.s3_assetstore_adapter

# -*- coding: utf-8 -*-
import boto3
import botocore
import cherrypy
import json
import re
import requests
import six
import uuid
from six.moves import urllib

from girder import logger, events
from girder.api.rest import setContentDisposition
from girder.exceptions import GirderException, ValidationException
from girder.models.file import File
from girder.models.folder import Folder
from girder.models.item import Item
from .abstract_assetstore_adapter import AbstractAssetstoreAdapter

BUF_LEN = 65536  # Buffer size for download stream
DEFAULT_REGION = 'us-east-1'


[docs]class S3AssetstoreAdapter(AbstractAssetstoreAdapter):
    """
    This assetstore type stores files on S3. It is responsible for generating
    HMAC-signed messages that authorize the client to communicate directly with
    the S3 server where the files are stored.
    """

    CHUNK_LEN = 1024 * 1024 * 32  # Chunk size for uploading
    HMAC_TTL = 120  # Number of seconds each signed message is valid

    @staticmethod
    def _s3Client(connectParams):
        try:
            client = boto3.client('s3', **connectParams)
            if 'googleapis' in urllib.parse.urlparse(connectParams.get(
                    'endpoint_url', '')).netloc.split('.'):
                client.meta.events.unregister(
                    'before-parameter-build.s3.ListObjects',
                    botocore.handlers.set_list_objects_encoding_type_url)
                client._useGoogleAccessId = True
            return client
        except Exception:
            logger.exception('S3 assetstore validation exception')
            raise ValidationException('Unable to connect to S3 assetstore')

[docs]    @staticmethod
    def validateInfo(doc):
        """
        Makes sure the root field is a valid absolute path and is writeable.
        """
        if 'prefix' not in doc:
            doc['prefix'] = ''
        # remove slashes from front and back of the prefix
        doc['prefix'] = doc['prefix'].strip('/')
        if not doc.get('bucket'):
            raise ValidationException('Bucket must not be empty.', 'bucket')

        # construct a set of connection parameters based on the keys and the service
        if 'service' not in doc:
            doc['service'] = ''
        if doc['service'] != '':
            if not re.match('^((https?)://)?([^:/]+)(:([0-9]+))?$', doc['service']):
                raise ValidationException(
                    'The service must of the form [http[s]://](host domain)[:(port)].', 'service')
        params = makeBotoConnectParams(
            doc['accessKeyId'], doc['secret'], doc['service'], doc.get('region'),
            doc.get('inferCredentials'))
        client = S3AssetstoreAdapter._s3Client(params)
        if doc.get('readOnly'):
            try:
                client.head_bucket(Bucket=doc['bucket'])
            except Exception:
                logger.exception('S3 assetstore validation exception')
                raise ValidationException(
                    'Unable to connect to bucket "%s".' % doc['bucket'], 'bucket')
        else:
            # Make sure we can write into the given bucket using boto
            try:
                key = '/'.join(filter(None, (doc['prefix'], 'girder_test')))
                client.put_object(Bucket=doc['bucket'], Key=key, Body=b'')
                client.delete_object(Bucket=doc['bucket'], Key=key)
            except Exception:
                logger.exception('S3 assetstore validation exception')
                raise ValidationException(
                    'Unable to write into bucket "%s".' % doc['bucket'], 'bucket')

        return doc

    def __init__(self, assetstore):
        super(S3AssetstoreAdapter, self).__init__(assetstore)
        if all(k in self.assetstore for k in ('accessKeyId', 'secret', 'service')):
            self.connectParams = makeBotoConnectParams(
                self.assetstore['accessKeyId'], self.assetstore['secret'],
                self.assetstore['service'], self.assetstore.get('region'),
                self.assetstore.get('inferCredentials'))
            self.client = S3AssetstoreAdapter._s3Client(self.connectParams)

    def _getRequestHeaders(self, upload):
        headers = {
            'Content-Disposition': setContentDisposition(upload['name'], setHeader=False),
            'Content-Type': upload.get('mimeType', ''),
            'x-amz-acl': 'private',
            'x-amz-meta-uploader-id': str(upload['userId']),
            'x-amz-meta-uploader-ip': str(cherrypy.request.remote.ip)
        }
        if self.assetstore.get('serverSideEncryption'):
            headers['x-amz-server-side-encryption'] = 'AES256'

        return headers

    def _generatePresignedUrl(self, *args, **kwargs):
        """
        Wrap self.client.generate_presigned_url to allow it work with
        Google Cloud Storage.

        See https://gist.github.com/gleicon/2b8acb9f9c0f22753eaac227ff997b34
        """
        url = self.client.generate_presigned_url(*args, **kwargs)
        if getattr(self.client, '_useGoogleAccessId', False):
            awskey, gskey = 'AWSAccessKeyId', 'GoogleAccessId'
            parsed = urllib.parse.urlparse(url)
            if awskey in urllib.parse.parse_qs(parsed.query):
                qsl = urllib.parse.parse_qsl(parsed.query)
                qsl = [(key if key != awskey else gskey, value) for key, value in qsl]
                url = urllib.parse.urlunparse((
                    parsed[0], parsed[1], parsed[2], parsed[3],
                    urllib.parse.urlencode(qsl),
                    parsed[5]))
        return url

[docs]    def initUpload(self, upload):
        """
        Build the request required to initiate an authorized upload to S3.
        """
        if upload['size'] <= 0:
            return upload

        uid = uuid.uuid4().hex
        key = '/'.join(filter(
            None, (self.assetstore.get('prefix', ''), uid[:2], uid[2:4], uid)))
        path = '/%s/%s' % (self.assetstore['bucket'], key)
        chunked = upload['size'] > self.CHUNK_LEN
        headers = self._getRequestHeaders(upload)
        params = {
            'Bucket': self.assetstore['bucket'],
            'Key': key,
            'ACL': headers['x-amz-acl'],
            'ContentDisposition': headers['Content-Disposition'],
            'ContentType': headers['Content-Type'],
            'Metadata': {
                'uploader-id': headers['x-amz-meta-uploader-id'],
                'uploader-ip': headers['x-amz-meta-uploader-ip']
            }
        }

        if self.assetstore.get('serverSideEncryption'):
            params['ServerSideEncryption'] = 'AES256'

        requestInfo = {
            'headers': headers,
            'method': 'PUT'
        }
        upload['behavior'] = 's3'
        upload['s3'] = {
            'chunked': chunked,
            'chunkLength': self.CHUNK_LEN,
            'relpath': path,
            'key': key,
            'request': requestInfo
        }

        if chunked:
            method = 'create_multipart_upload'
            requestInfo['method'] = 'POST'
        else:
            method = 'put_object'
            params['ContentLength'] = upload['size']

        requestInfo['url'] = self._generatePresignedUrl(ClientMethod=method, Params=params)
        return upload

[docs]    def uploadChunk(self, upload, chunk):
        """
        Rather than processing actual bytes of the chunk, this will generate
        the signature required to upload the chunk. Clients that do not support
        direct-to-S3 upload can pass the chunk via the request body as with
        other assetstores, and Girder will proxy the data through to S3.

        :param chunk: This should be a JSON string containing the chunk number
            and S3 upload ID. If a normal chunk file-like object is passed,
            we will send the data to S3.
        """
        if isinstance(chunk, six.string_types):
            return self._clientUploadChunk(upload, chunk)
        else:
            return self._proxiedUploadChunk(upload, chunk)

    def _clientUploadChunk(self, upload, chunk):
        """
        Clients that support direct-to-S3 upload behavior will go through this
        method by sending a normally-encoded form string as the chunk parameter,
        containing the required JSON info for uploading. This generates the
        signed URL that the client should use to upload the chunk to S3.
        """
        info = json.loads(chunk)
        index = int(info['partNumber']) - 1
        length = min(self.CHUNK_LEN, upload['size'] - index * self.CHUNK_LEN)

        if 'contentLength' in info and int(info['contentLength']) != length:
            raise ValidationException('Expected chunk size %d, but got %d.' % (
                length, info['contentLength']))

        if length <= 0:
            raise ValidationException('Invalid chunk length %d.' % length)

        url = self._generatePresignedUrl(ClientMethod='upload_part', Params={
            'Bucket': self.assetstore['bucket'],
            'Key': upload['s3']['key'],
            'ContentLength': length,
            'UploadId': info['s3UploadId'],
            'PartNumber': info['partNumber']
        })

        upload['s3']['uploadId'] = info['s3UploadId']
        upload['s3']['partNumber'] = info['partNumber']
        upload['s3']['request'] = {
            'method': 'PUT',
            'url': url
        }

        return upload

    def _proxiedUploadChunk(self, upload, chunk):
        """
        Clients that do not support direct-to-S3 upload behavior will go through
        this method by sending the chunk data as they normally would for other
        assetstore types. Girder will send the data to S3 on behalf of the client.
        """
        if upload['s3']['chunked']:
            if 'uploadId' not in upload['s3']:
                # Initiate a new multipart upload if this is the first chunk
                disp = 'attachment; filename="%s"' % upload['name']
                mime = upload.get('mimeType', '')
                mp = self.client.create_multipart_upload(
                    Bucket=self.assetstore['bucket'], Key=upload['s3']['key'],
                    ACL='private', ContentDisposition=disp, ContentType=mime,
                    Metadata={
                        'uploader-id': str(upload['userId']),
                        'uploader-ip': str(cherrypy.request.remote.ip)
                    })
                upload['s3']['uploadId'] = mp['UploadId']
                upload['s3']['keyName'] = mp['Key']
                upload['s3']['partNumber'] = 0

            upload['s3']['partNumber'] += 1
            size = chunk.getSize()
            headers = {
                'Content-Length': str(size)
            }

            # We can't just call upload_part directly because they require a
            # seekable file object, and ours isn't.
            url = self._generatePresignedUrl(ClientMethod='upload_part', Params={
                'Bucket': self.assetstore['bucket'],
                'Key': upload['s3']['key'],
                'ContentLength': size,
                'UploadId': upload['s3']['uploadId'],
                'PartNumber': upload['s3']['partNumber']
            })

            resp = requests.request(method='PUT', url=url, data=chunk, headers=headers)
            if resp.status_code not in (200, 201):
                logger.error('S3 multipart upload failure %d (uploadId=%s):\n%s' % (
                    resp.status_code, upload['_id'], resp.text))
                raise GirderException('Upload failed (bad gateway)')

            upload['received'] += size
        else:
            size = chunk.getSize()
            if size < upload['size']:
                raise ValidationException('Uploads of this length must be sent in a single chunk.')

            reqInfo = upload['s3']['request']
            resp = requests.request(
                method=reqInfo['method'], url=reqInfo['url'], data=chunk,
                headers=dict(reqInfo['headers'], **{'Content-Length': str(size)}))
            if resp.status_code not in (200, 201):
                logger.error('S3 upload failure %d (uploadId=%s):\n%s' % (
                    resp.status_code, upload['_id'], resp.text))
                raise GirderException('Upload failed (bad gateway)')

            upload['received'] = size

        return upload

[docs]    def requestOffset(self, upload):
        if upload['received'] > 0:
            # This is only set when we are proxying the data to S3
            return upload['received']

        if upload['s3']['chunked']:
            raise ValidationException(
                'You should not call requestOffset on a chunked direct-to-S3 upload.')

        headers = self._getRequestHeaders(upload)
        params = {
            'Bucket': self.assetstore['bucket'],
            'Key': upload['s3']['key'],
            'ACL': headers['x-amz-acl'],
            'ContentDisposition': headers['Content-Disposition'],
            'ContentLength': upload['size'],
            'ContentType': headers['Content-Type'],
            'Metadata': {
                'uploader-id': headers['x-amz-meta-uploader-id'],
                'uploader-ip': headers['x-amz-meta-uploader-ip']
            }
        }

        if self.assetstore.get('serverSideEncryption'):
            params['ServerSideEncryption'] = 'AES256'

        url = self._generatePresignedUrl(ClientMethod='put_object', Params=params)

        return {
            'method': 'PUT',
            'url': url,
            'headers': headers,
            'offset': 0
        }

[docs]    def finalizeUpload(self, upload, file):
        if upload['size'] <= 0:
            return file

        file['relpath'] = upload['s3']['relpath']
        file['s3Key'] = upload['s3']['key']

        if upload['s3']['chunked']:
            if upload['received'] > 0:
                # We proxied the data to S3
                parts = self.client.list_parts(
                    Bucket=self.assetstore['bucket'], Key=file['s3Key'],
                    UploadId=upload['s3']['uploadId'])
                parts = [{
                    'ETag': part['ETag'],
                    'PartNumber': part['PartNumber']
                } for part in parts.get('Parts', [])]
                self.client.complete_multipart_upload(
                    Bucket=self.assetstore['bucket'], Key=file['s3Key'],
                    UploadId=upload['s3']['uploadId'], MultipartUpload={'Parts': parts})
            else:
                url = self._generatePresignedUrl(
                    ClientMethod='complete_multipart_upload', Params={
                        'Bucket': self.assetstore['bucket'],
                        'Key': upload['s3']['key'],
                        'UploadId': upload['s3']['uploadId']
                    })
                file['s3FinalizeRequest'] = {
                    'method': 'POST',
                    'url': url,
                    'headers': {'Content-Type': 'text/plain;charset=UTF-8'}
                }
                file['additionalFinalizeKeys'] = ('s3FinalizeRequest',)
        return file

[docs]    def downloadFile(self, file, offset=0, headers=True, endByte=None,
                     contentDisposition=None, extraParameters=None, **kwargs):
        """
        When downloading a single file with HTTP, we redirect to S3. Otherwise,
        e.g. when downloading as part of a zip stream, we connect to S3 and
        pipe the bytes from S3 through the server to the user agent.
        """
        if file['size'] <= 0:
            if headers:
                self.setContentHeaders(file, 0, 0)

            def stream():
                yield ''
            return stream

        params = {
            'Bucket': self.assetstore['bucket'],
            'Key': file['s3Key']
        }

        if contentDisposition == 'inline' and not file.get('imported'):
            params['ResponseContentDisposition'] = 'inline; filename="%s"' % file['name']

        url = self._generatePresignedUrl(ClientMethod='get_object', Params=params)

        if headers:
            raise cherrypy.HTTPRedirect(url)
        else:
            headers = {}
            if offset or endByte is not None:
                if endByte is None or endByte > file['size']:
                    endByte = file['size']
                headers = {'Range': 'bytes=%d-%d' % (offset, endByte - 1)}

            def stream():
                pipe = requests.get(url, stream=True, headers=headers)
                for chunk in pipe.iter_content(chunk_size=BUF_LEN):
                    if chunk:
                        yield chunk
            return stream

[docs]    def importData(self, parent, parentType, params, progress, user, **kwargs):
        importPath = params.get('importPath', '').strip().lstrip('/')

        if importPath and not importPath.endswith('/'):
            importPath += '/'

        bucket = self.assetstore['bucket']
        paginator = self.client.get_paginator('list_objects')
        pageIterator = paginator.paginate(Bucket=bucket, Prefix=importPath, Delimiter='/')
        for resp in pageIterator:
            # Start with objects
            for obj in resp.get('Contents', []):
                if progress:
                    progress.update(message=obj['Key'])

                name = obj['Key'].rsplit('/', 1)[-1]
                if not name:
                    continue

                if parentType != 'folder':
                    raise ValidationException(
                        'Keys cannot be imported directly underneath a %s.' % parentType)

                if self.shouldImportFile(obj['Key'], params):
                    item = Item().createItem(
                        name=name, creator=user, folder=parent, reuseExisting=True)
                    # Create a file record; delay saving it until we have added
                    # the import information.
                    file = File().createFile(
                        name=name, creator=user, item=item, reuseExisting=True,
                        assetstore=self.assetstore, mimeType=None, size=obj['Size'],
                        saveFile=False)
                    file['s3Key'] = obj['Key']
                    file['imported'] = True
                    File().save(file)

            # Now recurse into subdirectories
            for obj in resp.get('CommonPrefixes', []):
                if progress:
                    progress.update(message=obj['Prefix'])

                name = obj['Prefix'].rstrip('/').rsplit('/', 1)[-1]

                folder = Folder().createFolder(
                    parent=parent, name=name, parentType=parentType, creator=user,
                    reuseExisting=True)
                self.importData(parent=folder, parentType='folder', params={
                    'importPath': obj['Prefix']
                }, progress=progress, user=user, **kwargs)

[docs]    def deleteFile(self, file):
        """
        We want to queue up files to be deleted asynchronously since it requires
        an external HTTP request per file in order to delete them, and we don't
        want to wait on that.

        Files that were imported as pre-existing data will not actually be
        deleted from S3, only their references in Girder will be deleted.
        """
        if file['size'] > 0 and 'relpath' in file:
            q = {
                'relpath': file['relpath'],
                'assetstoreId': self.assetstore['_id']
            }
            matching = File().find(q, limit=2, fields=[])
            if matching.count(True) == 1:
                events.daemon.trigger(info={
                    'client': self.client,
                    'bucket': self.assetstore['bucket'],
                    'key': file['s3Key']
                }, callback=_deleteFileImpl)

[docs]    def fileUpdated(self, file):
        """
        On file update, if the name or the MIME type changed, we must update
        them accordingly on the S3 key so that the file downloads with the
        correct name and content type.
        """
        if file.get('imported'):
            return

        bucket = self.assetstore['bucket']
        try:
            key = self.client.head_object(Bucket=bucket, Key=file['s3Key'])
        except botocore.exceptions.ClientError:
            return

        disp = 'attachment; filename="%s"' % file['name']
        mime = file.get('mimeType') or ''

        if key.get('ContentType') != mime or key.get('ContentDisposition') != disp:
            self.client.copy_object(
                Bucket=bucket, Key=file['s3Key'], Metadata=key['Metadata'],
                CopySource={'Bucket': bucket, 'Key': file['s3Key']}, ContentDisposition=disp,
                ContentType=mime, MetadataDirective='REPLACE')

[docs]    def cancelUpload(self, upload):
        """
        Delete the temporary files associated with a given upload.
        """
        if 'key' not in upload.get('s3', {}):
            return
        bucket = self.assetstore['bucket']
        key = upload['s3']['key']
        self.client.delete_object(Bucket=bucket, Key=key)

        # check if this is an abandoned multipart upload
        if 'uploadId' in upload['s3']:
            try:
                self.client.abort_multipart_upload(
                    Bucket=bucket, Key=key, UploadId=upload['s3']['uploadId'])
            except botocore.exceptions.ClientError:
                pass

[docs]    def untrackedUploads(self, knownUploads=None, delete=False):
        """
        List and optionally discard uploads that are in the assetstore but not
        in the known list.

        :param knownUploads: a list of upload dictionaries of all known incomplete uploads.
        :type knownUploads: list
        :param delete: if True, delete any unknown uploads.
        :type delete: bool
        :returns: a list of unknown uploads.
        """
        if self.assetstore.get('readOnly'):
            return []

        untrackedList = []
        prefix = self.assetstore.get('prefix', '')
        if prefix:
            prefix += '/'

        if knownUploads is None:
            knownUploads = []

        bucket = self.assetstore['bucket']
        getParams = {'Bucket': bucket}

        while True:
            multipartUploads = self.client.list_multipart_uploads(**getParams)
            if not multipartUploads.get('Uploads'):
                break
            for upload in multipartUploads['Uploads']:
                if self._uploadIsKnown(upload, knownUploads):
                    continue
                # don't include uploads with a different prefix; this allows a
                # single bucket to handle multiple assetstores and us to only
                # clean up the one we are in.  We could further validate that
                # the key name was of the format /(prefix)/../../(id)
                if not upload['Key'].startswith(prefix):
                    continue
                untrackedList.append({
                    's3': {
                        'uploadId': upload['UploadId'],
                        'key': upload['Key'],
                        'created': upload['Initiated']
                    }
                })
                if delete:
                    self.client.abort_multipart_upload(
                        Bucket=bucket, Key=upload['Key'], UploadId=upload['UploadId'])
            if not multipartUploads['IsTruncated']:
                break
            getParams['KeyMarker'] = multipartUploads['NextKeyMarker']
            getParams['UploadIdMarker'] = multipartUploads['NextUploadIdMarker']
        return untrackedList

    def _uploadIsKnown(self, multipartUpload, knownUploads):
        """
        Check if a multipartUpload as returned by boto is in our list of known uploads.

        :param multipartUpload: an upload entry from get_all_multipart_uploads.
        :param knownUploads: a list of our known uploads.
        :results: Whether the upload is known
        """
        for upload in knownUploads:
            if ('s3' in upload and 'uploadId' in upload['s3']
                    and 'key' in upload['s3']):
                if (multipartUpload['UploadId'] == upload['s3']['uploadId']
                        and multipartUpload['Key'] == upload['s3']['key']):
                    return True
        return False


[docs]def makeBotoConnectParams(accessKeyId, secret, service=None, region=None, inferCredentials=False):
    """
    Create a dictionary of values to pass to the boto connect_s3 function.

    :param accessKeyId: the S3 access key ID
    :param secret: the S3 secret key
    :param service: alternate service URL
    :param region: the AWS region name of the bucket (if not "us-east-1")
    :param inferCredentials: Whether or not Boto should infer the credentials
        without directly using accessKeyId and secret.
    :returns: boto connection parameter dictionary.
    """
    region = region or DEFAULT_REGION
    if inferCredentials:
        # Look up credentials through Boto's fallback mechanism, see:
        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#configuring-credentials
        params = {
            'config': botocore.client.Config(signature_version='s3v4', region_name=region)
        }
    elif accessKeyId and secret:
        # Use explicitly passed credentials
        params = {
            'aws_access_key_id': accessKeyId,
            'aws_secret_access_key': secret,
            'config': botocore.client.Config(signature_version='s3v4', region_name=region)
        }
    else:
        # Anonymous access
        params = {
            'config': botocore.client.Config(
                signature_version=botocore.UNSIGNED, region_name=region)
        }

    if service:
        if not service.startswith('http://') and not service.startswith('https://'):
            service = 'https://' + service
        params['endpoint_url'] = service

    return params


def _deleteFileImpl(event):
    event.info['client'].delete_object(Bucket=event.info['bucket'], Key=event.info['key'])