2025-12-01
This commit is contained in:
@@ -0,0 +1,882 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
"""Abstractions over S3's upload/download operations.
|
||||
|
||||
This module provides high level abstractions for efficient
|
||||
uploads/downloads. It handles several things for the user:
|
||||
|
||||
* Automatically switching to multipart transfers when
|
||||
a file is over a specific size threshold
|
||||
* Uploading/downloading a file in parallel
|
||||
* Throttling based on max bandwidth
|
||||
* Progress callbacks to monitor transfers
|
||||
* Retries. While botocore handles retries for streaming uploads,
|
||||
it is not possible for it to handle retries for streaming
|
||||
downloads. This module handles retries for both cases so
|
||||
you don't need to implement any retry logic yourself.
|
||||
|
||||
This module has a reasonable set of defaults. It also allows you
|
||||
to configure many aspects of the transfer process including:
|
||||
|
||||
* Multipart threshold size
|
||||
* Max parallel downloads
|
||||
* Max bandwidth
|
||||
* Socket timeouts
|
||||
* Retry amounts
|
||||
|
||||
There is no support for s3->s3 multipart copies at this
|
||||
time.
|
||||
|
||||
|
||||
.. _ref_s3transfer_usage:
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
The simplest way to use this module is:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
client = boto3.client('s3', 'us-west-2')
|
||||
transfer = S3Transfer(client)
|
||||
# Upload /tmp/myfile to s3://bucket/key
|
||||
transfer.upload_file('/tmp/myfile', 'bucket', 'key')
|
||||
|
||||
# Download s3://bucket/key to /tmp/myfile
|
||||
transfer.download_file('bucket', 'key', '/tmp/myfile')
|
||||
|
||||
The ``upload_file`` and ``download_file`` methods also accept
|
||||
``**kwargs``, which will be forwarded through to the corresponding
|
||||
client operation. Here are a few examples using ``upload_file``::
|
||||
|
||||
# Making the object public
|
||||
transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
||||
extra_args={'ACL': 'public-read'})
|
||||
|
||||
# Setting metadata
|
||||
transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
||||
extra_args={'Metadata': {'a': 'b', 'c': 'd'}})
|
||||
|
||||
# Setting content type
|
||||
transfer.upload_file('/tmp/myfile.json', 'bucket', 'key',
|
||||
extra_args={'ContentType': "application/json"})
|
||||
|
||||
|
||||
The ``S3Transfer`` class also supports progress callbacks so you can
|
||||
provide transfer progress to users. Both the ``upload_file`` and
|
||||
``download_file`` methods take an optional ``callback`` parameter.
|
||||
Here's an example of how to print a simple progress percentage
|
||||
to the user:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class ProgressPercentage(object):
|
||||
def __init__(self, filename):
|
||||
self._filename = filename
|
||||
self._size = float(os.path.getsize(filename))
|
||||
self._seen_so_far = 0
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def __call__(self, bytes_amount):
|
||||
# To simplify we'll assume this is hooked up
|
||||
# to a single filename.
|
||||
with self._lock:
|
||||
self._seen_so_far += bytes_amount
|
||||
percentage = (self._seen_so_far / self._size) * 100
|
||||
sys.stdout.write(
|
||||
"\r%s %s / %s (%.2f%%)" % (self._filename, self._seen_so_far,
|
||||
self._size, percentage))
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
transfer = S3Transfer(boto3.client('s3', 'us-west-2'))
|
||||
# Upload /tmp/myfile to s3://bucket/key and print upload progress.
|
||||
transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
||||
callback=ProgressPercentage('/tmp/myfile'))
|
||||
|
||||
|
||||
|
||||
You can also provide a TransferConfig object to the S3Transfer
|
||||
object that gives you more fine grained control over the
|
||||
transfer. For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
client = boto3.client('s3', 'us-west-2')
|
||||
config = TransferConfig(
|
||||
multipart_threshold=8 * 1024 * 1024,
|
||||
max_concurrency=10,
|
||||
num_download_attempts=10,
|
||||
)
|
||||
transfer = S3Transfer(client, config)
|
||||
transfer.upload_file('/tmp/foo', 'bucket', 'key')
|
||||
|
||||
|
||||
"""
|
||||
|
||||
import concurrent.futures
|
||||
import functools
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import queue
|
||||
import random
|
||||
import socket
|
||||
import string
|
||||
import threading
|
||||
from logging import NullHandler
|
||||
|
||||
from botocore.compat import six # noqa: F401
|
||||
from botocore.exceptions import IncompleteReadError, ResponseStreamingError
|
||||
from botocore.vendored.requests.packages.urllib3.exceptions import (
|
||||
ReadTimeoutError,
|
||||
)
|
||||
|
||||
import s3transfer.compat
|
||||
from s3transfer.exceptions import RetriesExceededError, S3UploadFailedError
|
||||
|
||||
__author__ = 'Amazon Web Services'
|
||||
__version__ = '0.13.1'
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.addHandler(NullHandler())
|
||||
|
||||
MB = 1024 * 1024
|
||||
SHUTDOWN_SENTINEL = object()
|
||||
|
||||
|
||||
def random_file_extension(num_digits=8):
|
||||
return ''.join(random.choice(string.hexdigits) for _ in range(num_digits))
|
||||
|
||||
|
||||
def disable_upload_callbacks(request, operation_name, **kwargs):
|
||||
if operation_name in ['PutObject', 'UploadPart'] and hasattr(
|
||||
request.body, 'disable_callback'
|
||||
):
|
||||
request.body.disable_callback()
|
||||
|
||||
|
||||
def enable_upload_callbacks(request, operation_name, **kwargs):
|
||||
if operation_name in ['PutObject', 'UploadPart'] and hasattr(
|
||||
request.body, 'enable_callback'
|
||||
):
|
||||
request.body.enable_callback()
|
||||
|
||||
|
||||
class QueueShutdownError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ReadFileChunk:
|
||||
def __init__(
|
||||
self,
|
||||
fileobj,
|
||||
start_byte,
|
||||
chunk_size,
|
||||
full_file_size,
|
||||
callback=None,
|
||||
enable_callback=True,
|
||||
):
|
||||
"""
|
||||
|
||||
Given a file object shown below:
|
||||
|
||||
|___________________________________________________|
|
||||
0 | | full_file_size
|
||||
|----chunk_size---|
|
||||
start_byte
|
||||
|
||||
:type fileobj: file
|
||||
:param fileobj: File like object
|
||||
|
||||
:type start_byte: int
|
||||
:param start_byte: The first byte from which to start reading.
|
||||
|
||||
:type chunk_size: int
|
||||
:param chunk_size: The max chunk size to read. Trying to read
|
||||
pass the end of the chunk size will behave like you've
|
||||
reached the end of the file.
|
||||
|
||||
:type full_file_size: int
|
||||
:param full_file_size: The entire content length associated
|
||||
with ``fileobj``.
|
||||
|
||||
:type callback: function(amount_read)
|
||||
:param callback: Called whenever data is read from this object.
|
||||
|
||||
"""
|
||||
self._fileobj = fileobj
|
||||
self._start_byte = start_byte
|
||||
self._size = self._calculate_file_size(
|
||||
self._fileobj,
|
||||
requested_size=chunk_size,
|
||||
start_byte=start_byte,
|
||||
actual_file_size=full_file_size,
|
||||
)
|
||||
self._fileobj.seek(self._start_byte)
|
||||
self._amount_read = 0
|
||||
self._callback = callback
|
||||
self._callback_enabled = enable_callback
|
||||
|
||||
@classmethod
|
||||
def from_filename(
|
||||
cls,
|
||||
filename,
|
||||
start_byte,
|
||||
chunk_size,
|
||||
callback=None,
|
||||
enable_callback=True,
|
||||
):
|
||||
"""Convenience factory function to create from a filename.
|
||||
|
||||
:type start_byte: int
|
||||
:param start_byte: The first byte from which to start reading.
|
||||
|
||||
:type chunk_size: int
|
||||
:param chunk_size: The max chunk size to read. Trying to read
|
||||
pass the end of the chunk size will behave like you've
|
||||
reached the end of the file.
|
||||
|
||||
:type full_file_size: int
|
||||
:param full_file_size: The entire content length associated
|
||||
with ``fileobj``.
|
||||
|
||||
:type callback: function(amount_read)
|
||||
:param callback: Called whenever data is read from this object.
|
||||
|
||||
:type enable_callback: bool
|
||||
:param enable_callback: Indicate whether to invoke callback
|
||||
during read() calls.
|
||||
|
||||
:rtype: ``ReadFileChunk``
|
||||
:return: A new instance of ``ReadFileChunk``
|
||||
|
||||
"""
|
||||
f = open(filename, 'rb')
|
||||
file_size = os.fstat(f.fileno()).st_size
|
||||
return cls(
|
||||
f, start_byte, chunk_size, file_size, callback, enable_callback
|
||||
)
|
||||
|
||||
def _calculate_file_size(
|
||||
self, fileobj, requested_size, start_byte, actual_file_size
|
||||
):
|
||||
max_chunk_size = actual_file_size - start_byte
|
||||
return min(max_chunk_size, requested_size)
|
||||
|
||||
def read(self, amount=None):
|
||||
if amount is None:
|
||||
amount_to_read = self._size - self._amount_read
|
||||
else:
|
||||
amount_to_read = min(self._size - self._amount_read, amount)
|
||||
data = self._fileobj.read(amount_to_read)
|
||||
self._amount_read += len(data)
|
||||
if self._callback is not None and self._callback_enabled:
|
||||
self._callback(len(data))
|
||||
return data
|
||||
|
||||
def enable_callback(self):
|
||||
self._callback_enabled = True
|
||||
|
||||
def disable_callback(self):
|
||||
self._callback_enabled = False
|
||||
|
||||
def seek(self, where):
|
||||
self._fileobj.seek(self._start_byte + where)
|
||||
if self._callback is not None and self._callback_enabled:
|
||||
# To also rewind the callback() for an accurate progress report
|
||||
self._callback(where - self._amount_read)
|
||||
self._amount_read = where
|
||||
|
||||
def close(self):
|
||||
self._fileobj.close()
|
||||
|
||||
def tell(self):
|
||||
return self._amount_read
|
||||
|
||||
def __len__(self):
|
||||
# __len__ is defined because requests will try to determine the length
|
||||
# of the stream to set a content length. In the normal case
|
||||
# of the file it will just stat the file, but we need to change that
|
||||
# behavior. By providing a __len__, requests will use that instead
|
||||
# of stat'ing the file.
|
||||
return self._size
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
self.close()
|
||||
|
||||
def __iter__(self):
|
||||
# This is a workaround for http://bugs.python.org/issue17575
|
||||
# Basically httplib will try to iterate over the contents, even
|
||||
# if its a file like object. This wasn't noticed because we've
|
||||
# already exhausted the stream so iterating over the file immediately
|
||||
# stops, which is what we're simulating here.
|
||||
return iter([])
|
||||
|
||||
|
||||
class StreamReaderProgress:
|
||||
"""Wrapper for a read only stream that adds progress callbacks."""
|
||||
|
||||
def __init__(self, stream, callback=None):
|
||||
self._stream = stream
|
||||
self._callback = callback
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
value = self._stream.read(*args, **kwargs)
|
||||
if self._callback is not None:
|
||||
self._callback(len(value))
|
||||
return value
|
||||
|
||||
|
||||
class OSUtils:
|
||||
def get_file_size(self, filename):
|
||||
return os.path.getsize(filename)
|
||||
|
||||
def open_file_chunk_reader(self, filename, start_byte, size, callback):
|
||||
return ReadFileChunk.from_filename(
|
||||
filename, start_byte, size, callback, enable_callback=False
|
||||
)
|
||||
|
||||
def open(self, filename, mode):
|
||||
return open(filename, mode)
|
||||
|
||||
def remove_file(self, filename):
|
||||
"""Remove a file, noop if file does not exist."""
|
||||
# Unlike os.remove, if the file does not exist,
|
||||
# then this method does nothing.
|
||||
try:
|
||||
os.remove(filename)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def rename_file(self, current_filename, new_filename):
|
||||
s3transfer.compat.rename_file(current_filename, new_filename)
|
||||
|
||||
|
||||
class MultipartUploader:
|
||||
# These are the extra_args that need to be forwarded onto
|
||||
# subsequent upload_parts.
|
||||
UPLOAD_PART_ARGS = [
|
||||
'SSECustomerKey',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKeyMD5',
|
||||
'RequestPayer',
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
executor_cls=concurrent.futures.ThreadPoolExecutor,
|
||||
):
|
||||
self._client = client
|
||||
self._config = config
|
||||
self._os = osutil
|
||||
self._executor_cls = executor_cls
|
||||
|
||||
def _extra_upload_part_args(self, extra_args):
|
||||
# Only the args in UPLOAD_PART_ARGS actually need to be passed
|
||||
# onto the upload_part calls.
|
||||
upload_parts_args = {}
|
||||
for key, value in extra_args.items():
|
||||
if key in self.UPLOAD_PART_ARGS:
|
||||
upload_parts_args[key] = value
|
||||
return upload_parts_args
|
||||
|
||||
def upload_file(self, filename, bucket, key, callback, extra_args):
|
||||
response = self._client.create_multipart_upload(
|
||||
Bucket=bucket, Key=key, **extra_args
|
||||
)
|
||||
upload_id = response['UploadId']
|
||||
try:
|
||||
parts = self._upload_parts(
|
||||
upload_id, filename, bucket, key, callback, extra_args
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Exception raised while uploading parts, "
|
||||
"aborting multipart upload.",
|
||||
exc_info=True,
|
||||
)
|
||||
self._client.abort_multipart_upload(
|
||||
Bucket=bucket, Key=key, UploadId=upload_id
|
||||
)
|
||||
raise S3UploadFailedError(
|
||||
"Failed to upload {} to {}: {}".format(
|
||||
filename, '/'.join([bucket, key]), e
|
||||
)
|
||||
)
|
||||
self._client.complete_multipart_upload(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id,
|
||||
MultipartUpload={'Parts': parts},
|
||||
)
|
||||
|
||||
def _upload_parts(
|
||||
self, upload_id, filename, bucket, key, callback, extra_args
|
||||
):
|
||||
upload_parts_extra_args = self._extra_upload_part_args(extra_args)
|
||||
parts = []
|
||||
part_size = self._config.multipart_chunksize
|
||||
num_parts = int(
|
||||
math.ceil(self._os.get_file_size(filename) / float(part_size))
|
||||
)
|
||||
max_workers = self._config.max_concurrency
|
||||
with self._executor_cls(max_workers=max_workers) as executor:
|
||||
upload_partial = functools.partial(
|
||||
self._upload_one_part,
|
||||
filename,
|
||||
bucket,
|
||||
key,
|
||||
upload_id,
|
||||
part_size,
|
||||
upload_parts_extra_args,
|
||||
callback,
|
||||
)
|
||||
for part in executor.map(upload_partial, range(1, num_parts + 1)):
|
||||
parts.append(part)
|
||||
return parts
|
||||
|
||||
def _upload_one_part(
|
||||
self,
|
||||
filename,
|
||||
bucket,
|
||||
key,
|
||||
upload_id,
|
||||
part_size,
|
||||
extra_args,
|
||||
callback,
|
||||
part_number,
|
||||
):
|
||||
open_chunk_reader = self._os.open_file_chunk_reader
|
||||
with open_chunk_reader(
|
||||
filename, part_size * (part_number - 1), part_size, callback
|
||||
) as body:
|
||||
response = self._client.upload_part(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id,
|
||||
PartNumber=part_number,
|
||||
Body=body,
|
||||
**extra_args,
|
||||
)
|
||||
etag = response['ETag']
|
||||
return {'ETag': etag, 'PartNumber': part_number}
|
||||
|
||||
|
||||
class ShutdownQueue(queue.Queue):
|
||||
"""A queue implementation that can be shutdown.
|
||||
|
||||
Shutting down a queue means that this class adds a
|
||||
trigger_shutdown method that will trigger all subsequent
|
||||
calls to put() to fail with a ``QueueShutdownError``.
|
||||
|
||||
It purposefully deviates from queue.Queue, and is *not* meant
|
||||
to be a drop in replacement for ``queue.Queue``.
|
||||
|
||||
"""
|
||||
|
||||
def _init(self, maxsize):
|
||||
self._shutdown = False
|
||||
self._shutdown_lock = threading.Lock()
|
||||
# queue.Queue is an old style class so we don't use super().
|
||||
return queue.Queue._init(self, maxsize)
|
||||
|
||||
def trigger_shutdown(self):
|
||||
with self._shutdown_lock:
|
||||
self._shutdown = True
|
||||
logger.debug("The IO queue is now shutdown.")
|
||||
|
||||
def put(self, item):
|
||||
# Note: this is not sufficient, it's still possible to deadlock!
|
||||
# Need to hook into the condition vars used by this class.
|
||||
with self._shutdown_lock:
|
||||
if self._shutdown:
|
||||
raise QueueShutdownError(
|
||||
"Cannot put item to queue when queue has been shutdown."
|
||||
)
|
||||
return queue.Queue.put(self, item)
|
||||
|
||||
|
||||
class MultipartDownloader:
|
||||
def __init__(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
executor_cls=concurrent.futures.ThreadPoolExecutor,
|
||||
):
|
||||
self._client = client
|
||||
self._config = config
|
||||
self._os = osutil
|
||||
self._executor_cls = executor_cls
|
||||
self._ioqueue = ShutdownQueue(self._config.max_io_queue)
|
||||
|
||||
def download_file(
|
||||
self, bucket, key, filename, object_size, extra_args, callback=None
|
||||
):
|
||||
with self._executor_cls(max_workers=2) as controller:
|
||||
# 1 thread for the future that manages the uploading of files
|
||||
# 1 thread for the future that manages IO writes.
|
||||
download_parts_handler = functools.partial(
|
||||
self._download_file_as_future,
|
||||
bucket,
|
||||
key,
|
||||
filename,
|
||||
object_size,
|
||||
callback,
|
||||
)
|
||||
parts_future = controller.submit(download_parts_handler)
|
||||
|
||||
io_writes_handler = functools.partial(
|
||||
self._perform_io_writes, filename
|
||||
)
|
||||
io_future = controller.submit(io_writes_handler)
|
||||
results = concurrent.futures.wait(
|
||||
[parts_future, io_future],
|
||||
return_when=concurrent.futures.FIRST_EXCEPTION,
|
||||
)
|
||||
self._process_future_results(results)
|
||||
|
||||
def _process_future_results(self, futures):
|
||||
finished, unfinished = futures
|
||||
for future in finished:
|
||||
future.result()
|
||||
|
||||
def _download_file_as_future(
|
||||
self, bucket, key, filename, object_size, callback
|
||||
):
|
||||
part_size = self._config.multipart_chunksize
|
||||
num_parts = int(math.ceil(object_size / float(part_size)))
|
||||
max_workers = self._config.max_concurrency
|
||||
download_partial = functools.partial(
|
||||
self._download_range,
|
||||
bucket,
|
||||
key,
|
||||
filename,
|
||||
part_size,
|
||||
num_parts,
|
||||
callback,
|
||||
)
|
||||
try:
|
||||
with self._executor_cls(max_workers=max_workers) as executor:
|
||||
list(executor.map(download_partial, range(num_parts)))
|
||||
finally:
|
||||
self._ioqueue.put(SHUTDOWN_SENTINEL)
|
||||
|
||||
def _calculate_range_param(self, part_size, part_index, num_parts):
|
||||
start_range = part_index * part_size
|
||||
if part_index == num_parts - 1:
|
||||
end_range = ''
|
||||
else:
|
||||
end_range = start_range + part_size - 1
|
||||
range_param = f'bytes={start_range}-{end_range}'
|
||||
return range_param
|
||||
|
||||
def _download_range(
|
||||
self, bucket, key, filename, part_size, num_parts, callback, part_index
|
||||
):
|
||||
try:
|
||||
range_param = self._calculate_range_param(
|
||||
part_size, part_index, num_parts
|
||||
)
|
||||
|
||||
max_attempts = self._config.num_download_attempts
|
||||
last_exception = None
|
||||
for i in range(max_attempts):
|
||||
try:
|
||||
logger.debug("Making get_object call.")
|
||||
response = self._client.get_object(
|
||||
Bucket=bucket, Key=key, Range=range_param
|
||||
)
|
||||
streaming_body = StreamReaderProgress(
|
||||
response['Body'], callback
|
||||
)
|
||||
buffer_size = 1024 * 16
|
||||
current_index = part_size * part_index
|
||||
for chunk in iter(
|
||||
lambda: streaming_body.read(buffer_size), b''
|
||||
):
|
||||
self._ioqueue.put((current_index, chunk))
|
||||
current_index += len(chunk)
|
||||
return
|
||||
except (
|
||||
socket.timeout,
|
||||
OSError,
|
||||
ReadTimeoutError,
|
||||
IncompleteReadError,
|
||||
ResponseStreamingError,
|
||||
) as e:
|
||||
logger.debug(
|
||||
"Retrying exception caught (%s), "
|
||||
"retrying request, (attempt %s / %s)",
|
||||
e,
|
||||
i,
|
||||
max_attempts,
|
||||
exc_info=True,
|
||||
)
|
||||
last_exception = e
|
||||
continue
|
||||
raise RetriesExceededError(last_exception)
|
||||
finally:
|
||||
logger.debug("EXITING _download_range for part: %s", part_index)
|
||||
|
||||
def _perform_io_writes(self, filename):
|
||||
with self._os.open(filename, 'wb') as f:
|
||||
while True:
|
||||
task = self._ioqueue.get()
|
||||
if task is SHUTDOWN_SENTINEL:
|
||||
logger.debug(
|
||||
"Shutdown sentinel received in IO handler, "
|
||||
"shutting down IO handler."
|
||||
)
|
||||
return
|
||||
else:
|
||||
try:
|
||||
offset, data = task
|
||||
f.seek(offset)
|
||||
f.write(data)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Caught exception in IO thread: %s",
|
||||
e,
|
||||
exc_info=True,
|
||||
)
|
||||
self._ioqueue.trigger_shutdown()
|
||||
raise
|
||||
|
||||
|
||||
class TransferConfig:
|
||||
def __init__(
|
||||
self,
|
||||
multipart_threshold=8 * MB,
|
||||
max_concurrency=10,
|
||||
multipart_chunksize=8 * MB,
|
||||
num_download_attempts=5,
|
||||
max_io_queue=100,
|
||||
):
|
||||
self.multipart_threshold = multipart_threshold
|
||||
self.max_concurrency = max_concurrency
|
||||
self.multipart_chunksize = multipart_chunksize
|
||||
self.num_download_attempts = num_download_attempts
|
||||
self.max_io_queue = max_io_queue
|
||||
|
||||
|
||||
class S3Transfer:
|
||||
ALLOWED_DOWNLOAD_ARGS = [
|
||||
'VersionId',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKey',
|
||||
'SSECustomerKeyMD5',
|
||||
'RequestPayer',
|
||||
]
|
||||
|
||||
ALLOWED_UPLOAD_ARGS = [
|
||||
'ACL',
|
||||
'CacheControl',
|
||||
'ContentDisposition',
|
||||
'ContentEncoding',
|
||||
'ContentLanguage',
|
||||
'ContentType',
|
||||
'Expires',
|
||||
'GrantFullControl',
|
||||
'GrantRead',
|
||||
'GrantReadACP',
|
||||
'GrantWriteACL',
|
||||
'Metadata',
|
||||
'RequestPayer',
|
||||
'ServerSideEncryption',
|
||||
'StorageClass',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKey',
|
||||
'SSECustomerKeyMD5',
|
||||
'SSEKMSKeyId',
|
||||
'SSEKMSEncryptionContext',
|
||||
'Tagging',
|
||||
]
|
||||
|
||||
def __init__(self, client, config=None, osutil=None):
|
||||
self._client = client
|
||||
self._client.meta.events.register(
|
||||
'before-call.s3.*', self._update_checksum_context
|
||||
)
|
||||
if config is None:
|
||||
config = TransferConfig()
|
||||
self._config = config
|
||||
if osutil is None:
|
||||
osutil = OSUtils()
|
||||
self._osutil = osutil
|
||||
|
||||
def _update_checksum_context(self, params, **kwargs):
|
||||
request_context = params.get("context", {})
|
||||
checksum_context = request_context.get("checksum", {})
|
||||
if "request_algorithm" in checksum_context:
|
||||
# Force request checksum algorithm in the header if specified.
|
||||
checksum_context["request_algorithm"]["in"] = "header"
|
||||
|
||||
def upload_file(
|
||||
self, filename, bucket, key, callback=None, extra_args=None
|
||||
):
|
||||
"""Upload a file to an S3 object.
|
||||
|
||||
Variants have also been injected into S3 client, Bucket and Object.
|
||||
You don't have to use S3Transfer.upload_file() directly.
|
||||
"""
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
|
||||
events = self._client.meta.events
|
||||
events.register_first(
|
||||
'request-created.s3',
|
||||
disable_upload_callbacks,
|
||||
unique_id='s3upload-callback-disable',
|
||||
)
|
||||
events.register_last(
|
||||
'request-created.s3',
|
||||
enable_upload_callbacks,
|
||||
unique_id='s3upload-callback-enable',
|
||||
)
|
||||
if (
|
||||
self._osutil.get_file_size(filename)
|
||||
>= self._config.multipart_threshold
|
||||
):
|
||||
self._multipart_upload(filename, bucket, key, callback, extra_args)
|
||||
else:
|
||||
self._put_object(filename, bucket, key, callback, extra_args)
|
||||
|
||||
def _put_object(self, filename, bucket, key, callback, extra_args):
|
||||
# We're using open_file_chunk_reader so we can take advantage of the
|
||||
# progress callback functionality.
|
||||
open_chunk_reader = self._osutil.open_file_chunk_reader
|
||||
with open_chunk_reader(
|
||||
filename,
|
||||
0,
|
||||
self._osutil.get_file_size(filename),
|
||||
callback=callback,
|
||||
) as body:
|
||||
self._client.put_object(
|
||||
Bucket=bucket, Key=key, Body=body, **extra_args
|
||||
)
|
||||
|
||||
def download_file(
|
||||
self, bucket, key, filename, extra_args=None, callback=None
|
||||
):
|
||||
"""Download an S3 object to a file.
|
||||
|
||||
Variants have also been injected into S3 client, Bucket and Object.
|
||||
You don't have to use S3Transfer.download_file() directly.
|
||||
"""
|
||||
# This method will issue a ``head_object`` request to determine
|
||||
# the size of the S3 object. This is used to determine if the
|
||||
# object is downloaded in parallel.
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
|
||||
object_size = self._object_size(bucket, key, extra_args)
|
||||
temp_filename = filename + os.extsep + random_file_extension()
|
||||
try:
|
||||
self._download_file(
|
||||
bucket, key, temp_filename, object_size, extra_args, callback
|
||||
)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Exception caught in download_file, removing partial file: %s",
|
||||
temp_filename,
|
||||
exc_info=True,
|
||||
)
|
||||
self._osutil.remove_file(temp_filename)
|
||||
raise
|
||||
else:
|
||||
self._osutil.rename_file(temp_filename, filename)
|
||||
|
||||
def _download_file(
|
||||
self, bucket, key, filename, object_size, extra_args, callback
|
||||
):
|
||||
if object_size >= self._config.multipart_threshold:
|
||||
self._ranged_download(
|
||||
bucket, key, filename, object_size, extra_args, callback
|
||||
)
|
||||
else:
|
||||
self._get_object(bucket, key, filename, extra_args, callback)
|
||||
|
||||
def _validate_all_known_args(self, actual, allowed):
|
||||
for kwarg in actual:
|
||||
if kwarg not in allowed:
|
||||
raise ValueError(
|
||||
f"Invalid extra_args key '{kwarg}', "
|
||||
f"must be one of: {', '.join(allowed)}"
|
||||
)
|
||||
|
||||
def _ranged_download(
|
||||
self, bucket, key, filename, object_size, extra_args, callback
|
||||
):
|
||||
downloader = MultipartDownloader(
|
||||
self._client, self._config, self._osutil
|
||||
)
|
||||
downloader.download_file(
|
||||
bucket, key, filename, object_size, extra_args, callback
|
||||
)
|
||||
|
||||
def _get_object(self, bucket, key, filename, extra_args, callback):
|
||||
# precondition: num_download_attempts > 0
|
||||
max_attempts = self._config.num_download_attempts
|
||||
last_exception = None
|
||||
for i in range(max_attempts):
|
||||
try:
|
||||
return self._do_get_object(
|
||||
bucket, key, filename, extra_args, callback
|
||||
)
|
||||
except (
|
||||
socket.timeout,
|
||||
OSError,
|
||||
ReadTimeoutError,
|
||||
IncompleteReadError,
|
||||
ResponseStreamingError,
|
||||
) as e:
|
||||
# TODO: we need a way to reset the callback if the
|
||||
# download failed.
|
||||
logger.debug(
|
||||
"Retrying exception caught (%s), "
|
||||
"retrying request, (attempt %s / %s)",
|
||||
e,
|
||||
i,
|
||||
max_attempts,
|
||||
exc_info=True,
|
||||
)
|
||||
last_exception = e
|
||||
continue
|
||||
raise RetriesExceededError(last_exception)
|
||||
|
||||
def _do_get_object(self, bucket, key, filename, extra_args, callback):
|
||||
response = self._client.get_object(
|
||||
Bucket=bucket, Key=key, **extra_args
|
||||
)
|
||||
streaming_body = StreamReaderProgress(response['Body'], callback)
|
||||
with self._osutil.open(filename, 'wb') as f:
|
||||
for chunk in iter(lambda: streaming_body.read(8192), b''):
|
||||
f.write(chunk)
|
||||
|
||||
def _object_size(self, bucket, key, extra_args):
|
||||
return self._client.head_object(Bucket=bucket, Key=key, **extra_args)[
|
||||
'ContentLength'
|
||||
]
|
||||
|
||||
def _multipart_upload(self, filename, bucket, key, callback, extra_args):
|
||||
uploader = MultipartUploader(self._client, self._config, self._osutil)
|
||||
uploader.upload_file(filename, bucket, key, callback, extra_args)
|
||||
@@ -0,0 +1,437 @@
|
||||
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import threading
|
||||
import time
|
||||
|
||||
|
||||
class RequestExceededException(Exception):
|
||||
def __init__(self, requested_amt, retry_time):
|
||||
"""Error when requested amount exceeds what is allowed
|
||||
|
||||
The request that raised this error should be retried after waiting
|
||||
the time specified by ``retry_time``.
|
||||
|
||||
:type requested_amt: int
|
||||
:param requested_amt: The originally requested byte amount
|
||||
|
||||
:type retry_time: float
|
||||
:param retry_time: The length in time to wait to retry for the
|
||||
requested amount
|
||||
"""
|
||||
self.requested_amt = requested_amt
|
||||
self.retry_time = retry_time
|
||||
msg = f'Request amount {requested_amt} exceeded the amount available. Retry in {retry_time}'
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class RequestToken:
|
||||
"""A token to pass as an identifier when consuming from the LeakyBucket"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class TimeUtils:
|
||||
def time(self):
|
||||
"""Get the current time back
|
||||
|
||||
:rtype: float
|
||||
:returns: The current time in seconds
|
||||
"""
|
||||
return time.time()
|
||||
|
||||
def sleep(self, value):
|
||||
"""Sleep for a designated time
|
||||
|
||||
:type value: float
|
||||
:param value: The time to sleep for in seconds
|
||||
"""
|
||||
return time.sleep(value)
|
||||
|
||||
|
||||
class BandwidthLimiter:
|
||||
def __init__(self, leaky_bucket, time_utils=None):
|
||||
"""Limits bandwidth for shared S3 transfers
|
||||
|
||||
:type leaky_bucket: LeakyBucket
|
||||
:param leaky_bucket: The leaky bucket to use limit bandwidth
|
||||
|
||||
:type time_utils: TimeUtils
|
||||
:param time_utils: Time utility to use for interacting with time.
|
||||
"""
|
||||
self._leaky_bucket = leaky_bucket
|
||||
self._time_utils = time_utils
|
||||
if time_utils is None:
|
||||
self._time_utils = TimeUtils()
|
||||
|
||||
def get_bandwith_limited_stream(
|
||||
self, fileobj, transfer_coordinator, enabled=True
|
||||
):
|
||||
"""Wraps a fileobj in a bandwidth limited stream wrapper
|
||||
|
||||
:type fileobj: file-like obj
|
||||
:param fileobj: The file-like obj to wrap
|
||||
|
||||
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
|
||||
param transfer_coordinator: The coordinator for the general transfer
|
||||
that the wrapped stream is a part of
|
||||
|
||||
:type enabled: boolean
|
||||
:param enabled: Whether bandwidth limiting should be enabled to start
|
||||
"""
|
||||
stream = BandwidthLimitedStream(
|
||||
fileobj, self._leaky_bucket, transfer_coordinator, self._time_utils
|
||||
)
|
||||
if not enabled:
|
||||
stream.disable_bandwidth_limiting()
|
||||
return stream
|
||||
|
||||
|
||||
class BandwidthLimitedStream:
|
||||
def __init__(
|
||||
self,
|
||||
fileobj,
|
||||
leaky_bucket,
|
||||
transfer_coordinator,
|
||||
time_utils=None,
|
||||
bytes_threshold=256 * 1024,
|
||||
):
|
||||
"""Limits bandwidth for reads on a wrapped stream
|
||||
|
||||
:type fileobj: file-like object
|
||||
:param fileobj: The file like object to wrap
|
||||
|
||||
:type leaky_bucket: LeakyBucket
|
||||
:param leaky_bucket: The leaky bucket to use to throttle reads on
|
||||
the stream
|
||||
|
||||
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
|
||||
param transfer_coordinator: The coordinator for the general transfer
|
||||
that the wrapped stream is a part of
|
||||
|
||||
:type time_utils: TimeUtils
|
||||
:param time_utils: The time utility to use for interacting with time
|
||||
"""
|
||||
self._fileobj = fileobj
|
||||
self._leaky_bucket = leaky_bucket
|
||||
self._transfer_coordinator = transfer_coordinator
|
||||
self._time_utils = time_utils
|
||||
if time_utils is None:
|
||||
self._time_utils = TimeUtils()
|
||||
self._bandwidth_limiting_enabled = True
|
||||
self._request_token = RequestToken()
|
||||
self._bytes_seen = 0
|
||||
self._bytes_threshold = bytes_threshold
|
||||
|
||||
def enable_bandwidth_limiting(self):
|
||||
"""Enable bandwidth limiting on reads to the stream"""
|
||||
self._bandwidth_limiting_enabled = True
|
||||
|
||||
def disable_bandwidth_limiting(self):
|
||||
"""Disable bandwidth limiting on reads to the stream"""
|
||||
self._bandwidth_limiting_enabled = False
|
||||
|
||||
def read(self, amount):
|
||||
"""Read a specified amount
|
||||
|
||||
Reads will only be throttled if bandwidth limiting is enabled.
|
||||
"""
|
||||
if not self._bandwidth_limiting_enabled:
|
||||
return self._fileobj.read(amount)
|
||||
|
||||
# We do not want to be calling consume on every read as the read
|
||||
# amounts can be small causing the lock of the leaky bucket to
|
||||
# introduce noticeable overhead. So instead we keep track of
|
||||
# how many bytes we have seen and only call consume once we pass a
|
||||
# certain threshold.
|
||||
self._bytes_seen += amount
|
||||
if self._bytes_seen < self._bytes_threshold:
|
||||
return self._fileobj.read(amount)
|
||||
|
||||
self._consume_through_leaky_bucket()
|
||||
return self._fileobj.read(amount)
|
||||
|
||||
def _consume_through_leaky_bucket(self):
|
||||
# NOTE: If the read amount on the stream are high, it will result
|
||||
# in large bursty behavior as there is not an interface for partial
|
||||
# reads. However given the read's on this abstraction are at most 256KB
|
||||
# (via downloads), it reduces the burstiness to be small KB bursts at
|
||||
# worst.
|
||||
while not self._transfer_coordinator.exception:
|
||||
try:
|
||||
self._leaky_bucket.consume(
|
||||
self._bytes_seen, self._request_token
|
||||
)
|
||||
self._bytes_seen = 0
|
||||
return
|
||||
except RequestExceededException as e:
|
||||
self._time_utils.sleep(e.retry_time)
|
||||
else:
|
||||
raise self._transfer_coordinator.exception
|
||||
|
||||
def signal_transferring(self):
|
||||
"""Signal that data being read is being transferred to S3"""
|
||||
self.enable_bandwidth_limiting()
|
||||
|
||||
def signal_not_transferring(self):
|
||||
"""Signal that data being read is not being transferred to S3"""
|
||||
self.disable_bandwidth_limiting()
|
||||
|
||||
def seek(self, where, whence=0):
|
||||
self._fileobj.seek(where, whence)
|
||||
|
||||
def tell(self):
|
||||
return self._fileobj.tell()
|
||||
|
||||
def close(self):
|
||||
if self._bandwidth_limiting_enabled and self._bytes_seen:
|
||||
# This handles the case where the file is small enough to never
|
||||
# trigger the threshold and thus is never subjugated to the
|
||||
# leaky bucket on read(). This specifically happens for small
|
||||
# uploads. So instead to account for those bytes, have
|
||||
# it go through the leaky bucket when the file gets closed.
|
||||
self._consume_through_leaky_bucket()
|
||||
self._fileobj.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
self.close()
|
||||
|
||||
|
||||
class LeakyBucket:
|
||||
def __init__(
|
||||
self,
|
||||
max_rate,
|
||||
time_utils=None,
|
||||
rate_tracker=None,
|
||||
consumption_scheduler=None,
|
||||
):
|
||||
"""A leaky bucket abstraction to limit bandwidth consumption
|
||||
|
||||
:type rate: int
|
||||
:type rate: The maximum rate to allow. This rate is in terms of
|
||||
bytes per second.
|
||||
|
||||
:type time_utils: TimeUtils
|
||||
:param time_utils: The time utility to use for interacting with time
|
||||
|
||||
:type rate_tracker: BandwidthRateTracker
|
||||
:param rate_tracker: Tracks bandwidth consumption
|
||||
|
||||
:type consumption_scheduler: ConsumptionScheduler
|
||||
:param consumption_scheduler: Schedules consumption retries when
|
||||
necessary
|
||||
"""
|
||||
self._max_rate = float(max_rate)
|
||||
self._time_utils = time_utils
|
||||
if time_utils is None:
|
||||
self._time_utils = TimeUtils()
|
||||
self._lock = threading.Lock()
|
||||
self._rate_tracker = rate_tracker
|
||||
if rate_tracker is None:
|
||||
self._rate_tracker = BandwidthRateTracker()
|
||||
self._consumption_scheduler = consumption_scheduler
|
||||
if consumption_scheduler is None:
|
||||
self._consumption_scheduler = ConsumptionScheduler()
|
||||
|
||||
def consume(self, amt, request_token):
|
||||
"""Consume an a requested amount
|
||||
|
||||
:type amt: int
|
||||
:param amt: The amount of bytes to request to consume
|
||||
|
||||
:type request_token: RequestToken
|
||||
:param request_token: The token associated to the consumption
|
||||
request that is used to identify the request. So if a
|
||||
RequestExceededException is raised the token should be used
|
||||
in subsequent retry consume() request.
|
||||
|
||||
:raises RequestExceededException: If the consumption amount would
|
||||
exceed the maximum allocated bandwidth
|
||||
|
||||
:rtype: int
|
||||
:returns: The amount consumed
|
||||
"""
|
||||
with self._lock:
|
||||
time_now = self._time_utils.time()
|
||||
if self._consumption_scheduler.is_scheduled(request_token):
|
||||
return self._release_requested_amt_for_scheduled_request(
|
||||
amt, request_token, time_now
|
||||
)
|
||||
elif self._projected_to_exceed_max_rate(amt, time_now):
|
||||
self._raise_request_exceeded_exception(
|
||||
amt, request_token, time_now
|
||||
)
|
||||
else:
|
||||
return self._release_requested_amt(amt, time_now)
|
||||
|
||||
def _projected_to_exceed_max_rate(self, amt, time_now):
|
||||
projected_rate = self._rate_tracker.get_projected_rate(amt, time_now)
|
||||
return projected_rate > self._max_rate
|
||||
|
||||
def _release_requested_amt_for_scheduled_request(
|
||||
self, amt, request_token, time_now
|
||||
):
|
||||
self._consumption_scheduler.process_scheduled_consumption(
|
||||
request_token
|
||||
)
|
||||
return self._release_requested_amt(amt, time_now)
|
||||
|
||||
def _raise_request_exceeded_exception(self, amt, request_token, time_now):
|
||||
allocated_time = amt / float(self._max_rate)
|
||||
retry_time = self._consumption_scheduler.schedule_consumption(
|
||||
amt, request_token, allocated_time
|
||||
)
|
||||
raise RequestExceededException(
|
||||
requested_amt=amt, retry_time=retry_time
|
||||
)
|
||||
|
||||
def _release_requested_amt(self, amt, time_now):
|
||||
self._rate_tracker.record_consumption_rate(amt, time_now)
|
||||
return amt
|
||||
|
||||
|
||||
class ConsumptionScheduler:
|
||||
def __init__(self):
|
||||
"""Schedules when to consume a desired amount"""
|
||||
self._tokens_to_scheduled_consumption = {}
|
||||
self._total_wait = 0
|
||||
|
||||
def is_scheduled(self, token):
|
||||
"""Indicates if a consumption request has been scheduled
|
||||
|
||||
:type token: RequestToken
|
||||
:param token: The token associated to the consumption
|
||||
request that is used to identify the request.
|
||||
"""
|
||||
return token in self._tokens_to_scheduled_consumption
|
||||
|
||||
def schedule_consumption(self, amt, token, time_to_consume):
|
||||
"""Schedules a wait time to be able to consume an amount
|
||||
|
||||
:type amt: int
|
||||
:param amt: The amount of bytes scheduled to be consumed
|
||||
|
||||
:type token: RequestToken
|
||||
:param token: The token associated to the consumption
|
||||
request that is used to identify the request.
|
||||
|
||||
:type time_to_consume: float
|
||||
:param time_to_consume: The desired time it should take for that
|
||||
specific request amount to be consumed in regardless of previously
|
||||
scheduled consumption requests
|
||||
|
||||
:rtype: float
|
||||
:returns: The amount of time to wait for the specific request before
|
||||
actually consuming the specified amount.
|
||||
"""
|
||||
self._total_wait += time_to_consume
|
||||
self._tokens_to_scheduled_consumption[token] = {
|
||||
'wait_duration': self._total_wait,
|
||||
'time_to_consume': time_to_consume,
|
||||
}
|
||||
return self._total_wait
|
||||
|
||||
def process_scheduled_consumption(self, token):
|
||||
"""Processes a scheduled consumption request that has completed
|
||||
|
||||
:type token: RequestToken
|
||||
:param token: The token associated to the consumption
|
||||
request that is used to identify the request.
|
||||
"""
|
||||
scheduled_retry = self._tokens_to_scheduled_consumption.pop(token)
|
||||
self._total_wait = max(
|
||||
self._total_wait - scheduled_retry['time_to_consume'], 0
|
||||
)
|
||||
|
||||
|
||||
class BandwidthRateTracker:
|
||||
def __init__(self, alpha=0.8):
|
||||
"""Tracks the rate of bandwidth consumption
|
||||
|
||||
:type a: float
|
||||
:param a: The constant to use in calculating the exponentional moving
|
||||
average of the bandwidth rate. Specifically it is used in the
|
||||
following calculation:
|
||||
|
||||
current_rate = alpha * new_rate + (1 - alpha) * current_rate
|
||||
|
||||
This value of this constant should be between 0 and 1.
|
||||
"""
|
||||
self._alpha = alpha
|
||||
self._last_time = None
|
||||
self._current_rate = None
|
||||
|
||||
@property
|
||||
def current_rate(self):
|
||||
"""The current transfer rate
|
||||
|
||||
:rtype: float
|
||||
:returns: The current tracked transfer rate
|
||||
"""
|
||||
if self._last_time is None:
|
||||
return 0.0
|
||||
return self._current_rate
|
||||
|
||||
def get_projected_rate(self, amt, time_at_consumption):
|
||||
"""Get the projected rate using a provided amount and time
|
||||
|
||||
:type amt: int
|
||||
:param amt: The proposed amount to consume
|
||||
|
||||
:type time_at_consumption: float
|
||||
:param time_at_consumption: The proposed time to consume at
|
||||
|
||||
:rtype: float
|
||||
:returns: The consumption rate if that amt and time were consumed
|
||||
"""
|
||||
if self._last_time is None:
|
||||
return 0.0
|
||||
return self._calculate_exponential_moving_average_rate(
|
||||
amt, time_at_consumption
|
||||
)
|
||||
|
||||
def record_consumption_rate(self, amt, time_at_consumption):
|
||||
"""Record the consumption rate based off amount and time point
|
||||
|
||||
:type amt: int
|
||||
:param amt: The amount that got consumed
|
||||
|
||||
:type time_at_consumption: float
|
||||
:param time_at_consumption: The time at which the amount was consumed
|
||||
"""
|
||||
if self._last_time is None:
|
||||
self._last_time = time_at_consumption
|
||||
self._current_rate = 0.0
|
||||
return
|
||||
self._current_rate = self._calculate_exponential_moving_average_rate(
|
||||
amt, time_at_consumption
|
||||
)
|
||||
self._last_time = time_at_consumption
|
||||
|
||||
def _calculate_rate(self, amt, time_at_consumption):
|
||||
time_delta = time_at_consumption - self._last_time
|
||||
if time_delta <= 0:
|
||||
# While it is really unlikely to see this in an actual transfer,
|
||||
# we do not want to be returning back a negative rate or try to
|
||||
# divide the amount by zero. So instead return back an infinite
|
||||
# rate as the time delta is infinitesimally small.
|
||||
return float('inf')
|
||||
return amt / (time_delta)
|
||||
|
||||
def _calculate_exponential_moving_average_rate(
|
||||
self, amt, time_at_consumption
|
||||
):
|
||||
new_rate = self._calculate_rate(amt, time_at_consumption)
|
||||
return self._alpha * new_rate + (1 - self._alpha) * self._current_rate
|
||||
@@ -0,0 +1,94 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import errno
|
||||
import inspect
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
|
||||
from botocore.compat import six
|
||||
|
||||
if sys.platform.startswith('win'):
|
||||
def rename_file(current_filename, new_filename):
|
||||
try:
|
||||
os.remove(new_filename)
|
||||
except OSError as e:
|
||||
if not e.errno == errno.ENOENT:
|
||||
# We only want to a ignore trying to remove
|
||||
# a file that does not exist. If it fails
|
||||
# for any other reason we should be propagating
|
||||
# that exception.
|
||||
raise
|
||||
os.rename(current_filename, new_filename)
|
||||
else:
|
||||
rename_file = os.rename
|
||||
|
||||
|
||||
def accepts_kwargs(func):
|
||||
return inspect.getfullargspec(func)[2]
|
||||
|
||||
|
||||
# In python 3, socket.error is OSError, which is too general
|
||||
# for what we want (i.e FileNotFoundError is a subclass of OSError).
|
||||
# In python 3, all the socket related errors are in a newly created
|
||||
# ConnectionError.
|
||||
SOCKET_ERROR = ConnectionError
|
||||
MAXINT = None
|
||||
|
||||
|
||||
def seekable(fileobj):
|
||||
"""Backwards compat function to determine if a fileobj is seekable
|
||||
|
||||
:param fileobj: The file-like object to determine if seekable
|
||||
|
||||
:returns: True, if seekable. False, otherwise.
|
||||
"""
|
||||
# If the fileobj has a seekable attr, try calling the seekable()
|
||||
# method on it.
|
||||
if hasattr(fileobj, 'seekable'):
|
||||
return fileobj.seekable()
|
||||
# If there is no seekable attr, check if the object can be seeked
|
||||
# or telled. If it can, try to seek to the current position.
|
||||
elif hasattr(fileobj, 'seek') and hasattr(fileobj, 'tell'):
|
||||
try:
|
||||
fileobj.seek(0, 1)
|
||||
return True
|
||||
except OSError:
|
||||
# If an io related error was thrown then it is not seekable.
|
||||
return False
|
||||
# Else, the fileobj is not seekable
|
||||
return False
|
||||
|
||||
|
||||
def readable(fileobj):
|
||||
"""Determines whether or not a file-like object is readable.
|
||||
|
||||
:param fileobj: The file-like object to determine if readable
|
||||
|
||||
:returns: True, if readable. False otherwise.
|
||||
"""
|
||||
if hasattr(fileobj, 'readable'):
|
||||
return fileobj.readable()
|
||||
|
||||
return hasattr(fileobj, 'read')
|
||||
|
||||
|
||||
def fallocate(fileobj, size):
|
||||
if hasattr(os, 'posix_fallocate'):
|
||||
os.posix_fallocate(fileobj.fileno(), 0, size)
|
||||
else:
|
||||
fileobj.truncate(size)
|
||||
|
||||
|
||||
# Import at end of file to avoid circular dependencies
|
||||
from multiprocessing.managers import BaseManager # noqa: F401,E402
|
||||
@@ -0,0 +1,38 @@
|
||||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import s3transfer
|
||||
|
||||
KB = 1024
|
||||
MB = KB * KB
|
||||
GB = MB * KB
|
||||
|
||||
ALLOWED_DOWNLOAD_ARGS = [
|
||||
'ChecksumMode',
|
||||
'VersionId',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKey',
|
||||
'SSECustomerKeyMD5',
|
||||
'RequestPayer',
|
||||
'ExpectedBucketOwner',
|
||||
]
|
||||
|
||||
FULL_OBJECT_CHECKSUM_ARGS = [
|
||||
'ChecksumCRC32',
|
||||
'ChecksumCRC32C',
|
||||
'ChecksumCRC64NVME',
|
||||
'ChecksumSHA1',
|
||||
'ChecksumSHA256',
|
||||
]
|
||||
|
||||
USER_AGENT = f's3transfer/{s3transfer.__version__}'
|
||||
PROCESS_USER_AGENT = f'{USER_AGENT} processpool'
|
||||
@@ -0,0 +1,388 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import copy
|
||||
import math
|
||||
|
||||
from s3transfer.tasks import (
|
||||
CompleteMultipartUploadTask,
|
||||
CreateMultipartUploadTask,
|
||||
SubmissionTask,
|
||||
Task,
|
||||
)
|
||||
from s3transfer.utils import (
|
||||
ChunksizeAdjuster,
|
||||
calculate_range_parameter,
|
||||
get_callbacks,
|
||||
get_filtered_dict,
|
||||
)
|
||||
|
||||
|
||||
class CopySubmissionTask(SubmissionTask):
|
||||
"""Task for submitting tasks to execute a copy"""
|
||||
|
||||
EXTRA_ARGS_TO_HEAD_ARGS_MAPPING = {
|
||||
'CopySourceIfMatch': 'IfMatch',
|
||||
'CopySourceIfModifiedSince': 'IfModifiedSince',
|
||||
'CopySourceIfNoneMatch': 'IfNoneMatch',
|
||||
'CopySourceIfUnmodifiedSince': 'IfUnmodifiedSince',
|
||||
'CopySourceSSECustomerKey': 'SSECustomerKey',
|
||||
'CopySourceSSECustomerAlgorithm': 'SSECustomerAlgorithm',
|
||||
'CopySourceSSECustomerKeyMD5': 'SSECustomerKeyMD5',
|
||||
'RequestPayer': 'RequestPayer',
|
||||
'ExpectedBucketOwner': 'ExpectedBucketOwner',
|
||||
}
|
||||
|
||||
UPLOAD_PART_COPY_ARGS = [
|
||||
'CopySourceIfMatch',
|
||||
'CopySourceIfModifiedSince',
|
||||
'CopySourceIfNoneMatch',
|
||||
'CopySourceIfUnmodifiedSince',
|
||||
'CopySourceSSECustomerKey',
|
||||
'CopySourceSSECustomerAlgorithm',
|
||||
'CopySourceSSECustomerKeyMD5',
|
||||
'SSECustomerKey',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKeyMD5',
|
||||
'RequestPayer',
|
||||
'ExpectedBucketOwner',
|
||||
]
|
||||
|
||||
CREATE_MULTIPART_ARGS_BLACKLIST = [
|
||||
'CopySourceIfMatch',
|
||||
'CopySourceIfModifiedSince',
|
||||
'CopySourceIfNoneMatch',
|
||||
'CopySourceIfUnmodifiedSince',
|
||||
'CopySourceSSECustomerKey',
|
||||
'CopySourceSSECustomerAlgorithm',
|
||||
'CopySourceSSECustomerKeyMD5',
|
||||
'MetadataDirective',
|
||||
'TaggingDirective',
|
||||
]
|
||||
|
||||
COMPLETE_MULTIPART_ARGS = [
|
||||
'SSECustomerKey',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKeyMD5',
|
||||
'RequestPayer',
|
||||
'ExpectedBucketOwner',
|
||||
]
|
||||
|
||||
def _submit(
|
||||
self, client, config, osutil, request_executor, transfer_future
|
||||
):
|
||||
"""
|
||||
:param client: The client associated with the transfer manager
|
||||
|
||||
:type config: s3transfer.manager.TransferConfig
|
||||
:param config: The transfer config associated with the transfer
|
||||
manager
|
||||
|
||||
:type osutil: s3transfer.utils.OSUtil
|
||||
:param osutil: The os utility associated to the transfer manager
|
||||
|
||||
:type request_executor: s3transfer.futures.BoundedExecutor
|
||||
:param request_executor: The request executor associated with the
|
||||
transfer manager
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future associated with the
|
||||
transfer request that tasks are being submitted for
|
||||
"""
|
||||
# Determine the size if it was not provided
|
||||
if transfer_future.meta.size is None:
|
||||
# If a size was not provided figure out the size for the
|
||||
# user. Note that we will only use the client provided to
|
||||
# the TransferManager. If the object is outside of the region
|
||||
# of the client, they may have to provide the file size themselves
|
||||
# with a completely new client.
|
||||
call_args = transfer_future.meta.call_args
|
||||
head_object_request = (
|
||||
self._get_head_object_request_from_copy_source(
|
||||
call_args.copy_source
|
||||
)
|
||||
)
|
||||
extra_args = call_args.extra_args
|
||||
|
||||
# Map any values that may be used in the head object that is
|
||||
# used in the copy object
|
||||
for param, value in extra_args.items():
|
||||
if param in self.EXTRA_ARGS_TO_HEAD_ARGS_MAPPING:
|
||||
head_object_request[
|
||||
self.EXTRA_ARGS_TO_HEAD_ARGS_MAPPING[param]
|
||||
] = value
|
||||
|
||||
response = call_args.source_client.head_object(
|
||||
**head_object_request
|
||||
)
|
||||
transfer_future.meta.provide_transfer_size(
|
||||
response['ContentLength']
|
||||
)
|
||||
|
||||
# If it is greater than threshold do a multipart copy, otherwise
|
||||
# do a regular copy object.
|
||||
if transfer_future.meta.size < config.multipart_threshold:
|
||||
self._submit_copy_request(
|
||||
client, config, osutil, request_executor, transfer_future
|
||||
)
|
||||
else:
|
||||
self._submit_multipart_request(
|
||||
client, config, osutil, request_executor, transfer_future
|
||||
)
|
||||
|
||||
def _submit_copy_request(
|
||||
self, client, config, osutil, request_executor, transfer_future
|
||||
):
|
||||
call_args = transfer_future.meta.call_args
|
||||
|
||||
# Get the needed progress callbacks for the task
|
||||
progress_callbacks = get_callbacks(transfer_future, 'progress')
|
||||
|
||||
# Submit the request of a single copy.
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
CopyObjectTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'copy_source': call_args.copy_source,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'extra_args': call_args.extra_args,
|
||||
'callbacks': progress_callbacks,
|
||||
'size': transfer_future.meta.size,
|
||||
},
|
||||
is_final=True,
|
||||
),
|
||||
)
|
||||
|
||||
def _submit_multipart_request(
|
||||
self, client, config, osutil, request_executor, transfer_future
|
||||
):
|
||||
call_args = transfer_future.meta.call_args
|
||||
|
||||
# Submit the request to create a multipart upload and make sure it
|
||||
# does not include any of the arguments used for copy part.
|
||||
create_multipart_extra_args = {}
|
||||
for param, val in call_args.extra_args.items():
|
||||
if param not in self.CREATE_MULTIPART_ARGS_BLACKLIST:
|
||||
create_multipart_extra_args[param] = val
|
||||
|
||||
create_multipart_future = self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
CreateMultipartUploadTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'extra_args': create_multipart_extra_args,
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
# Determine how many parts are needed based on filesize and
|
||||
# desired chunksize.
|
||||
part_size = config.multipart_chunksize
|
||||
adjuster = ChunksizeAdjuster()
|
||||
part_size = adjuster.adjust_chunksize(
|
||||
part_size, transfer_future.meta.size
|
||||
)
|
||||
num_parts = int(
|
||||
math.ceil(transfer_future.meta.size / float(part_size))
|
||||
)
|
||||
|
||||
# Submit requests to upload the parts of the file.
|
||||
part_futures = []
|
||||
progress_callbacks = get_callbacks(transfer_future, 'progress')
|
||||
|
||||
for part_number in range(1, num_parts + 1):
|
||||
extra_part_args = self._extra_upload_part_args(
|
||||
call_args.extra_args
|
||||
)
|
||||
# The part number for upload part starts at 1 while the
|
||||
# range parameter starts at zero, so just subtract 1 off of
|
||||
# the part number
|
||||
extra_part_args['CopySourceRange'] = calculate_range_parameter(
|
||||
part_size,
|
||||
part_number - 1,
|
||||
num_parts,
|
||||
transfer_future.meta.size,
|
||||
)
|
||||
# Get the size of the part copy as well for the progress
|
||||
# callbacks.
|
||||
size = self._get_transfer_size(
|
||||
part_size,
|
||||
part_number - 1,
|
||||
num_parts,
|
||||
transfer_future.meta.size,
|
||||
)
|
||||
# Get the checksum algorithm of the multipart request.
|
||||
checksum_algorithm = call_args.extra_args.get("ChecksumAlgorithm")
|
||||
part_futures.append(
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
CopyPartTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'copy_source': call_args.copy_source,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'part_number': part_number,
|
||||
'extra_args': extra_part_args,
|
||||
'callbacks': progress_callbacks,
|
||||
'size': size,
|
||||
'checksum_algorithm': checksum_algorithm,
|
||||
},
|
||||
pending_main_kwargs={
|
||||
'upload_id': create_multipart_future
|
||||
},
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
complete_multipart_extra_args = self._extra_complete_multipart_args(
|
||||
call_args.extra_args
|
||||
)
|
||||
# Submit the request to complete the multipart upload.
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
CompleteMultipartUploadTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'extra_args': complete_multipart_extra_args,
|
||||
},
|
||||
pending_main_kwargs={
|
||||
'upload_id': create_multipart_future,
|
||||
'parts': part_futures,
|
||||
},
|
||||
is_final=True,
|
||||
),
|
||||
)
|
||||
|
||||
def _get_head_object_request_from_copy_source(self, copy_source):
|
||||
if isinstance(copy_source, dict):
|
||||
return copy.copy(copy_source)
|
||||
else:
|
||||
raise TypeError(
|
||||
'Expecting dictionary formatted: '
|
||||
'{"Bucket": bucket_name, "Key": key} '
|
||||
f'but got {copy_source} or type {type(copy_source)}.'
|
||||
)
|
||||
|
||||
def _extra_upload_part_args(self, extra_args):
|
||||
# Only the args in COPY_PART_ARGS actually need to be passed
|
||||
# onto the upload_part_copy calls.
|
||||
return get_filtered_dict(extra_args, self.UPLOAD_PART_COPY_ARGS)
|
||||
|
||||
def _extra_complete_multipart_args(self, extra_args):
|
||||
return get_filtered_dict(extra_args, self.COMPLETE_MULTIPART_ARGS)
|
||||
|
||||
def _get_transfer_size(
|
||||
self, part_size, part_index, num_parts, total_transfer_size
|
||||
):
|
||||
if part_index == num_parts - 1:
|
||||
# The last part may be different in size then the rest of the
|
||||
# parts.
|
||||
return total_transfer_size - (part_index * part_size)
|
||||
return part_size
|
||||
|
||||
|
||||
class CopyObjectTask(Task):
|
||||
"""Task to do a nonmultipart copy"""
|
||||
|
||||
def _main(
|
||||
self, client, copy_source, bucket, key, extra_args, callbacks, size
|
||||
):
|
||||
"""
|
||||
:param client: The client to use when calling PutObject
|
||||
:param copy_source: The CopySource parameter to use
|
||||
:param bucket: The name of the bucket to copy to
|
||||
:param key: The name of the key to copy to
|
||||
:param extra_args: A dictionary of any extra arguments that may be
|
||||
used in the upload.
|
||||
:param callbacks: List of callbacks to call after copy
|
||||
:param size: The size of the transfer. This value is passed into
|
||||
the callbacks
|
||||
|
||||
"""
|
||||
client.copy_object(
|
||||
CopySource=copy_source, Bucket=bucket, Key=key, **extra_args
|
||||
)
|
||||
for callback in callbacks:
|
||||
callback(bytes_transferred=size)
|
||||
|
||||
|
||||
class CopyPartTask(Task):
|
||||
"""Task to upload a part in a multipart copy"""
|
||||
|
||||
def _main(
|
||||
self,
|
||||
client,
|
||||
copy_source,
|
||||
bucket,
|
||||
key,
|
||||
upload_id,
|
||||
part_number,
|
||||
extra_args,
|
||||
callbacks,
|
||||
size,
|
||||
checksum_algorithm=None,
|
||||
):
|
||||
"""
|
||||
:param client: The client to use when calling PutObject
|
||||
:param copy_source: The CopySource parameter to use
|
||||
:param bucket: The name of the bucket to upload to
|
||||
:param key: The name of the key to upload to
|
||||
:param upload_id: The id of the upload
|
||||
:param part_number: The number representing the part of the multipart
|
||||
upload
|
||||
:param extra_args: A dictionary of any extra arguments that may be
|
||||
used in the upload.
|
||||
:param callbacks: List of callbacks to call after copy part
|
||||
:param size: The size of the transfer. This value is passed into
|
||||
the callbacks
|
||||
:param checksum_algorithm: The algorithm that was used to create the multipart
|
||||
upload
|
||||
|
||||
:rtype: dict
|
||||
:returns: A dictionary representing a part::
|
||||
|
||||
{'Etag': etag_value, 'PartNumber': part_number}
|
||||
|
||||
This value can be appended to a list to be used to complete
|
||||
the multipart upload. If a checksum is in the response,
|
||||
it will also be included.
|
||||
"""
|
||||
response = client.upload_part_copy(
|
||||
CopySource=copy_source,
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id,
|
||||
PartNumber=part_number,
|
||||
**extra_args,
|
||||
)
|
||||
for callback in callbacks:
|
||||
callback(bytes_transferred=size)
|
||||
etag = response['CopyPartResult']['ETag']
|
||||
part_metadata = {'ETag': etag, 'PartNumber': part_number}
|
||||
if checksum_algorithm:
|
||||
checksum_member = f'Checksum{checksum_algorithm.upper()}'
|
||||
if checksum_member in response['CopyPartResult']:
|
||||
part_metadata[checksum_member] = response['CopyPartResult'][
|
||||
checksum_member
|
||||
]
|
||||
return part_metadata
|
||||
@@ -0,0 +1,992 @@
|
||||
# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import logging
|
||||
import re
|
||||
import threading
|
||||
from io import BytesIO
|
||||
|
||||
import awscrt.http
|
||||
import awscrt.s3
|
||||
import botocore.awsrequest
|
||||
import botocore.session
|
||||
from awscrt.auth import (
|
||||
AwsCredentials,
|
||||
AwsCredentialsProvider,
|
||||
AwsSigningAlgorithm,
|
||||
AwsSigningConfig,
|
||||
)
|
||||
from awscrt.io import (
|
||||
ClientBootstrap,
|
||||
ClientTlsContext,
|
||||
DefaultHostResolver,
|
||||
EventLoopGroup,
|
||||
TlsContextOptions,
|
||||
)
|
||||
from awscrt.s3 import S3Client, S3RequestTlsMode, S3RequestType
|
||||
from botocore import UNSIGNED
|
||||
from botocore.compat import urlsplit
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import NoCredentialsError
|
||||
from botocore.utils import ArnParser, InvalidArnException
|
||||
|
||||
from s3transfer.constants import FULL_OBJECT_CHECKSUM_ARGS, MB
|
||||
from s3transfer.exceptions import TransferNotDoneError
|
||||
from s3transfer.futures import BaseTransferFuture, BaseTransferMeta
|
||||
from s3transfer.manager import TransferManager
|
||||
from s3transfer.utils import (
|
||||
CallArgs,
|
||||
OSUtils,
|
||||
create_nested_client,
|
||||
get_callbacks,
|
||||
is_s3express_bucket,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CRT_S3_PROCESS_LOCK = None
|
||||
|
||||
|
||||
def acquire_crt_s3_process_lock(name):
|
||||
# Currently, the CRT S3 client performs best when there is only one
|
||||
# instance of it running on a host. This lock allows an application to
|
||||
# signal across processes whether there is another process of the same
|
||||
# application using the CRT S3 client and prevent spawning more than one
|
||||
# CRT S3 clients running on the system for that application.
|
||||
#
|
||||
# NOTE: When acquiring the CRT process lock, the lock automatically is
|
||||
# released when the lock object is garbage collected. So, the CRT process
|
||||
# lock is set as a global so that it is not unintentionally garbage
|
||||
# collected/released if reference of the lock is lost.
|
||||
global CRT_S3_PROCESS_LOCK
|
||||
if CRT_S3_PROCESS_LOCK is None:
|
||||
crt_lock = awscrt.s3.CrossProcessLock(name)
|
||||
try:
|
||||
crt_lock.acquire()
|
||||
except RuntimeError:
|
||||
# If there is another process that is holding the lock, the CRT
|
||||
# returns a RuntimeError. We return None here to signal that our
|
||||
# current process was not able to acquire the lock.
|
||||
return None
|
||||
CRT_S3_PROCESS_LOCK = crt_lock
|
||||
return CRT_S3_PROCESS_LOCK
|
||||
|
||||
|
||||
def create_s3_crt_client(
|
||||
region,
|
||||
crt_credentials_provider=None,
|
||||
num_threads=None,
|
||||
target_throughput=None,
|
||||
part_size=8 * MB,
|
||||
use_ssl=True,
|
||||
verify=None,
|
||||
):
|
||||
"""
|
||||
:type region: str
|
||||
:param region: The region used for signing
|
||||
|
||||
:type crt_credentials_provider:
|
||||
Optional[awscrt.auth.AwsCredentialsProvider]
|
||||
:param crt_credentials_provider: CRT AWS credentials provider
|
||||
to use to sign requests. If not set, requests will not be signed.
|
||||
|
||||
:type num_threads: Optional[int]
|
||||
:param num_threads: Number of worker threads generated. Default
|
||||
is the number of processors in the machine.
|
||||
|
||||
:type target_throughput: Optional[int]
|
||||
:param target_throughput: Throughput target in bytes per second.
|
||||
By default, CRT will automatically attempt to choose a target
|
||||
throughput that matches the system's maximum network throughput.
|
||||
Currently, if CRT is unable to determine the maximum network
|
||||
throughput, a fallback target throughput of ``1_250_000_000`` bytes
|
||||
per second (which translates to 10 gigabits per second, or 1.16
|
||||
gibibytes per second) is used. To set a specific target
|
||||
throughput, set a value for this parameter.
|
||||
|
||||
:type part_size: Optional[int]
|
||||
:param part_size: Size, in Bytes, of parts that files will be downloaded
|
||||
or uploaded in.
|
||||
|
||||
:type use_ssl: boolean
|
||||
:param use_ssl: Whether or not to use SSL. By default, SSL is used.
|
||||
Note that not all services support non-ssl connections.
|
||||
|
||||
:type verify: Optional[boolean/string]
|
||||
:param verify: Whether or not to verify SSL certificates.
|
||||
By default SSL certificates are verified. You can provide the
|
||||
following values:
|
||||
|
||||
* False - do not validate SSL certificates. SSL will still be
|
||||
used (unless use_ssl is False), but SSL certificates
|
||||
will not be verified.
|
||||
* path/to/cert/bundle.pem - A filename of the CA cert bundle to
|
||||
use. Specify this argument if you want to use a custom CA cert
|
||||
bundle instead of the default one on your system.
|
||||
"""
|
||||
event_loop_group = EventLoopGroup(num_threads)
|
||||
host_resolver = DefaultHostResolver(event_loop_group)
|
||||
bootstrap = ClientBootstrap(event_loop_group, host_resolver)
|
||||
tls_connection_options = None
|
||||
|
||||
tls_mode = (
|
||||
S3RequestTlsMode.ENABLED if use_ssl else S3RequestTlsMode.DISABLED
|
||||
)
|
||||
if verify is not None:
|
||||
tls_ctx_options = TlsContextOptions()
|
||||
if verify:
|
||||
tls_ctx_options.override_default_trust_store_from_path(
|
||||
ca_filepath=verify
|
||||
)
|
||||
else:
|
||||
tls_ctx_options.verify_peer = False
|
||||
client_tls_option = ClientTlsContext(tls_ctx_options)
|
||||
tls_connection_options = client_tls_option.new_connection_options()
|
||||
target_gbps = _get_crt_throughput_target_gbps(
|
||||
provided_throughput_target_bytes=target_throughput
|
||||
)
|
||||
return S3Client(
|
||||
bootstrap=bootstrap,
|
||||
region=region,
|
||||
credential_provider=crt_credentials_provider,
|
||||
part_size=part_size,
|
||||
tls_mode=tls_mode,
|
||||
tls_connection_options=tls_connection_options,
|
||||
throughput_target_gbps=target_gbps,
|
||||
enable_s3express=True,
|
||||
)
|
||||
|
||||
|
||||
def _get_crt_throughput_target_gbps(provided_throughput_target_bytes=None):
|
||||
if provided_throughput_target_bytes is None:
|
||||
target_gbps = awscrt.s3.get_recommended_throughput_target_gbps()
|
||||
logger.debug(
|
||||
'Recommended CRT throughput target in gbps: %s', target_gbps
|
||||
)
|
||||
if target_gbps is None:
|
||||
target_gbps = 10.0
|
||||
else:
|
||||
# NOTE: The GB constant in s3transfer is technically a gibibyte. The
|
||||
# GB constant is not used here because the CRT interprets gigabits
|
||||
# for networking as a base power of 10
|
||||
# (i.e. 1000 ** 3 instead of 1024 ** 3).
|
||||
target_gbps = provided_throughput_target_bytes * 8 / 1_000_000_000
|
||||
logger.debug('Using CRT throughput target in gbps: %s', target_gbps)
|
||||
return target_gbps
|
||||
|
||||
|
||||
class CRTTransferManager:
|
||||
ALLOWED_DOWNLOAD_ARGS = TransferManager.ALLOWED_DOWNLOAD_ARGS
|
||||
ALLOWED_UPLOAD_ARGS = TransferManager.ALLOWED_UPLOAD_ARGS
|
||||
ALLOWED_DELETE_ARGS = TransferManager.ALLOWED_DELETE_ARGS
|
||||
|
||||
VALIDATE_SUPPORTED_BUCKET_VALUES = True
|
||||
|
||||
_UNSUPPORTED_BUCKET_PATTERNS = TransferManager._UNSUPPORTED_BUCKET_PATTERNS
|
||||
|
||||
def __init__(self, crt_s3_client, crt_request_serializer, osutil=None):
|
||||
"""A transfer manager interface for Amazon S3 on CRT s3 client.
|
||||
|
||||
:type crt_s3_client: awscrt.s3.S3Client
|
||||
:param crt_s3_client: The CRT s3 client, handling all the
|
||||
HTTP requests and functions under then hood
|
||||
|
||||
:type crt_request_serializer: s3transfer.crt.BaseCRTRequestSerializer
|
||||
:param crt_request_serializer: Serializer, generates unsigned crt HTTP
|
||||
request.
|
||||
|
||||
:type osutil: s3transfer.utils.OSUtils
|
||||
:param osutil: OSUtils object to use for os-related behavior when
|
||||
using with transfer manager.
|
||||
"""
|
||||
if osutil is None:
|
||||
self._osutil = OSUtils()
|
||||
self._crt_s3_client = crt_s3_client
|
||||
self._s3_args_creator = S3ClientArgsCreator(
|
||||
crt_request_serializer, self._osutil
|
||||
)
|
||||
self._crt_exception_translator = (
|
||||
crt_request_serializer.translate_crt_exception
|
||||
)
|
||||
self._future_coordinators = []
|
||||
self._semaphore = threading.Semaphore(128) # not configurable
|
||||
# A counter to create unique id's for each transfer submitted.
|
||||
self._id_counter = 0
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, *args):
|
||||
cancel = False
|
||||
if exc_type:
|
||||
cancel = True
|
||||
self._shutdown(cancel)
|
||||
|
||||
def download(
|
||||
self, bucket, key, fileobj, extra_args=None, subscribers=None
|
||||
):
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
if subscribers is None:
|
||||
subscribers = {}
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
|
||||
self._validate_if_bucket_supported(bucket)
|
||||
callargs = CallArgs(
|
||||
bucket=bucket,
|
||||
key=key,
|
||||
fileobj=fileobj,
|
||||
extra_args=extra_args,
|
||||
subscribers=subscribers,
|
||||
)
|
||||
return self._submit_transfer("get_object", callargs)
|
||||
|
||||
def upload(self, fileobj, bucket, key, extra_args=None, subscribers=None):
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
if subscribers is None:
|
||||
subscribers = {}
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
|
||||
self._validate_if_bucket_supported(bucket)
|
||||
self._validate_checksum_algorithm_supported(extra_args)
|
||||
callargs = CallArgs(
|
||||
bucket=bucket,
|
||||
key=key,
|
||||
fileobj=fileobj,
|
||||
extra_args=extra_args,
|
||||
subscribers=subscribers,
|
||||
)
|
||||
return self._submit_transfer("put_object", callargs)
|
||||
|
||||
def delete(self, bucket, key, extra_args=None, subscribers=None):
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
if subscribers is None:
|
||||
subscribers = {}
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_DELETE_ARGS)
|
||||
self._validate_if_bucket_supported(bucket)
|
||||
callargs = CallArgs(
|
||||
bucket=bucket,
|
||||
key=key,
|
||||
extra_args=extra_args,
|
||||
subscribers=subscribers,
|
||||
)
|
||||
return self._submit_transfer("delete_object", callargs)
|
||||
|
||||
def shutdown(self, cancel=False):
|
||||
self._shutdown(cancel)
|
||||
|
||||
def _validate_if_bucket_supported(self, bucket):
|
||||
# s3 high level operations don't support some resources
|
||||
# (eg. S3 Object Lambda) only direct API calls are available
|
||||
# for such resources
|
||||
if self.VALIDATE_SUPPORTED_BUCKET_VALUES:
|
||||
for resource, pattern in self._UNSUPPORTED_BUCKET_PATTERNS.items():
|
||||
match = pattern.match(bucket)
|
||||
if match:
|
||||
raise ValueError(
|
||||
f'TransferManager methods do not support {resource} '
|
||||
'resource. Use direct client calls instead.'
|
||||
)
|
||||
|
||||
def _validate_all_known_args(self, actual, allowed):
|
||||
for kwarg in actual:
|
||||
if kwarg not in allowed:
|
||||
raise ValueError(
|
||||
f"Invalid extra_args key '{kwarg}', "
|
||||
f"must be one of: {', '.join(allowed)}"
|
||||
)
|
||||
|
||||
def _validate_checksum_algorithm_supported(self, extra_args):
|
||||
checksum_algorithm = extra_args.get('ChecksumAlgorithm')
|
||||
if checksum_algorithm is None:
|
||||
return
|
||||
supported_algorithms = list(awscrt.s3.S3ChecksumAlgorithm.__members__)
|
||||
if checksum_algorithm.upper() not in supported_algorithms:
|
||||
raise ValueError(
|
||||
f'ChecksumAlgorithm: {checksum_algorithm} not supported. '
|
||||
f'Supported algorithms are: {supported_algorithms}'
|
||||
)
|
||||
|
||||
def _cancel_transfers(self):
|
||||
for coordinator in self._future_coordinators:
|
||||
if not coordinator.done():
|
||||
coordinator.cancel()
|
||||
|
||||
def _finish_transfers(self):
|
||||
for coordinator in self._future_coordinators:
|
||||
coordinator.result()
|
||||
|
||||
def _wait_transfers_done(self):
|
||||
for coordinator in self._future_coordinators:
|
||||
coordinator.wait_until_on_done_callbacks_complete()
|
||||
|
||||
def _shutdown(self, cancel=False):
|
||||
if cancel:
|
||||
self._cancel_transfers()
|
||||
try:
|
||||
self._finish_transfers()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
self._cancel_transfers()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
self._wait_transfers_done()
|
||||
|
||||
def _release_semaphore(self, **kwargs):
|
||||
self._semaphore.release()
|
||||
|
||||
def _submit_transfer(self, request_type, call_args):
|
||||
on_done_after_calls = [self._release_semaphore]
|
||||
coordinator = CRTTransferCoordinator(
|
||||
transfer_id=self._id_counter,
|
||||
exception_translator=self._crt_exception_translator,
|
||||
)
|
||||
components = {
|
||||
'meta': CRTTransferMeta(self._id_counter, call_args),
|
||||
'coordinator': coordinator,
|
||||
}
|
||||
future = CRTTransferFuture(**components)
|
||||
afterdone = AfterDoneHandler(coordinator)
|
||||
on_done_after_calls.append(afterdone)
|
||||
|
||||
try:
|
||||
self._semaphore.acquire()
|
||||
on_queued = self._s3_args_creator.get_crt_callback(
|
||||
future, 'queued'
|
||||
)
|
||||
on_queued()
|
||||
crt_callargs = self._s3_args_creator.get_make_request_args(
|
||||
request_type,
|
||||
call_args,
|
||||
coordinator,
|
||||
future,
|
||||
on_done_after_calls,
|
||||
)
|
||||
crt_s3_request = self._crt_s3_client.make_request(**crt_callargs)
|
||||
except Exception as e:
|
||||
coordinator.set_exception(e, True)
|
||||
on_done = self._s3_args_creator.get_crt_callback(
|
||||
future, 'done', after_subscribers=on_done_after_calls
|
||||
)
|
||||
on_done(error=e)
|
||||
else:
|
||||
coordinator.set_s3_request(crt_s3_request)
|
||||
self._future_coordinators.append(coordinator)
|
||||
|
||||
self._id_counter += 1
|
||||
return future
|
||||
|
||||
|
||||
class CRTTransferMeta(BaseTransferMeta):
|
||||
"""Holds metadata about the CRTTransferFuture"""
|
||||
|
||||
def __init__(self, transfer_id=None, call_args=None):
|
||||
self._transfer_id = transfer_id
|
||||
self._call_args = call_args
|
||||
self._user_context = {}
|
||||
|
||||
@property
|
||||
def call_args(self):
|
||||
return self._call_args
|
||||
|
||||
@property
|
||||
def transfer_id(self):
|
||||
return self._transfer_id
|
||||
|
||||
@property
|
||||
def user_context(self):
|
||||
return self._user_context
|
||||
|
||||
|
||||
class CRTTransferFuture(BaseTransferFuture):
|
||||
def __init__(self, meta=None, coordinator=None):
|
||||
"""The future associated to a submitted transfer request via CRT S3 client
|
||||
|
||||
:type meta: s3transfer.crt.CRTTransferMeta
|
||||
:param meta: The metadata associated to the transfer future.
|
||||
|
||||
:type coordinator: s3transfer.crt.CRTTransferCoordinator
|
||||
:param coordinator: The coordinator associated to the transfer future.
|
||||
"""
|
||||
self._meta = meta
|
||||
if meta is None:
|
||||
self._meta = CRTTransferMeta()
|
||||
self._coordinator = coordinator
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
return self._meta
|
||||
|
||||
def done(self):
|
||||
return self._coordinator.done()
|
||||
|
||||
def result(self, timeout=None):
|
||||
self._coordinator.result(timeout)
|
||||
|
||||
def cancel(self):
|
||||
self._coordinator.cancel()
|
||||
|
||||
def set_exception(self, exception):
|
||||
"""Sets the exception on the future."""
|
||||
if not self.done():
|
||||
raise TransferNotDoneError(
|
||||
'set_exception can only be called once the transfer is '
|
||||
'complete.'
|
||||
)
|
||||
self._coordinator.set_exception(exception, override=True)
|
||||
|
||||
|
||||
class BaseCRTRequestSerializer:
|
||||
def serialize_http_request(self, transfer_type, future):
|
||||
"""Serialize CRT HTTP requests.
|
||||
|
||||
:type transfer_type: string
|
||||
:param transfer_type: the type of transfer made,
|
||||
e.g 'put_object', 'get_object', 'delete_object'
|
||||
|
||||
:type future: s3transfer.crt.CRTTransferFuture
|
||||
|
||||
:rtype: awscrt.http.HttpRequest
|
||||
:returns: An unsigned HTTP request to be used for the CRT S3 client
|
||||
"""
|
||||
raise NotImplementedError('serialize_http_request()')
|
||||
|
||||
def translate_crt_exception(self, exception):
|
||||
raise NotImplementedError('translate_crt_exception()')
|
||||
|
||||
|
||||
class BotocoreCRTRequestSerializer(BaseCRTRequestSerializer):
|
||||
def __init__(self, session, client_kwargs=None):
|
||||
"""Serialize CRT HTTP request using botocore logic
|
||||
It also takes into account configuration from both the session
|
||||
and any keyword arguments that could be passed to
|
||||
`Session.create_client()` when serializing the request.
|
||||
|
||||
:type session: botocore.session.Session
|
||||
|
||||
:type client_kwargs: Optional[Dict[str, str]])
|
||||
:param client_kwargs: The kwargs for the botocore
|
||||
s3 client initialization.
|
||||
"""
|
||||
self._session = session
|
||||
if client_kwargs is None:
|
||||
client_kwargs = {}
|
||||
self._resolve_client_config(session, client_kwargs)
|
||||
self._client = create_nested_client(session, **client_kwargs)
|
||||
self._client.meta.events.register(
|
||||
'request-created.s3.*', self._capture_http_request
|
||||
)
|
||||
self._client.meta.events.register(
|
||||
'after-call.s3.*', self._change_response_to_serialized_http_request
|
||||
)
|
||||
self._client.meta.events.register(
|
||||
'before-send.s3.*', self._make_fake_http_response
|
||||
)
|
||||
self._client.meta.events.register(
|
||||
'before-call.s3.*', self._remove_checksum_context
|
||||
)
|
||||
|
||||
def _resolve_client_config(self, session, client_kwargs):
|
||||
user_provided_config = None
|
||||
if session.get_default_client_config():
|
||||
user_provided_config = session.get_default_client_config()
|
||||
if 'config' in client_kwargs:
|
||||
user_provided_config = client_kwargs['config']
|
||||
|
||||
client_config = Config(signature_version=UNSIGNED)
|
||||
if user_provided_config:
|
||||
client_config = user_provided_config.merge(client_config)
|
||||
client_kwargs['config'] = client_config
|
||||
client_kwargs["service_name"] = "s3"
|
||||
|
||||
def _crt_request_from_aws_request(self, aws_request):
|
||||
url_parts = urlsplit(aws_request.url)
|
||||
crt_path = url_parts.path
|
||||
if url_parts.query:
|
||||
crt_path = f'{crt_path}?{url_parts.query}'
|
||||
headers_list = []
|
||||
for name, value in aws_request.headers.items():
|
||||
if isinstance(value, str):
|
||||
headers_list.append((name, value))
|
||||
else:
|
||||
headers_list.append((name, str(value, 'utf-8')))
|
||||
|
||||
crt_headers = awscrt.http.HttpHeaders(headers_list)
|
||||
|
||||
crt_request = awscrt.http.HttpRequest(
|
||||
method=aws_request.method,
|
||||
path=crt_path,
|
||||
headers=crt_headers,
|
||||
body_stream=aws_request.body,
|
||||
)
|
||||
return crt_request
|
||||
|
||||
def _convert_to_crt_http_request(self, botocore_http_request):
|
||||
# Logic that does CRTUtils.crt_request_from_aws_request
|
||||
crt_request = self._crt_request_from_aws_request(botocore_http_request)
|
||||
if crt_request.headers.get("host") is None:
|
||||
# If host is not set, set it for the request before using CRT s3
|
||||
url_parts = urlsplit(botocore_http_request.url)
|
||||
crt_request.headers.set("host", url_parts.netloc)
|
||||
if crt_request.headers.get('Content-MD5') is not None:
|
||||
crt_request.headers.remove("Content-MD5")
|
||||
|
||||
# In general, the CRT S3 client expects a content length header. It
|
||||
# only expects a missing content length header if the body is not
|
||||
# seekable. However, botocore does not set the content length header
|
||||
# for GetObject API requests and so we set the content length to zero
|
||||
# to meet the CRT S3 client's expectation that the content length
|
||||
# header is set even if there is no body.
|
||||
if crt_request.headers.get('Content-Length') is None:
|
||||
if botocore_http_request.body is None:
|
||||
crt_request.headers.add('Content-Length', "0")
|
||||
|
||||
# Botocore sets the Transfer-Encoding header when it cannot determine
|
||||
# the content length of the request body (e.g. it's not seekable).
|
||||
# However, CRT does not support this header, but it supports
|
||||
# non-seekable bodies. So we remove this header to not cause issues
|
||||
# in the downstream CRT S3 request.
|
||||
if crt_request.headers.get('Transfer-Encoding') is not None:
|
||||
crt_request.headers.remove('Transfer-Encoding')
|
||||
|
||||
return crt_request
|
||||
|
||||
def _capture_http_request(self, request, **kwargs):
|
||||
request.context['http_request'] = request
|
||||
|
||||
def _change_response_to_serialized_http_request(
|
||||
self, context, parsed, **kwargs
|
||||
):
|
||||
request = context['http_request']
|
||||
parsed['HTTPRequest'] = request.prepare()
|
||||
|
||||
def _make_fake_http_response(self, request, **kwargs):
|
||||
return botocore.awsrequest.AWSResponse(
|
||||
None,
|
||||
200,
|
||||
{},
|
||||
FakeRawResponse(b""),
|
||||
)
|
||||
|
||||
def _get_botocore_http_request(self, client_method, call_args):
|
||||
return getattr(self._client, client_method)(
|
||||
Bucket=call_args.bucket, Key=call_args.key, **call_args.extra_args
|
||||
)['HTTPRequest']
|
||||
|
||||
def serialize_http_request(self, transfer_type, future):
|
||||
botocore_http_request = self._get_botocore_http_request(
|
||||
transfer_type, future.meta.call_args
|
||||
)
|
||||
crt_request = self._convert_to_crt_http_request(botocore_http_request)
|
||||
return crt_request
|
||||
|
||||
def translate_crt_exception(self, exception):
|
||||
if isinstance(exception, awscrt.s3.S3ResponseError):
|
||||
return self._translate_crt_s3_response_error(exception)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _translate_crt_s3_response_error(self, s3_response_error):
|
||||
status_code = s3_response_error.status_code
|
||||
if status_code < 301:
|
||||
# Botocore's exception parsing only
|
||||
# runs on status codes >= 301
|
||||
return None
|
||||
|
||||
headers = {k: v for k, v in s3_response_error.headers}
|
||||
operation_name = s3_response_error.operation_name
|
||||
if operation_name is not None:
|
||||
service_model = self._client.meta.service_model
|
||||
shape = service_model.operation_model(operation_name).output_shape
|
||||
else:
|
||||
shape = None
|
||||
|
||||
response_dict = {
|
||||
'headers': botocore.awsrequest.HeadersDict(headers),
|
||||
'status_code': status_code,
|
||||
'body': s3_response_error.body,
|
||||
}
|
||||
parsed_response = self._client._response_parser.parse(
|
||||
response_dict, shape=shape
|
||||
)
|
||||
|
||||
error_code = parsed_response.get("Error", {}).get("Code")
|
||||
error_class = self._client.exceptions.from_code(error_code)
|
||||
return error_class(parsed_response, operation_name=operation_name)
|
||||
|
||||
def _remove_checksum_context(self, params, **kwargs):
|
||||
request_context = params.get("context", {})
|
||||
if "checksum" in request_context:
|
||||
del request_context["checksum"]
|
||||
|
||||
|
||||
class FakeRawResponse(BytesIO):
|
||||
def stream(self, amt=1024, decode_content=None):
|
||||
while True:
|
||||
chunk = self.read(amt)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
|
||||
class BotocoreCRTCredentialsWrapper:
|
||||
def __init__(self, resolved_botocore_credentials):
|
||||
self._resolved_credentials = resolved_botocore_credentials
|
||||
|
||||
def __call__(self):
|
||||
credentials = self._get_credentials().get_frozen_credentials()
|
||||
return AwsCredentials(
|
||||
credentials.access_key, credentials.secret_key, credentials.token
|
||||
)
|
||||
|
||||
def to_crt_credentials_provider(self):
|
||||
return AwsCredentialsProvider.new_delegate(self)
|
||||
|
||||
def _get_credentials(self):
|
||||
if self._resolved_credentials is None:
|
||||
raise NoCredentialsError()
|
||||
return self._resolved_credentials
|
||||
|
||||
|
||||
class CRTTransferCoordinator:
|
||||
"""A helper class for managing CRTTransferFuture"""
|
||||
|
||||
def __init__(
|
||||
self, transfer_id=None, s3_request=None, exception_translator=None
|
||||
):
|
||||
self.transfer_id = transfer_id
|
||||
self._exception_translator = exception_translator
|
||||
self._s3_request = s3_request
|
||||
self._lock = threading.Lock()
|
||||
self._exception = None
|
||||
self._crt_future = None
|
||||
self._done_event = threading.Event()
|
||||
|
||||
@property
|
||||
def s3_request(self):
|
||||
return self._s3_request
|
||||
|
||||
def set_done_callbacks_complete(self):
|
||||
self._done_event.set()
|
||||
|
||||
def wait_until_on_done_callbacks_complete(self, timeout=None):
|
||||
self._done_event.wait(timeout)
|
||||
|
||||
def set_exception(self, exception, override=False):
|
||||
with self._lock:
|
||||
if not self.done() or override:
|
||||
self._exception = exception
|
||||
|
||||
def cancel(self):
|
||||
if self._s3_request:
|
||||
self._s3_request.cancel()
|
||||
|
||||
def result(self, timeout=None):
|
||||
if self._exception:
|
||||
raise self._exception
|
||||
try:
|
||||
self._crt_future.result(timeout)
|
||||
except KeyboardInterrupt:
|
||||
self.cancel()
|
||||
self._crt_future.result(timeout)
|
||||
raise
|
||||
except Exception as e:
|
||||
self.handle_exception(e)
|
||||
finally:
|
||||
if self._s3_request:
|
||||
self._s3_request = None
|
||||
|
||||
def handle_exception(self, exc):
|
||||
translated_exc = None
|
||||
if self._exception_translator:
|
||||
try:
|
||||
translated_exc = self._exception_translator(exc)
|
||||
except Exception as e:
|
||||
# Bail out if we hit an issue translating
|
||||
# and raise the original error.
|
||||
logger.debug("Unable to translate exception.", exc_info=e)
|
||||
pass
|
||||
if translated_exc is not None:
|
||||
raise translated_exc from exc
|
||||
else:
|
||||
raise exc
|
||||
|
||||
def done(self):
|
||||
if self._crt_future is None:
|
||||
return False
|
||||
return self._crt_future.done()
|
||||
|
||||
def set_s3_request(self, s3_request):
|
||||
self._s3_request = s3_request
|
||||
self._crt_future = self._s3_request.finished_future
|
||||
|
||||
|
||||
class S3ClientArgsCreator:
|
||||
def __init__(self, crt_request_serializer, os_utils):
|
||||
self._request_serializer = crt_request_serializer
|
||||
self._os_utils = os_utils
|
||||
|
||||
def get_make_request_args(
|
||||
self, request_type, call_args, coordinator, future, on_done_after_calls
|
||||
):
|
||||
request_args_handler = getattr(
|
||||
self,
|
||||
f'_get_make_request_args_{request_type}',
|
||||
self._default_get_make_request_args,
|
||||
)
|
||||
return request_args_handler(
|
||||
request_type=request_type,
|
||||
call_args=call_args,
|
||||
coordinator=coordinator,
|
||||
future=future,
|
||||
on_done_before_calls=[],
|
||||
on_done_after_calls=on_done_after_calls,
|
||||
)
|
||||
|
||||
def get_crt_callback(
|
||||
self,
|
||||
future,
|
||||
callback_type,
|
||||
before_subscribers=None,
|
||||
after_subscribers=None,
|
||||
):
|
||||
def invoke_all_callbacks(*args, **kwargs):
|
||||
callbacks_list = []
|
||||
if before_subscribers is not None:
|
||||
callbacks_list += before_subscribers
|
||||
callbacks_list += get_callbacks(future, callback_type)
|
||||
if after_subscribers is not None:
|
||||
callbacks_list += after_subscribers
|
||||
for callback in callbacks_list:
|
||||
# The get_callbacks helper will set the first augment
|
||||
# by keyword, the other augments need to be set by keyword
|
||||
# as well
|
||||
if callback_type == "progress":
|
||||
callback(bytes_transferred=args[0])
|
||||
else:
|
||||
callback(*args, **kwargs)
|
||||
|
||||
return invoke_all_callbacks
|
||||
|
||||
def _get_make_request_args_put_object(
|
||||
self,
|
||||
request_type,
|
||||
call_args,
|
||||
coordinator,
|
||||
future,
|
||||
on_done_before_calls,
|
||||
on_done_after_calls,
|
||||
):
|
||||
send_filepath = None
|
||||
if isinstance(call_args.fileobj, str):
|
||||
send_filepath = call_args.fileobj
|
||||
data_len = self._os_utils.get_file_size(send_filepath)
|
||||
call_args.extra_args["ContentLength"] = data_len
|
||||
else:
|
||||
call_args.extra_args["Body"] = call_args.fileobj
|
||||
|
||||
checksum_config = None
|
||||
if not any(
|
||||
checksum_arg in call_args.extra_args
|
||||
for checksum_arg in FULL_OBJECT_CHECKSUM_ARGS
|
||||
):
|
||||
checksum_algorithm = call_args.extra_args.pop(
|
||||
'ChecksumAlgorithm', 'CRC32'
|
||||
).upper()
|
||||
checksum_config = awscrt.s3.S3ChecksumConfig(
|
||||
algorithm=awscrt.s3.S3ChecksumAlgorithm[checksum_algorithm],
|
||||
location=awscrt.s3.S3ChecksumLocation.TRAILER,
|
||||
)
|
||||
# Suppress botocore's automatic MD5 calculation by setting an override
|
||||
# value that will get deleted in the BotocoreCRTRequestSerializer.
|
||||
# As part of the CRT S3 request, we request the CRT S3 client to
|
||||
# automatically add trailing checksums to its uploads.
|
||||
call_args.extra_args["ContentMD5"] = "override-to-be-removed"
|
||||
|
||||
make_request_args = self._default_get_make_request_args(
|
||||
request_type=request_type,
|
||||
call_args=call_args,
|
||||
coordinator=coordinator,
|
||||
future=future,
|
||||
on_done_before_calls=on_done_before_calls,
|
||||
on_done_after_calls=on_done_after_calls,
|
||||
)
|
||||
make_request_args['send_filepath'] = send_filepath
|
||||
make_request_args['checksum_config'] = checksum_config
|
||||
return make_request_args
|
||||
|
||||
def _get_make_request_args_get_object(
|
||||
self,
|
||||
request_type,
|
||||
call_args,
|
||||
coordinator,
|
||||
future,
|
||||
on_done_before_calls,
|
||||
on_done_after_calls,
|
||||
):
|
||||
recv_filepath = None
|
||||
on_body = None
|
||||
checksum_config = awscrt.s3.S3ChecksumConfig(validate_response=True)
|
||||
if isinstance(call_args.fileobj, str):
|
||||
final_filepath = call_args.fileobj
|
||||
recv_filepath = self._os_utils.get_temp_filename(final_filepath)
|
||||
on_done_before_calls.append(
|
||||
RenameTempFileHandler(
|
||||
coordinator, final_filepath, recv_filepath, self._os_utils
|
||||
)
|
||||
)
|
||||
else:
|
||||
on_body = OnBodyFileObjWriter(call_args.fileobj)
|
||||
|
||||
make_request_args = self._default_get_make_request_args(
|
||||
request_type=request_type,
|
||||
call_args=call_args,
|
||||
coordinator=coordinator,
|
||||
future=future,
|
||||
on_done_before_calls=on_done_before_calls,
|
||||
on_done_after_calls=on_done_after_calls,
|
||||
)
|
||||
make_request_args['recv_filepath'] = recv_filepath
|
||||
make_request_args['on_body'] = on_body
|
||||
make_request_args['checksum_config'] = checksum_config
|
||||
return make_request_args
|
||||
|
||||
def _default_get_make_request_args(
|
||||
self,
|
||||
request_type,
|
||||
call_args,
|
||||
coordinator,
|
||||
future,
|
||||
on_done_before_calls,
|
||||
on_done_after_calls,
|
||||
):
|
||||
make_request_args = {
|
||||
'request': self._request_serializer.serialize_http_request(
|
||||
request_type, future
|
||||
),
|
||||
'type': getattr(
|
||||
S3RequestType, request_type.upper(), S3RequestType.DEFAULT
|
||||
),
|
||||
'on_done': self.get_crt_callback(
|
||||
future, 'done', on_done_before_calls, on_done_after_calls
|
||||
),
|
||||
'on_progress': self.get_crt_callback(future, 'progress'),
|
||||
}
|
||||
|
||||
# For DEFAULT requests, CRT requires the official S3 operation name.
|
||||
# So transform string like "delete_object" -> "DeleteObject".
|
||||
if make_request_args['type'] == S3RequestType.DEFAULT:
|
||||
make_request_args['operation_name'] = ''.join(
|
||||
x.title() for x in request_type.split('_')
|
||||
)
|
||||
|
||||
arn_handler = _S3ArnParamHandler()
|
||||
if (
|
||||
accesspoint_arn_details := arn_handler.handle_arn(call_args.bucket)
|
||||
) and accesspoint_arn_details['region'] == "":
|
||||
# Configure our region to `*` to propogate in `x-amz-region-set`
|
||||
# for multi-region support in MRAP accesspoints.
|
||||
# use_double_uri_encode and should_normalize_uri_path are defaulted to be True
|
||||
# But SDK already encoded the URI, and it's for S3, so set both to False
|
||||
make_request_args['signing_config'] = AwsSigningConfig(
|
||||
algorithm=AwsSigningAlgorithm.V4_ASYMMETRIC,
|
||||
region="*",
|
||||
use_double_uri_encode=False,
|
||||
should_normalize_uri_path=False,
|
||||
)
|
||||
call_args.bucket = accesspoint_arn_details['resource_name']
|
||||
elif is_s3express_bucket(call_args.bucket):
|
||||
# use_double_uri_encode and should_normalize_uri_path are defaulted to be True
|
||||
# But SDK already encoded the URI, and it's for S3, so set both to False
|
||||
make_request_args['signing_config'] = AwsSigningConfig(
|
||||
algorithm=AwsSigningAlgorithm.V4_S3EXPRESS,
|
||||
use_double_uri_encode=False,
|
||||
should_normalize_uri_path=False,
|
||||
)
|
||||
return make_request_args
|
||||
|
||||
|
||||
class RenameTempFileHandler:
|
||||
def __init__(self, coordinator, final_filename, temp_filename, osutil):
|
||||
self._coordinator = coordinator
|
||||
self._final_filename = final_filename
|
||||
self._temp_filename = temp_filename
|
||||
self._osutil = osutil
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
error = kwargs['error']
|
||||
if error:
|
||||
self._osutil.remove_file(self._temp_filename)
|
||||
else:
|
||||
try:
|
||||
self._osutil.rename_file(
|
||||
self._temp_filename, self._final_filename
|
||||
)
|
||||
except Exception as e:
|
||||
self._osutil.remove_file(self._temp_filename)
|
||||
# the CRT future has done already at this point
|
||||
self._coordinator.set_exception(e)
|
||||
|
||||
|
||||
class AfterDoneHandler:
|
||||
def __init__(self, coordinator):
|
||||
self._coordinator = coordinator
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
self._coordinator.set_done_callbacks_complete()
|
||||
|
||||
|
||||
class OnBodyFileObjWriter:
|
||||
def __init__(self, fileobj):
|
||||
self._fileobj = fileobj
|
||||
|
||||
def __call__(self, chunk, **kwargs):
|
||||
self._fileobj.write(chunk)
|
||||
|
||||
|
||||
class _S3ArnParamHandler:
|
||||
"""Partial port of S3ArnParamHandler from botocore.
|
||||
|
||||
This is used to make a determination on MRAP accesspoints for signing
|
||||
purposes. This should be safe to remove once we properly integrate auth
|
||||
resolution from Botocore into the CRT transfer integration.
|
||||
"""
|
||||
|
||||
_RESOURCE_REGEX = re.compile(
|
||||
r'^(?P<resource_type>accesspoint|outpost)[/:](?P<resource_name>.+)$'
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self._arn_parser = ArnParser()
|
||||
|
||||
def handle_arn(self, bucket):
|
||||
arn_details = self._get_arn_details_from_bucket(bucket)
|
||||
if arn_details is None:
|
||||
return
|
||||
if arn_details['resource_type'] == 'accesspoint':
|
||||
return arn_details
|
||||
|
||||
def _get_arn_details_from_bucket(self, bucket):
|
||||
try:
|
||||
arn_details = self._arn_parser.parse_arn(bucket)
|
||||
self._add_resource_type_and_name(arn_details)
|
||||
return arn_details
|
||||
except InvalidArnException:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _add_resource_type_and_name(self, arn_details):
|
||||
match = self._RESOURCE_REGEX.match(arn_details['resource'])
|
||||
if match:
|
||||
arn_details['resource_type'] = match.group('resource_type')
|
||||
arn_details['resource_name'] = match.group('resource_name')
|
||||
@@ -0,0 +1,71 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
from s3transfer.tasks import SubmissionTask, Task
|
||||
|
||||
|
||||
class DeleteSubmissionTask(SubmissionTask):
|
||||
"""Task for submitting tasks to execute an object deletion."""
|
||||
|
||||
def _submit(self, client, request_executor, transfer_future, **kwargs):
|
||||
"""
|
||||
:param client: The client associated with the transfer manager
|
||||
|
||||
:type config: s3transfer.manager.TransferConfig
|
||||
:param config: The transfer config associated with the transfer
|
||||
manager
|
||||
|
||||
:type osutil: s3transfer.utils.OSUtil
|
||||
:param osutil: The os utility associated to the transfer manager
|
||||
|
||||
:type request_executor: s3transfer.futures.BoundedExecutor
|
||||
:param request_executor: The request executor associated with the
|
||||
transfer manager
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future associated with the
|
||||
transfer request that tasks are being submitted for
|
||||
"""
|
||||
call_args = transfer_future.meta.call_args
|
||||
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
DeleteObjectTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'extra_args': call_args.extra_args,
|
||||
},
|
||||
is_final=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class DeleteObjectTask(Task):
|
||||
def _main(self, client, bucket, key, extra_args):
|
||||
"""
|
||||
|
||||
:param client: The S3 client to use when calling DeleteObject
|
||||
|
||||
:type bucket: str
|
||||
:param bucket: The name of the bucket.
|
||||
|
||||
:type key: str
|
||||
:param key: The name of the object to delete.
|
||||
|
||||
:type extra_args: dict
|
||||
:param extra_args: Extra arguments to pass to the DeleteObject call.
|
||||
|
||||
"""
|
||||
client.delete_object(Bucket=bucket, Key=key, **extra_args)
|
||||
@@ -0,0 +1,834 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import heapq
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from s3transfer.compat import seekable
|
||||
from s3transfer.exceptions import RetriesExceededError, S3DownloadFailedError
|
||||
from s3transfer.futures import IN_MEMORY_DOWNLOAD_TAG
|
||||
from s3transfer.tasks import SubmissionTask, Task
|
||||
from s3transfer.utils import (
|
||||
S3_RETRYABLE_DOWNLOAD_ERRORS,
|
||||
CountCallbackInvoker,
|
||||
DeferredOpenFile,
|
||||
FunctionContainer,
|
||||
StreamReaderProgress,
|
||||
calculate_num_parts,
|
||||
calculate_range_parameter,
|
||||
get_callbacks,
|
||||
invoke_progress_callbacks,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DownloadOutputManager:
|
||||
"""Base manager class for handling various types of files for downloads
|
||||
|
||||
This class is typically used for the DownloadSubmissionTask class to help
|
||||
determine the following:
|
||||
|
||||
* Provides the fileobj to write to downloads to
|
||||
* Get a task to complete once everything downloaded has been written
|
||||
|
||||
The answers/implementations differ for the various types of file outputs
|
||||
that may be accepted. All implementations must subclass and override
|
||||
public methods from this class.
|
||||
"""
|
||||
|
||||
def __init__(self, osutil, transfer_coordinator, io_executor):
|
||||
self._osutil = osutil
|
||||
self._transfer_coordinator = transfer_coordinator
|
||||
self._io_executor = io_executor
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, download_target, osutil):
|
||||
"""Determines if the target for the download is compatible with manager
|
||||
|
||||
:param download_target: The target for which the upload will write
|
||||
data to.
|
||||
|
||||
:param osutil: The os utility to be used for the transfer
|
||||
|
||||
:returns: True if the manager can handle the type of target specified
|
||||
otherwise returns False.
|
||||
"""
|
||||
raise NotImplementedError('must implement is_compatible()')
|
||||
|
||||
def get_download_task_tag(self):
|
||||
"""Get the tag (if any) to associate all GetObjectTasks
|
||||
|
||||
:rtype: s3transfer.futures.TaskTag
|
||||
:returns: The tag to associate all GetObjectTasks with
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_fileobj_for_io_writes(self, transfer_future):
|
||||
"""Get file-like object to use for io writes in the io executor
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The future associated with upload request
|
||||
|
||||
returns: A file-like object to write to
|
||||
"""
|
||||
raise NotImplementedError('must implement get_fileobj_for_io_writes()')
|
||||
|
||||
def queue_file_io_task(self, fileobj, data, offset):
|
||||
"""Queue IO write for submission to the IO executor.
|
||||
|
||||
This method accepts an IO executor and information about the
|
||||
downloaded data, and handles submitting this to the IO executor.
|
||||
|
||||
This method may defer submission to the IO executor if necessary.
|
||||
|
||||
"""
|
||||
self._transfer_coordinator.submit(
|
||||
self._io_executor, self.get_io_write_task(fileobj, data, offset)
|
||||
)
|
||||
|
||||
def get_io_write_task(self, fileobj, data, offset):
|
||||
"""Get an IO write task for the requested set of data
|
||||
|
||||
This task can be ran immediately or be submitted to the IO executor
|
||||
for it to run.
|
||||
|
||||
:type fileobj: file-like object
|
||||
:param fileobj: The file-like object to write to
|
||||
|
||||
:type data: bytes
|
||||
:param data: The data to write out
|
||||
|
||||
:type offset: integer
|
||||
:param offset: The offset to write the data to in the file-like object
|
||||
|
||||
:returns: An IO task to be used to write data to a file-like object
|
||||
"""
|
||||
return IOWriteTask(
|
||||
self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'fileobj': fileobj,
|
||||
'data': data,
|
||||
'offset': offset,
|
||||
},
|
||||
)
|
||||
|
||||
def get_final_io_task(self):
|
||||
"""Get the final io task to complete the download
|
||||
|
||||
This is needed because based on the architecture of the TransferManager
|
||||
the final tasks will be sent to the IO executor, but the executor
|
||||
needs a final task for it to signal that the transfer is done and
|
||||
all done callbacks can be run.
|
||||
|
||||
:rtype: s3transfer.tasks.Task
|
||||
:returns: A final task to completed in the io executor
|
||||
"""
|
||||
raise NotImplementedError('must implement get_final_io_task()')
|
||||
|
||||
def _get_fileobj_from_filename(self, filename):
|
||||
f = DeferredOpenFile(
|
||||
filename, mode='wb', open_function=self._osutil.open
|
||||
)
|
||||
# Make sure the file gets closed and we remove the temporary file
|
||||
# if anything goes wrong during the process.
|
||||
self._transfer_coordinator.add_failure_cleanup(f.close)
|
||||
return f
|
||||
|
||||
|
||||
class DownloadFilenameOutputManager(DownloadOutputManager):
|
||||
def __init__(self, osutil, transfer_coordinator, io_executor):
|
||||
super().__init__(osutil, transfer_coordinator, io_executor)
|
||||
self._final_filename = None
|
||||
self._temp_filename = None
|
||||
self._temp_fileobj = None
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, download_target, osutil):
|
||||
return isinstance(download_target, str)
|
||||
|
||||
def get_fileobj_for_io_writes(self, transfer_future):
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
self._final_filename = fileobj
|
||||
self._temp_filename = self._osutil.get_temp_filename(fileobj)
|
||||
self._temp_fileobj = self._get_temp_fileobj()
|
||||
return self._temp_fileobj
|
||||
|
||||
def get_final_io_task(self):
|
||||
# A task to rename the file from the temporary file to its final
|
||||
# location is needed. This should be the last task needed to complete
|
||||
# the download.
|
||||
return IORenameFileTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'fileobj': self._temp_fileobj,
|
||||
'final_filename': self._final_filename,
|
||||
'osutil': self._osutil,
|
||||
},
|
||||
is_final=True,
|
||||
)
|
||||
|
||||
def _get_temp_fileobj(self):
|
||||
f = self._get_fileobj_from_filename(self._temp_filename)
|
||||
self._transfer_coordinator.add_failure_cleanup(
|
||||
self._osutil.remove_file, self._temp_filename
|
||||
)
|
||||
return f
|
||||
|
||||
|
||||
class DownloadSeekableOutputManager(DownloadOutputManager):
|
||||
@classmethod
|
||||
def is_compatible(cls, download_target, osutil):
|
||||
return seekable(download_target)
|
||||
|
||||
def get_fileobj_for_io_writes(self, transfer_future):
|
||||
# Return the fileobj provided to the future.
|
||||
return transfer_future.meta.call_args.fileobj
|
||||
|
||||
def get_final_io_task(self):
|
||||
# This task will serve the purpose of signaling when all of the io
|
||||
# writes have finished so done callbacks can be called.
|
||||
return CompleteDownloadNOOPTask(
|
||||
transfer_coordinator=self._transfer_coordinator
|
||||
)
|
||||
|
||||
|
||||
class DownloadNonSeekableOutputManager(DownloadOutputManager):
|
||||
def __init__(
|
||||
self, osutil, transfer_coordinator, io_executor, defer_queue=None
|
||||
):
|
||||
super().__init__(osutil, transfer_coordinator, io_executor)
|
||||
if defer_queue is None:
|
||||
defer_queue = DeferQueue()
|
||||
self._defer_queue = defer_queue
|
||||
self._io_submit_lock = threading.Lock()
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, download_target, osutil):
|
||||
return hasattr(download_target, 'write')
|
||||
|
||||
def get_download_task_tag(self):
|
||||
return IN_MEMORY_DOWNLOAD_TAG
|
||||
|
||||
def get_fileobj_for_io_writes(self, transfer_future):
|
||||
return transfer_future.meta.call_args.fileobj
|
||||
|
||||
def get_final_io_task(self):
|
||||
return CompleteDownloadNOOPTask(
|
||||
transfer_coordinator=self._transfer_coordinator
|
||||
)
|
||||
|
||||
def queue_file_io_task(self, fileobj, data, offset):
|
||||
with self._io_submit_lock:
|
||||
writes = self._defer_queue.request_writes(offset, data)
|
||||
for write in writes:
|
||||
data = write['data']
|
||||
logger.debug(
|
||||
"Queueing IO offset %s for fileobj: %s",
|
||||
write['offset'],
|
||||
fileobj,
|
||||
)
|
||||
super().queue_file_io_task(fileobj, data, offset)
|
||||
|
||||
def get_io_write_task(self, fileobj, data, offset):
|
||||
return IOStreamingWriteTask(
|
||||
self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'fileobj': fileobj,
|
||||
'data': data,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class DownloadSpecialFilenameOutputManager(DownloadNonSeekableOutputManager):
|
||||
def __init__(
|
||||
self, osutil, transfer_coordinator, io_executor, defer_queue=None
|
||||
):
|
||||
super().__init__(
|
||||
osutil, transfer_coordinator, io_executor, defer_queue
|
||||
)
|
||||
self._fileobj = None
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, download_target, osutil):
|
||||
return isinstance(download_target, str) and osutil.is_special_file(
|
||||
download_target
|
||||
)
|
||||
|
||||
def get_fileobj_for_io_writes(self, transfer_future):
|
||||
filename = transfer_future.meta.call_args.fileobj
|
||||
self._fileobj = self._get_fileobj_from_filename(filename)
|
||||
return self._fileobj
|
||||
|
||||
def get_final_io_task(self):
|
||||
# Make sure the file gets closed once the transfer is done.
|
||||
return IOCloseTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
is_final=True,
|
||||
main_kwargs={'fileobj': self._fileobj},
|
||||
)
|
||||
|
||||
|
||||
class DownloadSubmissionTask(SubmissionTask):
|
||||
"""Task for submitting tasks to execute a download"""
|
||||
|
||||
def _get_download_output_manager_cls(self, transfer_future, osutil):
|
||||
"""Retrieves a class for managing output for a download
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future for the request
|
||||
|
||||
:type osutil: s3transfer.utils.OSUtils
|
||||
:param osutil: The os utility associated to the transfer
|
||||
|
||||
:rtype: class of DownloadOutputManager
|
||||
:returns: The appropriate class to use for managing a specific type of
|
||||
input for downloads.
|
||||
"""
|
||||
download_manager_resolver_chain = [
|
||||
DownloadSpecialFilenameOutputManager,
|
||||
DownloadFilenameOutputManager,
|
||||
DownloadSeekableOutputManager,
|
||||
DownloadNonSeekableOutputManager,
|
||||
]
|
||||
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
for download_manager_cls in download_manager_resolver_chain:
|
||||
if download_manager_cls.is_compatible(fileobj, osutil):
|
||||
return download_manager_cls
|
||||
raise RuntimeError(
|
||||
f'Output {fileobj} of type: {type(fileobj)} is not supported.'
|
||||
)
|
||||
|
||||
def _submit(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
io_executor,
|
||||
transfer_future,
|
||||
bandwidth_limiter=None,
|
||||
):
|
||||
"""
|
||||
:param client: The client associated with the transfer manager
|
||||
|
||||
:type config: s3transfer.manager.TransferConfig
|
||||
:param config: The transfer config associated with the transfer
|
||||
manager
|
||||
|
||||
:type osutil: s3transfer.utils.OSUtil
|
||||
:param osutil: The os utility associated to the transfer manager
|
||||
|
||||
:type request_executor: s3transfer.futures.BoundedExecutor
|
||||
:param request_executor: The request executor associated with the
|
||||
transfer manager
|
||||
|
||||
:type io_executor: s3transfer.futures.BoundedExecutor
|
||||
:param io_executor: The io executor associated with the
|
||||
transfer manager
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future associated with the
|
||||
transfer request that tasks are being submitted for
|
||||
|
||||
:type bandwidth_limiter: s3transfer.bandwidth.BandwidthLimiter
|
||||
:param bandwidth_limiter: The bandwidth limiter to use when
|
||||
downloading streams
|
||||
"""
|
||||
if (
|
||||
transfer_future.meta.size is None
|
||||
or transfer_future.meta.etag is None
|
||||
):
|
||||
response = client.head_object(
|
||||
Bucket=transfer_future.meta.call_args.bucket,
|
||||
Key=transfer_future.meta.call_args.key,
|
||||
**transfer_future.meta.call_args.extra_args,
|
||||
)
|
||||
# If a size was not provided figure out the size for the
|
||||
# user.
|
||||
transfer_future.meta.provide_transfer_size(
|
||||
response['ContentLength']
|
||||
)
|
||||
# Provide an etag to ensure a stored object is not modified
|
||||
# during a multipart download.
|
||||
transfer_future.meta.provide_object_etag(response.get('ETag'))
|
||||
|
||||
download_output_manager = self._get_download_output_manager_cls(
|
||||
transfer_future, osutil
|
||||
)(osutil, self._transfer_coordinator, io_executor)
|
||||
|
||||
# If it is greater than threshold do a ranged download, otherwise
|
||||
# do a regular GetObject download.
|
||||
if transfer_future.meta.size < config.multipart_threshold:
|
||||
self._submit_download_request(
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
io_executor,
|
||||
download_output_manager,
|
||||
transfer_future,
|
||||
bandwidth_limiter,
|
||||
)
|
||||
else:
|
||||
self._submit_ranged_download_request(
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
io_executor,
|
||||
download_output_manager,
|
||||
transfer_future,
|
||||
bandwidth_limiter,
|
||||
)
|
||||
|
||||
def _submit_download_request(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
io_executor,
|
||||
download_output_manager,
|
||||
transfer_future,
|
||||
bandwidth_limiter,
|
||||
):
|
||||
call_args = transfer_future.meta.call_args
|
||||
|
||||
# Get a handle to the file that will be used for writing downloaded
|
||||
# contents
|
||||
fileobj = download_output_manager.get_fileobj_for_io_writes(
|
||||
transfer_future
|
||||
)
|
||||
|
||||
# Get the needed callbacks for the task
|
||||
progress_callbacks = get_callbacks(transfer_future, 'progress')
|
||||
|
||||
# Get any associated tags for the get object task.
|
||||
get_object_tag = download_output_manager.get_download_task_tag()
|
||||
|
||||
# Get the final io task to run once the download is complete.
|
||||
final_task = download_output_manager.get_final_io_task()
|
||||
|
||||
# Submit the task to download the object.
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
ImmediatelyWriteIOGetObjectTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'fileobj': fileobj,
|
||||
'extra_args': call_args.extra_args,
|
||||
'callbacks': progress_callbacks,
|
||||
'max_attempts': config.num_download_attempts,
|
||||
'download_output_manager': download_output_manager,
|
||||
'io_chunksize': config.io_chunksize,
|
||||
'bandwidth_limiter': bandwidth_limiter,
|
||||
},
|
||||
done_callbacks=[final_task],
|
||||
),
|
||||
tag=get_object_tag,
|
||||
)
|
||||
|
||||
def _submit_ranged_download_request(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
io_executor,
|
||||
download_output_manager,
|
||||
transfer_future,
|
||||
bandwidth_limiter,
|
||||
):
|
||||
call_args = transfer_future.meta.call_args
|
||||
|
||||
# Get the needed progress callbacks for the task
|
||||
progress_callbacks = get_callbacks(transfer_future, 'progress')
|
||||
|
||||
# Get a handle to the file that will be used for writing downloaded
|
||||
# contents
|
||||
fileobj = download_output_manager.get_fileobj_for_io_writes(
|
||||
transfer_future
|
||||
)
|
||||
|
||||
# Determine the number of parts
|
||||
part_size = config.multipart_chunksize
|
||||
num_parts = calculate_num_parts(transfer_future.meta.size, part_size)
|
||||
|
||||
# Get any associated tags for the get object task.
|
||||
get_object_tag = download_output_manager.get_download_task_tag()
|
||||
|
||||
# Callback invoker to submit the final io task once all downloads
|
||||
# are complete.
|
||||
finalize_download_invoker = CountCallbackInvoker(
|
||||
self._get_final_io_task_submission_callback(
|
||||
download_output_manager, io_executor
|
||||
)
|
||||
)
|
||||
for i in range(num_parts):
|
||||
# Calculate the range parameter
|
||||
range_parameter = calculate_range_parameter(
|
||||
part_size, i, num_parts
|
||||
)
|
||||
|
||||
# Inject extra parameters to be passed in as extra args
|
||||
extra_args = {
|
||||
'Range': range_parameter,
|
||||
}
|
||||
if transfer_future.meta.etag is not None:
|
||||
extra_args['IfMatch'] = transfer_future.meta.etag
|
||||
extra_args.update(call_args.extra_args)
|
||||
finalize_download_invoker.increment()
|
||||
# Submit the ranged downloads
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
GetObjectTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'fileobj': fileobj,
|
||||
'extra_args': extra_args,
|
||||
'callbacks': progress_callbacks,
|
||||
'max_attempts': config.num_download_attempts,
|
||||
'start_index': i * part_size,
|
||||
'download_output_manager': download_output_manager,
|
||||
'io_chunksize': config.io_chunksize,
|
||||
'bandwidth_limiter': bandwidth_limiter,
|
||||
},
|
||||
done_callbacks=[finalize_download_invoker.decrement],
|
||||
),
|
||||
tag=get_object_tag,
|
||||
)
|
||||
finalize_download_invoker.finalize()
|
||||
|
||||
def _get_final_io_task_submission_callback(
|
||||
self, download_manager, io_executor
|
||||
):
|
||||
final_task = download_manager.get_final_io_task()
|
||||
return FunctionContainer(
|
||||
self._transfer_coordinator.submit, io_executor, final_task
|
||||
)
|
||||
|
||||
def _calculate_range_param(self, part_size, part_index, num_parts):
|
||||
# Used to calculate the Range parameter
|
||||
start_range = part_index * part_size
|
||||
if part_index == num_parts - 1:
|
||||
end_range = ''
|
||||
else:
|
||||
end_range = start_range + part_size - 1
|
||||
range_param = f'bytes={start_range}-{end_range}'
|
||||
return range_param
|
||||
|
||||
|
||||
class GetObjectTask(Task):
|
||||
def _main(
|
||||
self,
|
||||
client,
|
||||
bucket,
|
||||
key,
|
||||
fileobj,
|
||||
extra_args,
|
||||
callbacks,
|
||||
max_attempts,
|
||||
download_output_manager,
|
||||
io_chunksize,
|
||||
start_index=0,
|
||||
bandwidth_limiter=None,
|
||||
):
|
||||
"""Downloads an object and places content into io queue
|
||||
|
||||
:param client: The client to use when calling GetObject
|
||||
:param bucket: The bucket to download from
|
||||
:param key: The key to download from
|
||||
:param fileobj: The file handle to write content to
|
||||
:param exta_args: Any extra arguments to include in GetObject request
|
||||
:param callbacks: List of progress callbacks to invoke on download
|
||||
:param max_attempts: The number of retries to do when downloading
|
||||
:param download_output_manager: The download output manager associated
|
||||
with the current download.
|
||||
:param io_chunksize: The size of each io chunk to read from the
|
||||
download stream and queue in the io queue.
|
||||
:param start_index: The location in the file to start writing the
|
||||
content of the key to.
|
||||
:param bandwidth_limiter: The bandwidth limiter to use when throttling
|
||||
the downloading of data in streams.
|
||||
"""
|
||||
last_exception = None
|
||||
for i in range(max_attempts):
|
||||
try:
|
||||
current_index = start_index
|
||||
response = client.get_object(
|
||||
Bucket=bucket, Key=key, **extra_args
|
||||
)
|
||||
streaming_body = StreamReaderProgress(
|
||||
response['Body'], callbacks
|
||||
)
|
||||
if bandwidth_limiter:
|
||||
streaming_body = (
|
||||
bandwidth_limiter.get_bandwith_limited_stream(
|
||||
streaming_body, self._transfer_coordinator
|
||||
)
|
||||
)
|
||||
|
||||
chunks = DownloadChunkIterator(streaming_body, io_chunksize)
|
||||
for chunk in chunks:
|
||||
# If the transfer is done because of a cancellation
|
||||
# or error somewhere else, stop trying to submit more
|
||||
# data to be written and break out of the download.
|
||||
if not self._transfer_coordinator.done():
|
||||
self._handle_io(
|
||||
download_output_manager,
|
||||
fileobj,
|
||||
chunk,
|
||||
current_index,
|
||||
)
|
||||
current_index += len(chunk)
|
||||
else:
|
||||
return
|
||||
return
|
||||
except ClientError as e:
|
||||
error_code = e.response.get('Error', {}).get('Code')
|
||||
if error_code == "PreconditionFailed":
|
||||
raise S3DownloadFailedError(
|
||||
f'Contents of stored object "{key}" in bucket '
|
||||
f'"{bucket}" did not match expected ETag.'
|
||||
)
|
||||
else:
|
||||
raise
|
||||
except S3_RETRYABLE_DOWNLOAD_ERRORS as e:
|
||||
logger.debug(
|
||||
"Retrying exception caught (%s), "
|
||||
"retrying request, (attempt %s / %s)",
|
||||
e,
|
||||
i,
|
||||
max_attempts,
|
||||
exc_info=True,
|
||||
)
|
||||
last_exception = e
|
||||
# Also invoke the progress callbacks to indicate that we
|
||||
# are trying to download the stream again and all progress
|
||||
# for this GetObject has been lost.
|
||||
invoke_progress_callbacks(
|
||||
callbacks, start_index - current_index
|
||||
)
|
||||
continue
|
||||
raise RetriesExceededError(last_exception)
|
||||
|
||||
def _handle_io(self, download_output_manager, fileobj, chunk, index):
|
||||
download_output_manager.queue_file_io_task(fileobj, chunk, index)
|
||||
|
||||
|
||||
class ImmediatelyWriteIOGetObjectTask(GetObjectTask):
|
||||
"""GetObjectTask that immediately writes to the provided file object
|
||||
|
||||
This is useful for downloads where it is known only one thread is
|
||||
downloading the object so there is no reason to go through the
|
||||
overhead of using an IO queue and executor.
|
||||
"""
|
||||
|
||||
def _handle_io(self, download_output_manager, fileobj, chunk, index):
|
||||
task = download_output_manager.get_io_write_task(fileobj, chunk, index)
|
||||
task()
|
||||
|
||||
|
||||
class IOWriteTask(Task):
|
||||
def _main(self, fileobj, data, offset):
|
||||
"""Pulls off an io queue to write contents to a file
|
||||
|
||||
:param fileobj: The file handle to write content to
|
||||
:param data: The data to write
|
||||
:param offset: The offset to write the data to.
|
||||
"""
|
||||
fileobj.seek(offset)
|
||||
fileobj.write(data)
|
||||
|
||||
|
||||
class IOStreamingWriteTask(Task):
|
||||
"""Task for writing data to a non-seekable stream."""
|
||||
|
||||
def _main(self, fileobj, data):
|
||||
"""Write data to a fileobj.
|
||||
|
||||
Data will be written directly to the fileobj without
|
||||
any prior seeking.
|
||||
|
||||
:param fileobj: The fileobj to write content to
|
||||
:param data: The data to write
|
||||
|
||||
"""
|
||||
fileobj.write(data)
|
||||
|
||||
|
||||
class IORenameFileTask(Task):
|
||||
"""A task to rename a temporary file to its final filename
|
||||
|
||||
:param fileobj: The file handle that content was written to.
|
||||
:param final_filename: The final name of the file to rename to
|
||||
upon completion of writing the contents.
|
||||
:param osutil: OS utility
|
||||
"""
|
||||
|
||||
def _main(self, fileobj, final_filename, osutil):
|
||||
fileobj.close()
|
||||
osutil.rename_file(fileobj.name, final_filename)
|
||||
|
||||
|
||||
class IOCloseTask(Task):
|
||||
"""A task to close out a file once the download is complete.
|
||||
|
||||
:param fileobj: The fileobj to close.
|
||||
"""
|
||||
|
||||
def _main(self, fileobj):
|
||||
fileobj.close()
|
||||
|
||||
|
||||
class CompleteDownloadNOOPTask(Task):
|
||||
"""A NOOP task to serve as an indicator that the download is complete
|
||||
|
||||
Note that the default for is_final is set to True because this should
|
||||
always be the last task.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
transfer_coordinator,
|
||||
main_kwargs=None,
|
||||
pending_main_kwargs=None,
|
||||
done_callbacks=None,
|
||||
is_final=True,
|
||||
):
|
||||
super().__init__(
|
||||
transfer_coordinator=transfer_coordinator,
|
||||
main_kwargs=main_kwargs,
|
||||
pending_main_kwargs=pending_main_kwargs,
|
||||
done_callbacks=done_callbacks,
|
||||
is_final=is_final,
|
||||
)
|
||||
|
||||
def _main(self):
|
||||
pass
|
||||
|
||||
|
||||
class DownloadChunkIterator:
|
||||
def __init__(self, body, chunksize):
|
||||
"""Iterator to chunk out a downloaded S3 stream
|
||||
|
||||
:param body: A readable file-like object
|
||||
:param chunksize: The amount to read each time
|
||||
"""
|
||||
self._body = body
|
||||
self._chunksize = chunksize
|
||||
self._num_reads = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
chunk = self._body.read(self._chunksize)
|
||||
self._num_reads += 1
|
||||
if chunk:
|
||||
return chunk
|
||||
elif self._num_reads == 1:
|
||||
# Even though the response may have not had any
|
||||
# content, we still want to account for an empty object's
|
||||
# existence so return the empty chunk for that initial
|
||||
# read.
|
||||
return chunk
|
||||
raise StopIteration()
|
||||
|
||||
next = __next__
|
||||
|
||||
|
||||
class DeferQueue:
|
||||
"""IO queue that defers write requests until they are queued sequentially.
|
||||
|
||||
This class is used to track IO data for a *single* fileobj.
|
||||
|
||||
You can send data to this queue, and it will defer any IO write requests
|
||||
until it has the next contiguous block available (starting at 0).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._writes = []
|
||||
self._pending_offsets = {}
|
||||
self._next_offset = 0
|
||||
|
||||
def request_writes(self, offset, data):
|
||||
"""Request any available writes given new incoming data.
|
||||
|
||||
You call this method by providing new data along with the
|
||||
offset associated with the data. If that new data unlocks
|
||||
any contiguous writes that can now be submitted, this
|
||||
method will return all applicable writes.
|
||||
|
||||
This is done with 1 method call so you don't have to
|
||||
make two method calls (put(), get()) which acquires a lock
|
||||
each method call.
|
||||
|
||||
"""
|
||||
if offset + len(data) <= self._next_offset:
|
||||
# This is a request for a write that we've already
|
||||
# seen. This can happen in the event of a retry
|
||||
# where if we retry at at offset N/2, we'll requeue
|
||||
# offsets 0-N/2 again.
|
||||
return []
|
||||
writes = []
|
||||
if offset < self._next_offset:
|
||||
# This is a special case where the write request contains
|
||||
# both seen AND unseen data. This can happen in the case
|
||||
# that we queue part of a chunk due to an incomplete read,
|
||||
# then pop the incomplete data for writing, then we receive the retry
|
||||
# for the incomplete read which contains both the previously-seen
|
||||
# partial chunk followed by the rest of the chunk (unseen).
|
||||
#
|
||||
# In this case, we discard the bytes of the data we've already
|
||||
# queued before, and only queue the unseen bytes.
|
||||
seen_bytes = self._next_offset - offset
|
||||
data = data[seen_bytes:]
|
||||
offset = self._next_offset
|
||||
if offset in self._pending_offsets:
|
||||
queued_data = self._pending_offsets[offset]
|
||||
if len(data) <= len(queued_data):
|
||||
# We already have a write request queued with the same offset
|
||||
# with at least as much data that is present in this
|
||||
# request. In this case we should ignore this request
|
||||
# and prefer what's already queued.
|
||||
return []
|
||||
else:
|
||||
# We have a write request queued with the same offset,
|
||||
# but this request contains more data. This can happen
|
||||
# in the case of a retried request due to an incomplete
|
||||
# read, followed by a retry containing the full response
|
||||
# body. In this case, we should overwrite the queued
|
||||
# request with this one since it contains more data.
|
||||
self._pending_offsets[offset] = data
|
||||
else:
|
||||
heapq.heappush(self._writes, offset)
|
||||
self._pending_offsets[offset] = data
|
||||
while self._writes and self._writes[0] == self._next_offset:
|
||||
next_write_offset = heapq.heappop(self._writes)
|
||||
next_write = self._pending_offsets[next_write_offset]
|
||||
writes.append({'offset': next_write_offset, 'data': next_write})
|
||||
del self._pending_offsets[next_write_offset]
|
||||
self._next_offset += len(next_write)
|
||||
return writes
|
||||
@@ -0,0 +1,41 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
from concurrent.futures import CancelledError
|
||||
|
||||
|
||||
class RetriesExceededError(Exception):
|
||||
def __init__(self, last_exception, msg='Max Retries Exceeded'):
|
||||
super().__init__(msg)
|
||||
self.last_exception = last_exception
|
||||
|
||||
|
||||
class S3UploadFailedError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class S3DownloadFailedError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class InvalidSubscriberMethodError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TransferNotDoneError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class FatalError(CancelledError):
|
||||
"""A CancelledError raised from an error in the TransferManager"""
|
||||
|
||||
pass
|
||||
@@ -0,0 +1,628 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import copy
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
from collections import namedtuple
|
||||
from concurrent import futures
|
||||
|
||||
from s3transfer.compat import MAXINT
|
||||
from s3transfer.exceptions import CancelledError, TransferNotDoneError
|
||||
from s3transfer.utils import FunctionContainer, TaskSemaphore
|
||||
|
||||
try:
|
||||
from botocore.context import get_context
|
||||
except ImportError:
|
||||
|
||||
def get_context():
|
||||
return None
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseTransferFuture:
|
||||
@property
|
||||
def meta(self):
|
||||
"""The metadata associated to the TransferFuture"""
|
||||
raise NotImplementedError('meta')
|
||||
|
||||
def done(self):
|
||||
"""Determines if a TransferFuture has completed
|
||||
|
||||
:returns: True if completed. False, otherwise.
|
||||
"""
|
||||
raise NotImplementedError('done()')
|
||||
|
||||
def result(self):
|
||||
"""Waits until TransferFuture is done and returns the result
|
||||
|
||||
If the TransferFuture succeeded, it will return the result. If the
|
||||
TransferFuture failed, it will raise the exception associated to the
|
||||
failure.
|
||||
"""
|
||||
raise NotImplementedError('result()')
|
||||
|
||||
def cancel(self):
|
||||
"""Cancels the request associated with the TransferFuture"""
|
||||
raise NotImplementedError('cancel()')
|
||||
|
||||
|
||||
class BaseTransferMeta:
|
||||
@property
|
||||
def call_args(self):
|
||||
"""The call args used in the transfer request"""
|
||||
raise NotImplementedError('call_args')
|
||||
|
||||
@property
|
||||
def transfer_id(self):
|
||||
"""The unique id of the transfer"""
|
||||
raise NotImplementedError('transfer_id')
|
||||
|
||||
@property
|
||||
def user_context(self):
|
||||
"""A dictionary that requesters can store data in"""
|
||||
raise NotImplementedError('user_context')
|
||||
|
||||
|
||||
class TransferFuture(BaseTransferFuture):
|
||||
def __init__(self, meta=None, coordinator=None):
|
||||
"""The future associated to a submitted transfer request
|
||||
|
||||
:type meta: TransferMeta
|
||||
:param meta: The metadata associated to the request. This object
|
||||
is visible to the requester.
|
||||
|
||||
:type coordinator: TransferCoordinator
|
||||
:param coordinator: The coordinator associated to the request. This
|
||||
object is not visible to the requester.
|
||||
"""
|
||||
self._meta = meta
|
||||
if meta is None:
|
||||
self._meta = TransferMeta()
|
||||
|
||||
self._coordinator = coordinator
|
||||
if coordinator is None:
|
||||
self._coordinator = TransferCoordinator()
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
return self._meta
|
||||
|
||||
def done(self):
|
||||
return self._coordinator.done()
|
||||
|
||||
def result(self):
|
||||
try:
|
||||
# Usually the result() method blocks until the transfer is done,
|
||||
# however if a KeyboardInterrupt is raised we want want to exit
|
||||
# out of this and propagate the exception.
|
||||
return self._coordinator.result()
|
||||
except KeyboardInterrupt as e:
|
||||
self.cancel()
|
||||
raise e
|
||||
|
||||
def cancel(self):
|
||||
self._coordinator.cancel()
|
||||
|
||||
def set_exception(self, exception):
|
||||
"""Sets the exception on the future."""
|
||||
if not self.done():
|
||||
raise TransferNotDoneError(
|
||||
'set_exception can only be called once the transfer is '
|
||||
'complete.'
|
||||
)
|
||||
self._coordinator.set_exception(exception, override=True)
|
||||
|
||||
|
||||
class TransferMeta(BaseTransferMeta):
|
||||
"""Holds metadata about the TransferFuture"""
|
||||
|
||||
def __init__(self, call_args=None, transfer_id=None):
|
||||
self._call_args = call_args
|
||||
self._transfer_id = transfer_id
|
||||
self._size = None
|
||||
self._user_context = {}
|
||||
self._etag = None
|
||||
|
||||
@property
|
||||
def call_args(self):
|
||||
"""The call args used in the transfer request"""
|
||||
return self._call_args
|
||||
|
||||
@property
|
||||
def transfer_id(self):
|
||||
"""The unique id of the transfer"""
|
||||
return self._transfer_id
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
"""The size of the transfer request if known"""
|
||||
return self._size
|
||||
|
||||
@property
|
||||
def user_context(self):
|
||||
"""A dictionary that requesters can store data in"""
|
||||
return self._user_context
|
||||
|
||||
@property
|
||||
def etag(self):
|
||||
"""The etag of the stored object for validating multipart downloads"""
|
||||
return self._etag
|
||||
|
||||
def provide_transfer_size(self, size):
|
||||
"""A method to provide the size of a transfer request
|
||||
|
||||
By providing this value, the TransferManager will not try to
|
||||
call HeadObject or use the use OS to determine the size of the
|
||||
transfer.
|
||||
"""
|
||||
self._size = size
|
||||
|
||||
def provide_object_etag(self, etag):
|
||||
"""A method to provide the etag of a transfer request
|
||||
|
||||
By providing this value, the TransferManager will validate
|
||||
multipart downloads by supplying an IfMatch parameter with
|
||||
the etag as the value to GetObject requests.
|
||||
"""
|
||||
self._etag = etag
|
||||
|
||||
|
||||
class TransferCoordinator:
|
||||
"""A helper class for managing TransferFuture"""
|
||||
|
||||
def __init__(self, transfer_id=None):
|
||||
self.transfer_id = transfer_id
|
||||
self._status = 'not-started'
|
||||
self._result = None
|
||||
self._exception = None
|
||||
self._associated_futures = set()
|
||||
self._failure_cleanups = []
|
||||
self._done_callbacks = []
|
||||
self._done_event = threading.Event()
|
||||
self._lock = threading.Lock()
|
||||
self._associated_futures_lock = threading.Lock()
|
||||
self._done_callbacks_lock = threading.Lock()
|
||||
self._failure_cleanups_lock = threading.Lock()
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}(transfer_id={self.transfer_id})'
|
||||
|
||||
@property
|
||||
def exception(self):
|
||||
return self._exception
|
||||
|
||||
@property
|
||||
def associated_futures(self):
|
||||
"""The list of futures associated to the inprogress TransferFuture
|
||||
|
||||
Once the transfer finishes this list becomes empty as the transfer
|
||||
is considered done and there should be no running futures left.
|
||||
"""
|
||||
with self._associated_futures_lock:
|
||||
# We return a copy of the list because we do not want to
|
||||
# processing the returned list while another thread is adding
|
||||
# more futures to the actual list.
|
||||
return copy.copy(self._associated_futures)
|
||||
|
||||
@property
|
||||
def failure_cleanups(self):
|
||||
"""The list of callbacks to call when the TransferFuture fails"""
|
||||
return self._failure_cleanups
|
||||
|
||||
@property
|
||||
def status(self):
|
||||
"""The status of the TransferFuture
|
||||
|
||||
The currently supported states are:
|
||||
* not-started - Has yet to start. If in this state, a transfer
|
||||
can be canceled immediately and nothing will happen.
|
||||
* queued - SubmissionTask is about to submit tasks
|
||||
* running - Is inprogress. In-progress as of now means that
|
||||
the SubmissionTask that runs the transfer is being executed. So
|
||||
there is no guarantee any transfer requests had been made to
|
||||
S3 if this state is reached.
|
||||
* cancelled - Was cancelled
|
||||
* failed - An exception other than CancelledError was thrown
|
||||
* success - No exceptions were thrown and is done.
|
||||
"""
|
||||
return self._status
|
||||
|
||||
def set_result(self, result):
|
||||
"""Set a result for the TransferFuture
|
||||
|
||||
Implies that the TransferFuture succeeded. This will always set a
|
||||
result because it is invoked on the final task where there is only
|
||||
ever one final task and it is ran at the very end of a transfer
|
||||
process. So if a result is being set for this final task, the transfer
|
||||
succeeded even if something came a long and canceled the transfer
|
||||
on the final task.
|
||||
"""
|
||||
with self._lock:
|
||||
self._exception = None
|
||||
self._result = result
|
||||
self._status = 'success'
|
||||
|
||||
def set_exception(self, exception, override=False):
|
||||
"""Set an exception for the TransferFuture
|
||||
|
||||
Implies the TransferFuture failed.
|
||||
|
||||
:param exception: The exception that cause the transfer to fail.
|
||||
:param override: If True, override any existing state.
|
||||
"""
|
||||
with self._lock:
|
||||
if not self.done() or override:
|
||||
self._exception = exception
|
||||
self._status = 'failed'
|
||||
|
||||
def result(self):
|
||||
"""Waits until TransferFuture is done and returns the result
|
||||
|
||||
If the TransferFuture succeeded, it will return the result. If the
|
||||
TransferFuture failed, it will raise the exception associated to the
|
||||
failure.
|
||||
"""
|
||||
# Doing a wait() with no timeout cannot be interrupted in python2 but
|
||||
# can be interrupted in python3 so we just wait with the largest
|
||||
# possible value integer value, which is on the scale of billions of
|
||||
# years...
|
||||
self._done_event.wait(MAXINT)
|
||||
|
||||
# Once done waiting, raise an exception if present or return the
|
||||
# final result.
|
||||
if self._exception:
|
||||
raise self._exception
|
||||
return self._result
|
||||
|
||||
def cancel(self, msg='', exc_type=CancelledError):
|
||||
"""Cancels the TransferFuture
|
||||
|
||||
:param msg: The message to attach to the cancellation
|
||||
:param exc_type: The type of exception to set for the cancellation
|
||||
"""
|
||||
with self._lock:
|
||||
if not self.done():
|
||||
should_announce_done = False
|
||||
logger.debug('%s cancel(%s) called', self, msg)
|
||||
self._exception = exc_type(msg)
|
||||
if self._status == 'not-started':
|
||||
should_announce_done = True
|
||||
self._status = 'cancelled'
|
||||
if should_announce_done:
|
||||
self.announce_done()
|
||||
|
||||
def set_status_to_queued(self):
|
||||
"""Sets the TransferFutrue's status to running"""
|
||||
self._transition_to_non_done_state('queued')
|
||||
|
||||
def set_status_to_running(self):
|
||||
"""Sets the TransferFuture's status to running"""
|
||||
self._transition_to_non_done_state('running')
|
||||
|
||||
def _transition_to_non_done_state(self, desired_state):
|
||||
with self._lock:
|
||||
if self.done():
|
||||
raise RuntimeError(
|
||||
f'Unable to transition from done state {self.status} to non-done '
|
||||
f'state {desired_state}.'
|
||||
)
|
||||
self._status = desired_state
|
||||
|
||||
def submit(self, executor, task, tag=None):
|
||||
"""Submits a task to a provided executor
|
||||
|
||||
:type executor: s3transfer.futures.BoundedExecutor
|
||||
:param executor: The executor to submit the callable to
|
||||
|
||||
:type task: s3transfer.tasks.Task
|
||||
:param task: The task to submit to the executor
|
||||
|
||||
:type tag: s3transfer.futures.TaskTag
|
||||
:param tag: A tag to associate to the submitted task
|
||||
|
||||
:rtype: concurrent.futures.Future
|
||||
:returns: A future representing the submitted task
|
||||
"""
|
||||
logger.debug(
|
||||
f"Submitting task {task} to executor {executor} for transfer request: {self.transfer_id}."
|
||||
)
|
||||
future = executor.submit(task, tag=tag)
|
||||
# Add this created future to the list of associated future just
|
||||
# in case it is needed during cleanups.
|
||||
self.add_associated_future(future)
|
||||
future.add_done_callback(
|
||||
FunctionContainer(self.remove_associated_future, future)
|
||||
)
|
||||
return future
|
||||
|
||||
def done(self):
|
||||
"""Determines if a TransferFuture has completed
|
||||
|
||||
:returns: False if status is equal to 'failed', 'cancelled', or
|
||||
'success'. True, otherwise
|
||||
"""
|
||||
return self.status in ['failed', 'cancelled', 'success']
|
||||
|
||||
def add_associated_future(self, future):
|
||||
"""Adds a future to be associated with the TransferFuture"""
|
||||
with self._associated_futures_lock:
|
||||
self._associated_futures.add(future)
|
||||
|
||||
def remove_associated_future(self, future):
|
||||
"""Removes a future's association to the TransferFuture"""
|
||||
with self._associated_futures_lock:
|
||||
self._associated_futures.remove(future)
|
||||
|
||||
def add_done_callback(self, function, *args, **kwargs):
|
||||
"""Add a done callback to be invoked when transfer is done"""
|
||||
with self._done_callbacks_lock:
|
||||
self._done_callbacks.append(
|
||||
FunctionContainer(function, *args, **kwargs)
|
||||
)
|
||||
|
||||
def add_failure_cleanup(self, function, *args, **kwargs):
|
||||
"""Adds a callback to call upon failure"""
|
||||
with self._failure_cleanups_lock:
|
||||
self._failure_cleanups.append(
|
||||
FunctionContainer(function, *args, **kwargs)
|
||||
)
|
||||
|
||||
def announce_done(self):
|
||||
"""Announce that future is done running and run associated callbacks
|
||||
|
||||
This will run any failure cleanups if the transfer failed if not
|
||||
they have not been run, allows the result() to be unblocked, and will
|
||||
run any done callbacks associated to the TransferFuture if they have
|
||||
not already been ran.
|
||||
"""
|
||||
if self.status != 'success':
|
||||
self._run_failure_cleanups()
|
||||
self._done_event.set()
|
||||
self._run_done_callbacks()
|
||||
|
||||
def _run_done_callbacks(self):
|
||||
# Run the callbacks and remove the callbacks from the internal
|
||||
# list so they do not get ran again if done is announced more than
|
||||
# once.
|
||||
with self._done_callbacks_lock:
|
||||
self._run_callbacks(self._done_callbacks)
|
||||
self._done_callbacks = []
|
||||
|
||||
def _run_failure_cleanups(self):
|
||||
# Run the cleanup callbacks and remove the callbacks from the internal
|
||||
# list so they do not get ran again if done is announced more than
|
||||
# once.
|
||||
with self._failure_cleanups_lock:
|
||||
self._run_callbacks(self.failure_cleanups)
|
||||
self._failure_cleanups = []
|
||||
|
||||
def _run_callbacks(self, callbacks):
|
||||
for callback in callbacks:
|
||||
self._run_callback(callback)
|
||||
|
||||
def _run_callback(self, callback):
|
||||
try:
|
||||
callback()
|
||||
# We do not want a callback interrupting the process, especially
|
||||
# in the failure cleanups. So log and catch, the exception.
|
||||
except Exception:
|
||||
logger.debug(f"Exception raised in {callback}.", exc_info=True)
|
||||
|
||||
|
||||
class BoundedExecutor:
|
||||
EXECUTOR_CLS = futures.ThreadPoolExecutor
|
||||
|
||||
def __init__(
|
||||
self, max_size, max_num_threads, tag_semaphores=None, executor_cls=None
|
||||
):
|
||||
"""An executor implementation that has a maximum queued up tasks
|
||||
|
||||
The executor will block if the number of tasks that have been
|
||||
submitted and is currently working on is past its maximum.
|
||||
|
||||
:params max_size: The maximum number of inflight futures. An inflight
|
||||
future means that the task is either queued up or is currently
|
||||
being executed. A size of None or 0 means that the executor will
|
||||
have no bound in terms of the number of inflight futures.
|
||||
|
||||
:params max_num_threads: The maximum number of threads the executor
|
||||
uses.
|
||||
|
||||
:type tag_semaphores: dict
|
||||
:params tag_semaphores: A dictionary where the key is the name of the
|
||||
tag and the value is the semaphore to use when limiting the
|
||||
number of tasks the executor is processing at a time.
|
||||
|
||||
:type executor_cls: BaseExecutor
|
||||
:param underlying_executor_cls: The executor class that
|
||||
get bounded by this executor. If None is provided, the
|
||||
concurrent.futures.ThreadPoolExecutor class is used.
|
||||
"""
|
||||
self._max_num_threads = max_num_threads
|
||||
if executor_cls is None:
|
||||
executor_cls = self.EXECUTOR_CLS
|
||||
self._executor = executor_cls(max_workers=self._max_num_threads)
|
||||
self._semaphore = TaskSemaphore(max_size)
|
||||
self._tag_semaphores = tag_semaphores
|
||||
|
||||
def submit(self, task, tag=None, block=True):
|
||||
"""Submit a task to complete
|
||||
|
||||
:type task: s3transfer.tasks.Task
|
||||
:param task: The task to run __call__ on
|
||||
|
||||
|
||||
:type tag: s3transfer.futures.TaskTag
|
||||
:param tag: An optional tag to associate to the task. This
|
||||
is used to override which semaphore to use.
|
||||
|
||||
:type block: boolean
|
||||
:param block: True if to wait till it is possible to submit a task.
|
||||
False, if not to wait and raise an error if not able to submit
|
||||
a task.
|
||||
|
||||
:returns: The future associated to the submitted task
|
||||
"""
|
||||
semaphore = self._semaphore
|
||||
# If a tag was provided, use the semaphore associated to that
|
||||
# tag.
|
||||
if tag:
|
||||
semaphore = self._tag_semaphores[tag]
|
||||
|
||||
# Call acquire on the semaphore.
|
||||
acquire_token = semaphore.acquire(task.transfer_id, block)
|
||||
# Create a callback to invoke when task is done in order to call
|
||||
# release on the semaphore.
|
||||
release_callback = FunctionContainer(
|
||||
semaphore.release, task.transfer_id, acquire_token
|
||||
)
|
||||
# Submit the task to the underlying executor.
|
||||
# Pass the current context to ensure child threads persist the
|
||||
# parent thread's context.
|
||||
future = ExecutorFuture(self._executor.submit(task, get_context()))
|
||||
# Add the Semaphore.release() callback to the future such that
|
||||
# it is invoked once the future completes.
|
||||
future.add_done_callback(release_callback)
|
||||
return future
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
self._executor.shutdown(wait)
|
||||
|
||||
|
||||
class ExecutorFuture:
|
||||
def __init__(self, future):
|
||||
"""A future returned from the executor
|
||||
|
||||
Currently, it is just a wrapper around a concurrent.futures.Future.
|
||||
However, this can eventually grow to implement the needed functionality
|
||||
of concurrent.futures.Future if we move off of the library and not
|
||||
affect the rest of the codebase.
|
||||
|
||||
:type future: concurrent.futures.Future
|
||||
:param future: The underlying future
|
||||
"""
|
||||
self._future = future
|
||||
|
||||
def result(self):
|
||||
return self._future.result()
|
||||
|
||||
def add_done_callback(self, fn):
|
||||
"""Adds a callback to be completed once future is done
|
||||
|
||||
:param fn: A callable that takes no arguments. Note that is different
|
||||
than concurrent.futures.Future.add_done_callback that requires
|
||||
a single argument for the future.
|
||||
"""
|
||||
|
||||
# The done callback for concurrent.futures.Future will always pass a
|
||||
# the future in as the only argument. So we need to create the
|
||||
# proper signature wrapper that will invoke the callback provided.
|
||||
def done_callback(future_passed_to_callback):
|
||||
return fn()
|
||||
|
||||
self._future.add_done_callback(done_callback)
|
||||
|
||||
def done(self):
|
||||
return self._future.done()
|
||||
|
||||
|
||||
class BaseExecutor:
|
||||
"""Base Executor class implementation needed to work with s3transfer"""
|
||||
|
||||
def __init__(self, max_workers=None):
|
||||
pass
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
raise NotImplementedError('submit()')
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
raise NotImplementedError('shutdown()')
|
||||
|
||||
|
||||
class NonThreadedExecutor(BaseExecutor):
|
||||
"""A drop-in replacement non-threaded version of ThreadPoolExecutor"""
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
future = NonThreadedExecutorFuture()
|
||||
try:
|
||||
result = fn(*args, **kwargs)
|
||||
future.set_result(result)
|
||||
except Exception:
|
||||
e, tb = sys.exc_info()[1:]
|
||||
logger.debug(
|
||||
'Setting exception for %s to %s with traceback %s',
|
||||
future,
|
||||
e,
|
||||
tb,
|
||||
)
|
||||
future.set_exception_info(e, tb)
|
||||
return future
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
pass
|
||||
|
||||
|
||||
class NonThreadedExecutorFuture:
|
||||
"""The Future returned from NonThreadedExecutor
|
||||
|
||||
Note that this future is **not** thread-safe as it is being used
|
||||
from the context of a non-threaded environment.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._result = None
|
||||
self._exception = None
|
||||
self._traceback = None
|
||||
self._done = False
|
||||
self._done_callbacks = []
|
||||
|
||||
def set_result(self, result):
|
||||
self._result = result
|
||||
self._set_done()
|
||||
|
||||
def set_exception_info(self, exception, traceback):
|
||||
self._exception = exception
|
||||
self._traceback = traceback
|
||||
self._set_done()
|
||||
|
||||
def result(self, timeout=None):
|
||||
if self._exception:
|
||||
raise self._exception.with_traceback(self._traceback)
|
||||
return self._result
|
||||
|
||||
def _set_done(self):
|
||||
self._done = True
|
||||
for done_callback in self._done_callbacks:
|
||||
self._invoke_done_callback(done_callback)
|
||||
self._done_callbacks = []
|
||||
|
||||
def _invoke_done_callback(self, done_callback):
|
||||
return done_callback(self)
|
||||
|
||||
def done(self):
|
||||
return self._done
|
||||
|
||||
def add_done_callback(self, fn):
|
||||
if self._done:
|
||||
self._invoke_done_callback(fn)
|
||||
else:
|
||||
self._done_callbacks.append(fn)
|
||||
|
||||
|
||||
TaskTag = namedtuple('TaskTag', ['name'])
|
||||
|
||||
IN_MEMORY_UPLOAD_TAG = TaskTag('in_memory_upload')
|
||||
IN_MEMORY_DOWNLOAD_TAG = TaskTag('in_memory_download')
|
||||
@@ -0,0 +1,755 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import copy
|
||||
import logging
|
||||
import re
|
||||
import threading
|
||||
|
||||
from s3transfer.bandwidth import BandwidthLimiter, LeakyBucket
|
||||
from s3transfer.constants import (
|
||||
ALLOWED_DOWNLOAD_ARGS,
|
||||
FULL_OBJECT_CHECKSUM_ARGS,
|
||||
KB,
|
||||
MB,
|
||||
)
|
||||
from s3transfer.copies import CopySubmissionTask
|
||||
from s3transfer.delete import DeleteSubmissionTask
|
||||
from s3transfer.download import DownloadSubmissionTask
|
||||
from s3transfer.exceptions import CancelledError, FatalError
|
||||
from s3transfer.futures import (
|
||||
IN_MEMORY_DOWNLOAD_TAG,
|
||||
IN_MEMORY_UPLOAD_TAG,
|
||||
BoundedExecutor,
|
||||
TransferCoordinator,
|
||||
TransferFuture,
|
||||
TransferMeta,
|
||||
)
|
||||
from s3transfer.upload import UploadSubmissionTask
|
||||
from s3transfer.utils import (
|
||||
CallArgs,
|
||||
OSUtils,
|
||||
SlidingWindowSemaphore,
|
||||
TaskSemaphore,
|
||||
get_callbacks,
|
||||
set_default_checksum_algorithm,
|
||||
signal_not_transferring,
|
||||
signal_transferring,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TransferConfig:
|
||||
def __init__(
|
||||
self,
|
||||
multipart_threshold=8 * MB,
|
||||
multipart_chunksize=8 * MB,
|
||||
max_request_concurrency=10,
|
||||
max_submission_concurrency=5,
|
||||
max_request_queue_size=1000,
|
||||
max_submission_queue_size=1000,
|
||||
max_io_queue_size=1000,
|
||||
io_chunksize=256 * KB,
|
||||
num_download_attempts=5,
|
||||
max_in_memory_upload_chunks=10,
|
||||
max_in_memory_download_chunks=10,
|
||||
max_bandwidth=None,
|
||||
):
|
||||
"""Configurations for the transfer manager
|
||||
|
||||
:param multipart_threshold: The threshold for which multipart
|
||||
transfers occur.
|
||||
|
||||
:param max_request_concurrency: The maximum number of S3 API
|
||||
transfer-related requests that can happen at a time.
|
||||
|
||||
:param max_submission_concurrency: The maximum number of threads
|
||||
processing a call to a TransferManager method. Processing a
|
||||
call usually entails determining which S3 API requests that need
|
||||
to be enqueued, but does **not** entail making any of the
|
||||
S3 API data transferring requests needed to perform the transfer.
|
||||
The threads controlled by ``max_request_concurrency`` is
|
||||
responsible for that.
|
||||
|
||||
:param multipart_chunksize: The size of each transfer if a request
|
||||
becomes a multipart transfer.
|
||||
|
||||
:param max_request_queue_size: The maximum amount of S3 API requests
|
||||
that can be queued at a time.
|
||||
|
||||
:param max_submission_queue_size: The maximum amount of
|
||||
TransferManager method calls that can be queued at a time.
|
||||
|
||||
:param max_io_queue_size: The maximum amount of read parts that
|
||||
can be queued to be written to disk per download. The default
|
||||
size for each elementin this queue is 8 KB.
|
||||
|
||||
:param io_chunksize: The max size of each chunk in the io queue.
|
||||
Currently, this is size used when reading from the downloaded
|
||||
stream as well.
|
||||
|
||||
:param num_download_attempts: The number of download attempts that
|
||||
will be tried upon errors with downloading an object in S3. Note
|
||||
that these retries account for errors that occur when streaming
|
||||
down the data from s3 (i.e. socket errors and read timeouts that
|
||||
occur after receiving an OK response from s3).
|
||||
Other retryable exceptions such as throttling errors and 5xx errors
|
||||
are already retried by botocore (this default is 5). The
|
||||
``num_download_attempts`` does not take into account the
|
||||
number of exceptions retried by botocore.
|
||||
|
||||
:param max_in_memory_upload_chunks: The number of chunks that can
|
||||
be stored in memory at a time for all ongoing upload requests.
|
||||
This pertains to chunks of data that need to be stored in memory
|
||||
during an upload if the data is sourced from a file-like object.
|
||||
The total maximum memory footprint due to a in-memory upload
|
||||
chunks is roughly equal to:
|
||||
|
||||
max_in_memory_upload_chunks * multipart_chunksize
|
||||
+ max_submission_concurrency * multipart_chunksize
|
||||
|
||||
``max_submission_concurrency`` has an affect on this value because
|
||||
for each thread pulling data off of a file-like object, they may
|
||||
be waiting with a single read chunk to be submitted for upload
|
||||
because the ``max_in_memory_upload_chunks`` value has been reached
|
||||
by the threads making the upload request.
|
||||
|
||||
:param max_in_memory_download_chunks: The number of chunks that can
|
||||
be buffered in memory and **not** in the io queue at a time for all
|
||||
ongoing download requests. This pertains specifically to file-like
|
||||
objects that cannot be seeked. The total maximum memory footprint
|
||||
due to a in-memory download chunks is roughly equal to:
|
||||
|
||||
max_in_memory_download_chunks * multipart_chunksize
|
||||
|
||||
:param max_bandwidth: The maximum bandwidth that will be consumed
|
||||
in uploading and downloading file content. The value is in terms of
|
||||
bytes per second.
|
||||
"""
|
||||
self.multipart_threshold = multipart_threshold
|
||||
self.multipart_chunksize = multipart_chunksize
|
||||
self.max_request_concurrency = max_request_concurrency
|
||||
self.max_submission_concurrency = max_submission_concurrency
|
||||
self.max_request_queue_size = max_request_queue_size
|
||||
self.max_submission_queue_size = max_submission_queue_size
|
||||
self.max_io_queue_size = max_io_queue_size
|
||||
self.io_chunksize = io_chunksize
|
||||
self.num_download_attempts = num_download_attempts
|
||||
self.max_in_memory_upload_chunks = max_in_memory_upload_chunks
|
||||
self.max_in_memory_download_chunks = max_in_memory_download_chunks
|
||||
self.max_bandwidth = max_bandwidth
|
||||
self._validate_attrs_are_nonzero()
|
||||
|
||||
def _validate_attrs_are_nonzero(self):
|
||||
for attr, attr_val in self.__dict__.items():
|
||||
if attr_val is not None and attr_val <= 0:
|
||||
raise ValueError(
|
||||
f'Provided parameter {attr} of value {attr_val} must '
|
||||
'be greater than 0.'
|
||||
)
|
||||
|
||||
|
||||
class TransferManager:
|
||||
ALLOWED_DOWNLOAD_ARGS = ALLOWED_DOWNLOAD_ARGS
|
||||
|
||||
_ALLOWED_SHARED_ARGS = [
|
||||
'ACL',
|
||||
'CacheControl',
|
||||
'ChecksumAlgorithm',
|
||||
'ContentDisposition',
|
||||
'ContentEncoding',
|
||||
'ContentLanguage',
|
||||
'ContentType',
|
||||
'ExpectedBucketOwner',
|
||||
'Expires',
|
||||
'GrantFullControl',
|
||||
'GrantRead',
|
||||
'GrantReadACP',
|
||||
'GrantWriteACP',
|
||||
'Metadata',
|
||||
'ObjectLockLegalHoldStatus',
|
||||
'ObjectLockMode',
|
||||
'ObjectLockRetainUntilDate',
|
||||
'RequestPayer',
|
||||
'ServerSideEncryption',
|
||||
'StorageClass',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKey',
|
||||
'SSECustomerKeyMD5',
|
||||
'SSEKMSKeyId',
|
||||
'SSEKMSEncryptionContext',
|
||||
'Tagging',
|
||||
'WebsiteRedirectLocation',
|
||||
]
|
||||
|
||||
ALLOWED_UPLOAD_ARGS = (
|
||||
_ALLOWED_SHARED_ARGS
|
||||
+ [
|
||||
'ChecksumType',
|
||||
'MpuObjectSize',
|
||||
]
|
||||
+ FULL_OBJECT_CHECKSUM_ARGS
|
||||
)
|
||||
|
||||
ALLOWED_COPY_ARGS = _ALLOWED_SHARED_ARGS + [
|
||||
'CopySourceIfMatch',
|
||||
'CopySourceIfModifiedSince',
|
||||
'CopySourceIfNoneMatch',
|
||||
'CopySourceIfUnmodifiedSince',
|
||||
'CopySourceSSECustomerAlgorithm',
|
||||
'CopySourceSSECustomerKey',
|
||||
'CopySourceSSECustomerKeyMD5',
|
||||
'MetadataDirective',
|
||||
'TaggingDirective',
|
||||
]
|
||||
|
||||
ALLOWED_DELETE_ARGS = [
|
||||
'MFA',
|
||||
'VersionId',
|
||||
'RequestPayer',
|
||||
'ExpectedBucketOwner',
|
||||
]
|
||||
|
||||
VALIDATE_SUPPORTED_BUCKET_VALUES = True
|
||||
|
||||
_UNSUPPORTED_BUCKET_PATTERNS = {
|
||||
'S3 Object Lambda': re.compile(
|
||||
r'^arn:(aws).*:s3-object-lambda:[a-z\-0-9]+:[0-9]{12}:'
|
||||
r'accesspoint[/:][a-zA-Z0-9\-]{1,63}'
|
||||
),
|
||||
}
|
||||
|
||||
def __init__(self, client, config=None, osutil=None, executor_cls=None):
|
||||
"""A transfer manager interface for Amazon S3
|
||||
|
||||
:param client: Client to be used by the manager
|
||||
:param config: TransferConfig to associate specific configurations
|
||||
:param osutil: OSUtils object to use for os-related behavior when
|
||||
using with transfer manager.
|
||||
|
||||
:type executor_cls: s3transfer.futures.BaseExecutor
|
||||
:param executor_cls: The class of executor to use with the transfer
|
||||
manager. By default, concurrent.futures.ThreadPoolExecutor is used.
|
||||
"""
|
||||
self._client = client
|
||||
self._config = config
|
||||
if config is None:
|
||||
self._config = TransferConfig()
|
||||
self._osutil = osutil
|
||||
if osutil is None:
|
||||
self._osutil = OSUtils()
|
||||
self._coordinator_controller = TransferCoordinatorController()
|
||||
# A counter to create unique id's for each transfer submitted.
|
||||
self._id_counter = 0
|
||||
|
||||
# The executor responsible for making S3 API transfer requests
|
||||
self._request_executor = BoundedExecutor(
|
||||
max_size=self._config.max_request_queue_size,
|
||||
max_num_threads=self._config.max_request_concurrency,
|
||||
tag_semaphores={
|
||||
IN_MEMORY_UPLOAD_TAG: TaskSemaphore(
|
||||
self._config.max_in_memory_upload_chunks
|
||||
),
|
||||
IN_MEMORY_DOWNLOAD_TAG: SlidingWindowSemaphore(
|
||||
self._config.max_in_memory_download_chunks
|
||||
),
|
||||
},
|
||||
executor_cls=executor_cls,
|
||||
)
|
||||
|
||||
# The executor responsible for submitting the necessary tasks to
|
||||
# perform the desired transfer
|
||||
self._submission_executor = BoundedExecutor(
|
||||
max_size=self._config.max_submission_queue_size,
|
||||
max_num_threads=self._config.max_submission_concurrency,
|
||||
executor_cls=executor_cls,
|
||||
)
|
||||
|
||||
# There is one thread available for writing to disk. It will handle
|
||||
# downloads for all files.
|
||||
self._io_executor = BoundedExecutor(
|
||||
max_size=self._config.max_io_queue_size,
|
||||
max_num_threads=1,
|
||||
executor_cls=executor_cls,
|
||||
)
|
||||
|
||||
# The component responsible for limiting bandwidth usage if it
|
||||
# is configured.
|
||||
self._bandwidth_limiter = None
|
||||
if self._config.max_bandwidth is not None:
|
||||
logger.debug(
|
||||
'Setting max_bandwidth to %s', self._config.max_bandwidth
|
||||
)
|
||||
leaky_bucket = LeakyBucket(self._config.max_bandwidth)
|
||||
self._bandwidth_limiter = BandwidthLimiter(leaky_bucket)
|
||||
|
||||
self._register_handlers()
|
||||
|
||||
@property
|
||||
def client(self):
|
||||
return self._client
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
return self._config
|
||||
|
||||
def upload(self, fileobj, bucket, key, extra_args=None, subscribers=None):
|
||||
"""Uploads a file to S3
|
||||
|
||||
:type fileobj: str or seekable file-like object
|
||||
:param fileobj: The name of a file to upload or a seekable file-like
|
||||
object to upload. It is recommended to use a filename because
|
||||
file-like objects may result in higher memory usage.
|
||||
|
||||
:type bucket: str
|
||||
:param bucket: The name of the bucket to upload to
|
||||
|
||||
:type key: str
|
||||
:param key: The name of the key to upload to
|
||||
|
||||
:type extra_args: dict
|
||||
:param extra_args: Extra arguments that may be passed to the
|
||||
client operation
|
||||
|
||||
:type subscribers: list(s3transfer.subscribers.BaseSubscriber)
|
||||
:param subscribers: The list of subscribers to be invoked in the
|
||||
order provided based on the event emit during the process of
|
||||
the transfer request.
|
||||
|
||||
:rtype: s3transfer.futures.TransferFuture
|
||||
:returns: Transfer future representing the upload
|
||||
"""
|
||||
|
||||
extra_args = extra_args.copy() if extra_args else {}
|
||||
if subscribers is None:
|
||||
subscribers = []
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
|
||||
self._validate_if_bucket_supported(bucket)
|
||||
self._add_operation_defaults(extra_args)
|
||||
call_args = CallArgs(
|
||||
fileobj=fileobj,
|
||||
bucket=bucket,
|
||||
key=key,
|
||||
extra_args=extra_args,
|
||||
subscribers=subscribers,
|
||||
)
|
||||
extra_main_kwargs = {}
|
||||
if self._bandwidth_limiter:
|
||||
extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter
|
||||
return self._submit_transfer(
|
||||
call_args, UploadSubmissionTask, extra_main_kwargs
|
||||
)
|
||||
|
||||
def download(
|
||||
self, bucket, key, fileobj, extra_args=None, subscribers=None
|
||||
):
|
||||
"""Downloads a file from S3
|
||||
|
||||
:type bucket: str
|
||||
:param bucket: The name of the bucket to download from
|
||||
|
||||
:type key: str
|
||||
:param key: The name of the key to download from
|
||||
|
||||
:type fileobj: str or seekable file-like object
|
||||
:param fileobj: The name of a file to download or a seekable file-like
|
||||
object to download. It is recommended to use a filename because
|
||||
file-like objects may result in higher memory usage.
|
||||
|
||||
:type extra_args: dict
|
||||
:param extra_args: Extra arguments that may be passed to the
|
||||
client operation
|
||||
|
||||
:type subscribers: list(s3transfer.subscribers.BaseSubscriber)
|
||||
:param subscribers: The list of subscribers to be invoked in the
|
||||
order provided based on the event emit during the process of
|
||||
the transfer request.
|
||||
|
||||
:rtype: s3transfer.futures.TransferFuture
|
||||
:returns: Transfer future representing the download
|
||||
"""
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
if subscribers is None:
|
||||
subscribers = []
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
|
||||
self._validate_if_bucket_supported(bucket)
|
||||
call_args = CallArgs(
|
||||
bucket=bucket,
|
||||
key=key,
|
||||
fileobj=fileobj,
|
||||
extra_args=extra_args,
|
||||
subscribers=subscribers,
|
||||
)
|
||||
extra_main_kwargs = {'io_executor': self._io_executor}
|
||||
if self._bandwidth_limiter:
|
||||
extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter
|
||||
return self._submit_transfer(
|
||||
call_args, DownloadSubmissionTask, extra_main_kwargs
|
||||
)
|
||||
|
||||
def copy(
|
||||
self,
|
||||
copy_source,
|
||||
bucket,
|
||||
key,
|
||||
extra_args=None,
|
||||
subscribers=None,
|
||||
source_client=None,
|
||||
):
|
||||
"""Copies a file in S3
|
||||
|
||||
:type copy_source: dict
|
||||
:param copy_source: The name of the source bucket, key name of the
|
||||
source object, and optional version ID of the source object. The
|
||||
dictionary format is:
|
||||
``{'Bucket': 'bucket', 'Key': 'key', 'VersionId': 'id'}``. Note
|
||||
that the ``VersionId`` key is optional and may be omitted.
|
||||
|
||||
:type bucket: str
|
||||
:param bucket: The name of the bucket to copy to
|
||||
|
||||
:type key: str
|
||||
:param key: The name of the key to copy to
|
||||
|
||||
:type extra_args: dict
|
||||
:param extra_args: Extra arguments that may be passed to the
|
||||
client operation
|
||||
|
||||
:type subscribers: a list of subscribers
|
||||
:param subscribers: The list of subscribers to be invoked in the
|
||||
order provided based on the event emit during the process of
|
||||
the transfer request.
|
||||
|
||||
:type source_client: botocore or boto3 Client
|
||||
:param source_client: The client to be used for operation that
|
||||
may happen at the source object. For example, this client is
|
||||
used for the head_object that determines the size of the copy.
|
||||
If no client is provided, the transfer manager's client is used
|
||||
as the client for the source object.
|
||||
|
||||
:rtype: s3transfer.futures.TransferFuture
|
||||
:returns: Transfer future representing the copy
|
||||
"""
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
if subscribers is None:
|
||||
subscribers = []
|
||||
if source_client is None:
|
||||
source_client = self._client
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_COPY_ARGS)
|
||||
if isinstance(copy_source, dict):
|
||||
self._validate_if_bucket_supported(copy_source.get('Bucket'))
|
||||
self._validate_if_bucket_supported(bucket)
|
||||
call_args = CallArgs(
|
||||
copy_source=copy_source,
|
||||
bucket=bucket,
|
||||
key=key,
|
||||
extra_args=extra_args,
|
||||
subscribers=subscribers,
|
||||
source_client=source_client,
|
||||
)
|
||||
return self._submit_transfer(call_args, CopySubmissionTask)
|
||||
|
||||
def delete(self, bucket, key, extra_args=None, subscribers=None):
|
||||
"""Delete an S3 object.
|
||||
|
||||
:type bucket: str
|
||||
:param bucket: The name of the bucket.
|
||||
|
||||
:type key: str
|
||||
:param key: The name of the S3 object to delete.
|
||||
|
||||
:type extra_args: dict
|
||||
:param extra_args: Extra arguments that may be passed to the
|
||||
DeleteObject call.
|
||||
|
||||
:type subscribers: list
|
||||
:param subscribers: A list of subscribers to be invoked during the
|
||||
process of the transfer request. Note that the ``on_progress``
|
||||
callback is not invoked during object deletion.
|
||||
|
||||
:rtype: s3transfer.futures.TransferFuture
|
||||
:return: Transfer future representing the deletion.
|
||||
|
||||
"""
|
||||
if extra_args is None:
|
||||
extra_args = {}
|
||||
if subscribers is None:
|
||||
subscribers = []
|
||||
self._validate_all_known_args(extra_args, self.ALLOWED_DELETE_ARGS)
|
||||
self._validate_if_bucket_supported(bucket)
|
||||
call_args = CallArgs(
|
||||
bucket=bucket,
|
||||
key=key,
|
||||
extra_args=extra_args,
|
||||
subscribers=subscribers,
|
||||
)
|
||||
return self._submit_transfer(call_args, DeleteSubmissionTask)
|
||||
|
||||
def _validate_if_bucket_supported(self, bucket):
|
||||
# s3 high level operations don't support some resources
|
||||
# (eg. S3 Object Lambda) only direct API calls are available
|
||||
# for such resources
|
||||
if self.VALIDATE_SUPPORTED_BUCKET_VALUES:
|
||||
for resource, pattern in self._UNSUPPORTED_BUCKET_PATTERNS.items():
|
||||
match = pattern.match(bucket)
|
||||
if match:
|
||||
raise ValueError(
|
||||
f'TransferManager methods do not support {resource} '
|
||||
'resource. Use direct client calls instead.'
|
||||
)
|
||||
|
||||
def _validate_all_known_args(self, actual, allowed):
|
||||
for kwarg in actual:
|
||||
if kwarg not in allowed:
|
||||
raise ValueError(
|
||||
"Invalid extra_args key '{}', must be one of: {}".format(
|
||||
kwarg, ', '.join(allowed)
|
||||
)
|
||||
)
|
||||
|
||||
def _add_operation_defaults(self, extra_args):
|
||||
if (
|
||||
self.client.meta.config.request_checksum_calculation
|
||||
== "when_supported"
|
||||
):
|
||||
set_default_checksum_algorithm(extra_args)
|
||||
|
||||
def _submit_transfer(
|
||||
self, call_args, submission_task_cls, extra_main_kwargs=None
|
||||
):
|
||||
if not extra_main_kwargs:
|
||||
extra_main_kwargs = {}
|
||||
|
||||
# Create a TransferFuture to return back to the user
|
||||
transfer_future, components = self._get_future_with_components(
|
||||
call_args
|
||||
)
|
||||
|
||||
# Add any provided done callbacks to the created transfer future
|
||||
# to be invoked on the transfer future being complete.
|
||||
for callback in get_callbacks(transfer_future, 'done'):
|
||||
components['coordinator'].add_done_callback(callback)
|
||||
|
||||
# Get the main kwargs needed to instantiate the submission task
|
||||
main_kwargs = self._get_submission_task_main_kwargs(
|
||||
transfer_future, extra_main_kwargs
|
||||
)
|
||||
|
||||
# Submit a SubmissionTask that will submit all of the necessary
|
||||
# tasks needed to complete the S3 transfer.
|
||||
self._submission_executor.submit(
|
||||
submission_task_cls(
|
||||
transfer_coordinator=components['coordinator'],
|
||||
main_kwargs=main_kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
# Increment the unique id counter for future transfer requests
|
||||
self._id_counter += 1
|
||||
|
||||
return transfer_future
|
||||
|
||||
def _get_future_with_components(self, call_args):
|
||||
transfer_id = self._id_counter
|
||||
# Creates a new transfer future along with its components
|
||||
transfer_coordinator = TransferCoordinator(transfer_id=transfer_id)
|
||||
# Track the transfer coordinator for transfers to manage.
|
||||
self._coordinator_controller.add_transfer_coordinator(
|
||||
transfer_coordinator
|
||||
)
|
||||
# Also make sure that the transfer coordinator is removed once
|
||||
# the transfer completes so it does not stick around in memory.
|
||||
transfer_coordinator.add_done_callback(
|
||||
self._coordinator_controller.remove_transfer_coordinator,
|
||||
transfer_coordinator,
|
||||
)
|
||||
components = {
|
||||
'meta': TransferMeta(call_args, transfer_id=transfer_id),
|
||||
'coordinator': transfer_coordinator,
|
||||
}
|
||||
transfer_future = TransferFuture(**components)
|
||||
return transfer_future, components
|
||||
|
||||
def _get_submission_task_main_kwargs(
|
||||
self, transfer_future, extra_main_kwargs
|
||||
):
|
||||
main_kwargs = {
|
||||
'client': self._client,
|
||||
'config': self._config,
|
||||
'osutil': self._osutil,
|
||||
'request_executor': self._request_executor,
|
||||
'transfer_future': transfer_future,
|
||||
}
|
||||
main_kwargs.update(extra_main_kwargs)
|
||||
return main_kwargs
|
||||
|
||||
def _register_handlers(self):
|
||||
# Register handlers to enable/disable callbacks on uploads.
|
||||
event_name = 'request-created.s3'
|
||||
self._client.meta.events.register_first(
|
||||
event_name,
|
||||
signal_not_transferring,
|
||||
unique_id='s3upload-not-transferring',
|
||||
)
|
||||
self._client.meta.events.register_last(
|
||||
event_name, signal_transferring, unique_id='s3upload-transferring'
|
||||
)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, *args):
|
||||
cancel = False
|
||||
cancel_msg = ''
|
||||
cancel_exc_type = FatalError
|
||||
# If a exception was raised in the context handler, signal to cancel
|
||||
# all of the inprogress futures in the shutdown.
|
||||
if exc_type:
|
||||
cancel = True
|
||||
cancel_msg = str(exc_value)
|
||||
if not cancel_msg:
|
||||
cancel_msg = repr(exc_value)
|
||||
# If it was a KeyboardInterrupt, the cancellation was initiated
|
||||
# by the user.
|
||||
if isinstance(exc_value, KeyboardInterrupt):
|
||||
cancel_exc_type = CancelledError
|
||||
self._shutdown(cancel, cancel_msg, cancel_exc_type)
|
||||
|
||||
def shutdown(self, cancel=False, cancel_msg=''):
|
||||
"""Shutdown the TransferManager
|
||||
|
||||
It will wait till all transfers complete before it completely shuts
|
||||
down.
|
||||
|
||||
:type cancel: boolean
|
||||
:param cancel: If True, calls TransferFuture.cancel() for
|
||||
all in-progress in transfers. This is useful if you want the
|
||||
shutdown to happen quicker.
|
||||
|
||||
:type cancel_msg: str
|
||||
:param cancel_msg: The message to specify if canceling all in-progress
|
||||
transfers.
|
||||
"""
|
||||
self._shutdown(cancel, cancel, cancel_msg)
|
||||
|
||||
def _shutdown(self, cancel, cancel_msg, exc_type=CancelledError):
|
||||
if cancel:
|
||||
# Cancel all in-flight transfers if requested, before waiting
|
||||
# for them to complete.
|
||||
self._coordinator_controller.cancel(cancel_msg, exc_type)
|
||||
try:
|
||||
# Wait until there are no more in-progress transfers. This is
|
||||
# wrapped in a try statement because this can be interrupted
|
||||
# with a KeyboardInterrupt that needs to be caught.
|
||||
self._coordinator_controller.wait()
|
||||
except KeyboardInterrupt:
|
||||
# If not errors were raised in the try block, the cancel should
|
||||
# have no coordinators it needs to run cancel on. If there was
|
||||
# an error raised in the try statement we want to cancel all of
|
||||
# the inflight transfers before shutting down to speed that
|
||||
# process up.
|
||||
self._coordinator_controller.cancel('KeyboardInterrupt()')
|
||||
raise
|
||||
finally:
|
||||
# Shutdown all of the executors.
|
||||
self._submission_executor.shutdown()
|
||||
self._request_executor.shutdown()
|
||||
self._io_executor.shutdown()
|
||||
|
||||
|
||||
class TransferCoordinatorController:
|
||||
def __init__(self):
|
||||
"""Abstraction to control all transfer coordinators
|
||||
|
||||
This abstraction allows the manager to wait for inprogress transfers
|
||||
to complete and cancel all inprogress transfers.
|
||||
"""
|
||||
self._lock = threading.Lock()
|
||||
self._tracked_transfer_coordinators = set()
|
||||
|
||||
@property
|
||||
def tracked_transfer_coordinators(self):
|
||||
"""The set of transfer coordinators being tracked"""
|
||||
with self._lock:
|
||||
# We return a copy because the set is mutable and if you were to
|
||||
# iterate over the set, it may be changing in length due to
|
||||
# additions and removals of transfer coordinators.
|
||||
return copy.copy(self._tracked_transfer_coordinators)
|
||||
|
||||
def add_transfer_coordinator(self, transfer_coordinator):
|
||||
"""Adds a transfer coordinator of a transfer to be canceled if needed
|
||||
|
||||
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
|
||||
:param transfer_coordinator: The transfer coordinator for the
|
||||
particular transfer
|
||||
"""
|
||||
with self._lock:
|
||||
self._tracked_transfer_coordinators.add(transfer_coordinator)
|
||||
|
||||
def remove_transfer_coordinator(self, transfer_coordinator):
|
||||
"""Remove a transfer coordinator from cancellation consideration
|
||||
|
||||
Typically, this method is invoked by the transfer coordinator itself
|
||||
to remove its self when it completes its transfer.
|
||||
|
||||
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
|
||||
:param transfer_coordinator: The transfer coordinator for the
|
||||
particular transfer
|
||||
"""
|
||||
with self._lock:
|
||||
self._tracked_transfer_coordinators.remove(transfer_coordinator)
|
||||
|
||||
def cancel(self, msg='', exc_type=CancelledError):
|
||||
"""Cancels all inprogress transfers
|
||||
|
||||
This cancels the inprogress transfers by calling cancel() on all
|
||||
tracked transfer coordinators.
|
||||
|
||||
:param msg: The message to pass on to each transfer coordinator that
|
||||
gets cancelled.
|
||||
|
||||
:param exc_type: The type of exception to set for the cancellation
|
||||
"""
|
||||
for transfer_coordinator in self.tracked_transfer_coordinators:
|
||||
transfer_coordinator.cancel(msg, exc_type)
|
||||
|
||||
def wait(self):
|
||||
"""Wait until there are no more inprogress transfers
|
||||
|
||||
This will not stop when failures are encountered and not propagate any
|
||||
of these errors from failed transfers, but it can be interrupted with
|
||||
a KeyboardInterrupt.
|
||||
"""
|
||||
try:
|
||||
transfer_coordinator = None
|
||||
for transfer_coordinator in self.tracked_transfer_coordinators:
|
||||
transfer_coordinator.result()
|
||||
except KeyboardInterrupt:
|
||||
logger.debug('Received KeyboardInterrupt in wait()')
|
||||
# If Keyboard interrupt is raised while waiting for
|
||||
# the result, then exit out of the wait and raise the
|
||||
# exception
|
||||
if transfer_coordinator:
|
||||
logger.debug(
|
||||
'On KeyboardInterrupt was waiting for %s',
|
||||
transfer_coordinator,
|
||||
)
|
||||
raise
|
||||
except Exception:
|
||||
# A general exception could have been thrown because
|
||||
# of result(). We just want to ignore this and continue
|
||||
# because we at least know that the transfer coordinator
|
||||
# has completed.
|
||||
pass
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,94 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
from functools import lru_cache
|
||||
|
||||
from s3transfer.compat import accepts_kwargs
|
||||
from s3transfer.exceptions import InvalidSubscriberMethodError
|
||||
|
||||
|
||||
class BaseSubscriber:
|
||||
"""The base subscriber class
|
||||
|
||||
It is recommended that all subscriber implementations subclass and then
|
||||
override the subscription methods (i.e. on_{subscribe_type}() methods).
|
||||
"""
|
||||
|
||||
VALID_SUBSCRIBER_TYPES = ['queued', 'progress', 'done']
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
cls._validate_subscriber_methods()
|
||||
return super().__new__(cls)
|
||||
|
||||
@classmethod
|
||||
@lru_cache
|
||||
def _validate_subscriber_methods(cls):
|
||||
for subscriber_type in cls.VALID_SUBSCRIBER_TYPES:
|
||||
subscriber_method = getattr(cls, 'on_' + subscriber_type)
|
||||
if not callable(subscriber_method):
|
||||
raise InvalidSubscriberMethodError(
|
||||
f'Subscriber method {subscriber_method} must be callable.'
|
||||
)
|
||||
|
||||
if not accepts_kwargs(subscriber_method):
|
||||
raise InvalidSubscriberMethodError(
|
||||
f'Subscriber method {subscriber_method} must accept keyword '
|
||||
'arguments (**kwargs)'
|
||||
)
|
||||
|
||||
def on_queued(self, future, **kwargs):
|
||||
"""Callback to be invoked when transfer request gets queued
|
||||
|
||||
This callback can be useful for:
|
||||
|
||||
* Keeping track of how many transfers have been requested
|
||||
* Providing the expected transfer size through
|
||||
future.meta.provide_transfer_size() so a HeadObject would not
|
||||
need to be made for copies and downloads.
|
||||
|
||||
:type future: s3transfer.futures.TransferFuture
|
||||
:param future: The TransferFuture representing the requested transfer.
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_progress(self, future, bytes_transferred, **kwargs):
|
||||
"""Callback to be invoked when progress is made on transfer
|
||||
|
||||
This callback can be useful for:
|
||||
|
||||
* Recording and displaying progress
|
||||
|
||||
:type future: s3transfer.futures.TransferFuture
|
||||
:param future: The TransferFuture representing the requested transfer.
|
||||
|
||||
:type bytes_transferred: int
|
||||
:param bytes_transferred: The number of bytes transferred for that
|
||||
invocation of the callback. Note that a negative amount can be
|
||||
provided, which usually indicates that an in-progress request
|
||||
needed to be retried and thus progress was rewound.
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_done(self, future, **kwargs):
|
||||
"""Callback to be invoked once a transfer is done
|
||||
|
||||
This callback can be useful for:
|
||||
|
||||
* Recording and displaying whether the transfer succeeded or
|
||||
failed using future.result()
|
||||
* Running some task after the transfer completed like changing
|
||||
the last modified time of a downloaded file.
|
||||
|
||||
:type future: s3transfer.futures.TransferFuture
|
||||
:param future: The TransferFuture representing the requested transfer.
|
||||
"""
|
||||
pass
|
||||
@@ -0,0 +1,390 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import copy
|
||||
import logging
|
||||
|
||||
from s3transfer.utils import get_callbacks
|
||||
|
||||
try:
|
||||
from botocore.context import start_as_current_context
|
||||
except ImportError:
|
||||
from contextlib import nullcontext as start_as_current_context
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Task:
|
||||
"""A task associated to a TransferFuture request
|
||||
|
||||
This is a base class for other classes to subclass from. All subclassed
|
||||
classes must implement the main() method.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
transfer_coordinator,
|
||||
main_kwargs=None,
|
||||
pending_main_kwargs=None,
|
||||
done_callbacks=None,
|
||||
is_final=False,
|
||||
):
|
||||
"""
|
||||
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
|
||||
:param transfer_coordinator: The context associated to the
|
||||
TransferFuture for which this Task is associated with.
|
||||
|
||||
:type main_kwargs: dict
|
||||
:param main_kwargs: The keyword args that can be immediately supplied
|
||||
to the _main() method of the task
|
||||
|
||||
:type pending_main_kwargs: dict
|
||||
:param pending_main_kwargs: The keyword args that are depended upon
|
||||
by the result from a dependent future(s). The result returned by
|
||||
the future(s) will be used as the value for the keyword argument
|
||||
when _main() is called. The values for each key can be:
|
||||
* a single future - Once completed, its value will be the
|
||||
result of that single future
|
||||
* a list of futures - Once all of the futures complete, the
|
||||
value used will be a list of each completed future result
|
||||
value in order of when they were originally supplied.
|
||||
|
||||
:type done_callbacks: list of callbacks
|
||||
:param done_callbacks: A list of callbacks to call once the task is
|
||||
done completing. Each callback will be called with no arguments
|
||||
and will be called no matter if the task succeeds or an exception
|
||||
is raised.
|
||||
|
||||
:type is_final: boolean
|
||||
:param is_final: True, to indicate that this task is the final task
|
||||
for the TransferFuture request. By setting this value to True, it
|
||||
will set the result of the entire TransferFuture to the result
|
||||
returned by this task's main() method.
|
||||
"""
|
||||
self._transfer_coordinator = transfer_coordinator
|
||||
|
||||
self._main_kwargs = main_kwargs
|
||||
if self._main_kwargs is None:
|
||||
self._main_kwargs = {}
|
||||
|
||||
self._pending_main_kwargs = pending_main_kwargs
|
||||
if pending_main_kwargs is None:
|
||||
self._pending_main_kwargs = {}
|
||||
|
||||
self._done_callbacks = done_callbacks
|
||||
if self._done_callbacks is None:
|
||||
self._done_callbacks = []
|
||||
|
||||
self._is_final = is_final
|
||||
|
||||
def __repr__(self):
|
||||
# These are the general main_kwarg parameters that we want to
|
||||
# display in the repr.
|
||||
params_to_display = [
|
||||
'bucket',
|
||||
'key',
|
||||
'part_number',
|
||||
'final_filename',
|
||||
'transfer_future',
|
||||
'offset',
|
||||
'extra_args',
|
||||
]
|
||||
main_kwargs_to_display = self._get_kwargs_with_params_to_include(
|
||||
self._main_kwargs, params_to_display
|
||||
)
|
||||
return f'{self.__class__.__name__}(transfer_id={self._transfer_coordinator.transfer_id}, {main_kwargs_to_display})'
|
||||
|
||||
@property
|
||||
def transfer_id(self):
|
||||
"""The id for the transfer request that the task belongs to"""
|
||||
return self._transfer_coordinator.transfer_id
|
||||
|
||||
def _get_kwargs_with_params_to_include(self, kwargs, include):
|
||||
filtered_kwargs = {}
|
||||
for param in include:
|
||||
if param in kwargs:
|
||||
filtered_kwargs[param] = kwargs[param]
|
||||
return filtered_kwargs
|
||||
|
||||
def _get_kwargs_with_params_to_exclude(self, kwargs, exclude):
|
||||
filtered_kwargs = {}
|
||||
for param, value in kwargs.items():
|
||||
if param in exclude:
|
||||
continue
|
||||
filtered_kwargs[param] = value
|
||||
return filtered_kwargs
|
||||
|
||||
def __call__(self, ctx=None):
|
||||
"""The callable to use when submitting a Task to an executor"""
|
||||
with start_as_current_context(ctx):
|
||||
try:
|
||||
# Wait for all of futures this task depends on.
|
||||
self._wait_on_dependent_futures()
|
||||
# Gather up all of the main keyword arguments for main().
|
||||
# This includes the immediately provided main_kwargs and
|
||||
# the values for pending_main_kwargs that source from the return
|
||||
# values from the task's dependent futures.
|
||||
kwargs = self._get_all_main_kwargs()
|
||||
# If the task is not done (really only if some other related
|
||||
# task to the TransferFuture had failed) then execute the task's
|
||||
# main() method.
|
||||
if not self._transfer_coordinator.done():
|
||||
return self._execute_main(kwargs)
|
||||
except Exception as e:
|
||||
self._log_and_set_exception(e)
|
||||
finally:
|
||||
# Run any done callbacks associated to the task no matter what.
|
||||
for done_callback in self._done_callbacks:
|
||||
done_callback()
|
||||
|
||||
if self._is_final:
|
||||
# If this is the final task announce that it is done if results
|
||||
# are waiting on its completion.
|
||||
self._transfer_coordinator.announce_done()
|
||||
|
||||
def _execute_main(self, kwargs):
|
||||
# Do not display keyword args that should not be printed, especially
|
||||
# if they are going to make the logs hard to follow.
|
||||
params_to_exclude = ['data']
|
||||
kwargs_to_display = self._get_kwargs_with_params_to_exclude(
|
||||
kwargs, params_to_exclude
|
||||
)
|
||||
# Log what is about to be executed.
|
||||
logger.debug(f"Executing task {self} with kwargs {kwargs_to_display}")
|
||||
|
||||
return_value = self._main(**kwargs)
|
||||
# If the task is the final task, then set the TransferFuture's
|
||||
# value to the return value from main().
|
||||
if self._is_final:
|
||||
self._transfer_coordinator.set_result(return_value)
|
||||
return return_value
|
||||
|
||||
def _log_and_set_exception(self, exception):
|
||||
# If an exception is ever thrown than set the exception for the
|
||||
# entire TransferFuture.
|
||||
logger.debug("Exception raised.", exc_info=True)
|
||||
self._transfer_coordinator.set_exception(exception)
|
||||
|
||||
def _main(self, **kwargs):
|
||||
"""The method that will be ran in the executor
|
||||
|
||||
This method must be implemented by subclasses from Task. main() can
|
||||
be implemented with any arguments decided upon by the subclass.
|
||||
"""
|
||||
raise NotImplementedError('_main() must be implemented')
|
||||
|
||||
def _wait_on_dependent_futures(self):
|
||||
# Gather all of the futures into that main() depends on.
|
||||
futures_to_wait_on = []
|
||||
for _, future in self._pending_main_kwargs.items():
|
||||
# If the pending main keyword arg is a list then extend the list.
|
||||
if isinstance(future, list):
|
||||
futures_to_wait_on.extend(future)
|
||||
# If the pending main keyword arg is a future append it to the list.
|
||||
else:
|
||||
futures_to_wait_on.append(future)
|
||||
# Now wait for all of the futures to complete.
|
||||
self._wait_until_all_complete(futures_to_wait_on)
|
||||
|
||||
def _wait_until_all_complete(self, futures):
|
||||
# This is a basic implementation of the concurrent.futures.wait()
|
||||
#
|
||||
# concurrent.futures.wait() is not used instead because of this
|
||||
# reported issue: https://bugs.python.org/issue20319.
|
||||
# The issue would occasionally cause multipart uploads to hang
|
||||
# when wait() was called. With this approach, it avoids the
|
||||
# concurrency bug by removing any association with concurrent.futures
|
||||
# implementation of waiters.
|
||||
logger.debug(
|
||||
'%s about to wait for the following futures %s', self, futures
|
||||
)
|
||||
for future in futures:
|
||||
try:
|
||||
logger.debug('%s about to wait for %s', self, future)
|
||||
future.result()
|
||||
except Exception:
|
||||
# result() can also produce exceptions. We want to ignore
|
||||
# these to be deferred to error handling down the road.
|
||||
pass
|
||||
logger.debug('%s done waiting for dependent futures', self)
|
||||
|
||||
def _get_all_main_kwargs(self):
|
||||
# Copy over all of the kwargs that we know is available.
|
||||
kwargs = copy.copy(self._main_kwargs)
|
||||
|
||||
# Iterate through the kwargs whose values are pending on the result
|
||||
# of a future.
|
||||
for key, pending_value in self._pending_main_kwargs.items():
|
||||
# If the value is a list of futures, iterate though the list
|
||||
# appending on the result from each future.
|
||||
if isinstance(pending_value, list):
|
||||
result = []
|
||||
for future in pending_value:
|
||||
result.append(future.result())
|
||||
# Otherwise if the pending_value is a future, just wait for it.
|
||||
else:
|
||||
result = pending_value.result()
|
||||
# Add the retrieved value to the kwargs to be sent to the
|
||||
# main() call.
|
||||
kwargs[key] = result
|
||||
return kwargs
|
||||
|
||||
|
||||
class SubmissionTask(Task):
|
||||
"""A base class for any submission task
|
||||
|
||||
Submission tasks are the top-level task used to submit a series of tasks
|
||||
to execute a particular transfer.
|
||||
"""
|
||||
|
||||
def _main(self, transfer_future, **kwargs):
|
||||
"""
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future associated with the
|
||||
transfer request that tasks are being submitted for
|
||||
|
||||
:param kwargs: Any additional kwargs that you may want to pass
|
||||
to the _submit() method
|
||||
"""
|
||||
try:
|
||||
self._transfer_coordinator.set_status_to_queued()
|
||||
|
||||
# Before submitting any tasks, run all of the on_queued callbacks
|
||||
on_queued_callbacks = get_callbacks(transfer_future, 'queued')
|
||||
for on_queued_callback in on_queued_callbacks:
|
||||
on_queued_callback()
|
||||
|
||||
# Once callbacks have been ran set the status to running.
|
||||
self._transfer_coordinator.set_status_to_running()
|
||||
|
||||
# Call the submit method to start submitting tasks to execute the
|
||||
# transfer.
|
||||
self._submit(transfer_future=transfer_future, **kwargs)
|
||||
except BaseException as e:
|
||||
# If there was an exception raised during the submission of task
|
||||
# there is a chance that the final task that signals if a transfer
|
||||
# is done and too run the cleanup may never have been submitted in
|
||||
# the first place so we need to account accordingly.
|
||||
#
|
||||
# Note that BaseException is caught, instead of Exception, because
|
||||
# for some implementations of executors, specifically the serial
|
||||
# implementation, the SubmissionTask is directly exposed to
|
||||
# KeyboardInterupts and so needs to cleanup and signal done
|
||||
# for those as well.
|
||||
|
||||
# Set the exception, that caused the process to fail.
|
||||
self._log_and_set_exception(e)
|
||||
|
||||
# Wait for all possibly associated futures that may have spawned
|
||||
# from this submission task have finished before we announce the
|
||||
# transfer done.
|
||||
self._wait_for_all_submitted_futures_to_complete()
|
||||
|
||||
# Announce the transfer as done, which will run any cleanups
|
||||
# and done callbacks as well.
|
||||
self._transfer_coordinator.announce_done()
|
||||
|
||||
def _submit(self, transfer_future, **kwargs):
|
||||
"""The submission method to be implemented
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future associated with the
|
||||
transfer request that tasks are being submitted for
|
||||
|
||||
:param kwargs: Any additional keyword arguments you want to be passed
|
||||
in
|
||||
"""
|
||||
raise NotImplementedError('_submit() must be implemented')
|
||||
|
||||
def _wait_for_all_submitted_futures_to_complete(self):
|
||||
# We want to wait for all futures that were submitted to
|
||||
# complete as we do not want the cleanup callbacks or done callbacks
|
||||
# to be called to early. The main problem is any task that was
|
||||
# submitted may have submitted even more during its process and so
|
||||
# we need to account accordingly.
|
||||
|
||||
# First get all of the futures that were submitted up to this point.
|
||||
submitted_futures = self._transfer_coordinator.associated_futures
|
||||
while submitted_futures:
|
||||
# Wait for those futures to complete.
|
||||
self._wait_until_all_complete(submitted_futures)
|
||||
# However, more futures may have been submitted as we waited so
|
||||
# we need to check again for any more associated futures.
|
||||
possibly_more_submitted_futures = (
|
||||
self._transfer_coordinator.associated_futures
|
||||
)
|
||||
# If the current list of submitted futures is equal to the
|
||||
# the list of associated futures for when after the wait completes,
|
||||
# we can ensure no more futures were submitted in waiting on
|
||||
# the current list of futures to complete ultimately meaning all
|
||||
# futures that may have spawned from the original submission task
|
||||
# have completed.
|
||||
if submitted_futures == possibly_more_submitted_futures:
|
||||
break
|
||||
submitted_futures = possibly_more_submitted_futures
|
||||
|
||||
|
||||
class CreateMultipartUploadTask(Task):
|
||||
"""Task to initiate a multipart upload"""
|
||||
|
||||
def _main(self, client, bucket, key, extra_args):
|
||||
"""
|
||||
:param client: The client to use when calling CreateMultipartUpload
|
||||
:param bucket: The name of the bucket to upload to
|
||||
:param key: The name of the key to upload to
|
||||
:param extra_args: A dictionary of any extra arguments that may be
|
||||
used in the initialization.
|
||||
|
||||
:returns: The upload id of the multipart upload
|
||||
"""
|
||||
# Create the multipart upload.
|
||||
response = client.create_multipart_upload(
|
||||
Bucket=bucket, Key=key, **extra_args
|
||||
)
|
||||
upload_id = response['UploadId']
|
||||
|
||||
# Add a cleanup if the multipart upload fails at any point.
|
||||
self._transfer_coordinator.add_failure_cleanup(
|
||||
client.abort_multipart_upload,
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id,
|
||||
)
|
||||
return upload_id
|
||||
|
||||
|
||||
class CompleteMultipartUploadTask(Task):
|
||||
"""Task to complete a multipart upload"""
|
||||
|
||||
def _main(self, client, bucket, key, upload_id, parts, extra_args):
|
||||
"""
|
||||
:param client: The client to use when calling CompleteMultipartUpload
|
||||
:param bucket: The name of the bucket to upload to
|
||||
:param key: The name of the key to upload to
|
||||
:param upload_id: The id of the upload
|
||||
:param parts: A list of parts to use to complete the multipart upload::
|
||||
|
||||
[{'Etag': etag_value, 'PartNumber': part_number}, ...]
|
||||
|
||||
Each element in the list consists of a return value from
|
||||
``UploadPartTask.main()``.
|
||||
:param extra_args: A dictionary of any extra arguments that may be
|
||||
used in completing the multipart transfer.
|
||||
"""
|
||||
client.complete_multipart_upload(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id,
|
||||
MultipartUpload={'Parts': parts},
|
||||
**extra_args,
|
||||
)
|
||||
@@ -0,0 +1,840 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import math
|
||||
from io import BytesIO
|
||||
|
||||
from s3transfer.compat import readable, seekable
|
||||
from s3transfer.constants import FULL_OBJECT_CHECKSUM_ARGS
|
||||
from s3transfer.futures import IN_MEMORY_UPLOAD_TAG
|
||||
from s3transfer.tasks import (
|
||||
CompleteMultipartUploadTask,
|
||||
CreateMultipartUploadTask,
|
||||
SubmissionTask,
|
||||
Task,
|
||||
)
|
||||
from s3transfer.utils import (
|
||||
ChunksizeAdjuster,
|
||||
DeferredOpenFile,
|
||||
get_callbacks,
|
||||
get_filtered_dict,
|
||||
)
|
||||
|
||||
|
||||
class AggregatedProgressCallback:
|
||||
def __init__(self, callbacks, threshold=1024 * 256):
|
||||
"""Aggregates progress updates for every provided progress callback
|
||||
|
||||
:type callbacks: A list of functions that accepts bytes_transferred
|
||||
as a single argument
|
||||
:param callbacks: The callbacks to invoke when threshold is reached
|
||||
|
||||
:type threshold: int
|
||||
:param threshold: The progress threshold in which to take the
|
||||
aggregated progress and invoke the progress callback with that
|
||||
aggregated progress total
|
||||
"""
|
||||
self._callbacks = callbacks
|
||||
self._threshold = threshold
|
||||
self._bytes_seen = 0
|
||||
|
||||
def __call__(self, bytes_transferred):
|
||||
self._bytes_seen += bytes_transferred
|
||||
if self._bytes_seen >= self._threshold:
|
||||
self._trigger_callbacks()
|
||||
|
||||
def flush(self):
|
||||
"""Flushes out any progress that has not been sent to its callbacks"""
|
||||
if self._bytes_seen > 0:
|
||||
self._trigger_callbacks()
|
||||
|
||||
def _trigger_callbacks(self):
|
||||
for callback in self._callbacks:
|
||||
callback(bytes_transferred=self._bytes_seen)
|
||||
self._bytes_seen = 0
|
||||
|
||||
|
||||
class InterruptReader:
|
||||
"""Wrapper that can interrupt reading using an error
|
||||
|
||||
It uses a transfer coordinator to propagate an error if it notices
|
||||
that a read is being made while the file is being read from.
|
||||
|
||||
:type fileobj: file-like obj
|
||||
:param fileobj: The file-like object to read from
|
||||
|
||||
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
|
||||
:param transfer_coordinator: The transfer coordinator to use if the
|
||||
reader needs to be interrupted.
|
||||
"""
|
||||
|
||||
def __init__(self, fileobj, transfer_coordinator):
|
||||
self._fileobj = fileobj
|
||||
self._transfer_coordinator = transfer_coordinator
|
||||
|
||||
def read(self, amount=None):
|
||||
# If there is an exception, then raise the exception.
|
||||
# We raise an error instead of returning no bytes because for
|
||||
# requests where the content length and md5 was sent, it will
|
||||
# cause md5 mismatches and retries as there was no indication that
|
||||
# the stream being read from encountered any issues.
|
||||
if self._transfer_coordinator.exception:
|
||||
raise self._transfer_coordinator.exception
|
||||
return self._fileobj.read(amount)
|
||||
|
||||
def seek(self, where, whence=0):
|
||||
self._fileobj.seek(where, whence)
|
||||
|
||||
def tell(self):
|
||||
return self._fileobj.tell()
|
||||
|
||||
def close(self):
|
||||
self._fileobj.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
self.close()
|
||||
|
||||
|
||||
class UploadInputManager:
|
||||
"""Base manager class for handling various types of files for uploads
|
||||
|
||||
This class is typically used for the UploadSubmissionTask class to help
|
||||
determine the following:
|
||||
|
||||
* How to determine the size of the file
|
||||
* How to determine if a multipart upload is required
|
||||
* How to retrieve the body for a PutObject
|
||||
* How to retrieve the bodies for a set of UploadParts
|
||||
|
||||
The answers/implementations differ for the various types of file inputs
|
||||
that may be accepted. All implementations must subclass and override
|
||||
public methods from this class.
|
||||
"""
|
||||
|
||||
def __init__(self, osutil, transfer_coordinator, bandwidth_limiter=None):
|
||||
self._osutil = osutil
|
||||
self._transfer_coordinator = transfer_coordinator
|
||||
self._bandwidth_limiter = bandwidth_limiter
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, upload_source):
|
||||
"""Determines if the source for the upload is compatible with manager
|
||||
|
||||
:param upload_source: The source for which the upload will pull data
|
||||
from.
|
||||
|
||||
:returns: True if the manager can handle the type of source specified
|
||||
otherwise returns False.
|
||||
"""
|
||||
raise NotImplementedError('must implement _is_compatible()')
|
||||
|
||||
def stores_body_in_memory(self, operation_name):
|
||||
"""Whether the body it provides are stored in-memory
|
||||
|
||||
:type operation_name: str
|
||||
:param operation_name: The name of the client operation that the body
|
||||
is being used for. Valid operation_names are ``put_object`` and
|
||||
``upload_part``.
|
||||
|
||||
:rtype: boolean
|
||||
:returns: True if the body returned by the manager will be stored in
|
||||
memory. False if the manager will not directly store the body in
|
||||
memory.
|
||||
"""
|
||||
raise NotImplementedError('must implement store_body_in_memory()')
|
||||
|
||||
def provide_transfer_size(self, transfer_future):
|
||||
"""Provides the transfer size of an upload
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The future associated with upload request
|
||||
"""
|
||||
raise NotImplementedError('must implement provide_transfer_size()')
|
||||
|
||||
def requires_multipart_upload(self, transfer_future, config):
|
||||
"""Determines where a multipart upload is required
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The future associated with upload request
|
||||
|
||||
:type config: s3transfer.manager.TransferConfig
|
||||
:param config: The config associated to the transfer manager
|
||||
|
||||
:rtype: boolean
|
||||
:returns: True, if the upload should be multipart based on
|
||||
configuration and size. False, otherwise.
|
||||
"""
|
||||
raise NotImplementedError('must implement requires_multipart_upload()')
|
||||
|
||||
def get_put_object_body(self, transfer_future):
|
||||
"""Returns the body to use for PutObject
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The future associated with upload request
|
||||
|
||||
:type config: s3transfer.manager.TransferConfig
|
||||
:param config: The config associated to the transfer manager
|
||||
|
||||
:rtype: s3transfer.utils.ReadFileChunk
|
||||
:returns: A ReadFileChunk including all progress callbacks
|
||||
associated with the transfer future.
|
||||
"""
|
||||
raise NotImplementedError('must implement get_put_object_body()')
|
||||
|
||||
def yield_upload_part_bodies(self, transfer_future, chunksize):
|
||||
"""Yields the part number and body to use for each UploadPart
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The future associated with upload request
|
||||
|
||||
:type chunksize: int
|
||||
:param chunksize: The chunksize to use for this upload.
|
||||
|
||||
:rtype: int, s3transfer.utils.ReadFileChunk
|
||||
:returns: Yields the part number and the ReadFileChunk including all
|
||||
progress callbacks associated with the transfer future for that
|
||||
specific yielded part.
|
||||
"""
|
||||
raise NotImplementedError('must implement yield_upload_part_bodies()')
|
||||
|
||||
def _wrap_fileobj(self, fileobj):
|
||||
fileobj = InterruptReader(fileobj, self._transfer_coordinator)
|
||||
if self._bandwidth_limiter:
|
||||
fileobj = self._bandwidth_limiter.get_bandwith_limited_stream(
|
||||
fileobj, self._transfer_coordinator, enabled=False
|
||||
)
|
||||
return fileobj
|
||||
|
||||
def _get_progress_callbacks(self, transfer_future):
|
||||
callbacks = get_callbacks(transfer_future, 'progress')
|
||||
# We only want to be wrapping the callbacks if there are callbacks to
|
||||
# invoke because we do not want to be doing any unnecessary work if
|
||||
# there are no callbacks to invoke.
|
||||
if callbacks:
|
||||
return [AggregatedProgressCallback(callbacks)]
|
||||
return []
|
||||
|
||||
def _get_close_callbacks(self, aggregated_progress_callbacks):
|
||||
return [callback.flush for callback in aggregated_progress_callbacks]
|
||||
|
||||
|
||||
class UploadFilenameInputManager(UploadInputManager):
|
||||
"""Upload utility for filenames"""
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, upload_source):
|
||||
return isinstance(upload_source, str)
|
||||
|
||||
def stores_body_in_memory(self, operation_name):
|
||||
return False
|
||||
|
||||
def provide_transfer_size(self, transfer_future):
|
||||
transfer_future.meta.provide_transfer_size(
|
||||
self._osutil.get_file_size(transfer_future.meta.call_args.fileobj)
|
||||
)
|
||||
|
||||
def requires_multipart_upload(self, transfer_future, config):
|
||||
return transfer_future.meta.size >= config.multipart_threshold
|
||||
|
||||
def get_put_object_body(self, transfer_future):
|
||||
# Get a file-like object for the given input
|
||||
fileobj, full_size = self._get_put_object_fileobj_with_full_size(
|
||||
transfer_future
|
||||
)
|
||||
|
||||
# Wrap fileobj with interrupt reader that will quickly cancel
|
||||
# uploads if needed instead of having to wait for the socket
|
||||
# to completely read all of the data.
|
||||
fileobj = self._wrap_fileobj(fileobj)
|
||||
|
||||
callbacks = self._get_progress_callbacks(transfer_future)
|
||||
close_callbacks = self._get_close_callbacks(callbacks)
|
||||
size = transfer_future.meta.size
|
||||
# Return the file-like object wrapped into a ReadFileChunk to get
|
||||
# progress.
|
||||
return self._osutil.open_file_chunk_reader_from_fileobj(
|
||||
fileobj=fileobj,
|
||||
chunk_size=size,
|
||||
full_file_size=full_size,
|
||||
callbacks=callbacks,
|
||||
close_callbacks=close_callbacks,
|
||||
)
|
||||
|
||||
def yield_upload_part_bodies(self, transfer_future, chunksize):
|
||||
full_file_size = transfer_future.meta.size
|
||||
num_parts = self._get_num_parts(transfer_future, chunksize)
|
||||
for part_number in range(1, num_parts + 1):
|
||||
callbacks = self._get_progress_callbacks(transfer_future)
|
||||
close_callbacks = self._get_close_callbacks(callbacks)
|
||||
start_byte = chunksize * (part_number - 1)
|
||||
# Get a file-like object for that part and the size of the full
|
||||
# file size for the associated file-like object for that part.
|
||||
fileobj, full_size = self._get_upload_part_fileobj_with_full_size(
|
||||
transfer_future.meta.call_args.fileobj,
|
||||
start_byte=start_byte,
|
||||
part_size=chunksize,
|
||||
full_file_size=full_file_size,
|
||||
)
|
||||
|
||||
# Wrap fileobj with interrupt reader that will quickly cancel
|
||||
# uploads if needed instead of having to wait for the socket
|
||||
# to completely read all of the data.
|
||||
fileobj = self._wrap_fileobj(fileobj)
|
||||
|
||||
# Wrap the file-like object into a ReadFileChunk to get progress.
|
||||
read_file_chunk = self._osutil.open_file_chunk_reader_from_fileobj(
|
||||
fileobj=fileobj,
|
||||
chunk_size=chunksize,
|
||||
full_file_size=full_size,
|
||||
callbacks=callbacks,
|
||||
close_callbacks=close_callbacks,
|
||||
)
|
||||
yield part_number, read_file_chunk
|
||||
|
||||
def _get_deferred_open_file(self, fileobj, start_byte):
|
||||
fileobj = DeferredOpenFile(
|
||||
fileobj, start_byte, open_function=self._osutil.open
|
||||
)
|
||||
return fileobj
|
||||
|
||||
def _get_put_object_fileobj_with_full_size(self, transfer_future):
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
size = transfer_future.meta.size
|
||||
return self._get_deferred_open_file(fileobj, 0), size
|
||||
|
||||
def _get_upload_part_fileobj_with_full_size(self, fileobj, **kwargs):
|
||||
start_byte = kwargs['start_byte']
|
||||
full_size = kwargs['full_file_size']
|
||||
return self._get_deferred_open_file(fileobj, start_byte), full_size
|
||||
|
||||
def _get_num_parts(self, transfer_future, part_size):
|
||||
return int(math.ceil(transfer_future.meta.size / float(part_size)))
|
||||
|
||||
|
||||
class UploadSeekableInputManager(UploadFilenameInputManager):
|
||||
"""Upload utility for an open file object"""
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, upload_source):
|
||||
return readable(upload_source) and seekable(upload_source)
|
||||
|
||||
def stores_body_in_memory(self, operation_name):
|
||||
if operation_name == 'put_object':
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def provide_transfer_size(self, transfer_future):
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
# To determine size, first determine the starting position
|
||||
# Seek to the end and then find the difference in the length
|
||||
# between the end and start positions.
|
||||
start_position = fileobj.tell()
|
||||
fileobj.seek(0, 2)
|
||||
end_position = fileobj.tell()
|
||||
fileobj.seek(start_position)
|
||||
transfer_future.meta.provide_transfer_size(
|
||||
end_position - start_position
|
||||
)
|
||||
|
||||
def _get_upload_part_fileobj_with_full_size(self, fileobj, **kwargs):
|
||||
# Note: It is unfortunate that in order to do a multithreaded
|
||||
# multipart upload we cannot simply copy the filelike object
|
||||
# since there is not really a mechanism in python (i.e. os.dup
|
||||
# points to the same OS filehandle which causes concurrency
|
||||
# issues). So instead we need to read from the fileobj and
|
||||
# chunk the data out to separate file-like objects in memory.
|
||||
data = fileobj.read(kwargs['part_size'])
|
||||
# We return the length of the data instead of the full_file_size
|
||||
# because we partitioned the data into separate BytesIO objects
|
||||
# meaning the BytesIO object has no knowledge of its start position
|
||||
# relative the input source nor access to the rest of the input
|
||||
# source. So we must treat it as its own standalone file.
|
||||
return BytesIO(data), len(data)
|
||||
|
||||
def _get_put_object_fileobj_with_full_size(self, transfer_future):
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
# The current position needs to be taken into account when retrieving
|
||||
# the full size of the file.
|
||||
size = fileobj.tell() + transfer_future.meta.size
|
||||
return fileobj, size
|
||||
|
||||
|
||||
class UploadNonSeekableInputManager(UploadInputManager):
|
||||
"""Upload utility for a file-like object that cannot seek."""
|
||||
|
||||
def __init__(self, osutil, transfer_coordinator, bandwidth_limiter=None):
|
||||
super().__init__(osutil, transfer_coordinator, bandwidth_limiter)
|
||||
self._initial_data = b''
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, upload_source):
|
||||
return readable(upload_source)
|
||||
|
||||
def stores_body_in_memory(self, operation_name):
|
||||
return True
|
||||
|
||||
def provide_transfer_size(self, transfer_future):
|
||||
# No-op because there is no way to do this short of reading the entire
|
||||
# body into memory.
|
||||
return
|
||||
|
||||
def requires_multipart_upload(self, transfer_future, config):
|
||||
# If the user has set the size, we can use that.
|
||||
if transfer_future.meta.size is not None:
|
||||
return transfer_future.meta.size >= config.multipart_threshold
|
||||
|
||||
# This is tricky to determine in this case because we can't know how
|
||||
# large the input is. So to figure it out, we read data into memory
|
||||
# up until the threshold and compare how much data was actually read
|
||||
# against the threshold.
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
threshold = config.multipart_threshold
|
||||
self._initial_data = self._read(fileobj, threshold, False)
|
||||
if len(self._initial_data) < threshold:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def get_put_object_body(self, transfer_future):
|
||||
callbacks = self._get_progress_callbacks(transfer_future)
|
||||
close_callbacks = self._get_close_callbacks(callbacks)
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
|
||||
body = self._wrap_data(
|
||||
self._initial_data + fileobj.read(), callbacks, close_callbacks
|
||||
)
|
||||
|
||||
# Zero out the stored data so we don't have additional copies
|
||||
# hanging around in memory.
|
||||
self._initial_data = None
|
||||
return body
|
||||
|
||||
def yield_upload_part_bodies(self, transfer_future, chunksize):
|
||||
file_object = transfer_future.meta.call_args.fileobj
|
||||
part_number = 0
|
||||
|
||||
# Continue reading parts from the file-like object until it is empty.
|
||||
while True:
|
||||
callbacks = self._get_progress_callbacks(transfer_future)
|
||||
close_callbacks = self._get_close_callbacks(callbacks)
|
||||
part_number += 1
|
||||
part_content = self._read(file_object, chunksize)
|
||||
if not part_content:
|
||||
break
|
||||
part_object = self._wrap_data(
|
||||
part_content, callbacks, close_callbacks
|
||||
)
|
||||
|
||||
# Zero out part_content to avoid hanging on to additional data.
|
||||
part_content = None
|
||||
yield part_number, part_object
|
||||
|
||||
def _read(self, fileobj, amount, truncate=True):
|
||||
"""
|
||||
Reads a specific amount of data from a stream and returns it. If there
|
||||
is any data in initial_data, that will be popped out first.
|
||||
|
||||
:type fileobj: A file-like object that implements read
|
||||
:param fileobj: The stream to read from.
|
||||
|
||||
:type amount: int
|
||||
:param amount: The number of bytes to read from the stream.
|
||||
|
||||
:type truncate: bool
|
||||
:param truncate: Whether or not to truncate initial_data after
|
||||
reading from it.
|
||||
|
||||
:return: Generator which generates part bodies from the initial data.
|
||||
"""
|
||||
# If the the initial data is empty, we simply read from the fileobj
|
||||
if len(self._initial_data) == 0:
|
||||
return fileobj.read(amount)
|
||||
|
||||
# If the requested number of bytes is less than the amount of
|
||||
# initial data, pull entirely from initial data.
|
||||
if amount <= len(self._initial_data):
|
||||
data = self._initial_data[:amount]
|
||||
# Truncate initial data so we don't hang onto the data longer
|
||||
# than we need.
|
||||
if truncate:
|
||||
self._initial_data = self._initial_data[amount:]
|
||||
return data
|
||||
|
||||
# At this point there is some initial data left, but not enough to
|
||||
# satisfy the number of bytes requested. Pull out the remaining
|
||||
# initial data and read the rest from the fileobj.
|
||||
amount_to_read = amount - len(self._initial_data)
|
||||
data = self._initial_data + fileobj.read(amount_to_read)
|
||||
|
||||
# Zero out initial data so we don't hang onto the data any more.
|
||||
if truncate:
|
||||
self._initial_data = b''
|
||||
return data
|
||||
|
||||
def _wrap_data(self, data, callbacks, close_callbacks):
|
||||
"""
|
||||
Wraps data with the interrupt reader and the file chunk reader.
|
||||
|
||||
:type data: bytes
|
||||
:param data: The data to wrap.
|
||||
|
||||
:type callbacks: list
|
||||
:param callbacks: The callbacks associated with the transfer future.
|
||||
|
||||
:type close_callbacks: list
|
||||
:param close_callbacks: The callbacks to be called when closing the
|
||||
wrapper for the data.
|
||||
|
||||
:return: Fully wrapped data.
|
||||
"""
|
||||
fileobj = self._wrap_fileobj(BytesIO(data))
|
||||
return self._osutil.open_file_chunk_reader_from_fileobj(
|
||||
fileobj=fileobj,
|
||||
chunk_size=len(data),
|
||||
full_file_size=len(data),
|
||||
callbacks=callbacks,
|
||||
close_callbacks=close_callbacks,
|
||||
)
|
||||
|
||||
|
||||
class UploadSubmissionTask(SubmissionTask):
|
||||
"""Task for submitting tasks to execute an upload"""
|
||||
|
||||
PUT_OBJECT_BLOCKLIST = ["ChecksumType", "MpuObjectSize"]
|
||||
|
||||
CREATE_MULTIPART_BLOCKLIST = FULL_OBJECT_CHECKSUM_ARGS + ["MpuObjectSize"]
|
||||
|
||||
UPLOAD_PART_ARGS = [
|
||||
'ChecksumAlgorithm',
|
||||
'SSECustomerKey',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKeyMD5',
|
||||
'RequestPayer',
|
||||
'ExpectedBucketOwner',
|
||||
]
|
||||
|
||||
COMPLETE_MULTIPART_ARGS = [
|
||||
'SSECustomerKey',
|
||||
'SSECustomerAlgorithm',
|
||||
'SSECustomerKeyMD5',
|
||||
'RequestPayer',
|
||||
'ExpectedBucketOwner',
|
||||
'ChecksumType',
|
||||
'MpuObjectSize',
|
||||
] + FULL_OBJECT_CHECKSUM_ARGS
|
||||
|
||||
def _get_upload_input_manager_cls(self, transfer_future):
|
||||
"""Retrieves a class for managing input for an upload based on file type
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future for the request
|
||||
|
||||
:rtype: class of UploadInputManager
|
||||
:returns: The appropriate class to use for managing a specific type of
|
||||
input for uploads.
|
||||
"""
|
||||
upload_manager_resolver_chain = [
|
||||
UploadFilenameInputManager,
|
||||
UploadSeekableInputManager,
|
||||
UploadNonSeekableInputManager,
|
||||
]
|
||||
|
||||
fileobj = transfer_future.meta.call_args.fileobj
|
||||
for upload_manager_cls in upload_manager_resolver_chain:
|
||||
if upload_manager_cls.is_compatible(fileobj):
|
||||
return upload_manager_cls
|
||||
raise RuntimeError(
|
||||
f'Input {fileobj} of type: {type(fileobj)} is not supported.'
|
||||
)
|
||||
|
||||
def _submit(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
transfer_future,
|
||||
bandwidth_limiter=None,
|
||||
):
|
||||
"""
|
||||
:param client: The client associated with the transfer manager
|
||||
|
||||
:type config: s3transfer.manager.TransferConfig
|
||||
:param config: The transfer config associated with the transfer
|
||||
manager
|
||||
|
||||
:type osutil: s3transfer.utils.OSUtil
|
||||
:param osutil: The os utility associated to the transfer manager
|
||||
|
||||
:type request_executor: s3transfer.futures.BoundedExecutor
|
||||
:param request_executor: The request executor associated with the
|
||||
transfer manager
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future associated with the
|
||||
transfer request that tasks are being submitted for
|
||||
"""
|
||||
upload_input_manager = self._get_upload_input_manager_cls(
|
||||
transfer_future
|
||||
)(osutil, self._transfer_coordinator, bandwidth_limiter)
|
||||
|
||||
# Determine the size if it was not provided
|
||||
if transfer_future.meta.size is None:
|
||||
upload_input_manager.provide_transfer_size(transfer_future)
|
||||
|
||||
# Do a multipart upload if needed, otherwise do a regular put object.
|
||||
if not upload_input_manager.requires_multipart_upload(
|
||||
transfer_future, config
|
||||
):
|
||||
self._submit_upload_request(
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
transfer_future,
|
||||
upload_input_manager,
|
||||
)
|
||||
else:
|
||||
self._submit_multipart_request(
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
transfer_future,
|
||||
upload_input_manager,
|
||||
)
|
||||
|
||||
def _submit_upload_request(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
transfer_future,
|
||||
upload_input_manager,
|
||||
):
|
||||
call_args = transfer_future.meta.call_args
|
||||
|
||||
put_object_extra_args = self._extra_put_object_args(
|
||||
call_args.extra_args
|
||||
)
|
||||
|
||||
# Get any tags that need to be associated to the put object task
|
||||
put_object_tag = self._get_upload_task_tag(
|
||||
upload_input_manager, 'put_object'
|
||||
)
|
||||
|
||||
# Submit the request of a single upload.
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
PutObjectTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'fileobj': upload_input_manager.get_put_object_body(
|
||||
transfer_future
|
||||
),
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'extra_args': put_object_extra_args,
|
||||
},
|
||||
is_final=True,
|
||||
),
|
||||
tag=put_object_tag,
|
||||
)
|
||||
|
||||
def _submit_multipart_request(
|
||||
self,
|
||||
client,
|
||||
config,
|
||||
osutil,
|
||||
request_executor,
|
||||
transfer_future,
|
||||
upload_input_manager,
|
||||
):
|
||||
call_args = transfer_future.meta.call_args
|
||||
|
||||
# When a user provided checksum is passed, set "ChecksumType" to "FULL_OBJECT"
|
||||
# and "ChecksumAlgorithm" to the related algorithm.
|
||||
for checksum in FULL_OBJECT_CHECKSUM_ARGS:
|
||||
if checksum in call_args.extra_args:
|
||||
call_args.extra_args["ChecksumType"] = "FULL_OBJECT"
|
||||
call_args.extra_args["ChecksumAlgorithm"] = checksum.replace(
|
||||
"Checksum", ""
|
||||
)
|
||||
|
||||
create_multipart_extra_args = self._extra_create_multipart_args(
|
||||
call_args.extra_args
|
||||
)
|
||||
|
||||
# Submit the request to create a multipart upload.
|
||||
create_multipart_future = self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
CreateMultipartUploadTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'extra_args': create_multipart_extra_args,
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
# Submit requests to upload the parts of the file.
|
||||
part_futures = []
|
||||
extra_part_args = self._extra_upload_part_args(call_args.extra_args)
|
||||
|
||||
# Get any tags that need to be associated to the submitted task
|
||||
# for upload the data
|
||||
upload_part_tag = self._get_upload_task_tag(
|
||||
upload_input_manager, 'upload_part'
|
||||
)
|
||||
|
||||
size = transfer_future.meta.size
|
||||
adjuster = ChunksizeAdjuster()
|
||||
chunksize = adjuster.adjust_chunksize(config.multipart_chunksize, size)
|
||||
part_iterator = upload_input_manager.yield_upload_part_bodies(
|
||||
transfer_future, chunksize
|
||||
)
|
||||
|
||||
for part_number, fileobj in part_iterator:
|
||||
part_futures.append(
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
UploadPartTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'fileobj': fileobj,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'part_number': part_number,
|
||||
'extra_args': extra_part_args,
|
||||
},
|
||||
pending_main_kwargs={
|
||||
'upload_id': create_multipart_future
|
||||
},
|
||||
),
|
||||
tag=upload_part_tag,
|
||||
)
|
||||
)
|
||||
|
||||
complete_multipart_extra_args = self._extra_complete_multipart_args(
|
||||
call_args.extra_args
|
||||
)
|
||||
# Submit the request to complete the multipart upload.
|
||||
self._transfer_coordinator.submit(
|
||||
request_executor,
|
||||
CompleteMultipartUploadTask(
|
||||
transfer_coordinator=self._transfer_coordinator,
|
||||
main_kwargs={
|
||||
'client': client,
|
||||
'bucket': call_args.bucket,
|
||||
'key': call_args.key,
|
||||
'extra_args': complete_multipart_extra_args,
|
||||
},
|
||||
pending_main_kwargs={
|
||||
'upload_id': create_multipart_future,
|
||||
'parts': part_futures,
|
||||
},
|
||||
is_final=True,
|
||||
),
|
||||
)
|
||||
|
||||
def _extra_upload_part_args(self, extra_args):
|
||||
# Only the args in UPLOAD_PART_ARGS actually need to be passed
|
||||
# onto the upload_part calls.
|
||||
return get_filtered_dict(extra_args, self.UPLOAD_PART_ARGS)
|
||||
|
||||
def _extra_complete_multipart_args(self, extra_args):
|
||||
return get_filtered_dict(extra_args, self.COMPLETE_MULTIPART_ARGS)
|
||||
|
||||
def _extra_create_multipart_args(self, extra_args):
|
||||
return get_filtered_dict(
|
||||
extra_args, blocklisted_keys=self.CREATE_MULTIPART_BLOCKLIST
|
||||
)
|
||||
|
||||
def _extra_put_object_args(self, extra_args):
|
||||
return get_filtered_dict(
|
||||
extra_args, blocklisted_keys=self.PUT_OBJECT_BLOCKLIST
|
||||
)
|
||||
|
||||
def _get_upload_task_tag(self, upload_input_manager, operation_name):
|
||||
tag = None
|
||||
if upload_input_manager.stores_body_in_memory(operation_name):
|
||||
tag = IN_MEMORY_UPLOAD_TAG
|
||||
return tag
|
||||
|
||||
|
||||
class PutObjectTask(Task):
|
||||
"""Task to do a nonmultipart upload"""
|
||||
|
||||
def _main(self, client, fileobj, bucket, key, extra_args):
|
||||
"""
|
||||
:param client: The client to use when calling PutObject
|
||||
:param fileobj: The file to upload.
|
||||
:param bucket: The name of the bucket to upload to
|
||||
:param key: The name of the key to upload to
|
||||
:param extra_args: A dictionary of any extra arguments that may be
|
||||
used in the upload.
|
||||
"""
|
||||
with fileobj as body:
|
||||
client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
|
||||
|
||||
|
||||
class UploadPartTask(Task):
|
||||
"""Task to upload a part in a multipart upload"""
|
||||
|
||||
def _main(
|
||||
self, client, fileobj, bucket, key, upload_id, part_number, extra_args
|
||||
):
|
||||
"""
|
||||
:param client: The client to use when calling PutObject
|
||||
:param fileobj: The file to upload.
|
||||
:param bucket: The name of the bucket to upload to
|
||||
:param key: The name of the key to upload to
|
||||
:param upload_id: The id of the upload
|
||||
:param part_number: The number representing the part of the multipart
|
||||
upload
|
||||
:param extra_args: A dictionary of any extra arguments that may be
|
||||
used in the upload.
|
||||
|
||||
:rtype: dict
|
||||
:returns: A dictionary representing a part::
|
||||
|
||||
{'Etag': etag_value, 'PartNumber': part_number}
|
||||
|
||||
This value can be appended to a list to be used to complete
|
||||
the multipart upload.
|
||||
"""
|
||||
with fileobj as body:
|
||||
response = client.upload_part(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id,
|
||||
PartNumber=part_number,
|
||||
Body=body,
|
||||
**extra_args,
|
||||
)
|
||||
etag = response['ETag']
|
||||
part_metadata = {'ETag': etag, 'PartNumber': part_number}
|
||||
if 'ChecksumAlgorithm' in extra_args:
|
||||
algorithm_name = extra_args['ChecksumAlgorithm'].upper()
|
||||
checksum_member = f'Checksum{algorithm_name}'
|
||||
if checksum_member in response:
|
||||
part_metadata[checksum_member] = response[checksum_member]
|
||||
return part_metadata
|
||||
@@ -0,0 +1,848 @@
|
||||
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
||||
# may not use this file except in compliance with the License. A copy of
|
||||
# the License is located at
|
||||
#
|
||||
# http://aws.amazon.com/apache2.0/
|
||||
#
|
||||
# or in the "license" file accompanying this file. This file is
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
||||
# ANY KIND, either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
import functools
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import socket
|
||||
import stat
|
||||
import string
|
||||
import threading
|
||||
from collections import defaultdict
|
||||
|
||||
from botocore.exceptions import (
|
||||
IncompleteReadError,
|
||||
ReadTimeoutError,
|
||||
ResponseStreamingError,
|
||||
)
|
||||
from botocore.httpchecksum import DEFAULT_CHECKSUM_ALGORITHM, AwsChunkedWrapper
|
||||
from botocore.utils import is_s3express_bucket
|
||||
|
||||
from s3transfer.compat import SOCKET_ERROR, fallocate, rename_file
|
||||
from s3transfer.constants import FULL_OBJECT_CHECKSUM_ARGS
|
||||
|
||||
MAX_PARTS = 10000
|
||||
# The maximum file size you can upload via S3 per request.
|
||||
# See: http://docs.aws.amazon.com/AmazonS3/latest/dev/UploadingObjects.html
|
||||
# and: http://docs.aws.amazon.com/AmazonS3/latest/dev/qfacts.html
|
||||
MAX_SINGLE_UPLOAD_SIZE = 5 * (1024**3)
|
||||
MIN_UPLOAD_CHUNKSIZE = 5 * (1024**2)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
S3_RETRYABLE_DOWNLOAD_ERRORS = (
|
||||
socket.timeout,
|
||||
SOCKET_ERROR,
|
||||
ReadTimeoutError,
|
||||
IncompleteReadError,
|
||||
ResponseStreamingError,
|
||||
)
|
||||
|
||||
|
||||
def random_file_extension(num_digits=8):
|
||||
return ''.join(random.choice(string.hexdigits) for _ in range(num_digits))
|
||||
|
||||
|
||||
def signal_not_transferring(request, operation_name, **kwargs):
|
||||
if operation_name in ['PutObject', 'UploadPart'] and hasattr(
|
||||
request.body, 'signal_not_transferring'
|
||||
):
|
||||
request.body.signal_not_transferring()
|
||||
|
||||
|
||||
def signal_transferring(request, operation_name, **kwargs):
|
||||
if operation_name in ['PutObject', 'UploadPart']:
|
||||
body = request.body
|
||||
if isinstance(body, AwsChunkedWrapper):
|
||||
body = getattr(body, '_raw', None)
|
||||
if hasattr(body, 'signal_transferring'):
|
||||
body.signal_transferring()
|
||||
|
||||
|
||||
def calculate_num_parts(size, part_size):
|
||||
return int(math.ceil(size / float(part_size)))
|
||||
|
||||
|
||||
def calculate_range_parameter(
|
||||
part_size, part_index, num_parts, total_size=None
|
||||
):
|
||||
"""Calculate the range parameter for multipart downloads/copies
|
||||
|
||||
:type part_size: int
|
||||
:param part_size: The size of the part
|
||||
|
||||
:type part_index: int
|
||||
:param part_index: The index for which this parts starts. This index starts
|
||||
at zero
|
||||
|
||||
:type num_parts: int
|
||||
:param num_parts: The total number of parts in the transfer
|
||||
|
||||
:returns: The value to use for Range parameter on downloads or
|
||||
the CopySourceRange parameter for copies
|
||||
"""
|
||||
# Used to calculate the Range parameter
|
||||
start_range = part_index * part_size
|
||||
if part_index == num_parts - 1:
|
||||
end_range = ''
|
||||
if total_size is not None:
|
||||
end_range = str(total_size - 1)
|
||||
else:
|
||||
end_range = start_range + part_size - 1
|
||||
range_param = f'bytes={start_range}-{end_range}'
|
||||
return range_param
|
||||
|
||||
|
||||
def get_callbacks(transfer_future, callback_type):
|
||||
"""Retrieves callbacks from a subscriber
|
||||
|
||||
:type transfer_future: s3transfer.futures.TransferFuture
|
||||
:param transfer_future: The transfer future the subscriber is associated
|
||||
to.
|
||||
|
||||
:type callback_type: str
|
||||
:param callback_type: The type of callback to retrieve from the subscriber.
|
||||
Valid types include:
|
||||
* 'queued'
|
||||
* 'progress'
|
||||
* 'done'
|
||||
|
||||
:returns: A list of callbacks for the type specified. All callbacks are
|
||||
preinjected with the transfer future.
|
||||
"""
|
||||
callbacks = []
|
||||
for subscriber in transfer_future.meta.call_args.subscribers:
|
||||
callback_name = 'on_' + callback_type
|
||||
if hasattr(subscriber, callback_name):
|
||||
callbacks.append(
|
||||
functools.partial(
|
||||
getattr(subscriber, callback_name), future=transfer_future
|
||||
)
|
||||
)
|
||||
return callbacks
|
||||
|
||||
|
||||
def invoke_progress_callbacks(callbacks, bytes_transferred):
|
||||
"""Calls all progress callbacks
|
||||
|
||||
:param callbacks: A list of progress callbacks to invoke
|
||||
:param bytes_transferred: The number of bytes transferred. This is passed
|
||||
to the callbacks. If no bytes were transferred the callbacks will not
|
||||
be invoked because no progress was achieved. It is also possible
|
||||
to receive a negative amount which comes from retrying a transfer
|
||||
request.
|
||||
"""
|
||||
# Only invoke the callbacks if bytes were actually transferred.
|
||||
if bytes_transferred:
|
||||
for callback in callbacks:
|
||||
callback(bytes_transferred=bytes_transferred)
|
||||
|
||||
|
||||
def get_filtered_dict(
|
||||
original_dict, whitelisted_keys=None, blocklisted_keys=None
|
||||
):
|
||||
"""Gets a dictionary filtered by whitelisted and blocklisted keys.
|
||||
|
||||
:param original_dict: The original dictionary of arguments to source keys
|
||||
and values.
|
||||
:param whitelisted_key: A list of keys to include in the filtered
|
||||
dictionary.
|
||||
:param blocklisted_key: A list of keys to exclude in the filtered
|
||||
dictionary.
|
||||
|
||||
:returns: A dictionary containing key/values from the original dictionary
|
||||
whose key was included in the whitelist and/or not included in the
|
||||
blocklist.
|
||||
"""
|
||||
filtered_dict = {}
|
||||
for key, value in original_dict.items():
|
||||
if (whitelisted_keys and key in whitelisted_keys) or (
|
||||
blocklisted_keys and key not in blocklisted_keys
|
||||
):
|
||||
filtered_dict[key] = value
|
||||
return filtered_dict
|
||||
|
||||
|
||||
class CallArgs:
|
||||
def __init__(self, **kwargs):
|
||||
"""A class that records call arguments
|
||||
|
||||
The call arguments must be passed as keyword arguments. It will set
|
||||
each keyword argument as an attribute of the object along with its
|
||||
associated value.
|
||||
"""
|
||||
for arg, value in kwargs.items():
|
||||
setattr(self, arg, value)
|
||||
|
||||
|
||||
class FunctionContainer:
|
||||
"""An object that contains a function and any args or kwargs to call it
|
||||
|
||||
When called the provided function will be called with provided args
|
||||
and kwargs.
|
||||
"""
|
||||
|
||||
def __init__(self, func, *args, **kwargs):
|
||||
self._func = func
|
||||
self._args = args
|
||||
self._kwargs = kwargs
|
||||
|
||||
def __repr__(self):
|
||||
return f'Function: {self._func} with args {self._args} and kwargs {self._kwargs}'
|
||||
|
||||
def __call__(self):
|
||||
return self._func(*self._args, **self._kwargs)
|
||||
|
||||
|
||||
class CountCallbackInvoker:
|
||||
"""An abstraction to invoke a callback when a shared count reaches zero
|
||||
|
||||
:param callback: Callback invoke when finalized count reaches zero
|
||||
"""
|
||||
|
||||
def __init__(self, callback):
|
||||
self._lock = threading.Lock()
|
||||
self._callback = callback
|
||||
self._count = 0
|
||||
self._is_finalized = False
|
||||
|
||||
@property
|
||||
def current_count(self):
|
||||
with self._lock:
|
||||
return self._count
|
||||
|
||||
def increment(self):
|
||||
"""Increment the count by one"""
|
||||
with self._lock:
|
||||
if self._is_finalized:
|
||||
raise RuntimeError(
|
||||
'Counter has been finalized it can no longer be '
|
||||
'incremented.'
|
||||
)
|
||||
self._count += 1
|
||||
|
||||
def decrement(self):
|
||||
"""Decrement the count by one"""
|
||||
with self._lock:
|
||||
if self._count == 0:
|
||||
raise RuntimeError(
|
||||
'Counter is at zero. It cannot dip below zero'
|
||||
)
|
||||
self._count -= 1
|
||||
if self._is_finalized and self._count == 0:
|
||||
self._callback()
|
||||
|
||||
def finalize(self):
|
||||
"""Finalize the counter
|
||||
|
||||
Once finalized, the counter never be incremented and the callback
|
||||
can be invoked once the count reaches zero
|
||||
"""
|
||||
with self._lock:
|
||||
self._is_finalized = True
|
||||
if self._count == 0:
|
||||
self._callback()
|
||||
|
||||
|
||||
class OSUtils:
|
||||
_MAX_FILENAME_LEN = 255
|
||||
|
||||
def get_file_size(self, filename):
|
||||
return os.path.getsize(filename)
|
||||
|
||||
def open_file_chunk_reader(self, filename, start_byte, size, callbacks):
|
||||
return ReadFileChunk.from_filename(
|
||||
filename, start_byte, size, callbacks, enable_callbacks=False
|
||||
)
|
||||
|
||||
def open_file_chunk_reader_from_fileobj(
|
||||
self,
|
||||
fileobj,
|
||||
chunk_size,
|
||||
full_file_size,
|
||||
callbacks,
|
||||
close_callbacks=None,
|
||||
):
|
||||
return ReadFileChunk(
|
||||
fileobj,
|
||||
chunk_size,
|
||||
full_file_size,
|
||||
callbacks=callbacks,
|
||||
enable_callbacks=False,
|
||||
close_callbacks=close_callbacks,
|
||||
)
|
||||
|
||||
def open(self, filename, mode):
|
||||
return open(filename, mode)
|
||||
|
||||
def remove_file(self, filename):
|
||||
"""Remove a file, noop if file does not exist."""
|
||||
# Unlike os.remove, if the file does not exist,
|
||||
# then this method does nothing.
|
||||
try:
|
||||
os.remove(filename)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def rename_file(self, current_filename, new_filename):
|
||||
rename_file(current_filename, new_filename)
|
||||
|
||||
def is_special_file(cls, filename):
|
||||
"""Checks to see if a file is a special UNIX file.
|
||||
|
||||
It checks if the file is a character special device, block special
|
||||
device, FIFO, or socket.
|
||||
|
||||
:param filename: Name of the file
|
||||
|
||||
:returns: True if the file is a special file. False, if is not.
|
||||
"""
|
||||
# If it does not exist, it must be a new file so it cannot be
|
||||
# a special file.
|
||||
if not os.path.exists(filename):
|
||||
return False
|
||||
mode = os.stat(filename).st_mode
|
||||
# Character special device.
|
||||
if stat.S_ISCHR(mode):
|
||||
return True
|
||||
# Block special device
|
||||
if stat.S_ISBLK(mode):
|
||||
return True
|
||||
# Named pipe / FIFO
|
||||
if stat.S_ISFIFO(mode):
|
||||
return True
|
||||
# Socket.
|
||||
if stat.S_ISSOCK(mode):
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_temp_filename(self, filename):
|
||||
suffix = os.extsep + random_file_extension()
|
||||
path = os.path.dirname(filename)
|
||||
name = os.path.basename(filename)
|
||||
temp_filename = name[: self._MAX_FILENAME_LEN - len(suffix)] + suffix
|
||||
return os.path.join(path, temp_filename)
|
||||
|
||||
def allocate(self, filename, size):
|
||||
try:
|
||||
with self.open(filename, 'wb') as f:
|
||||
fallocate(f, size)
|
||||
except OSError:
|
||||
self.remove_file(filename)
|
||||
raise
|
||||
|
||||
|
||||
class DeferredOpenFile:
|
||||
def __init__(self, filename, start_byte=0, mode='rb', open_function=open):
|
||||
"""A class that defers the opening of a file till needed
|
||||
|
||||
This is useful for deferring opening of a file till it is needed
|
||||
in a separate thread, as there is a limit of how many open files
|
||||
there can be in a single thread for most operating systems. The
|
||||
file gets opened in the following methods: ``read()``, ``seek()``,
|
||||
and ``__enter__()``
|
||||
|
||||
:type filename: str
|
||||
:param filename: The name of the file to open
|
||||
|
||||
:type start_byte: int
|
||||
:param start_byte: The byte to seek to when the file is opened.
|
||||
|
||||
:type mode: str
|
||||
:param mode: The mode to use to open the file
|
||||
|
||||
:type open_function: function
|
||||
:param open_function: The function to use to open the file
|
||||
"""
|
||||
self._filename = filename
|
||||
self._fileobj = None
|
||||
self._start_byte = start_byte
|
||||
self._mode = mode
|
||||
self._open_function = open_function
|
||||
|
||||
def _open_if_needed(self):
|
||||
if self._fileobj is None:
|
||||
self._fileobj = self._open_function(self._filename, self._mode)
|
||||
if self._start_byte != 0:
|
||||
self._fileobj.seek(self._start_byte)
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return self._filename
|
||||
|
||||
def read(self, amount=None):
|
||||
self._open_if_needed()
|
||||
return self._fileobj.read(amount)
|
||||
|
||||
def write(self, data):
|
||||
self._open_if_needed()
|
||||
self._fileobj.write(data)
|
||||
|
||||
def seek(self, where, whence=0):
|
||||
self._open_if_needed()
|
||||
self._fileobj.seek(where, whence)
|
||||
|
||||
def tell(self):
|
||||
if self._fileobj is None:
|
||||
return self._start_byte
|
||||
return self._fileobj.tell()
|
||||
|
||||
def close(self):
|
||||
if self._fileobj:
|
||||
self._fileobj.close()
|
||||
|
||||
def __enter__(self):
|
||||
self._open_if_needed()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
self.close()
|
||||
|
||||
|
||||
class ReadFileChunk:
|
||||
def __init__(
|
||||
self,
|
||||
fileobj,
|
||||
chunk_size,
|
||||
full_file_size,
|
||||
callbacks=None,
|
||||
enable_callbacks=True,
|
||||
close_callbacks=None,
|
||||
):
|
||||
"""
|
||||
|
||||
Given a file object shown below::
|
||||
|
||||
|___________________________________________________|
|
||||
0 | | full_file_size
|
||||
|----chunk_size---|
|
||||
f.tell()
|
||||
|
||||
:type fileobj: file
|
||||
:param fileobj: File like object
|
||||
|
||||
:type chunk_size: int
|
||||
:param chunk_size: The max chunk size to read. Trying to read
|
||||
pass the end of the chunk size will behave like you've
|
||||
reached the end of the file.
|
||||
|
||||
:type full_file_size: int
|
||||
:param full_file_size: The entire content length associated
|
||||
with ``fileobj``.
|
||||
|
||||
:type callbacks: A list of function(amount_read)
|
||||
:param callbacks: Called whenever data is read from this object in the
|
||||
order provided.
|
||||
|
||||
:type enable_callbacks: boolean
|
||||
:param enable_callbacks: True if to run callbacks. Otherwise, do not
|
||||
run callbacks
|
||||
|
||||
:type close_callbacks: A list of function()
|
||||
:param close_callbacks: Called when close is called. The function
|
||||
should take no arguments.
|
||||
"""
|
||||
self._fileobj = fileobj
|
||||
self._start_byte = self._fileobj.tell()
|
||||
self._size = self._calculate_file_size(
|
||||
self._fileobj,
|
||||
requested_size=chunk_size,
|
||||
start_byte=self._start_byte,
|
||||
actual_file_size=full_file_size,
|
||||
)
|
||||
# _amount_read represents the position in the chunk and may exceed
|
||||
# the chunk size, but won't allow reads out of bounds.
|
||||
self._amount_read = 0
|
||||
self._callbacks = callbacks
|
||||
if callbacks is None:
|
||||
self._callbacks = []
|
||||
self._callbacks_enabled = enable_callbacks
|
||||
self._close_callbacks = close_callbacks
|
||||
if close_callbacks is None:
|
||||
self._close_callbacks = close_callbacks
|
||||
|
||||
@classmethod
|
||||
def from_filename(
|
||||
cls,
|
||||
filename,
|
||||
start_byte,
|
||||
chunk_size,
|
||||
callbacks=None,
|
||||
enable_callbacks=True,
|
||||
):
|
||||
"""Convenience factory function to create from a filename.
|
||||
|
||||
:type start_byte: int
|
||||
:param start_byte: The first byte from which to start reading.
|
||||
|
||||
:type chunk_size: int
|
||||
:param chunk_size: The max chunk size to read. Trying to read
|
||||
pass the end of the chunk size will behave like you've
|
||||
reached the end of the file.
|
||||
|
||||
:type full_file_size: int
|
||||
:param full_file_size: The entire content length associated
|
||||
with ``fileobj``.
|
||||
|
||||
:type callbacks: function(amount_read)
|
||||
:param callbacks: Called whenever data is read from this object.
|
||||
|
||||
:type enable_callbacks: bool
|
||||
:param enable_callbacks: Indicate whether to invoke callback
|
||||
during read() calls.
|
||||
|
||||
:rtype: ``ReadFileChunk``
|
||||
:return: A new instance of ``ReadFileChunk``
|
||||
|
||||
"""
|
||||
f = open(filename, 'rb')
|
||||
f.seek(start_byte)
|
||||
file_size = os.fstat(f.fileno()).st_size
|
||||
return cls(f, chunk_size, file_size, callbacks, enable_callbacks)
|
||||
|
||||
def _calculate_file_size(
|
||||
self, fileobj, requested_size, start_byte, actual_file_size
|
||||
):
|
||||
max_chunk_size = actual_file_size - start_byte
|
||||
return min(max_chunk_size, requested_size)
|
||||
|
||||
def read(self, amount=None):
|
||||
amount_left = max(self._size - self._amount_read, 0)
|
||||
if amount is None:
|
||||
amount_to_read = amount_left
|
||||
else:
|
||||
amount_to_read = min(amount_left, amount)
|
||||
data = self._fileobj.read(amount_to_read)
|
||||
self._amount_read += len(data)
|
||||
if self._callbacks is not None and self._callbacks_enabled:
|
||||
invoke_progress_callbacks(self._callbacks, len(data))
|
||||
return data
|
||||
|
||||
def signal_transferring(self):
|
||||
self.enable_callback()
|
||||
if hasattr(self._fileobj, 'signal_transferring'):
|
||||
self._fileobj.signal_transferring()
|
||||
|
||||
def signal_not_transferring(self):
|
||||
self.disable_callback()
|
||||
if hasattr(self._fileobj, 'signal_not_transferring'):
|
||||
self._fileobj.signal_not_transferring()
|
||||
|
||||
def enable_callback(self):
|
||||
self._callbacks_enabled = True
|
||||
|
||||
def disable_callback(self):
|
||||
self._callbacks_enabled = False
|
||||
|
||||
def seek(self, where, whence=0):
|
||||
if whence not in (0, 1, 2):
|
||||
# Mimic io's error for invalid whence values
|
||||
raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
|
||||
|
||||
# Recalculate where based on chunk attributes so seek from file
|
||||
# start (whence=0) is always used
|
||||
where += self._start_byte
|
||||
if whence == 1:
|
||||
where += self._amount_read
|
||||
elif whence == 2:
|
||||
where += self._size
|
||||
|
||||
self._fileobj.seek(max(where, self._start_byte))
|
||||
if self._callbacks is not None and self._callbacks_enabled:
|
||||
# To also rewind the callback() for an accurate progress report
|
||||
bounded_where = max(min(where - self._start_byte, self._size), 0)
|
||||
bounded_amount_read = min(self._amount_read, self._size)
|
||||
amount = bounded_where - bounded_amount_read
|
||||
invoke_progress_callbacks(
|
||||
self._callbacks, bytes_transferred=amount
|
||||
)
|
||||
self._amount_read = max(where - self._start_byte, 0)
|
||||
|
||||
def close(self):
|
||||
if self._close_callbacks is not None and self._callbacks_enabled:
|
||||
for callback in self._close_callbacks:
|
||||
callback()
|
||||
self._fileobj.close()
|
||||
|
||||
def tell(self):
|
||||
return self._amount_read
|
||||
|
||||
def __len__(self):
|
||||
# __len__ is defined because requests will try to determine the length
|
||||
# of the stream to set a content length. In the normal case
|
||||
# of the file it will just stat the file, but we need to change that
|
||||
# behavior. By providing a __len__, requests will use that instead
|
||||
# of stat'ing the file.
|
||||
return self._size
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
self.close()
|
||||
|
||||
def __iter__(self):
|
||||
# This is a workaround for http://bugs.python.org/issue17575
|
||||
# Basically httplib will try to iterate over the contents, even
|
||||
# if its a file like object. This wasn't noticed because we've
|
||||
# already exhausted the stream so iterating over the file immediately
|
||||
# stops, which is what we're simulating here.
|
||||
return iter([])
|
||||
|
||||
|
||||
class StreamReaderProgress:
|
||||
"""Wrapper for a read only stream that adds progress callbacks."""
|
||||
|
||||
def __init__(self, stream, callbacks=None):
|
||||
self._stream = stream
|
||||
self._callbacks = callbacks
|
||||
if callbacks is None:
|
||||
self._callbacks = []
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
value = self._stream.read(*args, **kwargs)
|
||||
invoke_progress_callbacks(self._callbacks, len(value))
|
||||
return value
|
||||
|
||||
|
||||
class NoResourcesAvailable(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TaskSemaphore:
|
||||
def __init__(self, count):
|
||||
"""A semaphore for the purpose of limiting the number of tasks
|
||||
|
||||
:param count: The size of semaphore
|
||||
"""
|
||||
self._semaphore = threading.Semaphore(count)
|
||||
|
||||
def acquire(self, tag, blocking=True):
|
||||
"""Acquire the semaphore
|
||||
|
||||
:param tag: A tag identifying what is acquiring the semaphore. Note
|
||||
that this is not really needed to directly use this class but is
|
||||
needed for API compatibility with the SlidingWindowSemaphore
|
||||
implementation.
|
||||
:param block: If True, block until it can be acquired. If False,
|
||||
do not block and raise an exception if cannot be acquired.
|
||||
|
||||
:returns: A token (can be None) to use when releasing the semaphore
|
||||
"""
|
||||
logger.debug("Acquiring %s", tag)
|
||||
if not self._semaphore.acquire(blocking):
|
||||
raise NoResourcesAvailable(f"Cannot acquire tag '{tag}'")
|
||||
|
||||
def release(self, tag, acquire_token):
|
||||
"""Release the semaphore
|
||||
|
||||
:param tag: A tag identifying what is releasing the semaphore
|
||||
:param acquire_token: The token returned from when the semaphore was
|
||||
acquired. Note that this is not really needed to directly use this
|
||||
class but is needed for API compatibility with the
|
||||
SlidingWindowSemaphore implementation.
|
||||
"""
|
||||
logger.debug(f"Releasing acquire {tag}/{acquire_token}")
|
||||
self._semaphore.release()
|
||||
|
||||
|
||||
class SlidingWindowSemaphore(TaskSemaphore):
|
||||
"""A semaphore used to coordinate sequential resource access.
|
||||
|
||||
This class is similar to the stdlib BoundedSemaphore:
|
||||
|
||||
* It's initialized with a count.
|
||||
* Each call to ``acquire()`` decrements the counter.
|
||||
* If the count is at zero, then ``acquire()`` will either block until the
|
||||
count increases, or if ``blocking=False``, then it will raise
|
||||
a NoResourcesAvailable exception indicating that it failed to acquire the
|
||||
semaphore.
|
||||
|
||||
The main difference is that this semaphore is used to limit
|
||||
access to a resource that requires sequential access. For example,
|
||||
if I want to access resource R that has 20 subresources R_0 - R_19,
|
||||
this semaphore can also enforce that you only have a max range of
|
||||
10 at any given point in time. You must also specify a tag name
|
||||
when you acquire the semaphore. The sliding window semantics apply
|
||||
on a per tag basis. The internal count will only be incremented
|
||||
when the minimum sequence number for a tag is released.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, count):
|
||||
self._count = count
|
||||
# Dict[tag, next_sequence_number].
|
||||
self._tag_sequences = defaultdict(int)
|
||||
self._lowest_sequence = {}
|
||||
self._lock = threading.Lock()
|
||||
self._condition = threading.Condition(self._lock)
|
||||
# Dict[tag, List[sequence_number]]
|
||||
self._pending_release = {}
|
||||
|
||||
def current_count(self):
|
||||
with self._lock:
|
||||
return self._count
|
||||
|
||||
def acquire(self, tag, blocking=True):
|
||||
logger.debug("Acquiring %s", tag)
|
||||
self._condition.acquire()
|
||||
try:
|
||||
if self._count == 0:
|
||||
if not blocking:
|
||||
raise NoResourcesAvailable(f"Cannot acquire tag '{tag}'")
|
||||
else:
|
||||
while self._count == 0:
|
||||
self._condition.wait()
|
||||
# self._count is no longer zero.
|
||||
# First, check if this is the first time we're seeing this tag.
|
||||
sequence_number = self._tag_sequences[tag]
|
||||
if sequence_number == 0:
|
||||
# First time seeing the tag, so record we're at 0.
|
||||
self._lowest_sequence[tag] = sequence_number
|
||||
self._tag_sequences[tag] += 1
|
||||
self._count -= 1
|
||||
return sequence_number
|
||||
finally:
|
||||
self._condition.release()
|
||||
|
||||
def release(self, tag, acquire_token):
|
||||
sequence_number = acquire_token
|
||||
logger.debug("Releasing acquire %s/%s", tag, sequence_number)
|
||||
self._condition.acquire()
|
||||
try:
|
||||
if tag not in self._tag_sequences:
|
||||
raise ValueError(f"Attempted to release unknown tag: {tag}")
|
||||
max_sequence = self._tag_sequences[tag]
|
||||
if self._lowest_sequence[tag] == sequence_number:
|
||||
# We can immediately process this request and free up
|
||||
# resources.
|
||||
self._lowest_sequence[tag] += 1
|
||||
self._count += 1
|
||||
self._condition.notify()
|
||||
queued = self._pending_release.get(tag, [])
|
||||
while queued:
|
||||
if self._lowest_sequence[tag] == queued[-1]:
|
||||
queued.pop()
|
||||
self._lowest_sequence[tag] += 1
|
||||
self._count += 1
|
||||
else:
|
||||
break
|
||||
elif self._lowest_sequence[tag] < sequence_number < max_sequence:
|
||||
# We can't do anything right now because we're still waiting
|
||||
# for the min sequence for the tag to be released. We have
|
||||
# to queue this for pending release.
|
||||
self._pending_release.setdefault(tag, []).append(
|
||||
sequence_number
|
||||
)
|
||||
self._pending_release[tag].sort(reverse=True)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Attempted to release unknown sequence number "
|
||||
f"{sequence_number} for tag: {tag}"
|
||||
)
|
||||
finally:
|
||||
self._condition.release()
|
||||
|
||||
|
||||
class ChunksizeAdjuster:
|
||||
def __init__(
|
||||
self,
|
||||
max_size=MAX_SINGLE_UPLOAD_SIZE,
|
||||
min_size=MIN_UPLOAD_CHUNKSIZE,
|
||||
max_parts=MAX_PARTS,
|
||||
):
|
||||
self.max_size = max_size
|
||||
self.min_size = min_size
|
||||
self.max_parts = max_parts
|
||||
|
||||
def adjust_chunksize(self, current_chunksize, file_size=None):
|
||||
"""Get a chunksize close to current that fits within all S3 limits.
|
||||
|
||||
:type current_chunksize: int
|
||||
:param current_chunksize: The currently configured chunksize.
|
||||
|
||||
:type file_size: int or None
|
||||
:param file_size: The size of the file to upload. This might be None
|
||||
if the object being transferred has an unknown size.
|
||||
|
||||
:returns: A valid chunksize that fits within configured limits.
|
||||
"""
|
||||
chunksize = current_chunksize
|
||||
if file_size is not None:
|
||||
chunksize = self._adjust_for_max_parts(chunksize, file_size)
|
||||
return self._adjust_for_chunksize_limits(chunksize)
|
||||
|
||||
def _adjust_for_chunksize_limits(self, current_chunksize):
|
||||
if current_chunksize > self.max_size:
|
||||
logger.debug(
|
||||
"Chunksize greater than maximum chunksize. "
|
||||
f"Setting to {self.max_size} from {current_chunksize}."
|
||||
)
|
||||
return self.max_size
|
||||
elif current_chunksize < self.min_size:
|
||||
logger.debug(
|
||||
"Chunksize less than minimum chunksize. "
|
||||
f"Setting to {self.min_size} from {current_chunksize}."
|
||||
)
|
||||
return self.min_size
|
||||
else:
|
||||
return current_chunksize
|
||||
|
||||
def _adjust_for_max_parts(self, current_chunksize, file_size):
|
||||
chunksize = current_chunksize
|
||||
num_parts = int(math.ceil(file_size / float(chunksize)))
|
||||
|
||||
while num_parts > self.max_parts:
|
||||
chunksize *= 2
|
||||
num_parts = int(math.ceil(file_size / float(chunksize)))
|
||||
|
||||
if chunksize != current_chunksize:
|
||||
logger.debug(
|
||||
"Chunksize would result in the number of parts exceeding the "
|
||||
f"maximum. Setting to {chunksize} from {current_chunksize}."
|
||||
)
|
||||
|
||||
return chunksize
|
||||
|
||||
|
||||
def add_s3express_defaults(bucket, extra_args):
|
||||
"""
|
||||
This function has been deprecated, but is kept for backwards compatibility.
|
||||
This function is subject to removal in a future release.
|
||||
"""
|
||||
if is_s3express_bucket(bucket) and "ChecksumAlgorithm" not in extra_args:
|
||||
# Default Transfer Operations to S3Express to use CRC32
|
||||
extra_args["ChecksumAlgorithm"] = "crc32"
|
||||
|
||||
|
||||
def set_default_checksum_algorithm(extra_args):
|
||||
"""Set the default algorithm to CRC32 if not specified by the user."""
|
||||
if any(checksum in extra_args for checksum in FULL_OBJECT_CHECKSUM_ARGS):
|
||||
return
|
||||
extra_args.setdefault("ChecksumAlgorithm", DEFAULT_CHECKSUM_ALGORITHM)
|
||||
|
||||
|
||||
# NOTE: The following interfaces are considered private and are subject
|
||||
# to abrupt breaking changes. Please do not use them directly.
|
||||
|
||||
try:
|
||||
from botocore.utils import create_nested_client as create_client
|
||||
except ImportError:
|
||||
|
||||
def create_client(session, *args, **kwargs):
|
||||
return session.create_client(*args, **kwargs)
|
||||
|
||||
|
||||
def create_nested_client(session, service_name, **kwargs):
|
||||
return create_client(session, service_name, **kwargs)
|
||||
Reference in New Issue
Block a user