Skip to content

Commit 4d8d798

Browse files
committed
add parallel multipart upload example to boto3 and upload-objects pages
1 parent 1193e9b commit 4d8d798

2 files changed

Lines changed: 123 additions & 71 deletions

File tree

src/content/docs/r2/examples/aws/boto3.mdx

Lines changed: 98 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -9,95 +9,140 @@ import { Render } from "~/components";
99
<Render file="keys" product="r2" />
1010
<br />
1111

12-
You must configure [`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to use a preconstructed `endpoint_url` value. This can be done through any `boto3` usage that accepts connection arguments; for example:
12+
Configure [`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to use your R2 endpoint:
1313

1414
```python
1515
import boto3
1616

17-
s3 = boto3.resource('s3',
18-
# Provide your Cloudflare account ID
19-
endpoint_url = 'https://<ACCOUNT_ID>.r2.cloudflarestorage.com',
20-
# Retrieve your S3 API credentials for your R2 bucket via API tokens (see: https://developers.cloudflare.com/r2/api/tokens)
21-
aws_access_key_id = '<ACCESS_KEY_ID>',
22-
aws_secret_access_key = '<SECRET_ACCESS_KEY>'
17+
s3 = boto3.client(
18+
service_name="s3",
19+
endpoint_url="https://<ACCOUNT_ID>.r2.cloudflarestorage.com",
20+
aws_access_key_id="<ACCESS_KEY_ID>",
21+
aws_secret_access_key="<SECRET_ACCESS_KEY>",
22+
region_name="auto",
2323
)
2424
```
2525

26-
You may, however, omit the `aws_access_key_id` and `aws_secret_access_key ` arguments and allow `boto3` to rely on the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` [environment variables](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables) instead.
26+
You can omit `aws_access_key_id` and `aws_secret_access_key` if you set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` [environment variables](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables).
2727

28-
An example script may look like the following:
28+
Common operations using the client:
2929

3030
```python
31-
import boto3
31+
# Get object metadata
32+
s3.head_object(Bucket="my-bucket", Key="dog.png")
3233

33-
s3 = boto3.client(
34-
service_name="s3",
35-
# Provide your Cloudflare account ID
36-
endpoint_url='https://<ACCOUNT_ID>.r2.cloudflarestorage.com',
37-
# Retrieve your S3 API credentials for your R2 bucket via API tokens (see: https://developers.cloudflare.com/r2/api/tokens)
38-
aws_access_key_id='<ACCESS_KEY_ID>',
39-
aws_secret_access_key='<SECRET_ACCESS_KEY>',
40-
region_name="auto", # Required by SDK but not used by R2
41-
)
42-
43-
# Get object information
44-
object_information = s3.head_object(Bucket='my-bucket', Key='dog.png')
34+
# Get object
35+
response = s3.get_object(Bucket="my-bucket", Key="dog.png")
4536

46-
# Upload/Update single file
47-
s3.upload_fileobj(io.BytesIO(file_content), 'my-bucket', 'dog.png')
37+
# Upload single file
38+
s3.upload_fileobj(io.BytesIO(file_content), "my-bucket", "dog.png")
4839

4940
# Delete object
50-
s3.delete_object(Bucket='my-bucket', Key='dog.png')
41+
s3.delete_object(Bucket="my-bucket", Key="dog.png")
5142
```
5243

53-
## Generate presigned URLs
44+
## Optimizing upload performance
45+
46+
For large objects (multi-GB files such as training data or video), `upload_fileobj` can become a throughput bottleneck. Its internal thread pool is limited by Python's [GIL](https://en.wikipedia.org/wiki/Global_interpreter_lock), and increasing `max_concurrency` via `TransferConfig` gives diminishing returns beyond ~10 threads.
5447

55-
You can also generate presigned links that can be used to share public read or write access to a bucket temporarily.
48+
Use the low-level multipart API with `ThreadPoolExecutor` instead:
5649

5750
```python
5851
import boto3
52+
import math
53+
import os
54+
from concurrent.futures import ThreadPoolExecutor
5955

6056
s3 = boto3.client(
6157
service_name="s3",
62-
# Provide your Cloudflare account ID
63-
endpoint_url='https://<ACCOUNT_ID>.r2.cloudflarestorage.com',
64-
# Retrieve your S3 API credentials for your R2 bucket via API tokens (see: https://developers.cloudflare.com/r2/api/tokens)
65-
aws_access_key_id='<ACCESS_KEY_ID>',
66-
aws_secret_access_key='<SECRET_ACCESS_KEY>',
67-
region_name="auto", # Required by SDK but not used by R2
58+
endpoint_url="https://<ACCOUNT_ID>.r2.cloudflarestorage.com",
59+
aws_access_key_id="<ACCESS_KEY_ID>",
60+
aws_secret_access_key="<SECRET_ACCESS_KEY>",
61+
region_name="auto",
6862
)
6963

64+
bucket = "my-bucket"
65+
key = "large-file.bin"
66+
file_path = "./large-file.bin"
67+
part_size = 16 * 1024 * 1024 # 16 MiB per part
68+
max_workers = 10
69+
70+
# Step 1: Create the multipart upload
71+
mpu = s3.create_multipart_upload(Bucket=bucket, Key=key)
72+
upload_id = mpu["UploadId"]
73+
74+
def upload_part(part_number, data):
75+
response = s3.upload_part(
76+
Bucket=bucket,
77+
Key=key,
78+
UploadId=upload_id,
79+
PartNumber=part_number,
80+
Body=data,
81+
)
82+
return {"PartNumber": part_number, "ETag": response["ETag"]}
83+
84+
try:
85+
file_size = os.path.getsize(file_path)
86+
part_count = math.ceil(file_size / part_size)
87+
88+
# Step 2: Upload parts in parallel
89+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
90+
futures = []
91+
with open(file_path, "rb") as f:
92+
for i in range(part_count):
93+
data = f.read(part_size)
94+
futures.append(pool.submit(upload_part, i + 1, data))
95+
96+
parts = [future.result() for future in futures]
97+
98+
# Step 3: Complete the upload
99+
s3.complete_multipart_upload(
100+
Bucket=bucket,
101+
Key=key,
102+
UploadId=upload_id,
103+
MultipartUpload={"Parts": parts},
104+
)
105+
print("Multipart upload complete.")
106+
except Exception:
107+
try:
108+
s3.abort_multipart_upload(Bucket=bucket, Key=key, UploadId=upload_id)
109+
except Exception:
110+
pass
111+
raise
112+
```
113+
114+
For more on multipart uploads including part size limits and lifecycle management, refer to [Upload objects](/r2/objects/upload-objects/).
115+
116+
## Generate presigned URLs
117+
118+
Generate presigned links to share temporary public read or write access to a bucket.
119+
120+
```python
70121
# Generate presigned URL for reading (GET)
71-
# The ExpiresIn parameter determines how long the presigned link is valid (in seconds)
72122
get_url = s3.generate_presigned_url(
73-
'get_object',
74-
Params={'Bucket': 'my-bucket', 'Key': 'dog.png'},
75-
ExpiresIn=3600 # Valid for 1 hour
123+
"get_object",
124+
Params={"Bucket": "my-bucket", "Key": "dog.png"},
125+
ExpiresIn=3600, # Valid for 1 hour
76126
)
77127

78-
print(get_url)
79-
80128
# Generate presigned URL for writing (PUT)
81-
# Specify ContentType to restrict uploads to a specific file type
82129
put_url = s3.generate_presigned_url(
83-
'put_object',
130+
"put_object",
84131
Params={
85-
'Bucket': 'my-bucket',
86-
'Key': 'dog.png',
87-
'ContentType': 'image/png'
132+
"Bucket": "my-bucket",
133+
"Key": "dog.png",
134+
"ContentType": "image/png",
88135
},
89-
ExpiresIn=3600
136+
ExpiresIn=3600,
90137
)
91-
92-
print(put_url)
93138
```
94139

95140
```sh output
96141
https://<ACCOUNT_ID>.r2.cloudflarestorage.com/my-bucket/dog.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=...&X-Amz-Date=<timestamp>&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=<signature>
97142
https://<ACCOUNT_ID>.r2.cloudflarestorage.com/my-bucket/dog.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=...&X-Amz-Date=<timestamp>&X-Amz-Expires=3600&X-Amz-SignedHeaders=content-type%3Bhost&X-Amz-Signature=<signature>
98143
```
99144

100-
You can use the link generated by the `put_object` example to upload to the specified bucket and key, until the presigned link expires. When using a presigned URL with `ContentType`, the client must include a matching `Content-Type` header in the request.
145+
Upload using the presigned PUT URL. When using a presigned URL with `ContentType`, the client must include a matching `Content-Type` header:
101146

102147
```sh
103148
curl -X PUT "https://<ACCOUNT_ID>.r2.cloudflarestorage.com/my-bucket/dog.png?X-Amz-Algorithm=..." \
@@ -128,16 +173,14 @@ When generating presigned URLs for uploads, you can limit abuse and misuse by:
128173
Then generate a presigned URL with a Content-Type restriction:
129174

130175
```python
131-
# Generate a presigned URL with Content-Type restriction
132-
# The upload will only succeed if the client sends Content-Type: image/png
133176
put_url = s3.generate_presigned_url(
134-
'put_object',
177+
"put_object",
135178
Params={
136-
'Bucket': 'my-bucket',
137-
'Key': 'dog.png',
138-
'ContentType': 'image/png'
179+
"Bucket": "my-bucket",
180+
"Key": "dog.png",
181+
"ContentType": "image/png",
139182
},
140-
ExpiresIn=3600
183+
ExpiresIn=3600,
141184
)
142185
```
143186

src/content/docs/r2/objects/upload-objects.mdx

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,8 @@ s3 = boto3.client(
426426
region_name="auto",
427427
)
428428
429-
# upload_file automatically uses multipart for large files
429+
# upload_file automatically uses multipart for large files.
430+
# For better throughput with large objects, use the manual multipart example below.
430431
s3.upload_file(
431432
Filename="./large-file.bin",
432433
Bucket="my-bucket",
@@ -601,6 +602,7 @@ try {
601602
import boto3
602603
import math
603604
import os
605+
from concurrent.futures import ThreadPoolExecutor, as_completed
604606

605607
s3 = boto3.client(
606608
service_name="s3",
@@ -613,29 +615,36 @@ s3 = boto3.client(
613615
bucket = "my-bucket"
614616
key = "large-file.bin"
615617
file_path = "./large-file.bin"
616-
part_size = 10 * 1024 * 1024 # 10 MiB per part
618+
part_size = 16 * 1024 * 1024 # 16 MiB per part
619+
max_workers = 10 # Number of parallel upload threads
617620

618621
# Step 1: Create the multipart upload
619622
mpu = s3.create_multipart_upload(Bucket=bucket, Key=key)
620623
upload_id = mpu["UploadId"]
621624

625+
def upload_part(part_number, data):
626+
response = s3.upload_part(
627+
Bucket=bucket,
628+
Key=key,
629+
UploadId=upload_id,
630+
PartNumber=part_number,
631+
Body=data,
632+
)
633+
return {"PartNumber": part_number, "ETag": response["ETag"]}
634+
622635
try:
623636
file_size = os.path.getsize(file_path)
624637
part_count = math.ceil(file_size / part_size)
625-
parts = []
626-
627-
# Step 2: Upload each part
628-
with open(file_path, "rb") as f:
629-
for i in range(part_count):
630-
data = f.read(part_size)
631-
response = s3.upload_part(
632-
Bucket=bucket,
633-
Key=key,
634-
UploadId=upload_id,
635-
PartNumber=i + 1,
636-
Body=data,
637-
)
638-
parts.append({"PartNumber": i + 1, "ETag": response["ETag"]})
638+
639+
# Step 2: Upload parts in parallel
640+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
641+
futures = []
642+
with open(file_path, "rb") as f:
643+
for i in range(part_count):
644+
data = f.read(part_size)
645+
futures.append(pool.submit(upload_part, i + 1, data))
646+
647+
parts = [future.result() for future in futures]
639648

640649
# Step 3: Complete the upload
641650
s3.complete_multipart_upload(

0 commit comments

Comments
 (0)