-
Notifications
You must be signed in to change notification settings - Fork 213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FEATURE REQUEST #32] On-Premises S3 / S3 Compatible... #389
base: main
Are you sure you want to change the base?
Changes from all commits
d8c2079
1ec4f07
0327f15
1fa3a97
bd90304
d41da3d
a737b23
819f773
c000bc7
882b84f
d76756e
eedee82
ac951e3
f8acadb
7a97aec
006baaf
870101f
ca04dc1
025ac8c
fedc68c
6801079
fb0756f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,12 +38,14 @@ | |
import org.apache.polaris.core.admin.model.FileStorageConfigInfo; | ||
import org.apache.polaris.core.admin.model.GcpStorageConfigInfo; | ||
import org.apache.polaris.core.admin.model.PolarisCatalog; | ||
import org.apache.polaris.core.admin.model.S3CompatibleStorageConfigInfo; | ||
import org.apache.polaris.core.admin.model.StorageConfigInfo; | ||
import org.apache.polaris.core.storage.FileStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.PolarisStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.aws.AwsStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.azure.AzureStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.gcp.GcpStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.s3compatible.S3CompatibleStorageConfigurationInfo; | ||
|
||
/** | ||
* Catalog specific subclass of the {@link PolarisEntity} that handles conversion from the {@link | ||
|
@@ -141,6 +143,22 @@ private StorageConfigInfo getStorageInfo(Map<String, String> internalProperties) | |
.setRegion(awsConfig.getRegion()) | ||
.build(); | ||
} | ||
if (configInfo instanceof S3CompatibleStorageConfigurationInfo) { | ||
S3CompatibleStorageConfigurationInfo s3Config = | ||
(S3CompatibleStorageConfigurationInfo) configInfo; | ||
return S3CompatibleStorageConfigInfo.builder() | ||
.setStorageType(StorageConfigInfo.StorageTypeEnum.S3_COMPATIBLE) | ||
.setS3Endpoint(s3Config.getS3Endpoint()) | ||
.setS3ProfileName(s3Config.getS3ProfileName()) | ||
.setS3PathStyleAccess(s3Config.getS3PathStyleAccess()) | ||
.setAllowedLocations(s3Config.getAllowedLocations()) | ||
.setS3CredentialsCatalogAccessKeyEnvVar(s3Config.getS3CredentialsCatalogAccessKeyId()) | ||
.setS3CredentialsCatalogSecretAccessKeyEnvVar( | ||
s3Config.getS3CredentialsCatalogSecretAccessKey()) | ||
.setS3Region(s3Config.getS3Region()) | ||
.setS3RoleArn(s3Config.getS3RoleArn()) | ||
.build(); | ||
} | ||
if (configInfo instanceof AzureStorageConfigurationInfo) { | ||
AzureStorageConfigurationInfo azureConfig = (AzureStorageConfigurationInfo) configInfo; | ||
return AzureStorageConfigInfo.builder() | ||
|
@@ -250,6 +268,21 @@ public Builder setStorageConfigurationInfo( | |
awsConfig.validateArn(awsConfigModel.getRoleArn()); | ||
config = awsConfig; | ||
break; | ||
|
||
case S3_COMPATIBLE: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am still trying to figure out why we opted to introduce a new source type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. To complete the response given to @dimas-b a few lines higher : Before the security arbitrage favoring vending only STS there were more options to be used by onPrem ecosystems habits (keys that do not expire quickly by exemple). Since then, the difference between the AWS class and the S3compatible class is smaller. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for thew info. I am fine with it as it is as long we do have a plan to unify it at some point |
||
S3CompatibleStorageConfigInfo s3ConfigModel = | ||
(S3CompatibleStorageConfigInfo) storageConfigModel; | ||
config = | ||
new S3CompatibleStorageConfigurationInfo( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we as well need pass a custom STS endpoint in order to initialize STSClient with a custom endpoint that points to MinIO as an example? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation explicitly pass the s3 endpoint to the stsClient. Apart from AWS which exposes AssumeRole, STS and globally IAM on a separate endpoint from the S3 APIs, other S3 providers seem to use the same endpoint. In a mixed AWS + other approach, it would be necessary to explicitly specify the STS endpoint and the S3 endpoint. In the non-AWS approach, it is not mandatory, it is the same endpoint (Minio, Ceph, backblaze, Dell ECS, ...) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, Backblaze (only one 'l'! 🙂) B2, and I'm guessing some other S3-compatible services, doesn't have an STS, so there's no point in specifying an endpoint. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it. Thanks for the info :) |
||
s3ConfigModel.getS3Endpoint(), | ||
s3ConfigModel.getS3ProfileName(), | ||
s3ConfigModel.getS3CredentialsCatalogAccessKeyEnvVar(), | ||
s3ConfigModel.getS3CredentialsCatalogSecretAccessKeyEnvVar(), | ||
s3ConfigModel.getS3PathStyleAccess(), | ||
s3ConfigModel.getS3Region(), | ||
s3ConfigModel.getS3RoleArn(), | ||
new ArrayList<>(allowedLocations)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: why |
||
break; | ||
case AZURE: | ||
AzureStorageConfigInfo azureConfigModel = (AzureStorageConfigInfo) storageConfigModel; | ||
AzureStorageConfigurationInfo azureConfigInfo = | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ | |
import org.apache.polaris.core.storage.aws.AwsStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.azure.AzureStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.gcp.GcpStorageConfigurationInfo; | ||
import org.apache.polaris.core.storage.s3compatible.S3CompatibleStorageConfigurationInfo; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
@@ -62,6 +63,7 @@ | |
@JsonTypeInfo(use = JsonTypeInfo.Id.NAME) | ||
@JsonSubTypes({ | ||
@JsonSubTypes.Type(value = AwsStorageConfigurationInfo.class), | ||
@JsonSubTypes.Type(value = S3CompatibleStorageConfigurationInfo.class), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reason to model non-AWS S3 storage as a separate config info class as opposed to adding a few optional settings to My understanding is that the same AWS SDK is used for actual access anyway, so storage config info basically goes into the same AWS client, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The question of modifying the current implementation arose at the beginning of the PR. The direction given was to propose a variant in order not to disrupt the existing. Possibly in a second step, unify. So this PR follows the instructions and avoids modifying the existing and proposes a variant adapted to the S3 alternatives. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough... I'll review closer soon :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AWS config is an S3 configuration. Having two different configurations for the same thing doesn't really make sense to me. @lefebsy, mind doing this? |
||
@JsonSubTypes.Type(value = AzureStorageConfigurationInfo.class), | ||
@JsonSubTypes.Type(value = GcpStorageConfigurationInfo.class), | ||
@JsonSubTypes.Type(value = FileStorageConfigurationInfo.class), | ||
|
@@ -241,6 +243,7 @@ public void validateMaxAllowedLocations(int maxAllowedLocations) { | |
/** Polaris' storage type, each has a fixed prefix for its location */ | ||
public enum StorageType { | ||
S3("s3://"), | ||
S3_COMPATIBLE("s3://"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wondered if we only support "s3://", can we also support "s3a://" for location There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It could be added in the list, but do not know how to check with unit test if it is working. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe |
||
AZURE(List.of("abfs://", "wasb://", "abfss://", "wasbs://")), | ||
GCS("gs://"), | ||
FILE("file://"), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,11 @@ | |
|
||
import jakarta.annotation.Nonnull; | ||
import java.net.URI; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.stream.Stream; | ||
import software.amazon.awssdk.policybuilder.iam.*; | ||
|
||
public class StorageUtil { | ||
/** | ||
|
@@ -62,4 +67,136 @@ public class StorageUtil { | |
public static @Nonnull String getBucket(URI uri) { | ||
return uri.getAuthority(); | ||
} | ||
|
||
/** | ||
* Given a path, return it without leading slash | ||
* | ||
* @param path A path to parse | ||
* @return Same path without leading slash | ||
*/ | ||
private static @Nonnull String trimLeadingSlash(String path) { | ||
if (path.startsWith("/")) { | ||
path = path.substring(1); | ||
} | ||
return path; | ||
} | ||
|
||
/** | ||
* Given an uri, and format an S3 path | ||
* | ||
* @param uri A path to parse | ||
* @return A bucket and a path joined by slash | ||
*/ | ||
private static @Nonnull String parseS3Path(URI uri) { | ||
String bucket = getBucket(uri); | ||
String path = trimLeadingSlash(uri.getPath()); | ||
return String.join("/", bucket, path); | ||
} | ||
|
||
/** | ||
* Given a roleArn, return the prefix | ||
* | ||
* @param roleArn A roleArn to parse | ||
* @return The prefix of the roleArn | ||
*/ | ||
private static String getArnPrefixFor(String roleArn) { | ||
if (roleArn.contains("aws-cn")) { | ||
return "arn:aws-cn:s3:::"; | ||
} else if (roleArn.contains("aws-us-gov")) { | ||
return "arn:aws-us-gov:s3:::"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this kind of special cases should not be in Polaris code, but it's ok for this PR as I understand it mimics previous AWS-specific logic. |
||
} else { | ||
return "arn:aws:s3:::"; | ||
} | ||
} | ||
|
||
/** | ||
* generate an IamPolicy from the input readLocations and writeLocations, optionally with list | ||
* support. Credentials will be scoped to exactly the resources provided. If read and write | ||
* locations are empty, a non-empty policy will be generated that grants GetObject and optionally | ||
* ListBucket privileges with no resources. This prevents us from sending an empty policy to AWS | ||
* and just assuming the role with full privileges. | ||
* | ||
* @param roleArn A roleArn | ||
* @param allowList Allow list or not | ||
* @param readLocations A list of input read locations | ||
* @param writeLocations A list of input write locations | ||
* @return A policy limiting scope access | ||
*/ | ||
// TODO - add KMS key access | ||
public static IamPolicy policyString( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not reuse the policy from the code the deals with AWS S3? |
||
String roleArn, boolean allowList, Set<String> readLocations, Set<String> writeLocations) { | ||
IamPolicy.Builder policyBuilder = IamPolicy.builder(); | ||
IamStatement.Builder allowGetObjectStatementBuilder = | ||
IamStatement.builder() | ||
.effect(IamEffect.ALLOW) | ||
.addAction("s3:GetObject") | ||
.addAction("s3:GetObjectVersion"); | ||
Map<String, IamStatement.Builder> bucketListStatementBuilder = new HashMap<>(); | ||
Map<String, IamStatement.Builder> bucketGetLocationStatementBuilder = new HashMap<>(); | ||
|
||
String arnPrefix = getArnPrefixFor(roleArn); | ||
Stream.concat(readLocations.stream(), writeLocations.stream()) | ||
.distinct() | ||
.forEach( | ||
location -> { | ||
URI uri = URI.create(location); | ||
allowGetObjectStatementBuilder.addResource( | ||
// TODO add support for CN and GOV | ||
IamResource.create( | ||
arnPrefix + StorageUtil.concatFilePrefixes(parseS3Path(uri), "*", "/"))); | ||
final var bucket = arnPrefix + StorageUtil.getBucket(uri); | ||
if (allowList) { | ||
bucketListStatementBuilder | ||
.computeIfAbsent( | ||
bucket, | ||
(String key) -> | ||
IamStatement.builder() | ||
.effect(IamEffect.ALLOW) | ||
.addAction("s3:ListBucket") | ||
.addResource(key)) | ||
.addCondition( | ||
IamConditionOperator.STRING_LIKE, | ||
"s3:prefix", | ||
StorageUtil.concatFilePrefixes(trimLeadingSlash(uri.getPath()), "*", "/")); | ||
} | ||
bucketGetLocationStatementBuilder.computeIfAbsent( | ||
bucket, | ||
key -> | ||
IamStatement.builder() | ||
.effect(IamEffect.ALLOW) | ||
.addAction("s3:GetBucketLocation") | ||
.addResource(key)); | ||
}); | ||
|
||
if (!writeLocations.isEmpty()) { | ||
IamStatement.Builder allowPutObjectStatementBuilder = | ||
IamStatement.builder() | ||
.effect(IamEffect.ALLOW) | ||
.addAction("s3:PutObject") | ||
.addAction("s3:DeleteObject"); | ||
writeLocations.forEach( | ||
location -> { | ||
URI uri = URI.create(location); | ||
// TODO add support for CN and GOV | ||
allowPutObjectStatementBuilder.addResource( | ||
IamResource.create( | ||
arnPrefix + StorageUtil.concatFilePrefixes(parseS3Path(uri), "*", "/"))); | ||
}); | ||
policyBuilder.addStatement(allowPutObjectStatementBuilder.build()); | ||
} | ||
if (!bucketListStatementBuilder.isEmpty()) { | ||
bucketListStatementBuilder | ||
.values() | ||
.forEach(statementBuilder -> policyBuilder.addStatement(statementBuilder.build())); | ||
} else if (allowList) { | ||
// add list privilege with 0 resources | ||
policyBuilder.addStatement( | ||
IamStatement.builder().effect(IamEffect.ALLOW).addAction("s3:ListBucket").build()); | ||
} | ||
|
||
bucketGetLocationStatementBuilder | ||
.values() | ||
.forEach(statementBuilder -> policyBuilder.addStatement(statementBuilder.build())); | ||
return policyBuilder.addStatement(allowGetObjectStatementBuilder.build()).build(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these kind of attribute copy blocks of code are easy to mess up by accidental typos... Could you add a unit test for this conversion?