Parquet column encryption#

The following example defines aarrow::Table instance, and then writesit to an encrypted Parquet file. Metadata footer and columns are encrypted withdifferent encryption keys.

// Licensed to the Apache Software Foundation (ASF) under one// or more contributor license agreements. See the NOTICE file// distributed with this work for additional information// regarding copyright ownership. The ASF licenses this file// to you under the Apache License, Version 2.0 (the// "License"); you may not use this file except in compliance// with the License. You may obtain a copy of the License at//// http://www.apache.org/licenses/LICENSE-2.0//// Unless required by applicable law or agreed to in writing,// software distributed under the License is distributed on an// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY// KIND, either express or implied. See the License for the// specific language governing permissions and limitations// under the License.#include"arrow/api.h"#include"arrow/dataset/file_parquet.h"#include"arrow/dataset/parquet_encryption_config.h"#include"arrow/filesystem/localfs.h"#include"arrow/util/secure_string.h"#include"parquet/encryption/crypto_factory.h"#include"parquet/encryption/test_in_memory_kms.h"#include<iostream>namespacefs=arrow::fs;namespaceds=arrow::dataset;arrow::Result<std::shared_ptr<arrow::Table>>GetTable(){autoint_builder=arrow::Int32Builder();std::shared_ptr<arrow::Array>arr_i;ARROW_RETURN_NOT_OK(int_builder.AppendValues({1,3,5,7,1}));ARROW_RETURN_NOT_OK(int_builder.Finish(&arr_i));autostruct_type=arrow::struct_({{"a",arrow::int32()},{"b",arrow::int64()}});autopool=arrow::default_memory_pool();autoa_builder=std::make_shared<arrow::Int32Builder>();autob_builder=std::make_shared<arrow::Int64Builder>();autostruct_builder=arrow::StructBuilder(struct_type,pool,{a_builder,b_builder});std::shared_ptr<arrow::Array>arr_struct;ARROW_RETURN_NOT_OK(struct_builder.Append());ARROW_RETURN_NOT_OK(a_builder->Append(2));ARROW_RETURN_NOT_OK(b_builder->Append(20));ARROW_RETURN_NOT_OK(struct_builder.Append());ARROW_RETURN_NOT_OK(a_builder->Append(4));ARROW_RETURN_NOT_OK(b_builder->Append(40));ARROW_RETURN_NOT_OK(struct_builder.Append());ARROW_RETURN_NOT_OK(a_builder->Append(6));ARROW_RETURN_NOT_OK(b_builder->Append(60));ARROW_RETURN_NOT_OK(struct_builder.Append());ARROW_RETURN_NOT_OK(a_builder->Append(8));ARROW_RETURN_NOT_OK(b_builder->Append(80));ARROW_RETURN_NOT_OK(struct_builder.Append());ARROW_RETURN_NOT_OK(a_builder->Append(10));ARROW_RETURN_NOT_OK(b_builder->Append(100));ARROW_RETURN_NOT_OK(struct_builder.Finish(&arr_struct));automap_type=arrow::map(arrow::int32(),arrow::utf8());autokey_builder=std::make_shared<arrow::Int32Builder>();autoitem_builder=std::make_shared<arrow::StringBuilder>();automap_builder=arrow::MapBuilder(pool,key_builder,item_builder,map_type);std::shared_ptr<arrow::Array>arr_map;ARROW_RETURN_NOT_OK(map_builder.Append());ARROW_RETURN_NOT_OK(key_builder->AppendValues({2,4}));ARROW_RETURN_NOT_OK(item_builder->AppendValues({"2","4"}));ARROW_RETURN_NOT_OK(map_builder.Append());ARROW_RETURN_NOT_OK(key_builder->AppendValues({6}));ARROW_RETURN_NOT_OK(item_builder->AppendValues({"6"}));ARROW_RETURN_NOT_OK(map_builder.Append());ARROW_RETURN_NOT_OK(map_builder.Append());ARROW_RETURN_NOT_OK(key_builder->AppendValues({8,10}));ARROW_RETURN_NOT_OK(item_builder->AppendValues({"8","10"}));ARROW_RETURN_NOT_OK(map_builder.Append());ARROW_RETURN_NOT_OK(map_builder.Finish(&arr_map));autolist_type=arrow::list(arrow::int32());autovalue_builder=std::make_shared<arrow::Int32Builder>();autolist_builder=arrow::ListBuilder(pool,value_builder,list_type);std::shared_ptr<arrow::Array>arr_list;ARROW_RETURN_NOT_OK(list_builder.Append());ARROW_RETURN_NOT_OK(value_builder->AppendValues({1,2,3}));ARROW_RETURN_NOT_OK(list_builder.Append());ARROW_RETURN_NOT_OK(value_builder->AppendValues({4,5,6}));ARROW_RETURN_NOT_OK(list_builder.Append());ARROW_RETURN_NOT_OK(value_builder->AppendValues({7}));ARROW_RETURN_NOT_OK(list_builder.Append());ARROW_RETURN_NOT_OK(value_builder->AppendValues({8}));ARROW_RETURN_NOT_OK(list_builder.Append());ARROW_RETURN_NOT_OK(list_builder.Finish(&arr_list));autoschema=arrow::schema({arrow::field("i",arrow::int32()),arrow::field("s",struct_type),arrow::field("m",map_type),arrow::field("l",list_type),});returnarrow::Table::Make(schema,{arr_i,arr_struct,arr_map,arr_list});}std::shared_ptr<parquet::encryption::CryptoFactory>GetCryptoFactory(){// Configure KMS.std::unordered_map<std::string,arrow::util::SecureString>key_map;key_map.emplace("footerKeyId",arrow::util::SecureString("0123456789012345"));key_map.emplace("columnKeyId",arrow::util::SecureString("1234567890123456"));autocrypto_factory=std::make_shared<parquet::encryption::CryptoFactory>();autokms_client_factory=// for testing only, do not use it as an example of KmsClientFactory implementationstd::make_shared<parquet::encryption::TestOnlyInMemoryKmsClientFactory>(/*wrap_locally=*/true,key_map);crypto_factory->RegisterKmsClientFactory(std::move(kms_client_factory));returncrypto_factory;}arrow::StatusWriteEncryptedFile(conststd::string&path_to_file){usingarrow::internal::checked_pointer_cast;// Get a configured crypto factory and kms connection conf.autocrypto_factory=GetCryptoFactory();autokms_connection_config=std::make_shared<parquet::encryption::KmsConnectionConfig>();// Set write options with encryption configuration.autoencryption_config=std::make_shared<parquet::encryption::EncryptionConfiguration>(std::string("footerKeyId"));encryption_config->column_keys="columnKeyId: i, s.a, s.b, m.key_value.key, m.key_value.value, l.list.element";autoparquet_encryption_config=std::make_shared<ds::ParquetEncryptionConfig>();// Directly assign shared_ptr objects to ParquetEncryptionConfig members.parquet_encryption_config->crypto_factory=crypto_factory;parquet_encryption_config->kms_connection_config=kms_connection_config;parquet_encryption_config->encryption_config=std::move(encryption_config);autofile_format=std::make_shared<ds::ParquetFileFormat>();autoparquet_file_write_options=checked_pointer_cast<ds::ParquetFileWriteOptions>(file_format->DefaultWriteOptions());parquet_file_write_options->parquet_encryption_config=std::move(parquet_encryption_config);// Write dataset.ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table>table,GetTable());printf("%s",table->ToString().c_str());autodataset=std::make_shared<ds::InMemoryDataset>(table);ARROW_ASSIGN_OR_RAISE(autoscanner_builder,dataset->NewScan());ARROW_ASSIGN_OR_RAISE(autoscanner,scanner_builder->Finish());autofile_system=std::make_shared<fs::LocalFileSystem>();autopartitioning=std::make_shared<ds::HivePartitioning>(arrow::schema({arrow::field("part",arrow::utf8())}));ds::FileSystemDatasetWriteOptionswrite_options;write_options.file_write_options=parquet_file_write_options;write_options.filesystem=file_system;write_options.base_dir=path_to_file;write_options.partitioning=partitioning;write_options.basename_template="part{i}.parquet";returnds::FileSystemDataset::Write(write_options,std::move(scanner));}arrow::StatusReadEncryptedFile(conststd::string&path_to_file){// Get a configured crypto factory and kms connection confautocrypto_factory=GetCryptoFactory();autokms_connection_config=std::make_shared<parquet::encryption::KmsConnectionConfig>();// Create decryption properties.autodecryption_config=std::make_shared<parquet::encryption::DecryptionConfiguration>();autoparquet_decryption_config=std::make_shared<ds::ParquetDecryptionConfig>();parquet_decryption_config->crypto_factory=crypto_factory;parquet_decryption_config->kms_connection_config=kms_connection_config;parquet_decryption_config->decryption_config=std::move(decryption_config);// Set scan options.autoparquet_scan_options=std::make_shared<ds::ParquetFragmentScanOptions>();parquet_scan_options->parquet_decryption_config=std::move(parquet_decryption_config);// Get configured Parquet file formatautofile_format=std::make_shared<ds::ParquetFileFormat>();file_format->default_fragment_scan_options=std::move(parquet_scan_options);// Get the FileSystem.autofile_system=std::make_shared<fs::LocalFileSystem>();// Get FileInfo objects for all files under the base directoryfs::FileSelectorselector;selector.base_dir=path_to_file;selector.recursive=true;// Create the datasetds::FileSystemFactoryOptionsfactory_options;ARROW_ASSIGN_OR_RAISE(autodataset_factory,ds::FileSystemDatasetFactory::Make(file_system,selector,file_format,factory_options));ARROW_ASSIGN_OR_RAISE(autodataset,dataset_factory->Finish());ARROW_ASSIGN_OR_RAISE(autoscanner_builder,dataset->NewScan());ARROW_ASSIGN_OR_RAISE(autoscanner,scanner_builder->Finish());ARROW_ASSIGN_OR_RAISE(autotable,scanner->ToTable());std::cout<<"Table size: "<<table->num_rows()<<"\n";returnarrow::Status::OK();}arrow::StatusRunExamples(conststd::string&path_to_file){ARROW_RETURN_NOT_OK(WriteEncryptedFile(path_to_file));ARROW_RETURN_NOT_OK(ReadEncryptedFile(path_to_file));returnarrow::Status::OK();}intmain(intargc,char**argv){if(argc!=2){// Fake success for CI purposes.returnEXIT_SUCCESS;}std::stringpath_to_file=argv[1];arrow::Statusstatus=RunExamples(path_to_file);if(!status.ok()){std::cerr<<"Error occurred: "<<status.message()<<std::endl;returnEXIT_FAILURE;}returnEXIT_SUCCESS;}