|
97 | 97 | ), |
98 | 98 | ] |
99 | 99 |
|
| 100 | +SOURCE_URIS_AVRO= [ |
| 101 | +"gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.avro", |
| 102 | +"gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/b-twitter.avro", |
| 103 | +"gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/c-twitter.avro", |
| 104 | +] |
| 105 | +SOURCE_URIS_PARQUET= [ |
| 106 | +"gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.parquet", |
| 107 | +"gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/b-twitter.parquet", |
| 108 | +"gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/c-twitter.parquet", |
| 109 | +] |
| 110 | +REFERENCE_FILE_SCHEMA_URI_AVRO="gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.avro" |
| 111 | +REFERENCE_FILE_SCHEMA_URI_PARQUET="gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.parquet" |
| 112 | + |
| 113 | + |
100 | 114 | # The VPC-SC team maintains a mirror of the GCS bucket used for code |
101 | 115 | # samples. The public bucket crosses the configured security boundary. |
102 | 116 | # See: https://github.com/googleapis/google-cloud-python/issues/8550 |
@@ -1052,6 +1066,195 @@ def test_load_table_from_file_w_explicit_location(self): |
1052 | 1066 | table_ref,"gs://{}/letters-us.csv".format(bucket_name),location="US" |
1053 | 1067 | ).result() |
1054 | 1068 |
|
| 1069 | +deftest_create_external_table_with_reference_file_schema_uri_avro(self): |
| 1070 | +client=Config.CLIENT |
| 1071 | +dataset_id=_make_dataset_id("external_reference_file_avro") |
| 1072 | +self.temp_dataset(dataset_id) |
| 1073 | +dataset_ref=bigquery.DatasetReference(client.project,dataset_id) |
| 1074 | +table_id="test_ref_file_avro" |
| 1075 | +table_ref=bigquery.TableReference(dataset_ref=dataset_ref,table_id=table_id) |
| 1076 | + |
| 1077 | +expected_schema= [ |
| 1078 | +bigquery.SchemaField("username","STRING",mode="NULLABLE"), |
| 1079 | +bigquery.SchemaField("tweet","STRING",mode="NULLABLE"), |
| 1080 | +bigquery.SchemaField("timestamp","STRING",mode="NULLABLE"), |
| 1081 | +bigquery.SchemaField("likes","INTEGER",mode="NULLABLE"), |
| 1082 | + ] |
| 1083 | + |
| 1084 | +# By default, the table should have the c-twitter schema because it is lexicographically last |
| 1085 | +# in the `SOURCE_URIs` list: |
| 1086 | +# a-twitter schema: (username, tweet, timestamp, likes) |
| 1087 | +# b-twitter schema: (username, tweet, timestamp) |
| 1088 | +# c-twitter schema: (username, tweet) |
| 1089 | + |
| 1090 | +# Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1091 | + |
| 1092 | +# Create external data configuration |
| 1093 | +external_config=bigquery.ExternalConfig(bigquery.ExternalSourceFormat.AVRO) |
| 1094 | +external_config.source_uris=SOURCE_URIS_AVRO |
| 1095 | +external_config.reference_file_schema_uri=REFERENCE_FILE_SCHEMA_URI_AVRO |
| 1096 | + |
| 1097 | +table=bigquery.Table(table_ref) |
| 1098 | +table.external_data_configuration=external_config |
| 1099 | + |
| 1100 | +table=client.create_table(table) |
| 1101 | + |
| 1102 | +# Get table created by the create_table API call |
| 1103 | +generated_table=client.get_table(table_ref) |
| 1104 | + |
| 1105 | +self.assertEqual(generated_table.schema,expected_schema) |
| 1106 | +self.assertEqual( |
| 1107 | +generated_table.external_data_configuration._properties[ |
| 1108 | +"referenceFileSchemaUri" |
| 1109 | + ], |
| 1110 | +REFERENCE_FILE_SCHEMA_URI_AVRO, |
| 1111 | + ) |
| 1112 | + |
| 1113 | +# Clean up test |
| 1114 | +self.to_delete.insert(0,generated_table) |
| 1115 | + |
| 1116 | +deftest_load_table_from_uri_with_reference_file_schema_uri_avro(self): |
| 1117 | +dataset_id=_make_dataset_id("test_reference_file_avro") |
| 1118 | +self.temp_dataset(dataset_id) |
| 1119 | +client=Config.CLIENT |
| 1120 | +dataset_ref=bigquery.DatasetReference(client.project,dataset_id) |
| 1121 | +table_id="test_ref_file_avro" |
| 1122 | +table_ref=bigquery.TableReference(dataset_ref=dataset_ref,table_id=table_id) |
| 1123 | + |
| 1124 | +expected_schema= [ |
| 1125 | +bigquery.SchemaField("username","STRING",mode="NULLABLE"), |
| 1126 | +bigquery.SchemaField("tweet","STRING",mode="NULLABLE"), |
| 1127 | +bigquery.SchemaField("timestamp","STRING",mode="NULLABLE"), |
| 1128 | +bigquery.SchemaField("likes","INTEGER",mode="NULLABLE"), |
| 1129 | + ] |
| 1130 | + |
| 1131 | +# By default, the table should have the c-twitter schema because it is lexicographically last |
| 1132 | +# in the `SOURCE_URIS` list: |
| 1133 | +# a-twitter schema: (username, tweet, timestamp, likes) |
| 1134 | +# b-twitter schema: (username, tweet, timestamp) |
| 1135 | +# c-twitter schema: (username, tweet) |
| 1136 | + |
| 1137 | +# Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1138 | + |
| 1139 | +# Create load job configuration |
| 1140 | +load_job_config=bigquery.LoadJobConfig( |
| 1141 | +source_format=bigquery.SourceFormat.AVRO |
| 1142 | + ) |
| 1143 | +load_job_config.reference_file_schema_uri=REFERENCE_FILE_SCHEMA_URI_AVRO |
| 1144 | + |
| 1145 | +load_job=client.load_table_from_uri( |
| 1146 | +source_uris=SOURCE_URIS_AVRO, |
| 1147 | +destination=table_ref, |
| 1148 | +job_config=load_job_config, |
| 1149 | + ) |
| 1150 | +# Wait for load job to complete |
| 1151 | +result=load_job.result() |
| 1152 | + |
| 1153 | +# Get table created by the load job |
| 1154 | +generated_table=client.get_table(table_ref) |
| 1155 | +self.assertEqual(generated_table.schema,expected_schema) |
| 1156 | +self.assertEqual( |
| 1157 | +result._properties["configuration"]["load"]["referenceFileSchemaUri"], |
| 1158 | +REFERENCE_FILE_SCHEMA_URI_AVRO, |
| 1159 | + ) |
| 1160 | + |
| 1161 | +# Clean up test |
| 1162 | +self.to_delete.insert(0,generated_table) |
| 1163 | + |
| 1164 | +deftest_create_external_table_with_reference_file_schema_uri_parquet(self): |
| 1165 | +client=Config.CLIENT |
| 1166 | +dataset_id=_make_dataset_id("external_table_ref_file_parquet") |
| 1167 | +self.temp_dataset(dataset_id) |
| 1168 | +dataset_ref=bigquery.DatasetReference(client.project,dataset_id) |
| 1169 | +table_id="test_ref_file_parquet" |
| 1170 | +table_ref=bigquery.TableReference(dataset_ref=dataset_ref,table_id=table_id) |
| 1171 | + |
| 1172 | +expected_schema= [ |
| 1173 | +bigquery.SchemaField("username","STRING",mode="NULLABLE"), |
| 1174 | +bigquery.SchemaField("tweet","STRING",mode="NULLABLE"), |
| 1175 | +bigquery.SchemaField("timestamp","STRING",mode="NULLABLE"), |
| 1176 | +bigquery.SchemaField("likes","INTEGER",mode="NULLABLE"), |
| 1177 | + ] |
| 1178 | + |
| 1179 | +# By default, the table should have the c-twitter schema because it is lexicographically last |
| 1180 | +# in the `SOURCE_URIS` list: |
| 1181 | +# a-twitter schema: (username, tweet, timestamp, likes) |
| 1182 | +# b-twitter schema: (username, tweet, timestamp) |
| 1183 | +# c-twitter schema: (username, tweet) |
| 1184 | + |
| 1185 | +# Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1186 | + |
| 1187 | +# Create external data configuration |
| 1188 | +external_config=bigquery.ExternalConfig(bigquery.ExternalSourceFormat.PARQUET) |
| 1189 | +external_config.source_uris=SOURCE_URIS_PARQUET |
| 1190 | +external_config.reference_file_schema_uri=REFERENCE_FILE_SCHEMA_URI_PARQUET |
| 1191 | + |
| 1192 | +table=bigquery.Table(table_ref) |
| 1193 | +table.external_data_configuration=external_config |
| 1194 | + |
| 1195 | +table=client.create_table(table) |
| 1196 | + |
| 1197 | +# Get table created by the create_table API call |
| 1198 | +generated_table=client.get_table(table_ref) |
| 1199 | +self.assertEqual(generated_table.schema,expected_schema) |
| 1200 | +self.assertEqual( |
| 1201 | +generated_table.external_data_configuration._properties[ |
| 1202 | +"referenceFileSchemaUri" |
| 1203 | + ], |
| 1204 | +REFERENCE_FILE_SCHEMA_URI_PARQUET, |
| 1205 | + ) |
| 1206 | + |
| 1207 | +# Clean up test |
| 1208 | +self.to_delete.insert(0,generated_table) |
| 1209 | + |
| 1210 | +deftest_load_table_from_uri_with_reference_file_schema_uri_parquet(self): |
| 1211 | +dataset_id=_make_dataset_id("test_reference_file_parquet") |
| 1212 | +self.temp_dataset(dataset_id) |
| 1213 | +client=Config.CLIENT |
| 1214 | +dataset_ref=bigquery.DatasetReference(client.project,dataset_id) |
| 1215 | +table_id="test_ref_file_parquet" |
| 1216 | +table_ref=bigquery.TableReference(dataset_ref=dataset_ref,table_id=table_id) |
| 1217 | + |
| 1218 | +expected_schema= [ |
| 1219 | +bigquery.SchemaField("username","STRING",mode="NULLABLE"), |
| 1220 | +bigquery.SchemaField("tweet","STRING",mode="NULLABLE"), |
| 1221 | +bigquery.SchemaField("timestamp","STRING",mode="NULLABLE"), |
| 1222 | +bigquery.SchemaField("likes","INTEGER",mode="NULLABLE"), |
| 1223 | + ] |
| 1224 | + |
| 1225 | +# By default, the table should have the c-twitter schema because it is lexicographically last |
| 1226 | +# in the `SOURCE_URIS` list: |
| 1227 | +# a-twitter schema: (username, tweet, timestamp, likes) |
| 1228 | +# b-twitter schema: (username, tweet, timestamp) |
| 1229 | +# c-twitter schema: (username, tweet) |
| 1230 | + |
| 1231 | +# Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1232 | + |
| 1233 | +# Create load job configuration |
| 1234 | +load_job_config=bigquery.LoadJobConfig( |
| 1235 | +source_format=bigquery.SourceFormat.PARQUET |
| 1236 | + ) |
| 1237 | +load_job_config.reference_file_schema_uri=REFERENCE_FILE_SCHEMA_URI_PARQUET |
| 1238 | + |
| 1239 | +load_job=client.load_table_from_uri( |
| 1240 | +source_uris=SOURCE_URIS_PARQUET, |
| 1241 | +destination=table_ref, |
| 1242 | +job_config=load_job_config, |
| 1243 | + ) |
| 1244 | +# Wait for load job to complete |
| 1245 | +result=load_job.result() |
| 1246 | + |
| 1247 | +# Get table created by the load job |
| 1248 | +generated_table=client.get_table(table_ref) |
| 1249 | +self.assertEqual(generated_table.schema,expected_schema) |
| 1250 | +self.assertEqual( |
| 1251 | +result._properties["configuration"]["load"]["referenceFileSchemaUri"], |
| 1252 | +REFERENCE_FILE_SCHEMA_URI_PARQUET, |
| 1253 | + ) |
| 1254 | + |
| 1255 | +# Clean up test |
| 1256 | +self.to_delete.insert(0,generated_table) |
| 1257 | + |
1055 | 1258 | def_write_csv_to_storage(self,bucket_name,blob_name,header_row,data_rows): |
1056 | 1259 | fromgoogle.cloud._testingimport_NamedTemporaryFile |
1057 | 1260 |
|
|