Quick Start Guide#
Arrow Java provides several building blocks. Data types describe the types of values;ValueVectors are sequences of typed values; fields describe the types of columns intabular data; schemas describe a sequence of columns in tabular data, andVectorSchemaRoot represents tabular data. Arrow also provides readers andwriters for loading data from and persisting data to storage.
Create a ValueVector#
ValueVectors represent a sequence of values of the same type.They are also known as “arrays” in the columnar format.
Example: create a vector of 32-bit integers representing[1,null,2]:
importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.IntVector;try(BufferAllocatorallocator=newRootAllocator();IntVectorintVector=newIntVector("fixed-size-primitive-layout",allocator);){intVector.allocateNew(3);intVector.set(0,1);intVector.setNull(1);intVector.set(2,2);intVector.setValueCount(3);System.out.println("Vector created in memory: "+intVector);}
Vectorcreatedinmemory:[1,null,2]
Example: create a vector of UTF-8 encoded strings representing["one","two","three"]:
importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.VarCharVector;try(BufferAllocatorallocator=newRootAllocator();VarCharVectorvarCharVector=newVarCharVector("variable-size-primitive-layout",allocator);){varCharVector.allocateNew(3);varCharVector.set(0,"one".getBytes());varCharVector.set(1,"two".getBytes());varCharVector.set(2,"three".getBytes());varCharVector.setValueCount(3);System.out.println("Vector created in memory: "+varCharVector);}
Vectorcreatedinmemory:[one,two,three]
Create a Field#
Fields are used to denote the particular columns of tabular data.They consist of a name, a data type, a flag indicating whether the column can have null values,and optional key-value metadata.
Example: create a field named “document” of string type:
importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importjava.util.HashMap;importjava.util.Map;Map<String,String>metadata=newHashMap<>();metadata.put("A","Id card");metadata.put("B","Passport");metadata.put("C","Visa");Fielddocument=newField("document",newFieldType(true,newArrowType.Utf8(),/*dictionary*/null,metadata),/*children*/null);System.out.println("Field created: "+document+", Metadata: "+document.getMetadata());
Fieldcreated:document:Utf8,Metadata:{A=Idcard,B=Passport,C=Visa}
Create a Schema#
Schemas hold a sequence of fields together with some optional metadata.
Example: Create a schema describing datasets with two columns:an int32 column “A” and a UTF8-encoded string column “B”
importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importorg.apache.arrow.vector.types.pojo.Schema;importjava.util.HashMap;importjava.util.Map;import staticjava.util.Arrays.asList;Map<String,String>metadata=newHashMap<>();metadata.put("K1","V1");metadata.put("K2","V2");Fielda=newField("A",FieldType.nullable(newArrowType.Int(32,true)),/*children*/null);Fieldb=newField("B",FieldType.nullable(newArrowType.Utf8()),/*children*/null);Schemaschema=newSchema(asList(a,b),metadata);System.out.println("Schema created: "+schema);
Schemacreated:Schema<A:Int(32,true),B:Utf8>(metadata:{K1=V1,K2=V2})
Create a VectorSchemaRoot#
AVectorSchemaRoot combines ValueVectors with a Schema to represent tabular data.
Example: Create a dataset of names (strings) and ages (32-bit signed integers).
importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.IntVector;importorg.apache.arrow.vector.VarCharVector;importorg.apache.arrow.vector.VectorSchemaRoot;importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importorg.apache.arrow.vector.types.pojo.Schema;importjava.nio.charset.StandardCharsets;importjava.util.HashMap;importjava.util.Map;import staticjava.util.Arrays.asList;Fieldage=newField("age",FieldType.nullable(newArrowType.Int(32,true)),/*children*/null);Fieldname=newField("name",FieldType.nullable(newArrowType.Utf8()),/*children*/null);Schemaschema=newSchema(asList(age,name),/*metadata*/null);try(BufferAllocatorallocator=newRootAllocator();VectorSchemaRootroot=VectorSchemaRoot.create(schema,allocator);IntVectorageVector=(IntVector)root.getVector("age");VarCharVectornameVector=(VarCharVector)root.getVector("name");){ageVector.allocateNew(3);ageVector.set(0,10);ageVector.set(1,20);ageVector.set(2,30);nameVector.allocateNew(3);nameVector.set(0,"Dave".getBytes(StandardCharsets.UTF_8));nameVector.set(1,"Peter".getBytes(StandardCharsets.UTF_8));nameVector.set(2,"Mary".getBytes(StandardCharsets.UTF_8));root.setRowCount(3);System.out.println("VectorSchemaRoot created: \n"+root.contentToTSVString());}
VectorSchemaRootcreated:agename10Dave20Peter30Mary
Interprocess Communication (IPC)#
Arrow data can be written to and read from disk, and both of these can be done ina streaming and/or random-access fashion depending on application requirements.
Write data to an arrow file
Example: Write the dataset from the previous example to an Arrow IPC file (random-access).
importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.IntVector;importorg.apache.arrow.vector.VarCharVector;importorg.apache.arrow.vector.VectorSchemaRoot;importorg.apache.arrow.vector.ipc.ArrowFileWriter;importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importorg.apache.arrow.vector.types.pojo.Schema;importjava.io.File;importjava.io.FileOutputStream;importjava.io.IOException;importjava.nio.charset.StandardCharsets;importjava.util.HashMap;importjava.util.Map;import staticjava.util.Arrays.asList;Fieldage=newField("age",FieldType.nullable(newArrowType.Int(32,true)),/*children*/null);Fieldname=newField("name",FieldType.nullable(newArrowType.Utf8()),/*children*/null);Schemaschema=newSchema(asList(age,name));try(BufferAllocatorallocator=newRootAllocator();VectorSchemaRootroot=VectorSchemaRoot.create(schema,allocator);IntVectorageVector=(IntVector)root.getVector("age");VarCharVectornameVector=(VarCharVector)root.getVector("name");){ageVector.allocateNew(3);ageVector.set(0,10);ageVector.set(1,20);ageVector.set(2,30);nameVector.allocateNew(3);nameVector.set(0,"Dave".getBytes(StandardCharsets.UTF_8));nameVector.set(1,"Peter".getBytes(StandardCharsets.UTF_8));nameVector.set(2,"Mary".getBytes(StandardCharsets.UTF_8));root.setRowCount(3);Filefile=newFile("random_access_file.arrow");try(FileOutputStreamfileOutputStream=newFileOutputStream(file);ArrowFileWriterwriter=newArrowFileWriter(root,/*provider*/null,fileOutputStream.getChannel());){writer.start();writer.writeBatch();writer.end();System.out.println("Record batches written: "+writer.getRecordBlocks().size()+". Number of rows written: "+root.getRowCount());}catch(IOExceptione){e.printStackTrace();}}
Recordbatcheswritten:1.Numberofrowswritten:3
Read data from an arrow file
Example: Read the dataset from the previous example from an Arrow IPC file (random-access).
importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.ipc.ArrowFileReader;importorg.apache.arrow.vector.ipc.message.ArrowBlock;importorg.apache.arrow.vector.VectorSchemaRoot;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileOutputStream;importjava.io.IOException;try(BufferAllocatorallocator=newRootAllocator(Long.MAX_VALUE);FileInputStreamfileInputStream=newFileInputStream(newFile("random_access_file.arrow"));ArrowFileReaderreader=newArrowFileReader(fileInputStream.getChannel(),allocator);){System.out.println("Record batches in file: "+reader.getRecordBlocks().size());for(ArrowBlockarrowBlock:reader.getRecordBlocks()){reader.loadRecordBatch(arrowBlock);VectorSchemaRootroot=reader.getVectorSchemaRoot();System.out.println("VectorSchemaRoot read: \n"+root.contentToTSVString());}}catch(IOExceptione){e.printStackTrace();}
Recordbatchesinfile:1VectorSchemaRootread:agename10Dave20Peter30Mary
More examples available atArrow Java Cookbook.

