Quick Start Guide#

Arrow Java provides several building blocks. Data types describe the types of values;ValueVectors are sequences of typed values; fields describe the types of columns intabular data; schemas describe a sequence of columns in tabular data, andVectorSchemaRoot represents tabular data. Arrow also provides readers andwriters for loading data from and persisting data to storage.

Create a ValueVector#

ValueVectors represent a sequence of values of the same type.They are also known as “arrays” in the columnar format.

Example: create a vector of 32-bit integers representing[1,null,2]:

importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.IntVector;try(BufferAllocatorallocator=newRootAllocator();IntVectorintVector=newIntVector("fixed-size-primitive-layout",allocator);){intVector.allocateNew(3);intVector.set(0,1);intVector.setNull(1);intVector.set(2,2);intVector.setValueCount(3);System.out.println("Vector created in memory: "+intVector);}
Vectorcreatedinmemory:[1,null,2]

Example: create a vector of UTF-8 encoded strings representing["one","two","three"]:

importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.VarCharVector;try(BufferAllocatorallocator=newRootAllocator();VarCharVectorvarCharVector=newVarCharVector("variable-size-primitive-layout",allocator);){varCharVector.allocateNew(3);varCharVector.set(0,"one".getBytes());varCharVector.set(1,"two".getBytes());varCharVector.set(2,"three".getBytes());varCharVector.setValueCount(3);System.out.println("Vector created in memory: "+varCharVector);}
Vectorcreatedinmemory:[one,two,three]

Create a Field#

Fields are used to denote the particular columns of tabular data.They consist of a name, a data type, a flag indicating whether the column can have null values,and optional key-value metadata.

Example: create a field named “document” of string type:

importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importjava.util.HashMap;importjava.util.Map;Map<String,String>metadata=newHashMap<>();metadata.put("A","Id card");metadata.put("B","Passport");metadata.put("C","Visa");Fielddocument=newField("document",newFieldType(true,newArrowType.Utf8(),/*dictionary*/null,metadata),/*children*/null);System.out.println("Field created: "+document+", Metadata: "+document.getMetadata());
Fieldcreated:document:Utf8,Metadata:{A=Idcard,B=Passport,C=Visa}

Create a Schema#

Schemas hold a sequence of fields together with some optional metadata.

Example: Create a schema describing datasets with two columns:an int32 column “A” and a UTF8-encoded string column “B”

importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importorg.apache.arrow.vector.types.pojo.Schema;importjava.util.HashMap;importjava.util.Map;import staticjava.util.Arrays.asList;Map<String,String>metadata=newHashMap<>();metadata.put("K1","V1");metadata.put("K2","V2");Fielda=newField("A",FieldType.nullable(newArrowType.Int(32,true)),/*children*/null);Fieldb=newField("B",FieldType.nullable(newArrowType.Utf8()),/*children*/null);Schemaschema=newSchema(asList(a,b),metadata);System.out.println("Schema created: "+schema);
Schemacreated:Schema<A:Int(32,true),B:Utf8>(metadata:{K1=V1,K2=V2})

Create a VectorSchemaRoot#

AVectorSchemaRoot combines ValueVectors with a Schema to represent tabular data.

Example: Create a dataset of names (strings) and ages (32-bit signed integers).

importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.IntVector;importorg.apache.arrow.vector.VarCharVector;importorg.apache.arrow.vector.VectorSchemaRoot;importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importorg.apache.arrow.vector.types.pojo.Schema;importjava.nio.charset.StandardCharsets;importjava.util.HashMap;importjava.util.Map;import staticjava.util.Arrays.asList;Fieldage=newField("age",FieldType.nullable(newArrowType.Int(32,true)),/*children*/null);Fieldname=newField("name",FieldType.nullable(newArrowType.Utf8()),/*children*/null);Schemaschema=newSchema(asList(age,name),/*metadata*/null);try(BufferAllocatorallocator=newRootAllocator();VectorSchemaRootroot=VectorSchemaRoot.create(schema,allocator);IntVectorageVector=(IntVector)root.getVector("age");VarCharVectornameVector=(VarCharVector)root.getVector("name");){ageVector.allocateNew(3);ageVector.set(0,10);ageVector.set(1,20);ageVector.set(2,30);nameVector.allocateNew(3);nameVector.set(0,"Dave".getBytes(StandardCharsets.UTF_8));nameVector.set(1,"Peter".getBytes(StandardCharsets.UTF_8));nameVector.set(2,"Mary".getBytes(StandardCharsets.UTF_8));root.setRowCount(3);System.out.println("VectorSchemaRoot created: \n"+root.contentToTSVString());}
VectorSchemaRootcreated:agename10Dave20Peter30Mary

Interprocess Communication (IPC)#

Arrow data can be written to and read from disk, and both of these can be done ina streaming and/or random-access fashion depending on application requirements.

Write data to an arrow file

Example: Write the dataset from the previous example to an Arrow IPC file (random-access).

importorg.apache.arrow.memory.BufferAllocator;importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.IntVector;importorg.apache.arrow.vector.VarCharVector;importorg.apache.arrow.vector.VectorSchemaRoot;importorg.apache.arrow.vector.ipc.ArrowFileWriter;importorg.apache.arrow.vector.types.pojo.ArrowType;importorg.apache.arrow.vector.types.pojo.Field;importorg.apache.arrow.vector.types.pojo.FieldType;importorg.apache.arrow.vector.types.pojo.Schema;importjava.io.File;importjava.io.FileOutputStream;importjava.io.IOException;importjava.nio.charset.StandardCharsets;importjava.util.HashMap;importjava.util.Map;import staticjava.util.Arrays.asList;Fieldage=newField("age",FieldType.nullable(newArrowType.Int(32,true)),/*children*/null);Fieldname=newField("name",FieldType.nullable(newArrowType.Utf8()),/*children*/null);Schemaschema=newSchema(asList(age,name));try(BufferAllocatorallocator=newRootAllocator();VectorSchemaRootroot=VectorSchemaRoot.create(schema,allocator);IntVectorageVector=(IntVector)root.getVector("age");VarCharVectornameVector=(VarCharVector)root.getVector("name");){ageVector.allocateNew(3);ageVector.set(0,10);ageVector.set(1,20);ageVector.set(2,30);nameVector.allocateNew(3);nameVector.set(0,"Dave".getBytes(StandardCharsets.UTF_8));nameVector.set(1,"Peter".getBytes(StandardCharsets.UTF_8));nameVector.set(2,"Mary".getBytes(StandardCharsets.UTF_8));root.setRowCount(3);Filefile=newFile("random_access_file.arrow");try(FileOutputStreamfileOutputStream=newFileOutputStream(file);ArrowFileWriterwriter=newArrowFileWriter(root,/*provider*/null,fileOutputStream.getChannel());){writer.start();writer.writeBatch();writer.end();System.out.println("Record batches written: "+writer.getRecordBlocks().size()+". Number of rows written: "+root.getRowCount());}catch(IOExceptione){e.printStackTrace();}}
Recordbatcheswritten:1.Numberofrowswritten:3

Read data from an arrow file

Example: Read the dataset from the previous example from an Arrow IPC file (random-access).

importorg.apache.arrow.memory.RootAllocator;importorg.apache.arrow.vector.ipc.ArrowFileReader;importorg.apache.arrow.vector.ipc.message.ArrowBlock;importorg.apache.arrow.vector.VectorSchemaRoot;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileOutputStream;importjava.io.IOException;try(BufferAllocatorallocator=newRootAllocator(Long.MAX_VALUE);FileInputStreamfileInputStream=newFileInputStream(newFile("random_access_file.arrow"));ArrowFileReaderreader=newArrowFileReader(fileInputStream.getChannel(),allocator);){System.out.println("Record batches in file: "+reader.getRecordBlocks().size());for(ArrowBlockarrowBlock:reader.getRecordBlocks()){reader.loadRecordBatch(arrowBlock);VectorSchemaRootroot=reader.getVectorSchemaRoot();System.out.println("VectorSchemaRoot read: \n"+root.contentToTSVString());}}catch(IOExceptione){e.printStackTrace();}
Recordbatchesinfile:1VectorSchemaRootread:agename10Dave20Peter30Mary

More examples available atArrow Java Cookbook.