Embed presentation


























































































![For example: matrix multiplication• We would write it like this:void MatrixMul_sequential(int dim, float *A, float *B, float *C) {for(int iRow=0; iRow<dim;++iRow) {for(int iCol=0; iCol<dim;++iCol) {float result = 0.f;for(int i=0; i<dim;++i) {result += A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}}}91](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-91-2048.jpg&f=jpg&w=240)

![For example: matrix multiplication• So on GPU:void MatrixMul_kernel_basic(int dim,__global float *A, __global float *B, __global float *C) {//Get the index of the work-itemint iCol = get_global_id(0);int iRow = get_global_id(1);float result = 0.0;for(int i=0;i< dim;++i) {result += A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}93](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-93-2048.jpg&f=jpg&w=240)
![For example: matrix multiplication• So on GPU:void MatrixMul_kernel_basic(int dim,__global float *A, __global float *B, __global float *C) {//Get the index of the work-itemint iCol = get_global_id(0);int iRow = get_global_id(1);float result = 0.0;for(int i=0;i< dim;++i) {result += A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}94](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-94-2048.jpg&f=jpg&w=240)





![CUDA kernel#define N 10__global__ void add( int *a, int *b, int *c ) {int tid = blockIdx.x; // this thread handles the data at its thread idif (tid < N)c[tid] = a[tid] + b[tid];}100](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-100-2048.jpg&f=jpg&w=240)
![CUDA setupint a[N], b[N], c[N];int *dev_a, *dev_b, *dev_c;// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) );cudaMalloc( (void**)&dev_b, N * sizeof(int) );cudaMalloc( (void**)&dev_c, N * sizeof(int) );// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = -i;b[i] = i * i;}101](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-101-2048.jpg&f=jpg&w=240)

![CUDA get results// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %dn", a[i], b[i], c[i] );}// free the memory allocated on the GPUcudaFree( dev_a );cudaFree( dev_b );cudaFree( dev_c );103](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-103-2048.jpg&f=jpg&w=240)

![For example we need a good randint n = 100;curandGenerator generator = new curandGenerator();float hostData[] = new float[n];Pointer deviceData = new Pointer();cudaMalloc(deviceData, n * Sizeof.FLOAT);curandCreateGenerator(generator, CURAND_RNG_PSEUDO_DEFAULT);curandSetPseudoRandomGeneratorSeed(generator, 1234);curandGenerateUniform(generator, deviceData, n);cudaMemcpy(Pointer.to(hostData), deviceData,n * Sizeof.FLOAT, cudaMemcpyDeviceToHost);System.out.println(Arrays.toString(hostData));curandDestroyGenerator(generator);cudaFree(deviceData);105](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-105-2048.jpg&f=jpg&w=240)



![Optimizations…__kernel void MatrixMul_kernel_basic(int dim,__global float *A,__global float *B,__global float *C){int iCol = get_global_id(0);int iRow = get_global_id(1);float result = 0.0;for(int i=0;i< dim;++i){result +=A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}109](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-109-2048.jpg&f=jpg&w=240)
![<—Optimizations#define VECTOR_SIZE 4__kernel void MatrixMul_kernel_basic_vector4(int dim,__global float4 *A,__global float4 *B,__global float *C)int localIdx = get_global_id(0);int localIdy = get_global_id(1);float result = 0.0;float4 Bvector[4];float4 Avector, temp;float4 resultVector[4] = {0,0,0,0};int rowElements = dim/VECTOR_SIZE;for(int i=0; i<rowElements; ++i){Avector = A[localIdy*rowElements + i];Bvector[0] = B[dim*i + localIdx];Bvector[1] = B[dim*i + rowElements + localIdx];Bvector[2] = B[dim*i + 2*rowElements + localIdx];Bvector[3] = B[dim*i + 3*rowElements + localIdx];temp = (float4)(Bvector[0].x, Bvector[1].x, Bvector[2].x, Bvector[3].x);resultVector[0] += Avector * temp;temp = (float4)(Bvector[0].y, Bvector[1].y, Bvector[2].y, Bvector[3].y);resultVector[1] += Avector * temp;temp = (float4)(Bvector[0].z, Bvector[1].z, Bvector[2].z, Bvector[3].z);resultVector[2] += Avector * temp;temp = (float4)(Bvector[0].w, Bvector[1].w, Bvector[2].w, Bvector[3].w);resultVector[3] += Avector * temp;}C[localIdy*dim + localIdx*VECTOR_SIZE] = resultVector[0].x + resultVector[0].y + resultVector[0].z + resultVector[0].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 1] = resultVector[1].x + resultVector[1].y + resultVector[1].z + resultVector[1].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 2] = resultVector[2].x + resultVector[2].y + resultVector[2].z + resultVector[2].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 3] = resultVector[3].x + resultVector[3].y + resultVector[3].z + resultVector[3].w;} 110](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-110-2048.jpg&f=jpg&w=240)
![<—Optimizations#define VECTOR_SIZE 4__kernel void MatrixMul_kernel_basic_vector4(int dim,__global float4 *A,__global float4 *B,__global float *C)int localIdx = get_global_id(0);int localIdy = get_global_id(1);float result = 0.0;float4 Bvector[4];float4 Avector, temp;float4 resultVector[4] = {0,0,0,0};int rowElements = dim/VECTOR_SIZE;for(int i=0; i<rowElements; ++i){Avector = A[localIdy*rowElements + i];Bvector[0] = B[dim*i + localIdx];Bvector[1] = B[dim*i + rowElements + localIdx];Bvector[2] = B[dim*i + 2*rowElements + localIdx];Bvector[3] = B[dim*i + 3*rowElements + localIdx];temp = (float4)(Bvector[0].x, Bvector[1].x, Bvector[2].x, Bvector[3].x);resultVector[0] += Avector * temp;temp = (float4)(Bvector[0].y, Bvector[1].y, Bvector[2].y, Bvector[3].y);resultVector[1] += Avector * temp;temp = (float4)(Bvector[0].z, Bvector[1].z, Bvector[2].z, Bvector[3].z);resultVector[2] += Avector * temp;temp = (float4)(Bvector[0].w, Bvector[1].w, Bvector[2].w, Bvector[3].w);resultVector[3] += Avector * temp;}C[localIdy*dim + localIdx*VECTOR_SIZE] = resultVector[0].x + resultVector[0].y + resultVector[0].z + resultVector[0].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 1] = resultVector[1].x + resultVector[1].y + resultVector[1].z + resultVector[1].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 2] = resultVector[2].x + resultVector[2].y + resultVector[2].z + resultVector[2].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 3] = resultVector[3].x + resultVector[3].y + resultVector[3].z + resultVector[3].w;} 111](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-111-2048.jpg&f=jpg&w=240)













![IBM patched JVM for GPUImagine:void fooJava(float A[], float B[], int n) {// similar to for (idx = 0; i < n; i++)IntStream.range(0, N).parallel().forEach(i -> { b[i] = a[i] * 2.0; });}125](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-125-2048.jpg&f=jpg&w=240)
![IBM patched JVM for GPUImagine:void fooJava(float A[], float B[], int n) {// similar to for (idx = 0; i < n; i++)IntStream.range(0, N).parallel().forEach(i -> { b[i] = a[i] * 2.0; });}… we would like the lambda to be automatically converted to GPU code…126](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-126-2048.jpg&f=jpg&w=240)
![IBM patched JVM for GPUWhen n is big the lambda code is executed on GPU:class Par {void foo(float[] a, float[] b, float[] c, int n) {IntStream.range(0, n).parallel().forEach(i -> {b[i] = a[i] * 2.0;c[i] = a[i] * 3.0;});}}*only lambdas with primitive types in one dimension arrays.127](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-127-2048.jpg&f=jpg&w=240)















![Aparapi – now its so much simple!public static void main(String[] _args) {final int size = 512;final float[] a = new float[size];final float[] b = new float[size];for (int i = 0; i < size; i++) {a[i] = (float) (Math.random() * 100);b[i] = (float) (Math.random() * 100);}final float[] sum = new float[size];Kernel kernel = new Kernel(){@Override public void run() {int gid = getGlobalId();sum[gid] = a[gid] + b[gid];}};kernel.execute(Range.create(size));for (int i = 0; i < size; i++) {System.out.printf("%6.2f + %6.2f = %8.2fn", a[i], b[i], sum[i]);}kernel.dispose();}143](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-143-2048.jpg&f=jpg&w=240)






















1. The document discusses the history and evolution of GPUs and GPGPU programming. It describes how GPUs started as dedicated graphics cards but now have programmable capabilities through shaders. 2. It explains the key concepts of GPGPU including the host/device model, memory models, and execution models using concepts like work items, work groups, and ND ranges. 3. The document uses OpenCL as an example programming model, covering memory transfers between host and device, data types, and how a matrix multiplication kernel could be implemented in OpenCL using the execution model.


























































































![For example: matrix multiplication• We would write it like this:void MatrixMul_sequential(int dim, float *A, float *B, float *C) {for(int iRow=0; iRow<dim;++iRow) {for(int iCol=0; iCol<dim;++iCol) {float result = 0.f;for(int i=0; i<dim;++i) {result += A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}}}91](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-91-2048.jpg&f=jpg&w=240)

![For example: matrix multiplication• So on GPU:void MatrixMul_kernel_basic(int dim,__global float *A, __global float *B, __global float *C) {//Get the index of the work-itemint iCol = get_global_id(0);int iRow = get_global_id(1);float result = 0.0;for(int i=0;i< dim;++i) {result += A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}93](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-93-2048.jpg&f=jpg&w=240)
![For example: matrix multiplication• So on GPU:void MatrixMul_kernel_basic(int dim,__global float *A, __global float *B, __global float *C) {//Get the index of the work-itemint iCol = get_global_id(0);int iRow = get_global_id(1);float result = 0.0;for(int i=0;i< dim;++i) {result += A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}94](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-94-2048.jpg&f=jpg&w=240)





![CUDA kernel#define N 10__global__ void add( int *a, int *b, int *c ) {int tid = blockIdx.x; // this thread handles the data at its thread idif (tid < N)c[tid] = a[tid] + b[tid];}100](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-100-2048.jpg&f=jpg&w=240)
![CUDA setupint a[N], b[N], c[N];int *dev_a, *dev_b, *dev_c;// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) );cudaMalloc( (void**)&dev_b, N * sizeof(int) );cudaMalloc( (void**)&dev_c, N * sizeof(int) );// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = -i;b[i] = i * i;}101](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-101-2048.jpg&f=jpg&w=240)

![CUDA get results// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %dn", a[i], b[i], c[i] );}// free the memory allocated on the GPUcudaFree( dev_a );cudaFree( dev_b );cudaFree( dev_c );103](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-103-2048.jpg&f=jpg&w=240)

![For example we need a good randint n = 100;curandGenerator generator = new curandGenerator();float hostData[] = new float[n];Pointer deviceData = new Pointer();cudaMalloc(deviceData, n * Sizeof.FLOAT);curandCreateGenerator(generator, CURAND_RNG_PSEUDO_DEFAULT);curandSetPseudoRandomGeneratorSeed(generator, 1234);curandGenerateUniform(generator, deviceData, n);cudaMemcpy(Pointer.to(hostData), deviceData,n * Sizeof.FLOAT, cudaMemcpyDeviceToHost);System.out.println(Arrays.toString(hostData));curandDestroyGenerator(generator);cudaFree(deviceData);105](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-105-2048.jpg&f=jpg&w=240)



![Optimizations…__kernel void MatrixMul_kernel_basic(int dim,__global float *A,__global float *B,__global float *C){int iCol = get_global_id(0);int iRow = get_global_id(1);float result = 0.0;for(int i=0;i< dim;++i){result +=A[iRow*dim + i]*B[i*dim + iCol];}C[iRow*dim + iCol] = result;}109](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-109-2048.jpg&f=jpg&w=240)
![<—Optimizations#define VECTOR_SIZE 4__kernel void MatrixMul_kernel_basic_vector4(int dim,__global float4 *A,__global float4 *B,__global float *C)int localIdx = get_global_id(0);int localIdy = get_global_id(1);float result = 0.0;float4 Bvector[4];float4 Avector, temp;float4 resultVector[4] = {0,0,0,0};int rowElements = dim/VECTOR_SIZE;for(int i=0; i<rowElements; ++i){Avector = A[localIdy*rowElements + i];Bvector[0] = B[dim*i + localIdx];Bvector[1] = B[dim*i + rowElements + localIdx];Bvector[2] = B[dim*i + 2*rowElements + localIdx];Bvector[3] = B[dim*i + 3*rowElements + localIdx];temp = (float4)(Bvector[0].x, Bvector[1].x, Bvector[2].x, Bvector[3].x);resultVector[0] += Avector * temp;temp = (float4)(Bvector[0].y, Bvector[1].y, Bvector[2].y, Bvector[3].y);resultVector[1] += Avector * temp;temp = (float4)(Bvector[0].z, Bvector[1].z, Bvector[2].z, Bvector[3].z);resultVector[2] += Avector * temp;temp = (float4)(Bvector[0].w, Bvector[1].w, Bvector[2].w, Bvector[3].w);resultVector[3] += Avector * temp;}C[localIdy*dim + localIdx*VECTOR_SIZE] = resultVector[0].x + resultVector[0].y + resultVector[0].z + resultVector[0].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 1] = resultVector[1].x + resultVector[1].y + resultVector[1].z + resultVector[1].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 2] = resultVector[2].x + resultVector[2].y + resultVector[2].z + resultVector[2].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 3] = resultVector[3].x + resultVector[3].y + resultVector[3].z + resultVector[3].w;} 110](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-110-2048.jpg&f=jpg&w=240)
![<—Optimizations#define VECTOR_SIZE 4__kernel void MatrixMul_kernel_basic_vector4(int dim,__global float4 *A,__global float4 *B,__global float *C)int localIdx = get_global_id(0);int localIdy = get_global_id(1);float result = 0.0;float4 Bvector[4];float4 Avector, temp;float4 resultVector[4] = {0,0,0,0};int rowElements = dim/VECTOR_SIZE;for(int i=0; i<rowElements; ++i){Avector = A[localIdy*rowElements + i];Bvector[0] = B[dim*i + localIdx];Bvector[1] = B[dim*i + rowElements + localIdx];Bvector[2] = B[dim*i + 2*rowElements + localIdx];Bvector[3] = B[dim*i + 3*rowElements + localIdx];temp = (float4)(Bvector[0].x, Bvector[1].x, Bvector[2].x, Bvector[3].x);resultVector[0] += Avector * temp;temp = (float4)(Bvector[0].y, Bvector[1].y, Bvector[2].y, Bvector[3].y);resultVector[1] += Avector * temp;temp = (float4)(Bvector[0].z, Bvector[1].z, Bvector[2].z, Bvector[3].z);resultVector[2] += Avector * temp;temp = (float4)(Bvector[0].w, Bvector[1].w, Bvector[2].w, Bvector[3].w);resultVector[3] += Avector * temp;}C[localIdy*dim + localIdx*VECTOR_SIZE] = resultVector[0].x + resultVector[0].y + resultVector[0].z + resultVector[0].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 1] = resultVector[1].x + resultVector[1].y + resultVector[1].z + resultVector[1].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 2] = resultVector[2].x + resultVector[2].y + resultVector[2].z + resultVector[2].w;C[localIdy*dim + localIdx*VECTOR_SIZE + 3] = resultVector[3].x + resultVector[3].y + resultVector[3].z + resultVector[3].w;} 111](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-111-2048.jpg&f=jpg&w=240)













![IBM patched JVM for GPUImagine:void fooJava(float A[], float B[], int n) {// similar to for (idx = 0; i < n; i++)IntStream.range(0, N).parallel().forEach(i -> { b[i] = a[i] * 2.0; });}125](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-125-2048.jpg&f=jpg&w=240)
![IBM patched JVM for GPUImagine:void fooJava(float A[], float B[], int n) {// similar to for (idx = 0; i < n; i++)IntStream.range(0, N).parallel().forEach(i -> { b[i] = a[i] * 2.0; });}… we would like the lambda to be automatically converted to GPU code…126](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-126-2048.jpg&f=jpg&w=240)
![IBM patched JVM for GPUWhen n is big the lambda code is executed on GPU:class Par {void foo(float[] a, float[] b, float[] c, int n) {IntStream.range(0, n).parallel().forEach(i -> {b[i] = a[i] * 2.0;c[i] = a[i] * 3.0;});}}*only lambdas with primitive types in one dimension arrays.127](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-127-2048.jpg&f=jpg&w=240)















![Aparapi – now its so much simple!public static void main(String[] _args) {final int size = 512;final float[] a = new float[size];final float[] b = new float[size];for (int i = 0; i < size; i++) {a[i] = (float) (Math.random() * 100);b[i] = (float) (Math.random() * 100);}final float[] sum = new float[size];Kernel kernel = new Kernel(){@Override public void run() {int gid = getGlobalId();sum[gid] = a[gid] + b[gid];}};kernel.execute(Range.create(size));for (int i = 0; i < size; i++) {System.out.printf("%6.2f + %6.2f = %8.2fn", a[i], b[i], sum[i]);}kernel.dispose();}143](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2fgpuandjava-171110065815%2f75%2fJava-on-the-GPU-Where-are-we-now-143-2048.jpg&f=jpg&w=240)




















