Embed presentation
Download as PDF, PPTX











































![例)C++で自然数1..500の平方根を求 めるプログラムconst int N = 500;void calculate(float* x,float* y){ for (int i = 0; i < N; ++i) { y[i] = sqrt(x[i]); }} • これをCUDAで書きint main () { vector<float> x(N), y(N); なおしてみる for (int i = 0; i < N; ++i) { x[i] = i+1; } calculate(&x[0], &y[0]); for (int i = 0; i < N; ++i) { cout << y[i] << endl; }} 44](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-44-2048.jpg&f=jpg&w=240)
![例)自然数1..500の平方根を求めるプ ログラム,CUDA C版#include <iostream>#include <thrust/thrust_vector.h>__global__ void calculate(float* x, float* y) { int i = threadIdx.x; y[i] = sqrt(x[i]);} thrust_vectorはthrustを軽く ラップする独自のライブラリでint main () { す(nHDに入ってます)。完全な const int N = 500; thrust_vector<float> x(N), y(N); コードは2ページ後の黒背景 for (int i = 0; i < 500; ++i) { の部分を参照してください。 x[i] = i+1; } calculate <<<1,N>>> (x.ptr(), y.ptr()); for (int i = 0; i < 500; ++i) { cout << y[i] << endl; }} 45](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-45-2048.jpg&f=jpg&w=240)
![1..500の平方根を求めるプログラム,比較 C++ CUDAconst int N = 500; const int N = 500;void calculate(float* x,float* y){ __global__ void calculate(float* x, for (int i = 0; i < N; ++i) { float* y) { y[i] = sqrt(x[i]); int i = threadIdx.x; } y[i] = sqrt(x[i]);} }int main () { int main () { vector<float> x(N), y(N); thrust_vector<float> x(N), y(N); for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) { x[i] = i+1; x[i] = i+1; } } calculate(&x[0], &y[0]); calculate <<<1,N>>> for (int i = 0; i < N; ++i) { (x.ptr(), y.ptr()); cout << y[i] << endl; for (int i = 0; i < N; ++i) { } cout << y[i] << endl;} } } CUDAではループのかわりに、カーネル関数を並列的に実行 スレッドごとにthreadIdxというのが受け取れる(MPIっぽい) 46](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-46-2048.jpg&f=jpg&w=240)
![実行してみるthrust> cat thrust_vector.cu ys = dev_ys;#include <thrust/device_vector.h> for (int i = 0 ; i < N; ++i) {#include <thrust/host_vector.h> cout << "sqrt " << xs[i] << " is " << ys[i] <<#include <iostream> endl;using namespace std; } }const int N = 500; thrust> nvcc thrust_vector.cu__global__ thrust> ./a.outvoid calculate (float *px, float *py) { sqrt 0 is 0 int tid = threadIdx.x; sqrt 1 is 1 float x = px[tid]; sqrt 2 is 1.41421 float y = sqrtf(x); sqrt 3 is 1.73205 py[tid] = y; sqrt 4 is 2} sqrt 5 is 2.23607 sqrt 6 is 2.44949int main () { sqrt 7 is 2.64575 thrust::host_vector<float> xs(N),ys(N); sqrt 8 is 2.82843 thrust::device_vector<float> dev_xs(N),dev_ys(N); sqrt 9 is 3 for (int i = 0 ; i < N; ++i) { sqrt 10 is 3.16228 xs[i] = i; ・・・・・ } thrust> dev_xs = xs; calculate <<< 1 , N >>> (thrust::raw_pointer_cast(&*dev_xs.begin()), thrust::raw_pointer_cast(&*dev_ys.begin())); 47](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-47-2048.jpg&f=jpg&w=240)
![ptxを生成しGPU機械語に迫るthrust> nvcc thrust_vector.cu --ptx ・・・ .entry _Z9calculatePfS_ (thrust> less thrust_vector.ptx .param .u32 __cudaparm__Z9calculatePfS__px, .param .u32 __cudaparm__Z9calculatePfS__py) { .reg .u16 %rh<3>;thrust> less thrust_vector.cu .reg .u32 %r<8>;・・・ .reg .f32 %f<4>;__global__ .loc 26 8 0 $LBB1__Z9calculatePfS_:void calculate (float *px, float *py) { .loc 26 10 0 int tid = threadIdx.x;//A cvt.s32.u16 %r1, %tid.x; //A float x = px[tid]; //B cvt.u16.u32 %rh1, %r1; //A float y = sqrtf(x); //C mul.wide.u16 %r2, %rh1, 4; //r2 = 4*tid.x py[tid] = y; //D ld.param.u32 %r3,} [__cudaparm__Z9calculatePfS__px]; //B・・・ add.u32 %r4, %r3, %r2; //B ld.global.f32 %f1, [%r4+0]; //B .loc 26 12 0 sqrt.approx.f32 %f2, %f1; //C ld.param.u32 %r5, [__cudaparm__Z9calculatePfS__py]; //D add.u32 %r6, %r5, %r2; //D st.global.f32 [%r6+0], %f2; //D .loc 26 13 0 exit; $LDWend__Z9calculatePfS_: } // _Z9calculatePfS_ ・・・ 48](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-48-2048.jpg&f=jpg&w=240)




































































PFI社内セミナー2009年12月10日 20:00-21:00(予定)GPUコンピューティングの現状とスーパーコンピューティングの未来発表者: 村主 崇行(プリファードインフラストラクチャー 研究開発部門・京都大学大学院 物理学第二教室)セミナー録画URL: http://www.ustream.tv/recorded/2837689このスライドは、発表後にみなさまからいただいた貴重な意見をもとに改訂した版です。発表時点での版はこちら: http://www.slideshare.net/pfi/20091210-gpu-2735685











































![例)C++で自然数1..500の平方根を求 めるプログラムconst int N = 500;void calculate(float* x,float* y){ for (int i = 0; i < N; ++i) { y[i] = sqrt(x[i]); }} • これをCUDAで書きint main () { vector<float> x(N), y(N); なおしてみる for (int i = 0; i < N; ++i) { x[i] = i+1; } calculate(&x[0], &y[0]); for (int i = 0; i < N; ++i) { cout << y[i] << endl; }} 44](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-44-2048.jpg&f=jpg&w=240)
![例)自然数1..500の平方根を求めるプ ログラム,CUDA C版#include <iostream>#include <thrust/thrust_vector.h>__global__ void calculate(float* x, float* y) { int i = threadIdx.x; y[i] = sqrt(x[i]);} thrust_vectorはthrustを軽く ラップする独自のライブラリでint main () { す(nHDに入ってます)。完全な const int N = 500; thrust_vector<float> x(N), y(N); コードは2ページ後の黒背景 for (int i = 0; i < 500; ++i) { の部分を参照してください。 x[i] = i+1; } calculate <<<1,N>>> (x.ptr(), y.ptr()); for (int i = 0; i < 500; ++i) { cout << y[i] << endl; }} 45](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-45-2048.jpg&f=jpg&w=240)
![1..500の平方根を求めるプログラム,比較 C++ CUDAconst int N = 500; const int N = 500;void calculate(float* x,float* y){ __global__ void calculate(float* x, for (int i = 0; i < N; ++i) { float* y) { y[i] = sqrt(x[i]); int i = threadIdx.x; } y[i] = sqrt(x[i]);} }int main () { int main () { vector<float> x(N), y(N); thrust_vector<float> x(N), y(N); for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) { x[i] = i+1; x[i] = i+1; } } calculate(&x[0], &y[0]); calculate <<<1,N>>> for (int i = 0; i < N; ++i) { (x.ptr(), y.ptr()); cout << y[i] << endl; for (int i = 0; i < N; ++i) { } cout << y[i] << endl;} } } CUDAではループのかわりに、カーネル関数を並列的に実行 スレッドごとにthreadIdxというのが受け取れる(MPIっぽい) 46](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-46-2048.jpg&f=jpg&w=240)
![実行してみるthrust> cat thrust_vector.cu ys = dev_ys;#include <thrust/device_vector.h> for (int i = 0 ; i < N; ++i) {#include <thrust/host_vector.h> cout << "sqrt " << xs[i] << " is " << ys[i] <<#include <iostream> endl;using namespace std; } }const int N = 500; thrust> nvcc thrust_vector.cu__global__ thrust> ./a.outvoid calculate (float *px, float *py) { sqrt 0 is 0 int tid = threadIdx.x; sqrt 1 is 1 float x = px[tid]; sqrt 2 is 1.41421 float y = sqrtf(x); sqrt 3 is 1.73205 py[tid] = y; sqrt 4 is 2} sqrt 5 is 2.23607 sqrt 6 is 2.44949int main () { sqrt 7 is 2.64575 thrust::host_vector<float> xs(N),ys(N); sqrt 8 is 2.82843 thrust::device_vector<float> dev_xs(N),dev_ys(N); sqrt 9 is 3 for (int i = 0 ; i < N; ++i) { sqrt 10 is 3.16228 xs[i] = i; ・・・・・ } thrust> dev_xs = xs; calculate <<< 1 , N >>> (thrust::raw_pointer_cast(&*dev_xs.begin()), thrust::raw_pointer_cast(&*dev_ys.begin())); 47](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-47-2048.jpg&f=jpg&w=240)
![ptxを生成しGPU機械語に迫るthrust> nvcc thrust_vector.cu --ptx ・・・ .entry _Z9calculatePfS_ (thrust> less thrust_vector.ptx .param .u32 __cudaparm__Z9calculatePfS__px, .param .u32 __cudaparm__Z9calculatePfS__py) { .reg .u16 %rh<3>;thrust> less thrust_vector.cu .reg .u32 %r<8>;・・・ .reg .f32 %f<4>;__global__ .loc 26 8 0 $LBB1__Z9calculatePfS_:void calculate (float *px, float *py) { .loc 26 10 0 int tid = threadIdx.x;//A cvt.s32.u16 %r1, %tid.x; //A float x = px[tid]; //B cvt.u16.u32 %rh1, %r1; //A float y = sqrtf(x); //C mul.wide.u16 %r2, %rh1, 4; //r2 = 4*tid.x py[tid] = y; //D ld.param.u32 %r3,} [__cudaparm__Z9calculatePfS__px]; //B・・・ add.u32 %r4, %r3, %r2; //B ld.global.f32 %f1, [%r4+0]; //B .loc 26 12 0 sqrt.approx.f32 %f2, %f1; //C ld.param.u32 %r5, [__cudaparm__Z9calculatePfS__py]; //D add.u32 %r6, %r5, %r2; //D st.global.f32 [%r6+0], %f2; //D .loc 26 13 0 exit; $LDWend__Z9calculatePfS_: } // _Z9calculatePfS_ ・・・ 48](/image.pl?url=https%3a%2f%2fimage.slidesharecdn.com%2f2009-12-10-pfi-gpu-computing-091210041054-phpapp01%2f75%2f2009-12-10-GPU-48-2048.jpg&f=jpg&w=240)


































































