Acceleration of FFT computation using NVIDIA GPUs.

Acceleration of FFT computation
using NVIDIA GPUs.
Akira Nukada
Tokyo Institute of Technology
Research(History
1999
2001
2007
2004
The(University(of(Tokyo
Tokyo(Ins3tute(of(Technology
So#ware(Tuning(
Hand(Tuning,(some3mes(in(assembly(languages.
Auto(tuning
FFT
SpMV
DGEMM
Checkpoint(for(CUDA
TSUBAME[KFC
Sun(
E10000
Merced
SGI(
Origin2000
ES
SGI(Al3x
GPGPU
BG/L
POWER5
CRAY(XT3
PS3
Intel(MIC
FFT((Fast(Fourier(Transform)
FFT(is(a(fast(algorithm(to(compute(DFT((Discrete(Fourier(Transform).(
When(the(input(size(N(can(be(factorized(into(M(and(L,((N[point(FFT(is(replaced(
(by(L(x(M[point(FFTs,(M(x(L[point(FFT,(and(mul3plica3ons(by(twiddle(factors.(
twiddle(factors
M[point(FFT
L[point(FFT
Fine6grain(parallel(computa:on(of(FFT
Memory
L[point(FFT
L[point(FFT
L[point(FFT
L[point(FFT
Data(Exchange
Twiddle(factor(
mul3plica3on
Twiddle(factor(
mul3plica3on
Twiddle(factor(
mul3plica3on
Twiddle(factor(
mul3plica3on
M[point(FFT
M[point(FFT
M[point(FFT
M[point(FFT
Memory
16D(FFT(on(CUDA(GPUs
Data(is(read(from(global(memory.(
Global Memory
(
Many(threads(simultaneously(compute((
small(FFTs(using(registers.(
(
Shared Memory
Threads(exchange(data(using(shared(memory.(
(
Finally,(data(is(wriaen(back(to(global(memory.(
(
(
Global Memory
Three(CUDA(FFTs(in(2008(based(on(this(fine[grain(parallel(implementa3ons(
(1)(Ours(((2)(N.(Govindaraju,(et.(al.(((3)(V.(Volkov,(et.(al.(
Twiddle(factor(mul3plica3on(in(CUDA(FFT(
(
Twiddle(Factors(are(triangular(func3ons,(
and(thread[dependent(value.(
In(CUDA,(they(should(come(from(one(of(
(1)(registers.(
(2)(table(on(constant((cache)(memory.(
(3)(table(on(texture((cache)(memory.(
(4)(table(on(shared(memory.(
(5)(calculate(using(SFU(each(3me.(
We(selected(‘texture(plan’(to(reduce(the(
number(of(instruc3ons(and(registers.(
(
Fast(Fourier(transform(on(AMD(GPUs
Implementa3on(with(RADEON(/(
OpenCL(is(similar(to(NVIDIA(/(CUDA(
Read%from%device%memory
4[point(
4[point(
4[point(
•  Each(thread(computes(a(small(FFT( 4[point(
FFT
FFT
FFT
FFT
•  Data(exchange(between(threads(
Data%exchange%between%threads
•  Via(shared(memory((CUDA)(
4[point(
4[point(
4[point(
4[point(
FFT
FFT
FFT
•  Via(local(memory((OpenCL)(
FFT
•  Twiddle(factor((cos&sin)(table(
Write%to%device%memory
•  On(texture(memory((CUDA)( Example(of(16[point(FFT(using(4(threads
•  On(constant(memory((OpenCL)(
Performance(comparison(with(NVIDIA(GPUs
(
•  GeForce(and(Tesla(
•  Intel(Core(i7(CPU(
•  X58(Express(Chipset(
•  CUDA(4.0(
•  CUFFT(library(4.0,(or(
NukadaFFT(
•  RADEON(
GFLOPS
200(
150(
RADEON(HD(6970(
GTX(580((CUFFT)(
100(
GTX(580((NukadaFFT)(
C2050((CUFFT)(
C2050((NukadaFFT)(
50(
•  AMD(Phenom(9500(CPU(
0(
•  AMD(APP(SDK(2.4(
•  Custom(FFT(code(in(OpenCL(
256[point(
512[point(
BoIle6neck(&(efficiency(|(ra:o(to(theore:cal(peak
•  DP(performance(
•  Boale[neck(on(GeForce(
•  Memory(access(efficiency(
•  Double[complex((double2)(data(
•  Good(for(RADEON,(Bad(for(GeForce(
AMD%RADEON%
HD%6970
NVIDIA%GeForce%%
GTX%580
NVIDIA%Tesla%%
C2050
Peak(DP(performance
675(GFLOPS
197GFLOPS
515GFLOPS
Achieved(performance
171GFLOPS((25.3%)
144GFLOPS((73.1%)
114GFLOPS((22.1%)
Peak(Memory(B/W
176GB/s
192GB/s
144GB/s((128GB/s*9/8)
Achieved(B/W
137GB/s((77.8%)
115GB/s((59.9%)
91GB/s((71.1%)
Boale[neck
Memory
DP(perf.
Memory
Number(of(floa:ng(point(opera:ons
5N(log2(N(is(pseudo(number(of(FP(opera3ons(
Real(number(of(FP(ops.(and(FP(instruc3ons
Powers[of[two(FFT(
•  Large(number(of(FPADD/FPSUB(ops.(
256Mpoint
512Mpoint
•  Low(ra3o(of(FPMAD(combina3on(
5N(log2(N
10,240
23,040
•  RADEON(GPU(architecture(can(execute(
ADD/SUB(ops.
4,672
11,776
(((one(of(the(following(instruc3ons(in(a(cycle(
MUL(ops.
2,304
4,352
•  2(FPADD/FPSUB(
FPADD/FPSUB
3,520
9,984
•  1(FPMUL(
FPMUL
1,152
2,560
•  1(FPMAD(
FPMAD
1,152
1,792
Min.(FP(cycle(
(AMD)
4,064
9,344
Min.(FP(cycle(
(NVIDIA)
5,824
14,336
Performance(of(16D(FFT((GTX(280)
GFLOPS
Power(of(two(
300
250
200
150
100
odd(numbers.(
Memory(access(is(par3ally(coalesced(
50
0
128
144
192
243
256
320
324
375
432
512
Auto6Tuning(of(FFT(on(CUDA(GPUs
Many(varie3es(in(compu3ng(environment:(
•  Genera3on(of(GPU((G8X/G9X,(GT2XX)(
•  #(of(registers(per(SM(
•  #(of(cores,(clock(frequency((shader,(memory)(
•  Compiler(version((op3miza3on(of(PTXJIT)(
•  Actual(register(usage(is(determined(by(compiler.(
•  Driver(version((CUDA(&(display(driver)
Tuning(Parameters
(1)((Selec3on(of(radices(of(FFT(kernels(((generic(for(FFT)(
((((((([(combina3on(&(ordering(
(((((((ex)((240[point(=>(4,(4,(3,(5(
(
In CPUs, this
parameter
determines
(2) Selection
of number
of threads
(generic for GPU)
# of floating-point ops
Sufficient
thread blocks
to exploit
memory bandwidth
# of load/store
to cache
memory
In GPUs, this parameter also determines
# of
threads
per thread
block
(3) Avoid
bank
conflicts
on shared
memory (CUDA & FFT)
In above
example,
radix pattern/rule.
3 is the smallest.
Insert
padding
in a specific
Then, # of threads is set to 80 (240/3).
Shared(memory(access
- as fast as registers.
- consists of 16 banks of 32-bit.
Up to 16 threads can access simultaneously, if there are no bank conflicts.
0
64
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
64
128
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
Bank((#0
Bank((#1
Bank((#2
Bank((#3
Bank((#4
Bank((#5
Bank((#6
Bank((#7
Bank((#8
Bank((#9
Bank((#10
Bank((#11
Bank((#12
Bank((#13
Bank((#14
Bank((#15
16
15
128
32
192
33
34
35
36
37
38
39
40
41
42
43
44
45
46
192
48
47
256
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
Shared(memory(access(paIern(in(FFT
Implementa3on(based(on(Stockham(auto[sort(algorithm.(
64[point(FFT(using(16(threads(is(described(as(follows.(
0
1
2
3
4
5
6
7
8
9 10 11 12 13 14 15 16 17 18 19 20 21
0
1
2
3
4
5
6
7
8
9 10 11 12 13 14 15 16 17 18 19 20 21
No bank conflicts in WRITEs
READs may cause some bank conflicts
Default(access(paIern
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
Padding((1)(:(aWer(every(block(stride
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
Padding((2)(:(aWer(every(16n(elements.
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
Adjustment(of(number(of(thread(blocks
GFLOPS
350
GeForce GTX 280
Tesla S1070
300
250
200
150
100
50
0
1
2
3
4
5
6
7
8
9
# of active thread blocks per SM
10
11
Auto6Tuning(FFT(for(CUDA
Start!?
Generate(code(to(FILE
Radices(and(Ordering.
Load(PTX(to(MODULE
Padding
Generate(PTX(code(
Generate(&(Compile
Send(data(to(GPU
on(MEM
Call(NVCC(command(
to(compile
#(of(threads
Execute(Kernel
Run(&(Measure
Receive(data(from(GPU
Update
Update
Update
Goal
Check(result(on(CPU
Performance(improvement(by(padding
GFLOPS
350
With Padding
300
250
batch=65,536
200
150
100
50
0
w/o Padding
128 140 150 180 192 240 243 256 320 400 420 512
Transform Size
Padding(@(1926point(FFT
GeForce(GTX(285
GFLOPS
250
4[way
4[way
2[way
w/o Padding
4[way
8[way
200
8[way
150
batch=65,536
8[way
100
50
0
3,4,4,4
3,8,8
4,3,4,4 4,4,3,4 4,4,4,3
Radices
4,6,8
With Padding
6,4,8
Time(of(auto6tuning(for(16D(FFTs
Time (s)
80
batch=65,536
70
60
50
40
30
20
10
0
128 140 150 180 240 243 256 320 400 420 512
Transform Size
O(Data(size)(*(O(#(of(varia3ons(in(factoriza3ons)(
16D(FFT(with(kernels(op:mized(for(Tesla
(%)
100
80
60
40
20
0
128 140 150 180 240 243 256 320 400 420 512
Transform Size
batch=65,536
Performance(of(16D(FFT
GFLOPS
350
batch=65,536
300
NukadaFFT
250
CUFFT 2.3
200
FFTW 3.2.1
150
MKL 11.0
100
50
0
128 140 150 180 240 243 256 320 400 420 512
Transform Size
FFTW and MKL libraries use four cores of Intel Core i7 920.
16D(FFT(in(double(precision
GFLOPS
60
batch=32,768
CUFFT2.3
50
FFTW3.2.1
40
MKL11.0
30
20
10
0
NukadaFFT
128 140 150 180 240 243 256 400 420 512
Transform Size
Warp6level(op:miza:ons
#(of(threads(per(thread(block(is(constant(in(kernel(
!(Some(threads(may(perform(unnecessary(ops.
160[point(FFT((by((40(threads
White(part(is(unnecessary(opera3ons
4[point(FFT(x(40(threads
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
5[point(FFT(x(32(threads
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
8[point(FFT(x(20(threads
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
In(warp(level(:(32(threads(share(
instruc:ons.
160[point(FFT(((by((40(threads
Warp(0
Warp(1
4[point(FFT(x(40(threads
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
5[point(FFT(x(32(threads
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
8[point(FFT(x(20(threads
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
Give(up(here
Try(to(
eliminate(
this.(
Warp6level(flow(control.
(
for((….)({(
((((global_load();(
((((radix4_x();(
((((shmem();(
((((radix5_x();(
((((shmem();(
((((radix8_x();(
((((global_store();(
}
(
Warp(0
for((….)({(
((((global_load();(
((((radix4_x();(
((((shmem();(
((((radix5_x();(
((((shmem();(
((((radix8_x();(
((((global_store();(
}
(
Warp(1
for((….)({(
((((global_load();(
((((radix4_x();(
((((shmem();(
(((((
((((barrier();(
(((((
(
}
Warp6level(
160[point(FFT(((by((40(threads
Warp(0
Warp(1
4[point(FFT(x(40(threads
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
5[point(FFT(x(32(threads
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
8[point(FFT(x(20(threads
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
Performance(improvements(by(the(warp6
level(op:miza:on.
GFLOPS
Proposed
300
Normal
CUFFT 3.1
250
200
150
100
50
0
231 234 260 264 273 297 308 312 325 330 350 351 352 363 364 375 385 390 396 400
Transform size
NukadaFFT(library(and(Updates(
•  Ini3al(version(for(GT200(GPUs(in(2010.(
•  Minor(update((from(PTX(1.4(to(2.0)(
•  Recompile(for(CUDA(4.0(
•  Recompile(for(CUDA(5.0(
•  Recompile(for(CUDA(6.0(
hap://matsu[www.is.3tech.ac.jp/~nukada/nux/
On(Maxwell(GPU((GeForce(750(Ti)
65,536(batched(1[D(FFT((N=100~600).
Counts
NukadaFFT(fails(
274(((sizes(which(need(prime(factors(>(32)
CUFFT(fails
31(((several(sizes(greater(than(512()
Both(work(and(NukadaFFT(is(faster
225((up(to(x4.48(speed[up)
Both(work(and(CUFFT(is(faster
2((((128,(256)
250(
200(
150(
100(
0(
100(
108(
115(
120(
126(
133(
140(
147(
154(
161(
169(
175(
184(
190(
198(
207(
216(
224(
231(
240(
247(
253(
261(
272(
279(
288(
297(
306(
315(
323(
336(
342(
350(
360(
368(
377(
385(
396(
405(
416(
429(
437(
448(
459(
465(
480(
490(
496(
507(
520(
528(
540(
551(
561(
575(
585(
595(
50(
CUFFT6.0(
NukadaFFT(
Mul:6node(36D(FFT
FFT((Fast(Fourier(Transform)
(
Used(in(many(kinds(of(applica3ons(
•  1[D(FFT((!(Signal/audio(processing,(…(
•  2[D(FFT((!(Image(processing,(…(
•  3[D(FFT((!(Large[scale(HPC(applica3ons(
•  Bioinforma3cs,(Molecular(Dynamics,(…..(
Computa:onal(aspects(of(36D(FFT
Suitable(for(GPU(architecture(due(to(
•  Really(memory[bound(computa3ons(
•  Only(O(NlogN)(FP(ops(for(O(N)(data(
•  Sufficient(parallelism(
•  Performs(many(1[D(FFTs(for(each(dimension(
((((ex)(A(CUDA(thread(block(compute(a(1[D(FFT.((
1[D(FFT(for(dim.(X
1[D(FFT(for(dim.(Y(
1[D(FFT(for(dim.(Z(
Mul:6GPU(36D(FFT
•  Requires(all[to[all(comm.((
((like(parallel(3[D(FFT(on(clusters)
(NX,NY,NZ/P)
(NX,NY/P,NZ)
all[to[all
1[D(FFT(for(dim.(X
1[D(FFT(for(dim.(Y(
1[D(FFT(for(dim.(Z(
TSUBAME(2.0(
Compute(Nodes((Thin)(
HP(SL390s(G7(
• 
• 
• 
• 
2(Intel(Xeon(X5670((Westmere)(
3(NVIDIA(Tesla(M2050(
2(Mellanox(QDR(InfiniBand(
54(or(96(GB(Memory(and(SSDs
DDR3(RAM
DDR3(RAM
DDR3(RAM
CPU(0(
Westmere[
EP
QPI
QPI
DDR3(RAM
DDR3(RAM
DDR3(RAM
CPU(1(
Westmere[
EP
IO[Hub(
Tylersburg
PCI[E(x8
PCI[E(x8
QDR(x4(IB
QDR(x4(IB
PCI[E(x16
GPU(0(
Fermi
PCI[E(x16
GPU(1(
Fermi
PCI[E(x16
GPU(2(
Fermi
QPI
QPI
IO[Hub(
Tylersburg
TSUBAME(2.0(InfiniBand(network
Voltaire(
GridDirector(4700
Primary
GridDirector(4036
……..
1,408(Compute(Nodes((Thin)
Fat(Nodes
……..
Secondary
I/O(Nodes
Typical all-to-all between GPUs using MPI
•  Pipelined stages (D2H, MPI, H2D)
•  Bi-directional IB and PCI-E
•  Four DMA transfers simultaneously
D2H(
0[>1(
D2H(
99[>0(
D2H(
0[>2(
MPI_Send(
0[>1(
D2H(
98[>0(
MPI_Recv(
99[>0(
On node #0
D2H(
0[>3(
MPI_Send(
0[>2(
H2D(
0[>1(
D2H(
0[>4(
MPI_Send(
0[>3(
H2D(
0[>2(
D2H(
97[>0(
MPI_Recv(
98[>0(
H2D(
99[>0(
D2H(
96[>0(
MPI_Recv(
97[>0(
H2D(
98[>0(
for((i(=(1;(i(<(np(+(2;(i++)({(
(((((((if((i(<(np)(
((((((((((((((cuMemcpyDtoH(….();(
(((((((if((i(>(2)(
((((((((((((((cuMemcpyHtoD(.….);(
(((((((if((i(>(1(&&(i(<(np(+(1)({(
((((((((((((((MPI_Irecv(…….);(
D2H(((((((((((((((MPI_Isend(……..);(
D2H(
D2H(
0[>5(((((((((((((((MPI_Waitall(…..);(
0[>6(
0[>7(
(((((((}( MPI_Send( MPI_Send(
MPI_Send(
0[>4(
0[>5(
0[>6(
(((((((cuCtxSynchronize(…);(
H2D(}
H2D(
H2D(
0[>3(
D2H(
95[>0(
MPI_Recv(
96[>0(
H2D(
97[>0(
0[>4(
D2H(
94[>0(
MPI_Recv(
95[>0(
H2D(
96[>0(
0[>5(
D2H(
93[>0(
MPI_Recv(
94[>0(
H2D(
95[>0(
Performance of 3-D FFT using MPI
(1GPU/node)
Performance (GFLOPS)
1600(
800(
256(
400(
384(
512(
200(
1024(
100(
Single GPU: 68GF@256, 48.5GF@384, 50.7GF@512
50(
4(
8(
16(
32(
# of node
64(
128(
Multi-GPU per node, multiple nodes.
(1) Hybrid : multiple GPU per process
Hybrid(results(in(larger(MPI(
message(size.
1GPU (0)
2GPU (0)
(2) Flat: one GPU per process
3GPU (0)
QDR x4 IB
CPU0
GPU0
GPU1
CPU1
2GPU (0-0)
GPU2
2GPU (0-1)
Placement(of(processes(also(
indicates(the(placement(of(
buffers(used(for(MPI(and(CUDA.
3GPU (0-0-0)
3GPU (0-1-1)
3GPU (0-0-1)
Performance(with(64(nodes
Memory(on(two(CPUs(are(used.
1400(
Performance (GFLOPS)
1200(
1000(
800(
600(
256(
384(
400(
512(
200(
1024(
0(
To(see(this(limited(scalability…
Many(people(say(many(things(
Hardware("(Hardware(off[loading,(etc.(
System("(New(rou3ng(algorithm,(tuned(run3me,(etc.
Sales("(Please(buy(our(network(hardware,(
(((((((((((((((((((((((((((((((((((((((((((network(analyzer(so#ware
Users("(No(problem.(It(is(normal.(Please(GIVE(UP!
Our(answer(is(“It’s(too(early(to(give(up!”
Op:miza:ons
Two(major(issues(
•  Overhead(of(sending(small(messages(
•  Network(conges3ons(
(
To(stabilize(and(improve(the(all[to[all(performance,(we(need(
following(op3miza3ons:(
1)  Low[level(IBverbs(API(
2)  NUMA[aware(buffer(alloca3on(:(3~11%(improvement(
3)  Simultaneous(RDMA(transfers(and(scheduling((
4)  Dynamic(Rail[assignment(control(
MPI overheads (size: 256x256x256)
Ibverbs(
MPI_Isend,Irecv(
Performance (GFLOPS)
800(
700(
600(
•  MPI(splits(messages(into(mul3ple(IB(rails(
•  Ibverbs(uses(only(one(rail(for(each(peer(node
500(
400(
300(
200(
100(
0(
4(
8(
16(
32(
# of node
64(
128(
NUMA(and(rail(assignment
Performance((GFLOPS)
CPU0(&(CPU1:(CPU0[[HCA0,(CPU1[[HCA1(
CPU0(only(:(CPU0[[HCA0&1(
500(
400(
300(
200(
100(
0(
2(
4(
8(
16(
32(
#(of(communica3ons(in(each(stage((=chunk(size)
3~11%(improvement(by(using(all(CPU(memory(regions
64(
All-to-all using IBverbs RDMA
Synchroniza3on(is(required(before(star3ng(RDMA(to(ensure(message(packing(completed.
D2H(
0[>1(
D2H(
0[>2(
(
0[>1(
D2H(
0[>3(
(
0[>2(
D2H(
0[>4(
(
0[>3(
D2H(
0[>5(
(
0[>4(
D2H(
0[>6(
(
0[>5(
D2H(
0[>7(
(
0[>6(
rdma_read( rdma_read( rdma_read( rdma_read( rdma_read( rdma_read(
99[>0(
98[>0(
97[>0(
96[>0(
95[>0(
94[>0(
On node #0
H2D(
99[>0(
H2D(
98[>0(
H2D(
97[>0(
H2D(
96[>0(
H2D(
95[>0(
all-to-all using multi-rail IB-RDMA
D2H(
0[>1(
D2H(
0[>2(
D2H(
0[>3(
D2H(
0[>4(
D2H(
0[>5(
D2H(
0[>6(
D2H(
0[>7(
D2H(
0[>8(
D2H(
0[>9(
D2H(
0[>10(
D2H( D2H(
0[>11( 0[>12(
Rail[0
(
0[>1(
(
0[>3(
(
0[>5(
(
0[>7(
(
0[>9(
Rail[1
(
0[>2(
(
0[>4(
(
0[>6(
(
0[>8(
(
0[>10(
Rail[0
rdma_read( rdma_read( rdma_read( rdma_read( rdma_read(
99[>0(
97[>0(
95[>0(
93[>0(
91[>0(
Rail[1
rdma_read( rdma_read( rdma_read( rdma_read( rdma_read(
98[>0(
96[>0(
94[>0(
92[>0(
90[>0(
On node #0
H2D( H2D( H2D( H2D( H2D( H2D( H2D( H2D(
99[>0( 98[>0( 97[>0( 96[>0( 95[>0( 94[>0( 93[>0( 92[>0(
Reduction of synchronizations
Mul3ple(RDMA(are(executed(concurrently,(with(rela3vely(smaller(overhead.
D2H(
0[>1(
D2H(
0[>2(
D2H(
0[>3(
D2H(
0[>4(
Rail[0
Rail[1
Rail[0
Rail[1
On node #0
D2H(
0[>5(
D2H(
0[>6(
D2H(
0[>7(
D2H(
0[>8(
D2H(
0[>9(
D2H(
0[>10(
D2H( D2H(
0[>11( 0[>12(
D2H( D2H(
0[>13( 0[>14(
0[>1(
0[>3(
0[>2(
0[>4(
0[>5(
0[>7(
0[>6(
0[>8(
0[>9(
0[>11(
0[>10(
0[>12(
rdma_read(99[>0(
rdma_read(97[>0(
rdma_read(98[>0(
rdma_read(96[>0(
rdma_read(95[>0(
rdma_read(93[>0(
rdma_read(94[>0(
rdma_read(92[>0(
rdma_read(
rdma_read(
rdma_read(
rdma_read(
H2D( H2D( H2D( H2D( H2D( H2D(
99[>0( 98[>0( 97[>0( 96[>0( 95[>0( 94[>0(
An(example(to(show(advantage
Step%1
Step%2
#0
Rank(#0
Line(A
Line(B
#1
Rank(#1
Line(B
Line(D
Rank(#2
Line(A
Line(C
#2
Rank(#3
Line(C
Line(D
Line(A
Line(B
Line(C
#3
Line(D
#0
#1
Edge(Switch
#0
#1
#2
#2
#3
#3
Chunk(size(and(performance
Performance((GFLOPS)
(256x256x256(3[D(FFT(using(64(nodes)
500(
400(
300(
200(
100(
0(
2(
4(
8(
16(
32(
64(
#(of(communica3ons(in(each(stage((=chunk(size)
Small
Chunk%size
Large
Large
#(of(synchroniza3ons
Small
Large
%(of(Overlap(
Small
Cri3cal
For(network(conten3on
Robust
Dynamic(Rail(Selec:on
•  Exchange(assigned(rails(for(slow(connec3ons(
Rail[0
1
3
7
Slow(
2
Rail[1
Rail[0
5
4
6
8
Rail[1
1
3
4
7
2
5
6
8
Changing(rail(may(avoid(the(slow[down(by(
•  Less(conten3on(due(to(different(rou3ng(
•  Not(using(bad/unstable(links(
Exchange
0
1
2
3
4
5
Effect(of(dynamic(rail(selec:on
(Size:(256x256x256)
7(jobs((and(many(other(users(jobs)(are(executed(
concurrently.
Performance((GFLOPS)
Rail(Selec3on(
No(selec3on(
600(
500(
400(
300(
200(
100(
0(
Block(1(
Block(2(
Block(3(
Block(4(
Block(5(
Block(6(
Block(7(
Compute(Nodes(((Each(block(has(64(nodes,(3GPU(per(node)
Block(1~6:(5~6(edge(switches((H/X(queue),((Block(7:(9(edge(switches((S(queue)
Effect(of(dynamic(rail(selec:on
The(dynamic(rail(selec3on(can(avoid(terrible(network(conten3ons.(
Rail(selec3on(
No(selec3on(
500(
400(
300(
200(
100(
0(
1(
76(
151(
226(
301(
376(
451(
526(
601(
676(
751(
826(
901(
976(
1051(
1126(
1201(
1276(
1351(
1426(
1501(
1576(
1651(
1726(
1801(
1876(
1951(
Performance((GFLOPS)
600(
Final(results
Performance(improvement(
(256x256x256(36D(FFT(using(64(nodes)
Performance (GFLOPS)
600(
500(
400(
300(
200(
100(
0(
Before
A#er
3-D FFT using 1GPU per node
Performance (GFLOPS)
1600(
800(
256(
400(
384(
512(
200(
1024(
100(
Single GPU: 68GF@256, 48.5GF@384, 50.7GF@512
50(
4(
8(
16(
32(
# of node
64(
128(
3-D FFT using 3GPU per node
4.8TF(with(256(node(
Size(:(20483(
3200(
Performance (GFLOPS)
1600(
800(
256(
384(
400(
512(
200(
1024(
100(
Single GPU: 68GF@256, 48.5GF@384, 50.7GF@512
50(
4(
8(
16(
32(
# of node
64(
128(
Breakdown(of(the(execu:on(:me(
(256x256x256,(1GPU(per(node)
45(
40(
Time (msec)
35(
FFT((Z)(
30(
All[to[all(
25(
FFT((X&Y)(
20(
15(
10(
5(
0(
4(
8(
16(
# of node
32(
64(
128(
Breakdown(of(the(execu:on(:me(
(256x256x256,(3GPU(per(node)
45(
40(
Time (msec)
35(
FFT((Z)(
30(
All[to[all(
25(
FFT((X&Y)(
20(
15(
10(
5(
0(
4(
8(
16(
# of node
32(
64(
128(
Breakdown(of(the(execu:on(:me(
(256x256x256,(1GPU(or(3GPU(per(
node)
On(board(FFT(is(faster(in(3GPU
45(
40(
Time (msec)
35(
30(
25(
All[to[all(part(is((
•  Faster(in(3GPU(for(large(message(size(
•  Faster(in(1GPU(for(small(message(size
FFT((Z)(
All[to[all(
FFT((X&Y)(
20(
15(
10(
5(
0(
# of node
Summary
•  Auto[tuning(FFT(for(CUDA(GPUs.(
•  Scalable(all[to[all(comm.(between(GPUs(on(
TSUBAME(2.0.