Slides - LRR - Technische Universität München

Parallel Programming Exercises
Andreas Hollmann
Technische Universität München
June 06, 2014
Organization
I
Andreas Wilhelm will do the Dependcy Analysis
I
Tutorial will take place next Wednesday (June 11) instead
of the lecture. No lecture next week.
I
Two lectures the week after: June 17 and 18
I
MPI tutorial will start on June 24
Merge Sort with OpenMP Tasks and Sections 1 / 5
I
The assignments consists of two parts
I
Use the provided OpenMP implementation.
I
Parallelize it with OpenMP tasks and try to optimize it.
I
Reduce the overhead. Use the variable threshold.
I
We accept only solution that do not create more tasks
then necessary for a given threshold.
I
---
I
Compare it to the parallelization with OpenMP Sections
and nested parallelism.
I
You can define the nesting level with
omp_set_max_active_levels(n)
I
Enable nesting with omp_set_nested(1)
I
Use the threshold for defining the nesting level.
Merge Sort with OpenMP Tasks and Sections 2 / 5
Assignment: Merge Sort with OpenMP Tasks and
Sections 3 / 5
1
2
3
4
5
void merge_sort ( int64_t * a , s i z e _ t num_elements ,
i n t num_threads , i n t threshold )
{
s i z e _ t size = num_elements * sizeof ( int64_t ) ;
int64_t * b = malloc ( size ) ;
6
i f (b == NULL)
e x i t ( EXIT_FAILURE ) ;
7
8
9
memcpy(b , a , size ) ;
10
11
s p l i t ( a , b , 0 , num_elements , threshold ) ;
12
13
free (b ) ;
14
15
}
Merge Sort with OpenMP Tasks and Sections 4 / 5
1
2
3
4
5
void s p l i t ( int64_t * a , int64_t * b , s i z e _ t begin ,
s i z e _ t end , i n t threshold )
{
i f (end − begin < 2)
return ;
6
swap(&a , &b ) ;
7
8
s i z e _ t mid = ( begin + end) / 2;
9
10
s p l i t ( a , b , begin , mid , threshold ) ;
s p l i t ( a , b , mid , end , threshold ) ;
11
12
13
merge( a , b , begin , mid , end ) ;
14
15
}
Merge Sort with OpenMP Tasks and Sections 5 / 5
1
2
3
4
void merge( int64_t * a , int64_t * b , s i z e _ t begin ,
s i z e _ t mid , s i z e _ t end)
{
s i z e _ t l = begin , r = mid , idx = begin ;
5
while ( l < mid && r < end)
b[ idx++] = a[ l ] < a[ r ] ? a[ l++] : a[ r++];
6
7
8
while ( l < mid)
b[ idx++] = a[ l ++];
9
10
11
while ( r < end)
b[ idx++] = a[ r++];
12
13
14
}
Solution: Merge Sort with OpenMP Tasks 1 / 2
1
2
3
4
void merge_sort ( int64_t * a , s i z e _ t num_elements ,
i n t num_threads , i n t threshold )
{
omp_set_num_threads ( num_threads ) ;
5
s i z e _ t size = num_elements * sizeof ( int64_t ) ;
int64_t * b = malloc ( size ) ;
6
7
8
i f (b == NULL)
e x i t ( EXIT_FAILURE ) ;
9
10
11
memcpy(b , a , size ) ;
12
13
#pragma omp p a r a l l e l
#pragma omp single
s p l i t ( a , b , 0 , num_elements , threshold ) ;
14
15
16
17
free (b ) ;
18
19
}
Solution: Merge Sort with OpenMP Tasks 2 / 2
1
2
3
4
5
void s p l i t ( int64_t * a , int64_t * b , s i z e _ t begin ,
s i z e _ t end , i n t threshold )
{
i f (end − begin < 2)
return ;
6
swap(&a , &b ) ;
7
8
s i z e _ t mid = ( begin + end) / 2;
9
10
#pragma omp task i f (end − begin > threshold )
s p l i t ( a , b , begin , mid , threshold ) ;
s p l i t ( a , b , mid , end , threshold ) ;
11
12
13
14
#pragma omp taskwait
merge( a , b , begin , mid , end ) ;
15
16
17
}
Solution: Merge Sort with OpenMP Sections 1 / 2
1
2
3
4
5
void merge_sort ( int64_t * a , s i z e _ t num_elements ,
i n t num_threads , i n t threshold )
{
s i z e _ t size = num_elements * sizeof ( int64_t ) ;
int64_t * b = malloc ( size ) ;
6
i f (b == NULL)
e x i t ( EXIT_FAILURE ) ;
7
8
9
memcpy(b , a , size ) ;
10
11
omp_set_num_threads ( num_threads ) ;
omp_set_nested ( 1 ) ;
omp_set_max_active_levels ( threshold ) ;
12
13
14
15
s p l i t ( a , b , 0 , num_elements , threshold ) ;
16
17
free (b ) ;
18
19
}
Solution: Merge Sort with OpenMP Sections 2 / 2
1
2
3
4
void s p l i t ( int64_t * a , int64_t * b , s i z e _ t begin ,
s i z e _ t end , i n t threshold )
{ i f (end − begin < 2)
return ;
5
swap(&a , &b ) ;
6
7
s i z e _ t mid = ( begin + end) / 2;
8
9
#pragma omp p a r a l l e l sections
{
#pragma omp section
s p l i t ( a , b , begin , mid , threshold ) ;
10
11
12
13
14
#pragma omp section
s p l i t ( a , b , mid , end , threshold ) ;
15
16
}
merge( a , b , begin , mid , end ) ;
17
18
19
}
Which level to parallelize? -> Dependency Analysis
1
2
3
4
5
6
7
8
9
10
f o r ( i n t i = 1; i < I_MAX ; i++)
{
f o r ( i n t j = 1; j < J_MAX ; j++)
{
f o r ( i n t k = 1; k < K_MAX; k++)
{
A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ;
}
}
}
Which level to parallelize? -> Dependency Analysis
1
2
3
4
5
6
7
8
9
10
11
f o r ( i n t i = 1; i < I_MAX ; i++)
{
f o r ( i n t j = 1; j < J_MAX ; j++)
{
#pragma omp p a r a l l e l f o r
f o r ( i n t k = 1; k < K_MAX; k++)
{
A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ;
}
}
}
Which level to parallelize? -> Dependency Analysis
1
2
3
4
5
6
7
8
9
10
11
f o r ( i n t i = 1; i < I_MAX ; i++)
{
f o r ( i n t j = 1; j < J_MAX ; j++)
{
#pragma omp p a r a l l e l f o r / / <−−− What happens here?
f o r ( i n t k = 1; k < K_MAX; k++)
{
A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ;
} / / <−−− What happens here?
}
}
Which level to parallelize? -> Dependency Analysis
1
2
3
4
5
6
7
8
9
10
11
12
#pragma omp p a r a l l e l
f o r ( i n t i = 1; i < I_MAX ; i++)
{
f o r ( i n t j = 1; j < J_MAX ; j++)
{
#pragma omp f o r
f o r ( i n t k = 1; k < K_MAX; k++)
{
A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ;
} / / <−−− What happens here?
}
}
Reducing the Overhead of inner Loop Parallelization
I
Starting a parallel region and synchronizing it in the end
is costly
I
Use ‘nowait‘ clause reduces the overhead
I
‘nowait‘ only works with static scheduling
I
Removes the synchronization at the end of a #pragma
omp for
Using ‘nowait‘ in inner Loop
1
2
3
4
5
6
7
8
9
10
11
12
#pragma omp p a r a l l e l
f o r ( i n t i = 1; i < I_MAX ; i++)
{
f o r ( i n t j = 1; j < J_MAX ; j++)
{
#pragma omp f o r nowait
f o r ( i n t k = 1; k < K_MAX; k++)
{
A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ;
}
}
}
Using ‘nowait‘ in inner Loop
1
2
3
#define I_MAX 1000
#define J_MAX 1000
#define K_MAX 10
4
5
6
7
$ . / omp_inner_loop
Inner Loop with nowait takes : 0.121344 seconds
Inner Loop without nowait takes : 1.163177 seconds