Su pe rM a trix O u to fO rd er Sc hedu ling of M a trix O pe ra

Anuncio
$
%
! "
& #
(
" #
"
)
'
%
-
.+
+
/)
)
0
/660 0 0
1 ,
6
7
!
# 8
-
,
9
#
"
%
6
2
) 3$ 4
5
#
%
+
#
# /
7
7
: . " " . ; <=; >*? @.
,
#
: . " " . ; B; *B = @.
!
#
#
# #
!
#
A
A
*
-
%
.+
.+
#
C
-
%
%
+
,
9
"
Algorithms
Architectures
(hardware)
-
.+
.+
Compilers,
languages, and libraries
#
=
%
-
%
7
7
#
:
@"
+
A9
>J<
>><
µ
I
G
" FG
" F
/
"
>B<
H+
+:
,
H+
*G H +
+
" F
9 0
*; ; <
"
CG
%
G. 2
$
2
+
+
" FG
" F
*; <
DE
+
,
G
DE
DE
!E
<
%
-
K
#
.
!G
#
G
4
#
>B<
µ
H+
+:
G
" FG
" F
H+
*G H +
+
" F
!
,
, ! 0
3
>><
*; ; <
"
CG
!E
, !G
>J<
I
,
G. 2
$
2
+
+
" FG
" F
L
DE
0 5
*; <
G
DE
DE
?
%
-
K
.
#
!G
#
G
.
: 0
3
,
, !G
3
L
!E
!
,
0 5
!5
Algorithms
: 0
: 0
Architectures
(hardware)
Compilers,
languages, and libraries
B
%
-
9 0
%
+
,
!E
#
.+
4
#
3
#
5
# /
#
Algorithms
1 0
Architectures
(hardware)
Compilers,
languages, and libraries
J
-
%
.+
M "
N
1 %
7
7
7
7
-
.+
H
G&
) G @4
# AG " 4
µ
2
G
!
%
2
G
G
,
G *; ; <
:
: 0
: 0
: 0
#
>
.+
:
: 0
: 0
: 0
/
.+
- "
!/"
P
O:
L!
8
3
5#
P + +4 P
!
#
P,
;
.+
:
: 0
: 0
: 0
.+
/ !
%
#
M "
N/ !
" #
, #
P
#
P + +4 P
4
#
+
J
0
#
0
/
/
/ 3R P R45 Q 3R S ; 5
/ 341 +3 5 P +5 Q 3++4 P R5
Q
*
/
/
%
#
4+
+4+
41
41
P
H+
H1
+H +
H1
+H + +H +4
.+
:
: 0
: 0
: 0
.+
/ !
#
/
%
.+
, #
P
P + +4 P
!
#
#
*
.+
:
: 0
: 0
: 0
.+
-
/ !
"
#
%
/
0
H+
#
-
/
#
)
GH +
+
*G H +
CG
" F
/
!
#
%
+
: 0
3
# #
#
!5
Algorithms
: 0
Archictures
(hardware)
Compilers,
Languages, and libraries
: 0
C
.+
:
: 0
: 0
: 0
.+
/
# #
M "
#
4
"
/
N/ "
E
1 int Chol_unb( FLA_Obj A )
2 {
3
4
5
6
7 }
...
FLA_Part_2x2( A,
&ATL, &ATR,
&ABL, &ABR,
0, 0, FLA_TL );
...
=
.+
:
: 0
: 0
: 0
.+
-
/
/
# #
"
/
" G.
#
-
#
,G + ,% 0
+ 4 U
/
4
L
0
3
# 5
#
T
0
0
%
: 0
3
!5
Algorithms
: 0
Architectures
(hardware)
Compilers,
languages, and libraries
: 0
<
.+
:
: 0
: 0
: 0
.+
/
#
#
.
#
G
%
#
P + +4 P
P
# 8
/
E
C
C,
#
%
L
%
/
#
?
.+
:
: 0
: 0
: 0
#
#
%
#
!
#
Performance on a single processor of SGI Altix 350
6
4
V1
V2
V3
I C,
3
2
1
10000
9000
8000
7000
6000
5000
4000
3000
0
500
# 8
GigaFLOPS
5
2000
/
1000
.+
/
M atrix size
# *(
<2 9 8 3
?=5 +C ? H
B
.+
:
: 0
: 0
: 0
.+
/
#
/
#
#
Performance on 16 processors of SGI Altix 350
90
,
60
V1
V2
V3
50
40
30
20
10
10000
9000
8000
7000
6000
5000
4000
3000
0
2000
# 8
I
70
1000
GigaFLOPS
80
M atrix size
?
# *(
V 2
<2 9 8 3 ?=5 +C ? H
:$
L
J
.+
:
: 0
: 0
: 0
.+
/
#
#
M "
#
/
N/ 2
#
#
G
%
G
G *GWG
#
# 8
#
I G I *GWGI
>
.+
:
: 0
: 0
: 0
.+
/
#
#
M "
#
/
N/
!
#
#
#
I G I *GWGI
1
+
# 8
!
# 8
+*
+CE
1
L6 #
#
#
#
#
*;
.+
:
: 0
: 0
: 0
.+
-
/
"
#
#
/ ,. +
; >
0
#
-
#
/
)
+
H+
" FG
GH +
C*G
*G H +
?=
CG
/
,
#
: 0
3
!5
%
3
#
5
Algorithms
@: 0 A
Architectures
(hardware)
Compilers,
languages, and libraries
: 0
*
.+
:
: 0
: 0
: 0
.+
/
# #
/
!
%
#
#
+
.
,
9 "
#
%
Family of
algorithms
1 0
1
+
/#
!
# 8
+*
+CE
1
L6 #
#
#
Library
libFLAME 0.9
LPGL, 2006
**
-
%
.+
.+
#
M "
*N
"
@
G
G2
G1 %
#
2
) G
#
#
AG >
"
G *; ; B
#
7
7
7
7
%
#
#
1
%
%
, !
!
*C
.+
"
/
%
, !
!
+
1
.+
-
#
/#
.+
0
#
%
,
X
"
H
V
!V 4
$ :$
.
, "
:
?= * 3*
#
3
++ H
5
/ VJ
4 3J
5
5G
"
3=
5G
E
!
,
0
J;
G#
!
Y
*=
.+
"
/
%
+
1
.+
-
#
"
/#
Z
7
7
7
#
, !
!
%
!
3
5
/
9
!
"
3
G !
#
0
0
,
5
8
"
"
:
:
:
:
+
+
+
+
"
+
1
/
+
+*
1
L
+
1
+*
: 0
+
1
6
# #
, 0
"
*<
.+
"
/
%
+
1
.+
#
-
#
1
7
7
7
-
.
+
/#
K
0
7
7
7
%
,
"
, !
, ! 3
!
/
!5
3L
0 3
# #
5
5E
#
#
+
#
, !
!
!
!
!
,
!
,
L
#
*?
.+
"
/
%
+
1
.+
#
-
8
/
, !
!
, !
/
0
0
W
$ .
$ .
+ %
3
*
C
G
$ .
W
*
%
# 5
34 #
5
*B
.+
"
/
%
+
1
.+
#
-
8
/
, !
!
, !
,
"
4
3
5
L
0
0
W
:
#
:*
L
:
W
3
64
L + %
# 5
*
C
!
#
G
%
34 #
5
*J
.+
"
/
%
, !
!
+
1
.+
#
/
#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
, !
L 3
#
5
int Chol_blk( FLA_Obj A, int b)
{
...
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
...
/* -------------------------------------------------*/
/* A11 := Chol_unb( A11 ) */
Chol_unb( A11 );
,. +
4
0
4
4*
/* A21 := A21 * TRIL( A11 )^-T */
FLA_Trsm( ..., A11, A21 );
4C
L
0
W
W
/* A22 := A22 - TRIL( A21 * A21^T ) */
FLA_Syrk( ..., A21,..., A22 );
/* -----------------------------------------------------*/
...
}
7
7
7
#
1
4 L
%
8 /,
#
#
# #
T*
, !
, !
*>
.+
"
/
%
, !
!
+
1
.+
#
*
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
!
/
#
, !
P
#
L
int Chol_blk( FLA_Obj A, int b)
{
...
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
...
/* -------------------------------------------------*/
/* A11 := Chol_unb( A11 ) */
Chol_unb( A11 );
,. +
4
0
4
0
W
W
4*
/* A21 := A21 * TRIL( A11 )^-T */
FLA_Trsm( ..., A11, A21 );
L
4C
/* A22 := A22 - TRIL( A21 * A21^T ) */
FLA_Syrk( ..., A21,..., A22 );
/* -----------------------------------------------------*/
...
}
7
#
L
!
#
,
C;
.+
"
/
%
+
1
.+
#
/
, !
P4 #
C
, !
!
#
4
L
L
0
0
W
7
7
3#
5
:
:*
# #
L
,
:
W
!T
0
C
.+
"
/
%
+
1
.+
#
/
, !
!
, !
C
*
C
1
W
7
7
"
F
2
4
3
5
6
!
! #
,!
L
7
8
9
10
/
/
0
#
#
6
L
#
4 #
[
#
C*
.+
"
/
%
+
1
.+
-
#
/
1
, !
!
!
# #
"
:
:
:
:
+
+
+
+
"
1
+*
1
!
*
1
L G
%
G
#
CC
.+
"
/
%
+
1
.+
#
/
!
, !
!
!
L G
G
"
6
:
:
:
:
+
+
+
+
"
#
4
4*
1
+*
1
7
G0
5
3
7
!
#
!
L
#
!
G,
# /
!
C=
.+
"
/
%
+
1
.+
*
#
1
/
%
H
:
3" /
L0
,! ,
7
0 0
5
W
0
L6
.+
!
#
4
7
7
, !
!
%
L ,
W
L
L
# ,
G
#
64+H
,
#
#
% /
9
C<
.+
"
/
%
+
1
.+
#
-
#
, !
!
/
≈"
!/
90
80
Scalability+Affinity+Recursive
storage
Scalability+Affinity
60
50
Scalability
40
V3
30
LAPACK
20
10
10000
9000
8000
7000
6000
5000
4000
2000
3000
0
1000
GigaFLOPS
70
Matrix size
?
# *(
<2 9 8 3
?=5 +C ? H
C?
.+
"
/
%
+
1
.+
-
#
/
"
"
+$ 0
#
!
/
0
-
# #
, !
!
L!
1
+$ 0 6
8
!5
#
%
%
8
%
GH +
CT
#
/
4
L
0
3 ,
T
#
: 0
3
!
# 3
%
V
5
!5
Algorithms
: 0
Architectures
(hardware)
Compilers,
languages, and libraries
: 0
CB
.+
"
/
%
+
1
.+
/
# #
, !
!
!
%
#
+
,
#
libFLAME
9
3
libFLASH
5
!
#
!
W
#
#
L
CJ
Descargar