@@ -386,6 +386,77 @@ void _ccv_nnc_tensor_set_cpu_ref_f16(ccv_nnc_tensor_view_t* const a, const float
386
386
}
387
387
}
388
388
389
+ void _ccv_nnc_tensor_set_cpu_ref_bf16 (ccv_nnc_tensor_view_t * const a , const float b )
390
+ {
391
+ // Assuming this is short.
392
+ int dim [CCV_NNC_MAX_DIM_ALLOC ];
393
+ int astride [CCV_NNC_MAX_DIM_ALLOC ];
394
+ short h ;
395
+ ccv_float_to_bfloat ((float * )& b , (uint16_t * )& h , 1 );
396
+ int x ;
397
+ if (!CCV_IS_TENSOR_VIEW (a ))
398
+ {
399
+ // Super optimal case, just do one for-loop for sum.
400
+ const int tensor_count = ccv_nnc_tensor_count (a -> info );
401
+ for (x = 0 ; x < tensor_count ; x ++ )
402
+ a -> data .f16 [x ].v = h ;
403
+ return ;
404
+ }
405
+ assert (CCV_NNC_MAX_DIM == 2 ); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
406
+ ccv_nnc_tensor_view_get_dim (a , dim );
407
+ ccv_nnc_tensor_view_get_stride (a , astride );
408
+ int i [CCV_NNC_MAX_DIM + 2 ];
409
+ short * const ap = (short * )a -> data .f16 ;
410
+ const int count = dim [2 ] * dim [3 ];
411
+ if (astride [2 ] == dim [3 ])
412
+ {
413
+ // Special casing if the ainc[3] is the same as dim[3]
414
+ for (i [0 ] = 0 ; i [0 ] < dim [0 ]; i [0 ]++ )
415
+ {
416
+ short * ap0 = ap + i [0 ] * astride [0 ];
417
+ for (i [1 ] = 0 ; i [1 ] < dim [1 ]; i [1 ]++ )
418
+ {
419
+ for (x = 0 ; x < count ; x ++ )
420
+ ap0 [x ] = h ;
421
+ ap0 += astride [1 ];
422
+ }
423
+ }
424
+ return ;
425
+ } else if (astride [3 ] == 1 ) {
426
+ // The case the last dimension is packed.
427
+ for (i [0 ] = 0 ; i [0 ] < dim [0 ]; i [0 ]++ )
428
+ {
429
+ short * const ap0 = ap + i [0 ] * astride [0 ];
430
+ for (i [1 ] = 0 ; i [1 ] < dim [1 ]; i [1 ]++ )
431
+ {
432
+ short * ap1 = ap0 + i [1 ] * astride [1 ];
433
+ for (i [2 ] = 0 ; i [2 ] < dim [2 ]; i [2 ]++ )
434
+ {
435
+ for (x = 0 ; x < dim [3 ]; x ++ )
436
+ ap1 [x ] = h ;
437
+ ap1 += astride [2 ];
438
+ }
439
+ }
440
+ }
441
+ return ;
442
+ }
443
+ // Non-optimal case, need to do skip copy.
444
+ for (i [0 ] = 0 ; i [0 ] < dim [0 ]; i [0 ]++ )
445
+ {
446
+ short * const ap0 = ap + i [0 ] * astride [0 ];
447
+ for (i [1 ] = 0 ; i [1 ] < dim [1 ]; i [1 ]++ )
448
+ {
449
+ short * ap1 = ap0 + i [1 ] * astride [1 ];
450
+ for (i [2 ] = 0 ; i [2 ] < dim [2 ]; i [2 ]++ )
451
+ {
452
+ for (x = 0 ; x < dim [3 ]; x ++ )
453
+ ap1 [x * astride [3 ]] = h ;
454
+ ap1 += astride [2 ];
455
+ }
456
+ }
457
+ }
458
+ }
459
+
389
460
void _ccv_nnc_tensor_set_cpu_ref_f32 (ccv_nnc_tensor_view_t * const a , const float b )
390
461
{
391
462
// Assuming this is float 32.
@@ -603,7 +674,7 @@ static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t
603
674
if (a != b ) // Only do transfer if these are two different tensors.
604
675
{
605
676
assert (a -> info .datatype == b -> info .datatype );
606
- if (a -> info .datatype == CCV_16F )
677
+ if (a -> info .datatype == CCV_16F || a -> info . datatype == CCV_16BF )
607
678
_ccv_nnc_tensor_transfer_cpu_ref_f16 (a , b );
608
679
else if (a -> info .datatype == CCV_32F || a -> info .datatype == CCV_32S )
609
680
_ccv_nnc_tensor_transfer_cpu_ref_f32 (a , b );
@@ -619,7 +690,7 @@ static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t
619
690
REGISTER_COMMAND_BACKEND (CCV_NNC_DATA_TRANSFER_FORWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
620
691
{
621
692
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
622
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S ;
693
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF ;
623
694
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
624
695
registry -> algorithms = 1 ;
625
696
registry -> exec = _ccv_nnc_data_transfer ;
@@ -628,7 +699,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_CPU_REF)
628
699
REGISTER_COMMAND_BACKEND (CCV_NNC_DATA_TRANSFER_BACKWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
629
700
{
630
701
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
631
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S ;
702
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF ;
632
703
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
633
704
registry -> algorithms = 1 ;
634
705
registry -> exec = _ccv_nnc_data_transfer ;
@@ -644,6 +715,8 @@ static int _ccv_nnc_set_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint,
644
715
for (i = 0 ; i < output_size ; i ++ )
645
716
if (outputs [i ]-> info .datatype == CCV_16F )
646
717
_ccv_nnc_tensor_set_cpu_ref_f16 ((ccv_nnc_tensor_view_t * )outputs [i ], cmd .info .blas .a [0 ]);
718
+ else if (outputs [i ]-> info .datatype == CCV_16BF )
719
+ _ccv_nnc_tensor_set_cpu_ref_bf16 ((ccv_nnc_tensor_view_t * )outputs [i ], cmd .info .blas .a [0 ]);
647
720
else if (outputs [i ]-> info .datatype == CCV_32F )
648
721
_ccv_nnc_tensor_set_cpu_ref_f32 ((ccv_nnc_tensor_view_t * )outputs [i ], cmd .info .blas .a [0 ]);
649
722
else if (outputs [i ]-> info .datatype == CCV_64F )
@@ -666,7 +739,7 @@ static int _ccv_nnc_set_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint,
666
739
REGISTER_COMMAND_BACKEND (CCV_NNC_SET_FORWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
667
740
{
668
741
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
669
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S ;
742
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF ;
670
743
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
671
744
registry -> algorithms = 1 ;
672
745
registry -> exec = _ccv_nnc_set_forw ;
@@ -675,7 +748,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_c
675
748
REGISTER_COMMAND_BACKEND (CCV_NNC_SET_BACKWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
676
749
{
677
750
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
678
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F ;
751
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF ;
679
752
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
680
753
registry -> algorithms = 1 ;
681
754
registry -> exec = _ccv_nnc_set_back ;
@@ -1040,7 +1113,7 @@ static int _ccv_nnc_format_transform(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint
1040
1113
} else if (a -> info .format == CCV_TENSOR_FORMAT_CHWN && b -> info .format == CCV_TENSOR_FORMAT_NCHW ) {
1041
1114
assert (0 );
1042
1115
}
1043
- } else if (a -> info .datatype == CCV_16F ) {
1116
+ } else if (a -> info .datatype == CCV_16F || a -> info . datatype == CCV_16BF ) {
1044
1117
if (a -> info .format == b -> info .format ) {
1045
1118
// If it is the same, just do a normal data transfer.
1046
1119
_ccv_nnc_tensor_transfer_cpu_ref_f16 (a , b );
@@ -1084,7 +1157,7 @@ static int _ccv_nnc_format_transform(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint
1084
1157
REGISTER_COMMAND_BACKEND (CCV_NNC_FORMAT_TRANSFORM_FORWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
1085
1158
{
1086
1159
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
1087
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U ;
1160
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U | CCV_16BF ;
1088
1161
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
1089
1162
registry -> algorithms = 1 ;
1090
1163
registry -> exec = _ccv_nnc_format_transform ;
@@ -1093,7 +1166,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_FORWARD, CCV_NNC_BACKEND_CPU_R
1093
1166
REGISTER_COMMAND_BACKEND (CCV_NNC_FORMAT_TRANSFORM_BACKWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
1094
1167
{
1095
1168
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
1096
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U ;
1169
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U | CCV_16BF ;
1097
1170
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
1098
1171
registry -> algorithms = 1 ;
1099
1172
registry -> exec = _ccv_nnc_format_transform ;
@@ -1209,7 +1282,7 @@ static int _ccv_nnc_datatype_conversion(const ccv_nnc_cmd_t cmd, const ccv_nnc_h
1209
1282
assert (a -> info .format == b -> info .format );
1210
1283
if (a -> info .datatype == b -> info .datatype ) {
1211
1284
// If it is the same, just do a normal data transfer.
1212
- if (a -> info .datatype == CCV_16F )
1285
+ if (a -> info .datatype == CCV_16F || a -> info . datatype == CCV_16BF )
1213
1286
_ccv_nnc_tensor_transfer_cpu_ref_f16 (a , b );
1214
1287
else if (a -> info .datatype == CCV_32F )
1215
1288
_ccv_nnc_tensor_transfer_cpu_ref_f32 (a , b );
@@ -1254,6 +1327,42 @@ static int _ccv_nnc_datatype_conversion(const ccv_nnc_cmd_t cmd, const ccv_nnc_h
1254
1327
const int tensor_count = ccv_nnc_tensor_count (a -> info );
1255
1328
assert (tensor_count == ccv_nnc_tensor_count (b -> info ));
1256
1329
ccv_half_precision_to_double ((uint16_t * )a -> data .f16 , b -> data .f64 , tensor_count );
1330
+ } else if (a -> info .datatype == CCV_16F && b -> info .datatype == CCV_16BF ) {
1331
+ assert (CCV_IS_TENSOR_CONTIGUOUS (a ));
1332
+ assert (CCV_IS_TENSOR_CONTIGUOUS (b ));
1333
+ const size_t tensor_count = ccv_nnc_tensor_count (a -> info );
1334
+ assert (tensor_count == ccv_nnc_tensor_count (b -> info ));
1335
+ ccv_half_precision_to_bfloat ((uint16_t * )a -> data .f16 , (uint16_t * )b -> data .f16 , tensor_count );
1336
+ } else if (a -> info .datatype == CCV_16BF && b -> info .datatype == CCV_16F ) {
1337
+ assert (CCV_IS_TENSOR_CONTIGUOUS (a ));
1338
+ assert (CCV_IS_TENSOR_CONTIGUOUS (b ));
1339
+ const int tensor_count = ccv_nnc_tensor_count (a -> info );
1340
+ assert (tensor_count == ccv_nnc_tensor_count (b -> info ));
1341
+ ccv_bfloat_to_half_precision ((uint16_t * )a -> data .f16 , (uint16_t * )b -> data .f16 , tensor_count );
1342
+ } else if (a -> info .datatype == CCV_32F && b -> info .datatype == CCV_16BF ) {
1343
+ assert (CCV_IS_TENSOR_CONTIGUOUS (a ));
1344
+ assert (CCV_IS_TENSOR_CONTIGUOUS (b ));
1345
+ const size_t tensor_count = ccv_nnc_tensor_count (a -> info );
1346
+ assert (tensor_count == ccv_nnc_tensor_count (b -> info ));
1347
+ ccv_float_to_bfloat (a -> data .f32 , (uint16_t * )b -> data .f16 , tensor_count );
1348
+ } else if (a -> info .datatype == CCV_16BF && b -> info .datatype == CCV_32F ) {
1349
+ assert (CCV_IS_TENSOR_CONTIGUOUS (a ));
1350
+ assert (CCV_IS_TENSOR_CONTIGUOUS (b ));
1351
+ const int tensor_count = ccv_nnc_tensor_count (a -> info );
1352
+ assert (tensor_count == ccv_nnc_tensor_count (b -> info ));
1353
+ ccv_bfloat_to_float ((uint16_t * )a -> data .f16 , b -> data .f32 , tensor_count );
1354
+ } else if (a -> info .datatype == CCV_64F && b -> info .datatype == CCV_16BF ) {
1355
+ assert (CCV_IS_TENSOR_CONTIGUOUS (a ));
1356
+ assert (CCV_IS_TENSOR_CONTIGUOUS (b ));
1357
+ const size_t tensor_count = ccv_nnc_tensor_count (a -> info );
1358
+ assert (tensor_count == ccv_nnc_tensor_count (b -> info ));
1359
+ ccv_double_to_bfloat (a -> data .f64 , (uint16_t * )b -> data .f16 , tensor_count );
1360
+ } else if (a -> info .datatype == CCV_16BF && b -> info .datatype == CCV_64F ) {
1361
+ assert (CCV_IS_TENSOR_CONTIGUOUS (a ));
1362
+ assert (CCV_IS_TENSOR_CONTIGUOUS (b ));
1363
+ const int tensor_count = ccv_nnc_tensor_count (a -> info );
1364
+ assert (tensor_count == ccv_nnc_tensor_count (b -> info ));
1365
+ ccv_bfloat_to_double ((uint16_t * )a -> data .f16 , b -> data .f64 , tensor_count );
1257
1366
}
1258
1367
}
1259
1368
return CCV_NNC_EXEC_SUCCESS ;
@@ -1262,7 +1371,7 @@ static int _ccv_nnc_datatype_conversion(const ccv_nnc_cmd_t cmd, const ccv_nnc_h
1262
1371
REGISTER_COMMAND_BACKEND (CCV_NNC_DATATYPE_CONVERSION_FORWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
1263
1372
{
1264
1373
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
1265
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F ;
1374
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_16BF ;
1266
1375
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
1267
1376
registry -> algorithms = 1 ;
1268
1377
registry -> exec = _ccv_nnc_datatype_conversion ;
@@ -1271,7 +1380,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_FORWARD, CCV_NNC_BACKEND_CP
1271
1380
REGISTER_COMMAND_BACKEND (CCV_NNC_DATATYPE_CONVERSION_BACKWARD , CCV_NNC_BACKEND_CPU_REF )(ccv_nnc_cmd_backend_registry_t * const registry )
1272
1381
{
1273
1382
registry -> tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN ;
1274
- registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F ;
1383
+ registry -> tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_16BF ;
1275
1384
registry -> tensor_memory = CCV_TENSOR_CPU_MEMORY ;
1276
1385
registry -> algorithms = 1 ;
1277
1386
registry -> exec = _ccv_nnc_datatype_conversion ;
0 commit comments