@@ -74,7 +74,7 @@ inline std::vector<int> generateIntRandomVector(size_t n, int low, int hi) {
74
74
* @param threads_per_block threads per block (defaults to THREADS_PER_BLOCK)
75
75
* @return number of blocks
76
76
*/
77
- constexpr size_t numBlocks (size_t n, size_t threads_per_block= THREADS_PER_BLOCK) {
77
+ constexpr size_t numBlocks (size_t n, size_t threads_per_block = THREADS_PER_BLOCK) {
78
78
return (n / threads_per_block + (n % threads_per_block != 0 ));
79
79
}
80
80
@@ -390,6 +390,51 @@ public:
390
390
*/
391
391
T minAbs () const ;
392
392
393
+ /* *
394
+ * Applied the Givens rotation G(i, j, c, s) on row i and column j,
395
+ * with cos θ = c, and sin θ = s. The rotation is applied in-place
396
+ * to all slices of the tensor. Recall that the Givens rotation is
397
+ *
398
+ * i j
399
+ * |1 0 ... 0 |
400
+ * |0 1 ... 0 |
401
+ * | |
402
+ * | 1 |
403
+ * i | c s |
404
+ * G' = ' '
405
+ * j | -s c |
406
+ * | 1 |
407
+ * | |
408
+ * | 0 |
409
+ *
410
+ * The right Givens rotation consists in multiplying from the right
411
+ * by G.
412
+ *
413
+ * Equivalently, the right Givens transformation performs the following
414
+ * operation on the i and j columns of this matrix:
415
+ *
416
+ * A[:, i] <-- c * A[:, i] - s * A[:, j]
417
+ * A[:, j] <-- s * A[:, i] + c * A[:, j]
418
+ *
419
+ * @param i first column index
420
+ * @param j second column index
421
+ * @param c cos θ
422
+ * @param minus_s minus sin θ
423
+ * @throws std::invalid_argument if either i or j is greater or equal ncols
424
+ */
425
+ void applyRightGivensRotation (size_t i, size_t j, const T *c, const T *minus_s);
426
+
427
+ /* *
428
+ * Performs a Givens rotation (left multiplication by G')
429
+ *
430
+ * @param i first row index
431
+ * @param j second row index
432
+ * @param c cos θ
433
+ * @param minus_s minus sin θ
434
+ * @throws std::invalid_argument if either i or j is greater or equal nrows
435
+ */
436
+ void applyLeftGivensRotation (size_t i, size_t j, const T *c, const T *minus_s);
437
+
393
438
/* *
394
439
* Batch solves `A \ b`.
395
440
* Solves `bi <- Ai \ bi` for each k-index `i`.
@@ -449,7 +494,6 @@ public:
449
494
}
450
495
451
496
friend DTensor<T> operator *(T a, DTensor &B) {
452
- size_t nrA = B.m_numRows , ncB = B.m_numCols , nmB = B.m_numMats ;
453
497
DTensor<T> result (B);
454
498
result *= a;
455
499
return result;
@@ -471,6 +515,8 @@ DTensor<T> DTensor<T>::createRandomTensor(size_t numRows, size_t numCols, size_t
471
515
auto randVec = generateIntRandomVector (numRows * numCols * numMats, low, hi);
472
516
DTensor<T> a (randVec, numRows, numCols, numMats);
473
517
return a;
518
+ } else {
519
+ throw std::invalid_argument (" [createRandomTensor] unsupported type T" );
474
520
}
475
521
}
476
522
@@ -675,6 +721,38 @@ inline double DTensor<double>::minAbs() const {
675
721
return std::signbit (hostDst) ? -hostDst : hostDst;
676
722
}
677
723
724
+ template <typename T>
725
+ void DTensor<T>::applyRightGivensRotation(size_t i, size_t j, const T *c, const T *minus_s) {
726
+ if (m_numMats > 1 ) throw std::invalid_argument (" [applyRightGivensRotation] tensors (nMat>1) not supported" );
727
+ T *col_i = m_d_data + i * m_numRows;
728
+ T *col_j = m_d_data + j * m_numRows;
729
+ if constexpr (std::is_same_v<T, double >) {
730
+ gpuErrChk (cublasDrot (Session::getInstance ().cuBlasHandle (), m_numRows,
731
+ col_i, 1 , col_j, 1 , c, minus_s));
732
+ } else if constexpr (std::is_same_v<T, float >) {
733
+ gpuErrChk (cublasSrot (Session::getInstance ().cuBlasHandle (), m_numRows,
734
+ col_i, 1 , col_j, 1 , c, minus_s));
735
+ }
736
+ }
737
+
738
+ template <typename T>
739
+ void DTensor<T>::applyLeftGivensRotation(size_t i, size_t j, const T *c, const T *minus_s) {
740
+ if (m_numMats > 1 ) throw std::invalid_argument (" [applyLeftGivensRotation] tensors (nMat>1) not supported" );
741
+ if constexpr (std::is_same_v<T, double >) {
742
+ gpuErrChk (cublasDrot (Session::getInstance ().cuBlasHandle (), m_numCols,
743
+ m_d_data + i, m_numRows,
744
+ m_d_data + j, m_numRows,
745
+ c, minus_s));
746
+ } else if constexpr (std::is_same_v<T, float >) {
747
+ gpuErrChk (cublasSrot (Session::getInstance ().cuBlasHandle (), m_numCols,
748
+ m_d_data + i, m_numRows,
749
+ m_d_data + j, m_numRows,
750
+ c, minus_s));
751
+ } else {
752
+ throw std::invalid_argument (" [applyLeftGivensRotation] Unsupported type T" );
753
+ }
754
+ }
755
+
678
756
template <typename T>
679
757
inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
680
758
if (size <= 0 ) return false ;
@@ -1132,7 +1210,7 @@ public:
1132
1210
DTensor<T> Si (*m_S, 2 , i, i);
1133
1211
DTensor<unsigned int > rankI (*m_rank, 2 , i, i);
1134
1212
k_countNonzeroSingularValues<T><<<numBlocks(numElS), THREADS_PER_BLOCK>>> (Si.raw (), numElS,
1135
- rankI.raw (), epsilon);
1213
+ rankI.raw (), epsilon);
1136
1214
}
1137
1215
return *m_rank;
1138
1216
}
@@ -1380,14 +1458,19 @@ public:
1380
1458
* @param b provided matrix
1381
1459
* @return status code of computation
1382
1460
*/
1383
- int leastSquares (DTensor<T> &);
1461
+ int leastSquares (DTensor<T> &b );
1384
1462
1385
1463
/* *
1386
1464
* Populate the given tensors with Q and R.
1387
1465
* Caution! This is an inefficient method: only to be used for debugging.
1388
- * @return resulting Q and R from factorisation
1466
+ *
1467
+ * @param Q matrix Q (preallocated)
1468
+ * @param R matrix R (preallocated)
1469
+ * @return status code
1470
+ *
1471
+ * @throws std::invalid_argument if Q or R have invalid dimensions
1389
1472
*/
1390
- int getQR (DTensor<T> &, DTensor<T> &);
1473
+ int getQR (DTensor<T> &Q , DTensor<T> &R );
1391
1474
1392
1475
};
1393
1476
@@ -1759,4 +1842,118 @@ inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
1759
1842
m_numMats));
1760
1843
}
1761
1844
1845
+
1846
+
1847
+ /* ================================================================================================
1848
+ * GIVENS ANNIHILATOR
1849
+ * ================================================================================================ */
1850
+
1851
+ /* *
1852
+ * GivensAnnihilator is used to apply a left Givens rotation that
1853
+ * makes a particular element (k, j) of a matrix zero by applying
1854
+ * an appropriate Givens rotation G(i, k, c, s).
1855
+ *
1856
+ * @tparam T data type of tensor (must be float or double)
1857
+ */
1858
+ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
1859
+ class GivensAnnihilator {
1860
+
1861
+ private:
1862
+ DTensor<T> *m_matrix;
1863
+ /* *
1864
+ * Auxiliary memory on the device of length 3 used to store
1865
+ * rhypot(xij, xkj), cos θ, and sin θ.
1866
+ */
1867
+ std::unique_ptr<DTensor<T>> m_d_rhyp_cos_sin;
1868
+
1869
+ void init () {
1870
+ m_d_rhyp_cos_sin = std::make_unique<DTensor<T>>(3 );
1871
+ }
1872
+
1873
+ public:
1874
+
1875
+ GivensAnnihilator () {
1876
+ init ();
1877
+ }
1878
+
1879
+ /* *
1880
+ * Constructor of GivensAnnihilator
1881
+ * @param a matrix
1882
+ * @throws std::invalid_argument if a.numMats() > 1
1883
+ */
1884
+ GivensAnnihilator (DTensor<T> &a) {
1885
+ if (a.numMats () > 1 ) {
1886
+ throw std::invalid_argument (" [GivensAnnihilator] tensors (numMats > 1) not supported" );
1887
+ }
1888
+ m_matrix = &a;
1889
+ init ();
1890
+ }
1891
+
1892
+ /* *
1893
+ * Set the reference to a matrix; this way the current
1894
+ * object can be reused
1895
+ *
1896
+ * @param a
1897
+ */
1898
+ void setMatrix (DTensor<T> &a) {
1899
+ if (a.numMats () > 1 ) {
1900
+ throw std::invalid_argument (" [GivensAnnihilator] tensors (numMats > 1) not supported" );
1901
+ }
1902
+ m_matrix = &a;
1903
+ }
1904
+
1905
+ /* *
1906
+ * Applies a left Givens rotation G(i, k, c, s) that eliminates
1907
+ * the (k, j) element of the given matrix.
1908
+ *
1909
+ * @param i row index i
1910
+ * @param k row index k
1911
+ * @param j column index j
1912
+ *
1913
+ * @throws std::invalid_argument if i, k, or j are out of bounds
1914
+ */
1915
+ void annihilate (size_t i, size_t k, size_t j);
1916
+
1917
+ };
1918
+
1919
+ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
1920
+ __global__ void k_givensAnnihilateRHypot (const T *data,
1921
+ T *res,
1922
+ size_t i, size_t k, size_t j,
1923
+ size_t nRows) {
1924
+ T xij = data[i + j * nRows];
1925
+ T xkj = data[k + j * nRows];
1926
+ res[0 ] = rhypot (xij, xkj);
1927
+ res[1 ] = xij * (*res); // cos
1928
+ res[2 ] = xkj * (*res); // -sin
1929
+ }
1930
+
1931
+ template <typename T>
1932
+ inline void GivensAnnihilator<T>::annihilate(size_t i, size_t k, size_t j) {
1933
+ /* A few checks */
1934
+ size_t nR = m_matrix->numRows (), nC = m_matrix->numCols ();
1935
+ if (i >= nR or k >= nR) throw std::invalid_argument (" [GivensAnnihilator::annihilate] invalid row index" );
1936
+ if (j >= nC) std::invalid_argument (" [GivensAnnihilator::annihilate] invalid column index j" );
1937
+
1938
+ /*
1939
+ * Pass cosine and sine as device pointers
1940
+ * (Avoid having to download first)
1941
+ */
1942
+ gpuErrChk (cublasSetPointerMode (Session::getInstance ().cuBlasHandle (), CUBLAS_POINTER_MODE_DEVICE));
1943
+
1944
+ /* Useful definitions */
1945
+ T *aux = m_d_rhyp_cos_sin->raw ();
1946
+ T *matData = m_matrix->raw ();
1947
+
1948
+ /* Call kernel to determine 1/sqrt(Ai^2 + Ak^2) */
1949
+ k_givensAnnihilateRHypot<<<1 , 1 >>> (m_matrix->raw (), aux, i, k, j, nR);
1950
+
1951
+ /* Apply Givens rotation */
1952
+ m_matrix->applyLeftGivensRotation (i, k, aux + 1 , aux + 2 );
1953
+
1954
+ /* Change back to default behaviour */
1955
+ gpuErrChk (cublasSetPointerMode (Session::getInstance ().cuBlasHandle (), CUBLAS_POINTER_MODE_HOST));
1956
+ }
1957
+
1958
+
1762
1959
#endif
0 commit comments