🥭CUDA(C)磁态蒙特卡洛和传输矩阵多GPU并行计算分析
1. 使用英伟达GPU、大都会和并行回火算法模拟蒙特卡洛。 2. 使用兰佐斯算法计算传输矩阵特征值。 3. 使用 Suzuki-Trotter 公式归一化量子无序系统。 4. 算法模型特征:多CUDA线程,多GPU和多任务式并行计算。
Last updated
1. 使用英伟达GPU、大都会和并行回火算法模拟蒙特卡洛。 2. 使用兰佐斯算法计算传输矩阵特征值。 3. 使用 Suzuki-Trotter 公式归一化量子无序系统。 4. 算法模型特征:多CUDA线程,多GPU和多任务式并行计算。
Last updated
Python和MATLAB自旋玻璃投资组合神经网络广义方程
Python和C++及MATLAB低温磁态机器学习模型
代码实现如下:
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
size_t BLOCK_TILE_SIZE_K, size_t NUM_THREADS, size_t BLOCK_TILE_SKEW_SIZE_X = 0U, size_t BLOCK_TILE_SKEW_SIZE_K = 0U>
__device__ void load_data_to_shared_memory(T const* A, size_t lda,
T const* B, size_t ldb,
T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K + BLOCK_TILE_SKEW_SIZE_K],
T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X + BLOCK_TILE_SKEW_SIZE_X],
size_t thread_block_tile_idx,
size_t thread_linear_idx,
size_t m, size_t n,
size_t k)
{
#pragma unroll
for (size_t load_idx{0U};
load_idx <
(BLOCK_TILE_SIZE_Y * BLOCK_TILE_SIZE_K + NUM_THREADS - 1U) /
NUM_THREADS;
++load_idx)
{
size_t const A_thread_block_tile_row_idx{
(thread_linear_idx + load_idx * NUM_THREADS) /
BLOCK_TILE_SIZE_K};
size_t const A_thread_block_tile_col_idx{
(thread_linear_idx + load_idx * NUM_THREADS) %
BLOCK_TILE_SIZE_K};
size_t const A_row_idx{blockIdx.y * BLOCK_TILE_SIZE_Y +
A_thread_block_tile_row_idx};
size_t const A_col_idx{thread_block_tile_idx * BLOCK_TILE_SIZE_K +
A_thread_block_tile_col_idx};
T val{static_cast<T>(0)};
if (A_row_idx < m && A_col_idx < k)
{
val = A[A_row_idx * lda + A_col_idx];
}
static_assert(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_Y % NUM_THREADS ==
0U);
A_thread_block_tile[A_thread_block_tile_row_idx]
[A_thread_block_tile_col_idx] = val;
}
#pragma unroll
for (size_t load_idx{0U};
load_idx <
(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_X + NUM_THREADS - 1U) /
NUM_THREADS;
++load_idx)
{
size_t const B_thread_block_tile_row_idx{
(thread_linear_idx + load_idx * NUM_THREADS) /
BLOCK_TILE_SIZE_X};
size_t const B_thread_block_tile_col_idx{
(thread_linear_idx + load_idx * NUM_THREADS) %
BLOCK_TILE_SIZE_X};
size_t const B_row_idx{thread_block_tile_idx * BLOCK_TILE_SIZE_K +
B_thread_block_tile_row_idx};
size_t const B_col_idx{blockIdx.x * BLOCK_TILE_SIZE_X +
B_thread_block_tile_col_idx};
T val{static_cast<T>(0)};
if (B_row_idx < k && B_col_idx < n)
{
val = B[B_row_idx * ldb + B_col_idx];
}
static_assert(BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_K % NUM_THREADS ==
0U);
B_thread_block_tile[B_thread_block_tile_row_idx]
[B_thread_block_tile_col_idx] = val;
}
}
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
size_t BLOCK_TILE_SIZE_K>
__global__ void gemm_v02(size_t m, size_t n, size_t k, T alpha, T const* A,
size_t lda, T const* B, size_t ldb, T beta, T* C,
size_t ldc)
{
constexpr size_t NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y};
size_t const thread_linear_idx{threadIdx.y * blockDim.x + threadIdx.x};
size_t const C_col_idx{blockIdx.x * blockDim.x + threadIdx.x};
size_t const C_row_idx{blockIdx.y * blockDim.y + threadIdx.y};
__shared__ T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K];
__shared__ T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X];
size_t const num_thread_block_tiles{(k + BLOCK_TILE_SIZE_K - 1) /
BLOCK_TILE_SIZE_K};
T sum{static_cast<T>(0)};
for (size_t thread_block_tile_idx{0U};
thread_block_tile_idx < num_thread_block_tiles;
++thread_block_tile_idx)
{
load_data_to_shared_memory<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y,
BLOCK_TILE_SIZE_K, NUM_THREADS>(
A, lda, B, ldb, A_thread_block_tile, B_thread_block_tile,
thread_block_tile_idx, thread_linear_idx, m, n, k);
__syncthreads();
#pragma unroll
for (size_t k_i{0U}; k_i < BLOCK_TILE_SIZE_K; ++k_i)
{
sum += A_thread_block_tile[threadIdx.y][k_i] *
B_thread_block_tile[k_i][threadIdx.x];
}
__syncthreads();
}
if (C_row_idx < m && C_col_idx < n)
{
C[C_row_idx * ldc + C_col_idx] =
alpha * sum + beta * C[C_row_idx * ldc + C_col_idx];
}
}
template <typename T>
void launch_gemm_kernel_v02(size_t m, size_t n, size_t k, T const* alpha,
T const* A, size_t lda, T const* B, size_t ldb,
T const* beta, T* C, size_t ldc,
cudaStream_t stream)
{
constexpr unsigned int BLOCK_TILE_SIZE_X{32U};
constexpr unsigned int BLOCK_TILE_SIZE_Y{32U};
constexpr unsigned int BLOCK_TILE_SIZE_K{32U};
constexpr unsigned int NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y};
static_assert(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_Y % NUM_THREADS == 0U);
static_assert(BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_K % NUM_THREADS == 0U);
dim3 const block_dim{BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, 1U};
dim3 const grid_dim{
(static_cast<unsigned int>(n) + block_dim.x - 1U) / block_dim.x,
(static_cast<unsigned int>(m) + block_dim.y - 1U) / block_dim.y, 1U};
gemm_v02<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, BLOCK_TILE_SIZE_K>
<<<grid_dim, block_dim, 0U, stream>>>(m, n, k, *alpha, A, lda, B, ldb,
*beta, C, ldc);
CHECK_LAST_CUDA_ERROR();
}
此 FP32 广义矩阵乘法实现的性能在 NVIDIA GeForce RTX 3090 GPU 上达到 2.66 TFLOPS,比之前的实现好很多。不过,它距离 GPU 的理论峰值性能还相差甚远。
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
size_t BLOCK_TILE_SIZE_K, size_t THREAD_TILE_SIZE_Y>
__global__ void gemm_v03(size_t m, size_t n, size_t k, T alpha, T const* A,
size_t lda, T const* B, size_t ldb, T beta, T* C,
size_t ldc)
{
constexpr size_t NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y /
THREAD_TILE_SIZE_Y};
size_t const thread_linear_idx{threadIdx.y * blockDim.x + threadIdx.x};
__shared__ T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K];
__shared__ T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X];
size_t const num_thread_block_tiles{(k + BLOCK_TILE_SIZE_K - 1) /
BLOCK_TILE_SIZE_K};
T C_thread_results[THREAD_TILE_SIZE_Y] = {static_cast<T>(0)};
for (size_t thread_block_tile_idx{0U};
thread_block_tile_idx < num_thread_block_tiles;
++thread_block_tile_idx)
{
load_data_to_shared_memory<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y,
BLOCK_TILE_SIZE_K, NUM_THREADS>(
A, lda, B, ldb, A_thread_block_tile, B_thread_block_tile,
thread_block_tile_idx, thread_linear_idx, m, n, k);
__syncthreads();
#pragma unroll
for (size_t k_i{0U}; k_i < BLOCK_TILE_SIZE_K; ++k_i)
{
size_t const B_thread_block_tile_row_idx{k_i};
T const B_val{
B_thread_block_tile[B_thread_block_tile_row_idx]
[thread_linear_idx % BLOCK_TILE_SIZE_X]};
#pragma unroll
for (size_t thread_tile_row_idx{0U};
thread_tile_row_idx < THREAD_TILE_SIZE_Y;
++thread_tile_row_idx)
{
size_t const A_thread_block_tile_row_idx{
thread_linear_idx / BLOCK_TILE_SIZE_X * THREAD_TILE_SIZE_Y +
thread_tile_row_idx};
size_t const A_thread_block_tile_col_idx{k_i};
T const A_val{A_thread_block_tile[A_thread_block_tile_row_idx]
[A_thread_block_tile_col_idx]};
C_thread_results[thread_tile_row_idx] += A_val * B_val;
}
}
__syncthreads();
}
#pragma unroll
for (size_t thread_tile_row_idx{0U};
thread_tile_row_idx < THREAD_TILE_SIZE_Y; ++thread_tile_row_idx)
{
size_t const C_row_idx{blockIdx.y * BLOCK_TILE_SIZE_Y +
thread_linear_idx / BLOCK_TILE_SIZE_X *
THREAD_TILE_SIZE_Y +
thread_tile_row_idx};
size_t const C_col_idx{blockIdx.x * BLOCK_TILE_SIZE_X +
thread_linear_idx % BLOCK_TILE_SIZE_X};
if (C_row_idx < m && C_col_idx < n)
{
C[C_row_idx * ldc + C_col_idx] =
alpha * C_thread_results[thread_tile_row_idx] +
beta * C[C_row_idx * ldc + C_col_idx];
}
}
}
template <typename T>
void launch_gemm_kernel_v03(size_t m, size_t n, size_t k, T const* alpha,
T const* A, size_t lda, T const* B, size_t ldb,
T const* beta, T* C, size_t ldc,
cudaStream_t stream)
{
constexpr unsigned int BLOCK_TILE_SIZE_X{64U};
constexpr unsigned int BLOCK_TILE_SIZE_Y{64U};
constexpr unsigned int BLOCK_TILE_SIZE_K{8U};
constexpr unsigned int THREAD_TILE_SIZE_Y{8U};
constexpr unsigned int NUM_THREADS_PER_BLOCK{
BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y / THREAD_TILE_SIZE_Y};
static_assert(BLOCK_TILE_SIZE_Y % THREAD_TILE_SIZE_Y == 0U);
static_assert(NUM_THREADS_PER_BLOCK % BLOCK_TILE_SIZE_K == 0U);
static_assert(NUM_THREADS_PER_BLOCK % BLOCK_TILE_SIZE_X == 0U);
dim3 const block_dim{NUM_THREADS_PER_BLOCK, 1U, 1U};
dim3 const grid_dim{
(static_cast<unsigned int>(n) + BLOCK_TILE_SIZE_X - 1U) /
BLOCK_TILE_SIZE_X,
(static_cast<unsigned int>(m) + BLOCK_TILE_SIZE_Y - 1U) /
BLOCK_TILE_SIZE_Y,
1U};
gemm_v03<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, BLOCK_TILE_SIZE_K,
THREAD_TILE_SIZE_Y><<<grid_dim, block_dim, 0U, stream>>>(
m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
CHECK_LAST_CUDA_ERROR();
}
从数学上讲,给定一个广义矩阵乘法运算 D=AB+C,其中 D∈Rm×n,A∈Rm×k,B∈Rk×n,C∈Rm×n,矩阵可以分成更小的矩阵。
我们首先将矩阵 B 的数据从共享内存缓存到寄存器中。每个具有块线程索引 (tm,tn)的线程,其中 tm∈[1,dtm/dtm]和 tn∈[1,dmm],现在负责计算小矩阵的dtm 个元素,其中dtm是线程块大小。