Skip to content

Commit 4d1e384

Browse files
committed
Fix MFA v2 multi-head.
1 parent 05c73e6 commit 4d1e384

File tree

2 files changed

+35
-11
lines changed

2 files changed

+35
-11
lines changed

lib/nnc/mfa/ccv_nnc_mfa_attention.cpp

+30-10
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,11 @@ void ccv_nnc_mfa_encode_attention(mfa::context* context, ccv_nnc_mfa_attention_p
9898
pool->drain();
9999
auto kernel = pipelineValue->kernel;
100100
auto pipeline = pipelineValue->pipeline;
101-
// Allocate a new command.
101+
// Allocate a new command.
102102
auto encoder = command_batch->startCommand();
103103
encoder->setComputePipelineState(pipeline.get());
104104
encoder->setThreadgroupMemoryLength(kernel->threadgroupMemoryAllocation, 0);
105-
105+
106106
// Bind the function arguments.
107107
encoder->useResource(tensors[0], MTL::ResourceUsageRead);
108108
encoder->useResource(tensors[1], MTL::ResourceUsageRead);
@@ -146,17 +146,37 @@ void ccv_nnc_mfa_encode_attention(mfa::context* context, ccv_nnc_mfa_attention_p
146146
encoder->setBuffer(scratch, 0, AttentionOperand(AttentionOperand::L).bufferIndex());
147147
}
148148
}
149-
149+
150150
MTL::Size gridSize
151-
(ceilDivide(int64_t(hash.R), kernel->blockDimensions[0]),
152-
hash.Hq,
153-
attentionDesc.batchDimension);
151+
(ceilDivide(int64_t(hash.R), kernel->blockDimensions[0]), 1, 1);
154152
MTL::Size groupSize
155153
(int64_t(kernel->threadgroupSize), 1, 1);
156-
157-
// Dispatch the required number of threads.
158-
encoder->dispatchThreadgroups(gridSize, groupSize);
159-
154+
155+
const size_t bytesPerElement = attentionDesc.lowPrecisionInputs ? sizeof(uint16_t) : sizeof(float);
156+
for (int i = 0; i < attentionDesc.batchDimension; i++) {
157+
for (int j = 0; j < hash.Hq; j++) {
158+
encoder->setBufferOffset(tensor_offsets[0] + bytesPerElement * (i * hash.R * hash.D * hash.Hq + j * hash.D), AttentionOperand(AttentionOperand::Q).bufferIndex());
159+
encoder->setBufferOffset(tensor_offsets[1] + bytesPerElement * (i * hash.C * hash.D * hash.Hk + j * hash.D), AttentionOperand(AttentionOperand::K).bufferIndex());
160+
encoder->setBufferOffset(tensor_offsets[2] + bytesPerElement * (i * hash.C * hash.D * hash.Hk + j * hash.D), AttentionOperand(AttentionOperand::V).bufferIndex());
161+
if (attentionDesc.lowPrecisionInputs) {
162+
encoder->setBufferOffset(sizeof(float) * (i * hash.R * hash.D * hash.Hq + j * hash.D), AttentionOperand(AttentionOperand::O).bufferIndex());
163+
if (tensors[5]) {
164+
encoder->setBufferOffset(tensor_offsets[5] + sizeof(float) * (i * hash.R * hash.Hq + j * hash.R), AttentionOperand(AttentionOperand::L).bufferIndex());
165+
} else {
166+
encoder->setBufferOffset(sizeof(float) * hash.R * hash.D * hash.Hq * attentionDesc.batchDimension + sizeof(float) * (i * hash.R * hash.Hq + j * hash.R), AttentionOperand(AttentionOperand::L).bufferIndex());
167+
}
168+
} else {
169+
encoder->setBufferOffset(tensor_offsets[3] + sizeof(float) * (i * hash.R * hash.D * hash.Hq + j * hash.D), AttentionOperand(AttentionOperand::O).bufferIndex());
170+
if (tensors[5]) {
171+
encoder->setBufferOffset(tensor_offsets[5] + sizeof(float) * (i * hash.R * hash.Hq + j * hash.R), AttentionOperand(AttentionOperand::L).bufferIndex());
172+
} else {
173+
encoder->setBufferOffset(sizeof(float) * (i * hash.R * hash.Hq + j * hash.R), AttentionOperand(AttentionOperand::L).bufferIndex());
174+
}
175+
}
176+
// Dispatch the required number of threads.
177+
encoder->dispatchThreadgroups(gridSize, groupSize);
178+
}
179+
}
160180
// Finish the command.
161181
command_batch->finishCommand(encoder);
162182
if (attentionDesc.lowPrecisionInputs) {

lib/nnc/mfa/v2/AttentionKernel.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,11 @@ std::string AttentionKernel::operandLocationWithHeadOffsetValue(AttentionOperand
520520
CodeWriter source;
521521
source.SetValue("OPERAND", operand.name());
522522
if (operand.value == AttentionOperand::L || operand.value == AttentionOperand::D) {
523-
source += "{{OPERAND}} + (gid.z * Hq + gid.y) * R\\";
523+
if (Hq > 1) {
524+
source += "{{OPERAND}} + (gid.z * Hq + gid.y) * R\\";
525+
} else {
526+
source += "{{OPERAND}} + gid.z * R\\";
527+
}
524528
} else if (Hq > 1) {
525529
source.SetValue("HEAD_DIMENSION", std::to_string(headDimension));
526530
if (!transposed(operand)) {

0 commit comments

Comments
 (0)