summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub Kuderski <jakub@nod-labs.com>2023-12-21 11:55:43 -0500
committerGitHub <noreply@github.com>2023-12-21 11:55:43 -0500
commit72003adf6bd44e91778c22e42e94a28c28be2339 (patch)
treefee659d46edac78de6bca626dd02199b77ba5946
parente6751c1a128320420801370ab662f213df5791b5 (diff)
[mlir][gpu] Allow subgroup reductions over 1-d vector types (#76015)
Each vector element is reduced independently, which is a form of multi-reduction. The plan is to allow for gradual lowering of multi-reduction that results in fewer `gpu.shuffle` ops at the end: 1d `vector.multi_reduction` --> 1d `gpu.subgroup_reduce` --> smaller 1d `gpu.subgroup_reduce` --> packed `gpu.shuffle` over i32 For example we can perform 2 independent f16 reductions with a series of `gpu.shuffles` over i32, reducing the final number of `gpu.shuffles` by 2x.
-rw-r--r--mlir/include/mlir/Dialect/GPU/IR/GPUOps.td16
-rw-r--r--mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp12
-rw-r--r--mlir/lib/Dialect/GPU/IR/GPUDialect.cpp11
-rw-r--r--mlir/test/Conversion/GPUToSPIRV/reductions.mlir38
-rw-r--r--mlir/test/Dialect/GPU/invalid.mlir14
-rw-r--r--mlir/test/Dialect/GPU/ops.mlir5
6 files changed, 84 insertions, 12 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index c72fde2ab351..b536b6c97cef 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -19,10 +19,11 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
include "mlir/Dialect/GPU/IR/CompilationAttrs.td"
include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td"
+include "mlir/IR/CommonTypeConstraints.td"
include "mlir/IR/EnumAttr.td"
-include "mlir/Interfaces/FunctionInterfaces.td"
include "mlir/IR/SymbolInterfaces.td"
include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/Interfaces/FunctionInterfaces.td"
include "mlir/Interfaces/InferIntRangeInterface.td"
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -1023,16 +1024,23 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce",
let hasRegionVerifier = 1;
}
+def AnyIntegerOrFloatOr1DVector :
+ AnyTypeOf<[AnyIntegerOrFloat, VectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>;
+
def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> {
let summary = "Reduce values among subgroup.";
let description = [{
The `subgroup_reduce` op reduces the value of every work item across a
subgroup. The result is equal for all work items of a subgroup.
+ When the reduced value is of a vector type, each vector element is reduced
+ independently. Only 1-d vector types are allowed.
+
Example:
```mlir
- %1 = gpu.subgroup_reduce add %0 : (f32) -> (f32)
+ %1 = gpu.subgroup_reduce add %a : (f32) -> (f32)
+ %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> (vector<4xf16>)
```
If `uniform` flag is set either none or all work items of a subgroup
@@ -1045,11 +1053,11 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
}];
let arguments = (ins
- AnyIntegerOrFloat:$value,
+ AnyIntegerOrFloatOr1DVector:$value,
GPU_AllReduceOperationAttr:$op,
UnitAttr:$uniform
);
- let results = (outs AnyIntegerOrFloat:$result);
+ let results = (outs AnyIntegerOrFloatOr1DVector:$result);
let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
(`uniform` $uniform^)? attr-dict
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index d383c16949f0..d7885e035959 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -16,10 +16,12 @@
#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
#include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Matchers.h"
+#include "mlir/Support/LogicalResult.h"
#include "mlir/Transforms/DialectConversion.h"
#include <optional>
@@ -591,10 +593,12 @@ public:
LogicalResult
matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto opType = op.getOp();
- auto result =
- createGroupReduceOp(rewriter, op.getLoc(), adaptor.getValue(), opType,
- /*isGroup*/ false, op.getUniform());
+ if (!isa<spirv::ScalarType>(adaptor.getValue().getType()))
+ return rewriter.notifyMatchFailure(op, "reduction type is not a scalar");
+
+ auto result = createGroupReduceOp(rewriter, op.getLoc(), adaptor.getValue(),
+ adaptor.getOp(),
+ /*isGroup=*/false, adaptor.getUniform());
if (!result)
return failure();
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 7c3330f4c238..dd482f305fcb 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -19,6 +19,7 @@
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
@@ -588,8 +589,16 @@ static void printAllReduceOperation(AsmPrinter &printer, Operation *op,
//===----------------------------------------------------------------------===//
LogicalResult gpu::SubgroupReduceOp::verify() {
+ Type elemType = getType();
+ if (auto vecTy = dyn_cast<VectorType>(elemType)) {
+ if (vecTy.isScalable())
+ return emitOpError() << "is not compatible with scalable vector types";
+
+ elemType = vecTy.getElementType();
+ }
+
gpu::AllReduceOperation opName = getOp();
- if (failed(verifyReduceOpAndType(opName, getType()))) {
+ if (failed(verifyReduceOpAndType(opName, elemType))) {
return emitError() << '`' << gpu::stringifyAllReduceOperation(opName)
<< "` reduction operation is not compatible with type "
<< getType();
diff --git a/mlir/test/Conversion/GPUToSPIRV/reductions.mlir b/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
index af58f4173136..44f85f68587f 100644
--- a/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
@@ -655,6 +655,26 @@ gpu.module @kernels {
// -----
+module attributes {
+ gpu.container_module,
+ spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Kernel, Addresses, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+ // CHECK-LABEL: spirv.func @test
+ // CHECK-SAME: (%[[ARG:.*]]: i32)
+ gpu.func @test(%arg : vector<1xi32>) kernel
+ attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+ // CHECK: %{{.*}} = spirv.GroupNonUniformSMax "Subgroup" "Reduce" %[[ARG]] : i32
+ %r0 = gpu.subgroup_reduce maxsi %arg : (vector<1xi32>) -> (vector<1xi32>)
+ gpu.return
+ }
+}
+
+}
+
+// -----
+
// TODO: Handle boolean reductions.
module attributes {
@@ -751,3 +771,21 @@ gpu.module @kernels {
}
}
}
+
+// -----
+
+// Vector reductions need to be lowered to scalar reductions first.
+
+module attributes {
+ gpu.container_module,
+ spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Kernel, Addresses, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], []>, #spirv.resource_limits<>>
+} {
+gpu.module @kernels {
+ gpu.func @maxui(%arg : vector<2xi32>) kernel
+ attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+ // expected-error @+1 {{failed to legalize operation 'gpu.subgroup_reduce'}}
+ %r0 = gpu.subgroup_reduce maxui %arg : (vector<2xi32>) -> (vector<2xi32>)
+ gpu.return
+ }
+}
+}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index d8a40f89f80a..8a34d6432607 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -333,9 +333,17 @@ func.func @reduce_invalid_op_type_maximumf(%arg0 : i32) {
// -----
-func.func @subgroup_reduce_bad_type(%arg0 : vector<2xf32>) {
- // expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float}}
- %res = gpu.subgroup_reduce add %arg0 : (vector<2xf32>) -> vector<2xf32>
+func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) {
+ // expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or vector of}}
+ %res = gpu.subgroup_reduce add %arg0 : (vector<2x2xf32>) -> vector<2x2xf32>
+ return
+}
+
+// -----
+
+func.func @subgroup_reduce_bad_type_scalable(%arg0 : vector<[2]xf32>) {
+ // expected-error@+1 {{is not compatible with scalable vector types}}
+ %res = gpu.subgroup_reduce add %arg0 : (vector<[2]xf32>) -> vector<[2]xf32>
return
}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 481934364156..605124243830 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -84,6 +84,8 @@ module attributes {gpu.container_module} {
%one = arith.constant 1.0 : f32
+ %vec = vector.broadcast %arg0 : f32 to vector<4xf32>
+
// CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} {
// CHECK-NEXT: } : (f32) -> f32
%sum = gpu.all_reduce add %one {} : (f32) -> (f32)
@@ -98,6 +100,9 @@ module attributes {gpu.container_module} {
// CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} uniform : (f32) -> f32
%sum_subgroup1 = gpu.subgroup_reduce add %one uniform : (f32) -> f32
+ // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (vector<4xf32>) -> vector<4xf32>
+ %sum_subgroup2 = gpu.subgroup_reduce add %vec : (vector<4xf32>) -> vector<4xf32>
+
%width = arith.constant 7 : i32
%offset = arith.constant 3 : i32
// CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32