diff --git a/cpp/command/sandbox.cpp b/cpp/command/sandbox.cpp index a8dbc358c..573df0c59 100644 --- a/cpp/command/sandbox.cpp +++ b/cpp/command/sandbox.cpp @@ -273,8 +273,9 @@ int MainCmds::sandbox() { if(!builder) throw StringError("sandbox: failed to create TensorRT builder"); - const auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - auto network = unique_ptr(builder->createNetworkV2(explicitBatch)); + // TensorRT 11 networks are always explicit-batch and strongly typed; createNetworkV2 takes no + // flags (the kEXPLICIT_BATCH NetworkDefinitionCreationFlag was removed). + auto network = unique_ptr(builder->createNetworkV2(0U)); if(!network) throw StringError("sandbox: failed to create TensorRT network"); diff --git a/cpp/neuralnet/onnxmodelbuilder.cpp b/cpp/neuralnet/onnxmodelbuilder.cpp index 991c79e90..c85580c80 100644 --- a/cpp/neuralnet/onnxmodelbuilder.cpp +++ b/cpp/neuralnet/onnxmodelbuilder.cpp @@ -1,6 +1,12 @@ #include "../neuralnet/onnxmodelbuilder.h" #include +#include +#include +#include +#include +#include +#include #include "../core/global.h" #include "../core/test.h" @@ -806,12 +812,190 @@ struct Builder { namespace OnnxModelBuilder { +// ---- FP16 conversion (for the TensorRT 11 strongly-typed backend) ---- +// +// The builder above always emits an FP32 graph. TensorRT 11 networks are strongly typed, so the +// only way to run the trunk in FP16 is to make the ONNX graph itself FP16. This pass rewrites a +// finished FP32 graph into mixed precision: every node runs FP16 except the numerically-sensitive +// nodes named in keepFP32 (RMSNorm square/reduce/sqrt reductions + the trunk tip and policy/value +// heads), which stay FP32. Graph inputs and outputs stay FP32 (KataGo feeds/reads FP32 buffers), +// so casts are inserted wherever an edge crosses an FP16<->FP32 boundary, and float weight +// initializers consumed only by FP16 nodes are converted to FP16. This reproduces the precision +// policy the old weakly-typed path expressed via setPrecision()+kOBEY_PRECISION_CONSTRAINTS. + +// IEEE-754 single -> half (binary16), round-to-nearest-even, with inf/nan/overflow/subnormal handling. +static uint16_t floatToHalf(float f) { + uint32_t x; + std::memcpy(&x, &f, sizeof(x)); + const uint32_t sign = (x >> 16) & 0x8000u; + const uint32_t mant = x & 0x007fffffu; + const int32_t rawExp = (int32_t)((x >> 23) & 0xffu); + if(rawExp == 0xff) // Inf / NaN preserved as-is + return (uint16_t)(sign | (mant != 0 ? 0x7e00u : 0x7c00u)); + const int32_t exp = rawExp - 127 + 15; // rebias to half + if(exp >= 0x1f) + // A *finite* value too large for half is clamped to the max finite half (+-65504) rather than + // promoted to Inf. NN weights are never meant to be infinite, and KataGo uses large sentinel + // constants (e.g. the 1e9 attention off-board mask bias) that must stay finite in FP16 - an Inf + // there yields 0*Inf = NaN in the attention softmax. Clamping preserves the intended semantics + // (a huge-but-finite negative bias still drives softmax to ~0). Matches onnxconverter-common. + return (uint16_t)(sign | 0x7bffu); + if(exp <= 0) { // subnormal half or zero + if(exp < -10) + return (uint16_t)sign; // too small even for a subnormal + const uint32_t m = mant | 0x00800000u; // restore implicit leading 1 + const int shift = 14 - exp; // in [14, 24] + uint32_t half = m >> shift; + const uint32_t rem = m & ((1u << shift) - 1u); + const uint32_t halfway = 1u << (shift - 1); + if(rem > halfway || (rem == halfway && (half & 1u))) + half += 1; // round to nearest even (may carry up to the smallest normal, which is correct) + return (uint16_t)(sign | half); + } + // normalized + const uint16_t half = (uint16_t)(sign | (uint32_t)(exp << 10) | (mant >> 13)); + const uint32_t rem = mant & 0x1fffu; + // round to nearest even; a mantissa carry naturally propagates into the exponent field. + if(rem > 0x1000u || (rem == 0x1000u && (half & 1u))) + return (uint16_t)(half + 1); + return half; +} + +static void convertInitializerToFP16(onnx::TensorProto* init) { + const int n = init->float_data_size(); + std::string raw; + raw.resize((size_t)n * 2); + for(int i = 0; i < n; i++) { + const uint16_t h = floatToHalf(init->float_data(i)); + raw[(size_t)2 * i] = (char)(h & 0xffu); // little-endian + raw[(size_t)2 * i + 1] = (char)((h >> 8) & 0xffu); + } + init->clear_float_data(); + init->set_data_type(onnx::TensorProto::FLOAT16); + init->set_raw_data(raw); +} + +// Rewrite an all-FP32 graph into mixed FP16/FP32 in place. keepFP32 holds the node *names* that must +// stay FP32; every other node becomes FP16. Casts are inserted (in topological position) on any edge +// whose producer/consumer precision differ, and FP16-only float initializers are converted to FP16. +static void convertGraphToFloat16(onnx::GraphProto* graph, const std::set& keepFP32) { + using std::string; + + auto nodeIsFP16 = [&](const onnx::NodeProto& n) { return keepFP32.count(n.name()) == 0; }; + + std::set graphInputNames; + for(const auto& vi : graph->input()) + graphInputNames.insert(vi.name()); + + // tensor name -> producing node index (every node here has a single, uniquely-named output) + std::unordered_map producer; + for(int i = 0; i < graph->node_size(); i++) + for(const string& o : graph->node(i).output()) + producer[o] = i; + + // Classify initializers: FLOAT ones are candidates for FP16; INT64 (axes/shapes) are left alone. + std::unordered_map initByName; + std::set floatInitNames; + for(int i = 0; i < graph->initializer_size(); i++) { + onnx::TensorProto* init = graph->mutable_initializer(i); + initByName[init->name()] = init; + if(init->data_type() == onnx::TensorProto::FLOAT) + floatInitNames.insert(init->name()); + } + + // A float initializer becomes FP16 iff every node consuming it is FP16 (otherwise keep it FP32 and + // let cast insertion handle any FP16 consumer). Most weights have exactly one consumer. + std::unordered_map initSawFP16, initSawFP32; + for(int i = 0; i < graph->node_size(); i++) { + const bool fp16 = nodeIsFP16(graph->node(i)); + for(const string& in : graph->node(i).input()) + if(floatInitNames.count(in)) + (fp16 ? initSawFP16 : initSawFP32)[in] = true; + } + std::set initIsFP16; + for(const string& name : floatInitNames) { + if(initSawFP16.count(name) && !initSawFP32.count(name)) { + convertInitializerToFP16(initByName[name]); + initIsFP16.insert(name); + } + } + + auto isFloatTensor = [&](const string& name) -> bool { + if(graphInputNames.count(name)) + return true; // all KataGo graph inputs are FLOAT + auto it = initByName.find(name); + if(it != initByName.end()) + return floatInitNames.count(name) > 0; // INT64 initializers are not float + return producer.count(name) > 0; // node outputs in this graph are all float + }; + auto tensorIsFP16 = [&](const string& name) -> bool { + if(graphInputNames.count(name)) + return false; + if(initByName.count(name)) + return initIsFP16.count(name) > 0; + auto it = producer.find(name); + return it != producer.end() && nodeIsFP16(graph->node(it->second)); + }; + + // Rebuild the node list, emitting any required Cast nodes just before the node that needs them so + // the result stays topologically ordered. Casts are cached by (source tensor, target precision). + std::map, string> castCache; + int castCounter = 0; + google::protobuf::RepeatedPtrField newNodes; + for(int i = 0; i < graph->node_size(); i++) { + const onnx::NodeProto& orig = graph->node(i); + const bool fp16 = nodeIsFP16(orig); + std::vector rewritten; + for(const string& in : orig.input()) { + if(in.empty() || !isFloatTensor(in) || tensorIsFP16(in) == fp16) { + rewritten.push_back(in); + continue; + } + const auto key = std::make_pair(in, fp16); + auto cit = castCache.find(key); + if(cit != castCache.end()) { + rewritten.push_back(cit->second); + continue; + } + const string castOut = in + (fp16 ? "__tofp16_" : "__tofp32_") + Global::intToString(castCounter++); + onnx::NodeProto* c = newNodes.Add(); + c->set_op_type("Cast"); + c->set_name(castOut + "/cast"); + c->add_input(in); + c->add_output(castOut); + onnx::AttributeProto* a = c->add_attribute(); + a->set_name("to"); + a->set_type(onnx::AttributeProto::INT); + a->set_i(fp16 ? onnx::TensorProto::FLOAT16 : onnx::TensorProto::FLOAT); + castCache[key] = castOut; + rewritten.push_back(castOut); + } + onnx::NodeProto* nn = newNodes.Add(); + *nn = orig; + nn->clear_input(); + for(const string& in : rewritten) + nn->add_input(in); + } + + // Sanity (checked against the original node list, before the swap below): graph outputs must remain + // FP32, since their producers are in keepFP32 and getOutput does a flat FP32 cudaMemcpy of each + // output binding. Fail loudly rather than silently producing garbage if that ever stops holding. + for(const auto& vo : graph->output()) { + auto it = producer.find(vo.name()); + if(it != producer.end() && nodeIsFP16(graph->node(it->second))) + throw StringError("OnnxModelBuilder: FP16 conversion left graph output '" + vo.name() + "' in FP16"); + } + + graph->mutable_node()->Swap(&newNodes); +} + Result build( const ModelDesc& desc, int nnXLen, int nnYLen, bool requireExactNNLen, bool transformerNHWC, + bool useFP16, Logger* logger ) { if(desc.metaEncoderVersion > 0) @@ -1051,6 +1235,17 @@ Result build( b.recordNodesSince(trunkTipAndHeadStart, b.trunkTipAndHeadNodeNames); + // For TensorRT 11 strongly-typed engines: rewrite the finished FP32 graph into mixed FP16/FP32, + // keeping the RMSNorm reductions and the trunk-tip + heads in FP32 (the same regions the old + // weakly-typed path pinned via setPrecision). Inputs/outputs stay FP32. + if(useFP16) { + std::set keepFP32(b.trunkTipAndHeadNodeNames.begin(), b.trunkTipAndHeadNodeNames.end()); + keepFP32.insert(b.rmsNormNodeNames.begin(), b.rmsNormNodeNames.end()); + convertGraphToFloat16(graph, keepFP32); + if(logger != NULL) + logger->write("OnnxModelBuilder: converted trunk to FP16 (" + Global::intToString((int)keepFP32.size()) + " nodes kept FP32)"); + } + // DEBUG (kept commented out): expose every internal node output as an extra FP32 graph output so the // backend can dump per-layer activations for FP16-vs-FP32 *numerical* divergence analysis. This is // complementary to the trtDumpDebugPlanToDir engine dump (which shows fusion structure and boundary diff --git a/cpp/neuralnet/onnxmodelbuilder.h b/cpp/neuralnet/onnxmodelbuilder.h index 10d381915..3837592fa 100644 --- a/cpp/neuralnet/onnxmodelbuilder.h +++ b/cpp/neuralnet/onnxmodelbuilder.h @@ -31,13 +31,17 @@ namespace OnnxModelBuilder { std::vector rmsNormNodeNames; // every RMSNorm (transformer + trunk-tip) op }; - // Build a serialized ONNX ModelProto for the given model. + // Build a serialized ONNX ModelProto for the given model. When useFP16 is set, the finished graph + // is rewritten to run the trunk in FP16 with the numerically-sensitive regions (RMSNorm reductions, + // trunk tip, policy/value heads) and the graph inputs/outputs kept in FP32 (see convertGraphToFloat16 + // in the .cpp). This is what makes FP16 possible under TensorRT 11's strongly-typed networks. Result build( const ModelDesc& desc, int nnXLen, int nnYLen, bool requireExactNNLen, bool transformerNHWC, + bool useFP16, Logger* logger ); } diff --git a/cpp/neuralnet/trtbackend.cpp b/cpp/neuralnet/trtbackend.cpp index 8af6e623a..7422d6944 100644 --- a/cpp/neuralnet/trtbackend.cpp +++ b/cpp/neuralnet/trtbackend.cpp @@ -192,7 +192,9 @@ struct ModelParser { // Bump this when between katago versions we want to forcibly drop old timing caches and plan caches. // Bumped 7->8 for the TensorRT ONNX overhaul (ONNX emitter as default path, NHWC trunk, FP32 pinning). - static constexpr int tuneSalt = 8; + // Bumped 8->9 for the TensorRT 11 strongly-typed port (FP32 engines; old FP16-pinned plans invalid). + // Bumped 9->10 for the strongly-typed FP16 path (FP16 trunk emitted in the ONNX graph with casts). + static constexpr int tuneSalt = 10; unique_ptr build( unique_ptr net, @@ -267,12 +269,12 @@ struct ModelParser { } else { debugOutputLayer = network->addIdentity(*tensor); } - debugOutputLayer->setOutputType(0, DataType::kFLOAT); string debugOutputName = "DBG" + to_string(hash{}(description)); auto debugOutput = debugOutputLayer->getOutput(0); network->markOutput(*debugOutput); debugOutput->setName(debugOutputName.c_str()); - debugOutput->setType(DataType::kFLOAT); + // Strongly-typed (TensorRT 11): the network is FP32, so this output is already FP32; just pin + // the layout. ITensor::setType / ILayer::setOutputType were removed. debugOutput->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); model->debugOutputs.emplace_back(debugOutputName, description); #else @@ -367,11 +369,9 @@ struct ModelParser { if(!model->requireExactNNLen) { maskSumLayer = network->addReduce(*inputMask, ReduceOperation::kSUM, 1U << 2 | 1U << 3, true); maskSumLayer->setName("InputMask/sum"); - maskSumLayer->setPrecision(DataType::kFLOAT); auto maskWidthLayer = network->addUnary(*maskSumLayer->getOutput(0), UnaryOperation::kSQRT); maskWidthLayer->setName("InputMask/width"); - maskWidthLayer->setPrecision(DataType::kFLOAT); auto maskScaleWeightsShift = make_unique(1); auto maskScaleWeightsScale = make_unique(1); @@ -384,7 +384,6 @@ struct ModelParser { {DataType::kFLOAT, maskScaleWeightsScale.get(), 1}, {DataType::kFLOAT, nullptr, 0}); maskScaleLayer->setName("InputMask/scale"); - maskScaleLayer->setPrecision(DataType::kFLOAT); model->extraWeights.push_back(move(maskScaleWeightsShift)); model->extraWeights.push_back(move(maskScaleWeightsScale)); @@ -399,7 +398,6 @@ struct ModelParser { {DataType::kFLOAT, nullptr, 0}, {DataType::kFLOAT, maskCenterSquareWeightsPower.get(), 1}); maskCenterSquareLayer->setName("InputMask/centersquare"); - maskCenterSquareLayer->setPrecision(DataType::kFLOAT); model->extraWeights.push_back(move(maskCenterSquareWeightsShift)); model->extraWeights.push_back(move(maskCenterSquareWeightsPower)); @@ -414,7 +412,6 @@ struct ModelParser { {DataType::kFLOAT, maskQuadWeightsScale.get(), 1}, {DataType::kFLOAT, nullptr, 0}); maskQuadLayer->setName("InputMask/quad"); - maskQuadLayer->setPrecision(DataType::kFLOAT); model->extraWeights.push_back(move(maskQuadWeightsShift)); model->extraWeights.push_back(move(maskQuadWeightsScale)); } else { @@ -545,7 +542,6 @@ struct ModelParser { *p1CastLayer->getOutput(0), *gpoolToBiasMulLayer->getOutput(0), ElementWiseOperation::kSUM); auto gpoolBiasLayerName = name + "/gpbias"; gpoolBiasLayer->setName(gpoolBiasLayerName.c_str()); - gpoolBiasLayer->setPrecision(DataType::kFLOAT); auto p1BatchNormLayer = buildBatchNormLayer(gpoolBiasLayer->getOutput(0), &desc->p1BN, true); auto p1ActivationLayer = buildActivationLayer(p1BatchNormLayer->getOutput(0), &desc->p1Activation, true); auto p1MaskLayer = applyMaskLayer(p1ActivationLayer, true); @@ -561,35 +557,28 @@ struct ModelParser { testAssert(desc->p2Conv.convYSize == 1); auto p2ConvLayer = buildConvLayer(p1MaskLayer->getOutput(0), &desc->p2Conv, true); - p2ConvLayer->setPrecision(DataType::kFLOAT); if(model->modelVersion >= 15) { auto gpoolToPassMulLayer = buildMatMulLayer(gpoolLayer->getOutput(0), &desc->gpoolToPassMul, true); - gpoolToPassMulLayer->setPrecision(DataType::kFLOAT); auto gpoolToPassBiasLayer = buildMatBiasLayer(gpoolToPassMulLayer->getOutput(0), &desc->gpoolToPassBias, true); auto gpoolToPassActLayer = buildActivationLayer(gpoolToPassBiasLayer->getOutput(0), &desc->passActivation, true); auto gpoolToPassMul2Layer = buildMatMulLayer(gpoolToPassActLayer->getOutput(0), &desc->gpoolToPassMul2, true); - gpoolToPassMul2Layer->setPrecision(DataType::kFLOAT); auto outputPolicyPass = gpoolToPassMul2Layer->getOutput(0); network->markOutput(*outputPolicyPass); outputPolicyPass->setName("OutputPolicyPass"); - outputPolicyPass->setType(DataType::kFLOAT); outputPolicyPass->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); } else { auto gpoolToPassMulLayer = buildMatMulLayer(gpoolLayer->getOutput(0), &desc->gpoolToPassMul, true); - gpoolToPassMulLayer->setPrecision(DataType::kFLOAT); auto outputPolicyPass = gpoolToPassMulLayer->getOutput(0); network->markOutput(*outputPolicyPass); outputPolicyPass->setName("OutputPolicyPass"); - outputPolicyPass->setType(DataType::kFLOAT); outputPolicyPass->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); } auto outputPolicy = p2ConvLayer->getOutput(0); network->markOutput(*outputPolicy); outputPolicy->setName("OutputPolicy"); - outputPolicy->setType(DataType::kFLOAT); outputPolicy->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); } @@ -628,19 +617,16 @@ struct ModelParser { auto outputValue = v3BiasLayer->getOutput(0); network->markOutput(*outputValue); outputValue->setName("OutputValue"); - outputValue->setType(DataType::kFLOAT); outputValue->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); auto outputScoreValue = sv3BiasLayer->getOutput(0); network->markOutput(*outputScoreValue); outputScoreValue->setName("OutputScoreValue"); - outputScoreValue->setType(DataType::kFLOAT); outputScoreValue->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); auto outputOwnership = vOwnershipCastLayer->getOutput(0); network->markOutput(*outputOwnership); outputOwnership->setName("OutputOwnership"); - outputOwnership->setType(DataType::kFLOAT); outputOwnership->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); auto modelDesc = &model->rawModel->modelDesc; @@ -756,10 +742,10 @@ struct ModelParser { {DataType::kFLOAT, transposedWeights.get(), static_cast(desc->weights.size())}, {DataType::kFLOAT, nullptr, 0}); matMulLayer->setName(desc->name.c_str()); - - if(forceFP32) { - matMulLayer->setPrecision(DataType::kFLOAT); - } + // Under TensorRT 11 (strongly-typed networks) there is no per-layer FP32 pinning; the whole + // ModelParser graph is FP32. The forceFP32 flag is kept on these build helpers only as a marker + // of the numerically-sensitive layers, for a future explicitly-typed FP16 emitter path. + (void)forceFP32; model->extraWeights.push_back(move(transposedWeights)); @@ -781,10 +767,7 @@ struct ModelParser { {DataType::kFLOAT, nullptr, 0}, {DataType::kFLOAT, nullptr, 0}); matBiasLayer->setName(desc->name.c_str()); - - if(forceFP32) { - matBiasLayer->setPrecision(DataType::kFLOAT); - } + (void)forceFP32; return matBiasLayer; } @@ -819,10 +802,7 @@ struct ModelParser { convLayer->setDilationNd({2, {dilationY, dilationX}}); convLayer->setPaddingMode(PaddingMode::kSAME_UPPER); convLayer->setName(desc->name.c_str()); - - if(forceFP32) { - convLayer->setPrecision(DataType::kFLOAT); - } + (void)forceFP32; return convLayer; } @@ -847,30 +827,22 @@ struct ModelParser { {DataType::kFLOAT, desc->mergedScale.data(), static_cast(numChannels)}, {DataType::kFLOAT, nullptr, 0}); bnLayer->setName(desc->name.c_str()); - - if(forceFP32) { - bnLayer->setPrecision(DataType::kFLOAT); - } + (void)forceFP32; return bnLayer; } ILayer* buildActivationLayer(ITensor* input, const ActivationLayerDesc* desc, bool forceFP32 = false) { tuneDesc += Global::strprintf(R"|("%s"(%d))|", desc->name.c_str(), desc->activation); + (void)forceFP32; // No per-layer FP32 pinning under TensorRT 11; graph is FP32 (see buildMatMulLayer). if(desc->activation == ACTIVATION_IDENTITY) { auto activationLayer = model->network->addIdentity(*input); activationLayer->setName(desc->name.c_str()); - if(forceFP32) { - activationLayer->setPrecision(DataType::kFLOAT); - } return activationLayer; } else if(desc->activation == ACTIVATION_RELU) { auto activationLayer = model->network->addActivation(*input, ActivationType::kRELU); activationLayer->setName(desc->name.c_str()); - if(forceFP32) { - activationLayer->setPrecision(DataType::kFLOAT); - } return activationLayer; } else if(desc->activation == ACTIVATION_MISH) { @@ -882,11 +854,6 @@ struct ModelParser { tanhLayer->setName(tanhLayerName.c_str()); auto mergeLayer = model->network->addElementWise(*input, *tanhLayer->getOutput(0), ElementWiseOperation::kPROD); mergeLayer->setName(desc->name.c_str()); - if(forceFP32) { - softplusLayer->setPrecision(DataType::kFLOAT); - tanhLayer->setPrecision(DataType::kFLOAT); - mergeLayer->setPrecision(DataType::kFLOAT); - } return mergeLayer; } else if(desc->activation == ACTIVATION_MISH_SCALE8) { @@ -900,11 +867,6 @@ struct ModelParser { tanhLayer->setName(tanhLayerName.c_str()); auto mergeLayer = model->network->addElementWise(*input, *tanhLayer->getOutput(0), ElementWiseOperation::kPROD); mergeLayer->setName(desc->name.c_str()); - if(forceFP32) { - softplusLayer->setPrecision(DataType::kFLOAT); - tanhLayer->setPrecision(DataType::kFLOAT); - mergeLayer->setPrecision(DataType::kFLOAT); - } return mergeLayer; } else { @@ -920,6 +882,7 @@ struct ModelParser { ILayer* applyGPoolLayer(ILayer* inputLayer, bool forceFP32 = false, bool isValueHead = false) { auto& network = model->network; string name = inputLayer->getName(); + (void)forceFP32; // No per-layer FP32 pinning under TensorRT 11; graph is FP32 (see buildMatMulLayer). ILayer* gpoolSumLayer = nullptr; ILayer* gpoolMeanLayer = nullptr; @@ -987,22 +950,6 @@ struct ModelParser { gpoolConcatLayer->setAxis(1); gpoolConcatLayer->setName(gpoolConcatLayerName.c_str()); - if(forceFP32) { - if(gpoolSumLayer) { - gpoolSumLayer->setPrecision(DataType::kFLOAT); - } - if(gpoolMaskAddLayer) { - gpoolMaskAddLayer->setPrecision(DataType::kFLOAT); - } - if(gpoolMaskShiftLayer) { - gpoolMaskShiftLayer->setPrecision(DataType::kFLOAT); - } - gpoolMeanLayer->setPrecision(DataType::kFLOAT); - gpoolMeanScaleLayer->setPrecision(DataType::kFLOAT); - gpoolConcatInputLayer3->setPrecision(DataType::kFLOAT); - gpoolConcatLayer->setPrecision(DataType::kFLOAT); - } - return gpoolConcatLayer; } @@ -1012,11 +959,10 @@ struct ModelParser { model->network->addElementWise(*inputLayer->getOutput(0), *inputMask, ElementWiseOperation::kPROD); auto maskLayerName = string(inputLayer->getName()) + "/mask"; maskLayer->setName(maskLayerName.c_str()); - if(forceFP32) { - maskLayer->setPrecision(DataType::kFLOAT); - } + (void)forceFP32; // No per-layer FP32 pinning under TensorRT 11; graph is FP32 (see buildMatMulLayer). return maskLayer; } else { + (void)forceFP32; return inputLayer; } } @@ -1182,19 +1128,19 @@ struct ComputeHandle { throw StringError("TensorRT backend: failed to create builder config"); } - usingFP16 = false; - if(builder->platformHasFastFp16()) { - if(ctx->useFP16Mode == enabled_t::True || ctx->useFP16Mode == enabled_t::Auto) { - config->setFlag(BuilderFlag::kFP16); - usingFP16 = true; - } - } else if(ctx->useFP16Mode == enabled_t::True) { - throw StringError("CUDA device does not support useFP16=true"); - } - // The ONNX path may pin specific layers to FP32 below and needs the constraint to be hard - // (kOBEY) so TensorRT cannot silently fall back to an FP16 path. The ModelParser path uses the - // softer kPREFER. We set the flag after building the network, once forceObeyPrecision is known. - bool forceObeyPrecision = false; + // TensorRT 11 removed weakly-typed networks and the builder-driven mixed-precision machinery this + // backend used to rely on (the kFP16 builder flag, platformHasFastFp16(), per-layer + // setPrecision()/setOutputType(), and kOBEY/kPREFER_PRECISION_CONSTRAINTS). Every network is now + // strongly typed: precision is whatever the network (or parsed ONNX graph) declares. FP16 is + // therefore expressed in the ONNX graph itself - the ONNX emitter rewrites the trunk to FP16 while + // keeping the RMSNorm reductions, trunk tip, heads, and the graph inputs/outputs in FP32 (see + // OnnxModelBuilder::build / convertGraphToFloat16). useFP16=false forces a fully-FP32 engine; the + // hand-built ModelParser path (trtDisableOnnx) has no FP16 support and always runs FP32. + usingFP16 = (ctx->useFP16Mode != enabled_t::False) && useOnnxEmit; + if(ctx->useFP16Mode == enabled_t::True && !useOnnxEmit) + logger->write( + "TensorRT backend: WARNING useFP16=true is not supported by the non-ONNX ModelParser path " + "(trtDisableOnnx); running in FP32."); // Debug plan/engine dump (trtDumpDebugPlanToDir). Build a base path inside that dir, disambiguated // by board size + precision + exact/max so the multiple engines built in one process don't collide. @@ -1207,8 +1153,9 @@ struct ComputeHandle { (usingFP16 ? "_fp16" : "_fp32") + (requireExactNNLen ? "_exact" : "_max"); } - auto network = unique_ptr( - builder->createNetworkV2(1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))); + // TensorRT 11 networks are always explicit-batch and strongly typed; createNetworkV2 takes no + // flags (the kEXPLICIT_BATCH NetworkDefinitionCreationFlag was removed). + auto network = unique_ptr(builder->createNetworkV2(0U)); if(!network) { throw StringError("TensorRT backend: failed to create network definition"); } @@ -1230,7 +1177,7 @@ struct ComputeHandle { if(useOnnxEmit) { logger->write("TensorRT backend: building network via ONNX emitter"); const ModelDesc& desc = loadedModel->modelDesc; - OnnxModelBuilder::Result onnxResult = OnnxModelBuilder::build(desc, ctx->nnXLen, ctx->nnYLen, requireExactNNLen, ctx->transformerNHWC, logger); + OnnxModelBuilder::Result onnxResult = OnnxModelBuilder::build(desc, ctx->nnXLen, ctx->nnYLen, requireExactNNLen, ctx->transformerNHWC, usingFP16, logger); onnxBytes = std::move(onnxResult.serializedModel); if(dumpDebugPlan) { @@ -1252,37 +1199,17 @@ struct ComputeHandle { throw StringError(msg); } - // Constrain all graph outputs to linear FP32, matching what ModelParser sets on its outputs. - // getOutput does a flat cudaMemcpy of each output buffer assuming linear layout, so without - // this the parser may leave outputs in a reformatted layout and the copy reads garbage. + // Graph outputs are always FP32 (the heads are kept FP32 even in FP16 mode); we only constrain + // them to a linear layout, since getOutput does a flat cudaMemcpy of each output buffer assuming + // linear layout. Under TensorRT 11 strongly-typed networks, ITensor::setType no longer exists - + // the tensor type is fixed by the graph. for(int i = 0; i < network->getNbOutputs(); i++) { - ITensor* out = network->getOutput(i); - out->setType(DataType::kFLOAT); - out->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); + network->getOutput(i)->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); } - // Force the numerically-sensitive regions to FP32: every RMSNorm reduction (square->reduce-> - // sqrt, which sums over many elements and loses too much precision in FP16) plus the trunk-tip - // norm and policy/value heads. The emitter records these layer names; we pin them via per-layer - // setPrecision + kOBEY_PRECISION_CONSTRAINTS (a hard constraint) so correctness does not depend - // on TensorRT declining to fuse a numerically-equivalent FP16 path back in. This matches the - // FP32-forcing the hand-built ModelParser path already does for its heads/gpool. - std::set fp32Names; - fp32Names.insert(onnxResult.trunkTipAndHeadNodeNames.begin(), onnxResult.trunkTipAndHeadNodeNames.end()); - fp32Names.insert(onnxResult.rmsNormNodeNames.begin(), onnxResult.rmsNormNodeNames.end()); - int pinned = 0; - for(int i = 0; i < network->getNbLayers(); i++) { - ILayer* layer = network->getLayer(i); - const char* lname = layer->getName(); - if(lname != nullptr && fp32Names.count(string(lname))) { - layer->setPrecision(DataType::kFLOAT); - for(int o = 0; o < layer->getNbOutputs(); o++) - layer->setOutputType(o, DataType::kFLOAT); - pinned++; - } - } - forceObeyPrecision = true; - logger->write(Global::strprintf("TensorRT backend: pinned %d layers to FP32 (rmsnorm + heads)", pinned)); + // No per-layer FP32 pinning here: precision is fixed in the ONNX graph. In FP16 mode the emitter + // already emitted the numerically-sensitive regions (onnxResult.trunkTipAndHeadNodeNames and + // rmsNormNodeNames) in FP32 with explicit casts; in FP32 mode the whole graph is FP32. // Set optimization profile dims for each input the parser created. auto setProfile = [&](const char* name, Dims4 minDims, Dims4 optMaxDims) { @@ -1321,9 +1248,8 @@ struct ComputeHandle { debugOutputs = model->debugOutputs; config->addOptimizationProfile(profile); - // Honor per-layer precision constraints. The ONNX path pins some layers to FP32 and needs a hard - // constraint (kOBEY) so TensorRT cannot fall back to FP16; the ModelParser path uses kPREFER. - config->setFlag(forceObeyPrecision ? BuilderFlag::kOBEY_PRECISION_CONSTRAINTS : BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + // No precision-constraint flag under TensorRT 11: strongly-typed networks carry their precision + // in the graph itself, so kOBEY/kPREFER_PRECISION_CONSTRAINTS no longer exist. #if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5 // This is to avoid external tactic sources and tactics that have shape switching overhead