diff --git a/cpp/command/sandbox.cpp b/cpp/command/sandbox.cpp
index a8dbc358c..573df0c59 100644
--- a/cpp/command/sandbox.cpp
+++ b/cpp/command/sandbox.cpp
@@ -273,8 +273,9 @@ int MainCmds::sandbox() {
   if(!builder)
     throw StringError("sandbox: failed to create TensorRT builder");
 
-  const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-  auto network = unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
+  // TensorRT 11 networks are always explicit-batch and strongly typed; createNetworkV2 takes no
+  // flags (the kEXPLICIT_BATCH NetworkDefinitionCreationFlag was removed).
+  auto network = unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(0U));
   if(!network)
     throw StringError("sandbox: failed to create TensorRT network");
 
diff --git a/cpp/neuralnet/onnxmodelbuilder.cpp b/cpp/neuralnet/onnxmodelbuilder.cpp
index 991c79e90..c85580c80 100644
--- a/cpp/neuralnet/onnxmodelbuilder.cpp
+++ b/cpp/neuralnet/onnxmodelbuilder.cpp
@@ -1,6 +1,12 @@
 #include "../neuralnet/onnxmodelbuilder.h"
 
 #include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <utility>
 
 #include "../core/global.h"
 #include "../core/test.h"
@@ -806,12 +812,190 @@ struct Builder {
 
 namespace OnnxModelBuilder {
 
+// ---- FP16 conversion (for the TensorRT 11 strongly-typed backend) ----
+//
+// The builder above always emits an FP32 graph. TensorRT 11 networks are strongly typed, so the
+// only way to run the trunk in FP16 is to make the ONNX graph itself FP16. This pass rewrites a
+// finished FP32 graph into mixed precision: every node runs FP16 except the numerically-sensitive
+// nodes named in keepFP32 (RMSNorm square/reduce/sqrt reductions + the trunk tip and policy/value
+// heads), which stay FP32. Graph inputs and outputs stay FP32 (KataGo feeds/reads FP32 buffers),
+// so casts are inserted wherever an edge crosses an FP16<->FP32 boundary, and float weight
+// initializers consumed only by FP16 nodes are converted to FP16. This reproduces the precision
+// policy the old weakly-typed path expressed via setPrecision()+kOBEY_PRECISION_CONSTRAINTS.
+
+// IEEE-754 single -> half (binary16), round-to-nearest-even, with inf/nan/overflow/subnormal handling.
+static uint16_t floatToHalf(float f) {
+  uint32_t x;
+  std::memcpy(&x, &f, sizeof(x));
+  const uint32_t sign = (x >> 16) & 0x8000u;
+  const uint32_t mant = x & 0x007fffffu;
+  const int32_t rawExp = (int32_t)((x >> 23) & 0xffu);
+  if(rawExp == 0xff)  // Inf / NaN preserved as-is
+    return (uint16_t)(sign | (mant != 0 ? 0x7e00u : 0x7c00u));
+  const int32_t exp = rawExp - 127 + 15;  // rebias to half
+  if(exp >= 0x1f)
+    // A *finite* value too large for half is clamped to the max finite half (+-65504) rather than
+    // promoted to Inf. NN weights are never meant to be infinite, and KataGo uses large sentinel
+    // constants (e.g. the 1e9 attention off-board mask bias) that must stay finite in FP16 - an Inf
+    // there yields 0*Inf = NaN in the attention softmax. Clamping preserves the intended semantics
+    // (a huge-but-finite negative bias still drives softmax to ~0). Matches onnxconverter-common.
+    return (uint16_t)(sign | 0x7bffu);
+  if(exp <= 0) {  // subnormal half or zero
+    if(exp < -10)
+      return (uint16_t)sign;  // too small even for a subnormal
+    const uint32_t m = mant | 0x00800000u;  // restore implicit leading 1
+    const int shift = 14 - exp;             // in [14, 24]
+    uint32_t half = m >> shift;
+    const uint32_t rem = m & ((1u << shift) - 1u);
+    const uint32_t halfway = 1u << (shift - 1);
+    if(rem > halfway || (rem == halfway && (half & 1u)))
+      half += 1;  // round to nearest even (may carry up to the smallest normal, which is correct)
+    return (uint16_t)(sign | half);
+  }
+  // normalized
+  const uint16_t half = (uint16_t)(sign | (uint32_t)(exp << 10) | (mant >> 13));
+  const uint32_t rem = mant & 0x1fffu;
+  // round to nearest even; a mantissa carry naturally propagates into the exponent field.
+  if(rem > 0x1000u || (rem == 0x1000u && (half & 1u)))
+    return (uint16_t)(half + 1);
+  return half;
+}
+
+static void convertInitializerToFP16(onnx::TensorProto* init) {
+  const int n = init->float_data_size();
+  std::string raw;
+  raw.resize((size_t)n * 2);
+  for(int i = 0; i < n; i++) {
+    const uint16_t h = floatToHalf(init->float_data(i));
+    raw[(size_t)2 * i] = (char)(h & 0xffu);          // little-endian
+    raw[(size_t)2 * i + 1] = (char)((h >> 8) & 0xffu);
+  }
+  init->clear_float_data();
+  init->set_data_type(onnx::TensorProto::FLOAT16);
+  init->set_raw_data(raw);
+}
+
+// Rewrite an all-FP32 graph into mixed FP16/FP32 in place. keepFP32 holds the node *names* that must
+// stay FP32; every other node becomes FP16. Casts are inserted (in topological position) on any edge
+// whose producer/consumer precision differ, and FP16-only float initializers are converted to FP16.
+static void convertGraphToFloat16(onnx::GraphProto* graph, const std::set<std::string>& keepFP32) {
+  using std::string;
+
+  auto nodeIsFP16 = [&](const onnx::NodeProto& n) { return keepFP32.count(n.name()) == 0; };
+
+  std::set<string> graphInputNames;
+  for(const auto& vi : graph->input())
+    graphInputNames.insert(vi.name());
+
+  // tensor name -> producing node index (every node here has a single, uniquely-named output)
+  std::unordered_map<string, int> producer;
+  for(int i = 0; i < graph->node_size(); i++)
+    for(const string& o : graph->node(i).output())
+      producer[o] = i;
+
+  // Classify initializers: FLOAT ones are candidates for FP16; INT64 (axes/shapes) are left alone.
+  std::unordered_map<string, onnx::TensorProto*> initByName;
+  std::set<string> floatInitNames;
+  for(int i = 0; i < graph->initializer_size(); i++) {
+    onnx::TensorProto* init = graph->mutable_initializer(i);
+    initByName[init->name()] = init;
+    if(init->data_type() == onnx::TensorProto::FLOAT)
+      floatInitNames.insert(init->name());
+  }
+
+  // A float initializer becomes FP16 iff every node consuming it is FP16 (otherwise keep it FP32 and
+  // let cast insertion handle any FP16 consumer). Most weights have exactly one consumer.
+  std::unordered_map<string, bool> initSawFP16, initSawFP32;
+  for(int i = 0; i < graph->node_size(); i++) {
+    const bool fp16 = nodeIsFP16(graph->node(i));
+    for(const string& in : graph->node(i).input())
+      if(floatInitNames.count(in))
+        (fp16 ? initSawFP16 : initSawFP32)[in] = true;
+  }
+  std::set<string> initIsFP16;
+  for(const string& name : floatInitNames) {
+    if(initSawFP16.count(name) && !initSawFP32.count(name)) {
+      convertInitializerToFP16(initByName[name]);
+      initIsFP16.insert(name);
+    }
+  }
+
+  auto isFloatTensor = [&](const string& name) -> bool {
+    if(graphInputNames.count(name))
+      return true;  // all KataGo graph inputs are FLOAT
+    auto it = initByName.find(name);
+    if(it != initByName.end())
+      return floatInitNames.count(name) > 0;  // INT64 initializers are not float
+    return producer.count(name) > 0;          // node outputs in this graph are all float
+  };
+  auto tensorIsFP16 = [&](const string& name) -> bool {
+    if(graphInputNames.count(name))
+      return false;
+    if(initByName.count(name))
+      return initIsFP16.count(name) > 0;
+    auto it = producer.find(name);
+    return it != producer.end() && nodeIsFP16(graph->node(it->second));
+  };
+
+  // Rebuild the node list, emitting any required Cast nodes just before the node that needs them so
+  // the result stays topologically ordered. Casts are cached by (source tensor, target precision).
+  std::map<std::pair<string, bool>, string> castCache;
+  int castCounter = 0;
+  google::protobuf::RepeatedPtrField<onnx::NodeProto> newNodes;
+  for(int i = 0; i < graph->node_size(); i++) {
+    const onnx::NodeProto& orig = graph->node(i);
+    const bool fp16 = nodeIsFP16(orig);
+    std::vector<string> rewritten;
+    for(const string& in : orig.input()) {
+      if(in.empty() || !isFloatTensor(in) || tensorIsFP16(in) == fp16) {
+        rewritten.push_back(in);
+        continue;
+      }
+      const auto key = std::make_pair(in, fp16);
+      auto cit = castCache.find(key);
+      if(cit != castCache.end()) {
+        rewritten.push_back(cit->second);
+        continue;
+      }
+      const string castOut = in + (fp16 ? "__tofp16_" : "__tofp32_") + Global::intToString(castCounter++);
+      onnx::NodeProto* c = newNodes.Add();
+      c->set_op_type("Cast");
+      c->set_name(castOut + "/cast");
+      c->add_input(in);
+      c->add_output(castOut);
+      onnx::AttributeProto* a = c->add_attribute();
+      a->set_name("to");
+      a->set_type(onnx::AttributeProto::INT);
+      a->set_i(fp16 ? onnx::TensorProto::FLOAT16 : onnx::TensorProto::FLOAT);
+      castCache[key] = castOut;
+      rewritten.push_back(castOut);
+    }
+    onnx::NodeProto* nn = newNodes.Add();
+    *nn = orig;
+    nn->clear_input();
+    for(const string& in : rewritten)
+      nn->add_input(in);
+  }
+
+  // Sanity (checked against the original node list, before the swap below): graph outputs must remain
+  // FP32, since their producers are in keepFP32 and getOutput does a flat FP32 cudaMemcpy of each
+  // output binding. Fail loudly rather than silently producing garbage if that ever stops holding.
+  for(const auto& vo : graph->output()) {
+    auto it = producer.find(vo.name());
+    if(it != producer.end() && nodeIsFP16(graph->node(it->second)))
+      throw StringError("OnnxModelBuilder: FP16 conversion left graph output '" + vo.name() + "' in FP16");
+  }
+
+  graph->mutable_node()->Swap(&newNodes);
+}
+
 Result build(
   const ModelDesc& desc,
   int nnXLen,
   int nnYLen,
   bool requireExactNNLen,
   bool transformerNHWC,
+  bool useFP16,
   Logger* logger
 ) {
   if(desc.metaEncoderVersion > 0)
@@ -1051,6 +1235,17 @@ Result build(
 
   b.recordNodesSince(trunkTipAndHeadStart, b.trunkTipAndHeadNodeNames);
 
+  // For TensorRT 11 strongly-typed engines: rewrite the finished FP32 graph into mixed FP16/FP32,
+  // keeping the RMSNorm reductions and the trunk-tip + heads in FP32 (the same regions the old
+  // weakly-typed path pinned via setPrecision). Inputs/outputs stay FP32.
+  if(useFP16) {
+    std::set<string> keepFP32(b.trunkTipAndHeadNodeNames.begin(), b.trunkTipAndHeadNodeNames.end());
+    keepFP32.insert(b.rmsNormNodeNames.begin(), b.rmsNormNodeNames.end());
+    convertGraphToFloat16(graph, keepFP32);
+    if(logger != NULL)
+      logger->write("OnnxModelBuilder: converted trunk to FP16 (" + Global::intToString((int)keepFP32.size()) + " nodes kept FP32)");
+  }
+
   // DEBUG (kept commented out): expose every internal node output as an extra FP32 graph output so the
   // backend can dump per-layer activations for FP16-vs-FP32 *numerical* divergence analysis. This is
   // complementary to the trtDumpDebugPlanToDir engine dump (which shows fusion structure and boundary
diff --git a/cpp/neuralnet/onnxmodelbuilder.h b/cpp/neuralnet/onnxmodelbuilder.h
index 10d381915..3837592fa 100644
--- a/cpp/neuralnet/onnxmodelbuilder.h
+++ b/cpp/neuralnet/onnxmodelbuilder.h
@@ -31,13 +31,17 @@ namespace OnnxModelBuilder {
     std::vector<std::string> rmsNormNodeNames;          // every RMSNorm (transformer + trunk-tip) op
   };
 
-  // Build a serialized ONNX ModelProto for the given model.
+  // Build a serialized ONNX ModelProto for the given model. When useFP16 is set, the finished graph
+  // is rewritten to run the trunk in FP16 with the numerically-sensitive regions (RMSNorm reductions,
+  // trunk tip, policy/value heads) and the graph inputs/outputs kept in FP32 (see convertGraphToFloat16
+  // in the .cpp). This is what makes FP16 possible under TensorRT 11's strongly-typed networks.
   Result build(
     const ModelDesc& desc,
     int nnXLen,
     int nnYLen,
     bool requireExactNNLen,
     bool transformerNHWC,
+    bool useFP16,
     Logger* logger
   );
 }
diff --git a/cpp/neuralnet/trtbackend.cpp b/cpp/neuralnet/trtbackend.cpp
index 8af6e623a..7422d6944 100644
--- a/cpp/neuralnet/trtbackend.cpp
+++ b/cpp/neuralnet/trtbackend.cpp
@@ -192,7 +192,9 @@ struct ModelParser {
 
   // Bump this when between katago versions we want to forcibly drop old timing caches and plan caches.
   // Bumped 7->8 for the TensorRT ONNX overhaul (ONNX emitter as default path, NHWC trunk, FP32 pinning).
-  static constexpr int tuneSalt = 8;
+  // Bumped 8->9 for the TensorRT 11 strongly-typed port (FP32 engines; old FP16-pinned plans invalid).
+  // Bumped 9->10 for the strongly-typed FP16 path (FP16 trunk emitted in the ONNX graph with casts).
+  static constexpr int tuneSalt = 10;
 
   unique_ptr<TRTModel> build(
     unique_ptr<INetworkDefinition> net,
@@ -267,12 +269,12 @@ struct ModelParser {
     } else {
       debugOutputLayer = network->addIdentity(*tensor);
     }
-    debugOutputLayer->setOutputType(0, DataType::kFLOAT);
     string debugOutputName = "DBG" + to_string(hash<string>{}(description));
     auto debugOutput = debugOutputLayer->getOutput(0);
     network->markOutput(*debugOutput);
     debugOutput->setName(debugOutputName.c_str());
-    debugOutput->setType(DataType::kFLOAT);
+    // Strongly-typed (TensorRT 11): the network is FP32, so this output is already FP32; just pin
+    // the layout. ITensor::setType / ILayer::setOutputType were removed.
     debugOutput->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
     model->debugOutputs.emplace_back(debugOutputName, description);
 #else
@@ -367,11 +369,9 @@ struct ModelParser {
     if(!model->requireExactNNLen) {
       maskSumLayer = network->addReduce(*inputMask, ReduceOperation::kSUM, 1U << 2 | 1U << 3, true);
       maskSumLayer->setName("InputMask/sum");
-      maskSumLayer->setPrecision(DataType::kFLOAT);
 
       auto maskWidthLayer = network->addUnary(*maskSumLayer->getOutput(0), UnaryOperation::kSQRT);
       maskWidthLayer->setName("InputMask/width");
-      maskWidthLayer->setPrecision(DataType::kFLOAT);
 
       auto maskScaleWeightsShift = make_unique<float[]>(1);
       auto maskScaleWeightsScale = make_unique<float[]>(1);
@@ -384,7 +384,6 @@ struct ModelParser {
         {DataType::kFLOAT, maskScaleWeightsScale.get(), 1},
         {DataType::kFLOAT, nullptr, 0});
       maskScaleLayer->setName("InputMask/scale");
-      maskScaleLayer->setPrecision(DataType::kFLOAT);
       model->extraWeights.push_back(move(maskScaleWeightsShift));
       model->extraWeights.push_back(move(maskScaleWeightsScale));
 
@@ -399,7 +398,6 @@ struct ModelParser {
         {DataType::kFLOAT, nullptr, 0},
         {DataType::kFLOAT, maskCenterSquareWeightsPower.get(), 1});
       maskCenterSquareLayer->setName("InputMask/centersquare");
-      maskCenterSquareLayer->setPrecision(DataType::kFLOAT);
       model->extraWeights.push_back(move(maskCenterSquareWeightsShift));
       model->extraWeights.push_back(move(maskCenterSquareWeightsPower));
 
@@ -414,7 +412,6 @@ struct ModelParser {
         {DataType::kFLOAT, maskQuadWeightsScale.get(), 1},
         {DataType::kFLOAT, nullptr, 0});
       maskQuadLayer->setName("InputMask/quad");
-      maskQuadLayer->setPrecision(DataType::kFLOAT);
       model->extraWeights.push_back(move(maskQuadWeightsShift));
       model->extraWeights.push_back(move(maskQuadWeightsScale));
     } else {
@@ -545,7 +542,6 @@ struct ModelParser {
       *p1CastLayer->getOutput(0), *gpoolToBiasMulLayer->getOutput(0), ElementWiseOperation::kSUM);
     auto gpoolBiasLayerName = name + "/gpbias";
     gpoolBiasLayer->setName(gpoolBiasLayerName.c_str());
-    gpoolBiasLayer->setPrecision(DataType::kFLOAT);
     auto p1BatchNormLayer = buildBatchNormLayer(gpoolBiasLayer->getOutput(0), &desc->p1BN, true);
     auto p1ActivationLayer = buildActivationLayer(p1BatchNormLayer->getOutput(0), &desc->p1Activation, true);
     auto p1MaskLayer = applyMaskLayer(p1ActivationLayer, true);
@@ -561,35 +557,28 @@ struct ModelParser {
     testAssert(desc->p2Conv.convYSize == 1);
 
     auto p2ConvLayer = buildConvLayer(p1MaskLayer->getOutput(0), &desc->p2Conv, true);
-    p2ConvLayer->setPrecision(DataType::kFLOAT);
     if(model->modelVersion >= 15) {
       auto gpoolToPassMulLayer = buildMatMulLayer(gpoolLayer->getOutput(0), &desc->gpoolToPassMul, true);
-      gpoolToPassMulLayer->setPrecision(DataType::kFLOAT);
       auto gpoolToPassBiasLayer = buildMatBiasLayer(gpoolToPassMulLayer->getOutput(0), &desc->gpoolToPassBias, true);
       auto gpoolToPassActLayer = buildActivationLayer(gpoolToPassBiasLayer->getOutput(0), &desc->passActivation, true);
       auto gpoolToPassMul2Layer = buildMatMulLayer(gpoolToPassActLayer->getOutput(0), &desc->gpoolToPassMul2, true);
-      gpoolToPassMul2Layer->setPrecision(DataType::kFLOAT);
 
       auto outputPolicyPass = gpoolToPassMul2Layer->getOutput(0);
       network->markOutput(*outputPolicyPass);
       outputPolicyPass->setName("OutputPolicyPass");
-      outputPolicyPass->setType(DataType::kFLOAT);
       outputPolicyPass->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
     } else {
       auto gpoolToPassMulLayer = buildMatMulLayer(gpoolLayer->getOutput(0), &desc->gpoolToPassMul, true);
-      gpoolToPassMulLayer->setPrecision(DataType::kFLOAT);
 
       auto outputPolicyPass = gpoolToPassMulLayer->getOutput(0);
       network->markOutput(*outputPolicyPass);
       outputPolicyPass->setName("OutputPolicyPass");
-      outputPolicyPass->setType(DataType::kFLOAT);
       outputPolicyPass->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
     }
 
     auto outputPolicy = p2ConvLayer->getOutput(0);
     network->markOutput(*outputPolicy);
     outputPolicy->setName("OutputPolicy");
-    outputPolicy->setType(DataType::kFLOAT);
     outputPolicy->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
   }
 
@@ -628,19 +617,16 @@ struct ModelParser {
     auto outputValue = v3BiasLayer->getOutput(0);
     network->markOutput(*outputValue);
     outputValue->setName("OutputValue");
-    outputValue->setType(DataType::kFLOAT);
     outputValue->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
 
     auto outputScoreValue = sv3BiasLayer->getOutput(0);
     network->markOutput(*outputScoreValue);
     outputScoreValue->setName("OutputScoreValue");
-    outputScoreValue->setType(DataType::kFLOAT);
     outputScoreValue->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
 
     auto outputOwnership = vOwnershipCastLayer->getOutput(0);
     network->markOutput(*outputOwnership);
     outputOwnership->setName("OutputOwnership");
-    outputOwnership->setType(DataType::kFLOAT);
     outputOwnership->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
 
     auto modelDesc = &model->rawModel->modelDesc;
@@ -756,10 +742,10 @@ struct ModelParser {
       {DataType::kFLOAT, transposedWeights.get(), static_cast<int64_t>(desc->weights.size())},
       {DataType::kFLOAT, nullptr, 0});
     matMulLayer->setName(desc->name.c_str());
-
-    if(forceFP32) {
-      matMulLayer->setPrecision(DataType::kFLOAT);
-    }
+    // Under TensorRT 11 (strongly-typed networks) there is no per-layer FP32 pinning; the whole
+    // ModelParser graph is FP32. The forceFP32 flag is kept on these build helpers only as a marker
+    // of the numerically-sensitive layers, for a future explicitly-typed FP16 emitter path.
+    (void)forceFP32;
 
     model->extraWeights.push_back(move(transposedWeights));
 
@@ -781,10 +767,7 @@ struct ModelParser {
       {DataType::kFLOAT, nullptr, 0},
       {DataType::kFLOAT, nullptr, 0});
     matBiasLayer->setName(desc->name.c_str());
-
-    if(forceFP32) {
-      matBiasLayer->setPrecision(DataType::kFLOAT);
-    }
+    (void)forceFP32;
 
     return matBiasLayer;
   }
@@ -819,10 +802,7 @@ struct ModelParser {
     convLayer->setDilationNd({2, {dilationY, dilationX}});
     convLayer->setPaddingMode(PaddingMode::kSAME_UPPER);
     convLayer->setName(desc->name.c_str());
-
-    if(forceFP32) {
-      convLayer->setPrecision(DataType::kFLOAT);
-    }
+    (void)forceFP32;
 
     return convLayer;
   }
@@ -847,30 +827,22 @@ struct ModelParser {
       {DataType::kFLOAT, desc->mergedScale.data(), static_cast<int64_t>(numChannels)},
       {DataType::kFLOAT, nullptr, 0});
     bnLayer->setName(desc->name.c_str());
-
-    if(forceFP32) {
-      bnLayer->setPrecision(DataType::kFLOAT);
-    }
+    (void)forceFP32;
 
     return bnLayer;
   }
 
   ILayer* buildActivationLayer(ITensor* input, const ActivationLayerDesc* desc, bool forceFP32 = false) {
     tuneDesc += Global::strprintf(R"|("%s"(%d))|", desc->name.c_str(), desc->activation);
+    (void)forceFP32;  // No per-layer FP32 pinning under TensorRT 11; graph is FP32 (see buildMatMulLayer).
     if(desc->activation == ACTIVATION_IDENTITY) {
       auto activationLayer = model->network->addIdentity(*input);
       activationLayer->setName(desc->name.c_str());
-      if(forceFP32) {
-        activationLayer->setPrecision(DataType::kFLOAT);
-      }
       return activationLayer;
     }
     else if(desc->activation == ACTIVATION_RELU) {
       auto activationLayer = model->network->addActivation(*input, ActivationType::kRELU);
       activationLayer->setName(desc->name.c_str());
-      if(forceFP32) {
-        activationLayer->setPrecision(DataType::kFLOAT);
-      }
       return activationLayer;
     }
     else if(desc->activation == ACTIVATION_MISH) {
@@ -882,11 +854,6 @@ struct ModelParser {
       tanhLayer->setName(tanhLayerName.c_str());
       auto mergeLayer = model->network->addElementWise(*input, *tanhLayer->getOutput(0), ElementWiseOperation::kPROD);
       mergeLayer->setName(desc->name.c_str());
-      if(forceFP32) {
-        softplusLayer->setPrecision(DataType::kFLOAT);
-        tanhLayer->setPrecision(DataType::kFLOAT);
-        mergeLayer->setPrecision(DataType::kFLOAT);
-      }
       return mergeLayer;
     }
     else if(desc->activation == ACTIVATION_MISH_SCALE8) {
@@ -900,11 +867,6 @@ struct ModelParser {
       tanhLayer->setName(tanhLayerName.c_str());
       auto mergeLayer = model->network->addElementWise(*input, *tanhLayer->getOutput(0), ElementWiseOperation::kPROD);
       mergeLayer->setName(desc->name.c_str());
-      if(forceFP32) {
-        softplusLayer->setPrecision(DataType::kFLOAT);
-        tanhLayer->setPrecision(DataType::kFLOAT);
-        mergeLayer->setPrecision(DataType::kFLOAT);
-      }
       return mergeLayer;
     }
     else {
@@ -920,6 +882,7 @@ struct ModelParser {
   ILayer* applyGPoolLayer(ILayer* inputLayer, bool forceFP32 = false, bool isValueHead = false) {
     auto& network = model->network;
     string name = inputLayer->getName();
+    (void)forceFP32;  // No per-layer FP32 pinning under TensorRT 11; graph is FP32 (see buildMatMulLayer).
 
     ILayer* gpoolSumLayer = nullptr;
     ILayer* gpoolMeanLayer = nullptr;
@@ -987,22 +950,6 @@ struct ModelParser {
     gpoolConcatLayer->setAxis(1);
     gpoolConcatLayer->setName(gpoolConcatLayerName.c_str());
 
-    if(forceFP32) {
-      if(gpoolSumLayer) {
-        gpoolSumLayer->setPrecision(DataType::kFLOAT);
-      }
-      if(gpoolMaskAddLayer) {
-        gpoolMaskAddLayer->setPrecision(DataType::kFLOAT);
-      }
-      if(gpoolMaskShiftLayer) {
-        gpoolMaskShiftLayer->setPrecision(DataType::kFLOAT);
-      }
-      gpoolMeanLayer->setPrecision(DataType::kFLOAT);
-      gpoolMeanScaleLayer->setPrecision(DataType::kFLOAT);
-      gpoolConcatInputLayer3->setPrecision(DataType::kFLOAT);
-      gpoolConcatLayer->setPrecision(DataType::kFLOAT);
-    }
-
     return gpoolConcatLayer;
   }
 
@@ -1012,11 +959,10 @@ struct ModelParser {
         model->network->addElementWise(*inputLayer->getOutput(0), *inputMask, ElementWiseOperation::kPROD);
       auto maskLayerName = string(inputLayer->getName()) + "/mask";
       maskLayer->setName(maskLayerName.c_str());
-      if(forceFP32) {
-        maskLayer->setPrecision(DataType::kFLOAT);
-      }
+      (void)forceFP32;  // No per-layer FP32 pinning under TensorRT 11; graph is FP32 (see buildMatMulLayer).
       return maskLayer;
     } else {
+      (void)forceFP32;
       return inputLayer;
     }
   }
@@ -1182,19 +1128,19 @@ struct ComputeHandle {
       throw StringError("TensorRT backend: failed to create builder config");
     }
 
-    usingFP16 = false;
-    if(builder->platformHasFastFp16()) {
-      if(ctx->useFP16Mode == enabled_t::True || ctx->useFP16Mode == enabled_t::Auto) {
-        config->setFlag(BuilderFlag::kFP16);
-        usingFP16 = true;
-      }
-    } else if(ctx->useFP16Mode == enabled_t::True) {
-      throw StringError("CUDA device does not support useFP16=true");
-    }
-    // The ONNX path may pin specific layers to FP32 below and needs the constraint to be hard
-    // (kOBEY) so TensorRT cannot silently fall back to an FP16 path. The ModelParser path uses the
-    // softer kPREFER. We set the flag after building the network, once forceObeyPrecision is known.
-    bool forceObeyPrecision = false;
+    // TensorRT 11 removed weakly-typed networks and the builder-driven mixed-precision machinery this
+    // backend used to rely on (the kFP16 builder flag, platformHasFastFp16(), per-layer
+    // setPrecision()/setOutputType(), and kOBEY/kPREFER_PRECISION_CONSTRAINTS). Every network is now
+    // strongly typed: precision is whatever the network (or parsed ONNX graph) declares. FP16 is
+    // therefore expressed in the ONNX graph itself - the ONNX emitter rewrites the trunk to FP16 while
+    // keeping the RMSNorm reductions, trunk tip, heads, and the graph inputs/outputs in FP32 (see
+    // OnnxModelBuilder::build / convertGraphToFloat16). useFP16=false forces a fully-FP32 engine; the
+    // hand-built ModelParser path (trtDisableOnnx) has no FP16 support and always runs FP32.
+    usingFP16 = (ctx->useFP16Mode != enabled_t::False) && useOnnxEmit;
+    if(ctx->useFP16Mode == enabled_t::True && !useOnnxEmit)
+      logger->write(
+        "TensorRT backend: WARNING useFP16=true is not supported by the non-ONNX ModelParser path "
+        "(trtDisableOnnx); running in FP32.");
 
     // Debug plan/engine dump (trtDumpDebugPlanToDir). Build a base path inside that dir, disambiguated
     // by board size + precision + exact/max so the multiple engines built in one process don't collide.
@@ -1207,8 +1153,9 @@ struct ComputeHandle {
         (usingFP16 ? "_fp16" : "_fp32") + (requireExactNNLen ? "_exact" : "_max");
     }
 
-    auto network = unique_ptr<INetworkDefinition>(
-      builder->createNetworkV2(1U << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
+    // TensorRT 11 networks are always explicit-batch and strongly typed; createNetworkV2 takes no
+    // flags (the kEXPLICIT_BATCH NetworkDefinitionCreationFlag was removed).
+    auto network = unique_ptr<INetworkDefinition>(builder->createNetworkV2(0U));
     if(!network) {
       throw StringError("TensorRT backend: failed to create network definition");
     }
@@ -1230,7 +1177,7 @@ struct ComputeHandle {
     if(useOnnxEmit) {
       logger->write("TensorRT backend: building network via ONNX emitter");
       const ModelDesc& desc = loadedModel->modelDesc;
-      OnnxModelBuilder::Result onnxResult = OnnxModelBuilder::build(desc, ctx->nnXLen, ctx->nnYLen, requireExactNNLen, ctx->transformerNHWC, logger);
+      OnnxModelBuilder::Result onnxResult = OnnxModelBuilder::build(desc, ctx->nnXLen, ctx->nnYLen, requireExactNNLen, ctx->transformerNHWC, usingFP16, logger);
       onnxBytes = std::move(onnxResult.serializedModel);
 
       if(dumpDebugPlan) {
@@ -1252,37 +1199,17 @@ struct ComputeHandle {
         throw StringError(msg);
       }
 
-      // Constrain all graph outputs to linear FP32, matching what ModelParser sets on its outputs.
-      // getOutput does a flat cudaMemcpy of each output buffer assuming linear layout, so without
-      // this the parser may leave outputs in a reformatted layout and the copy reads garbage.
+      // Graph outputs are always FP32 (the heads are kept FP32 even in FP16 mode); we only constrain
+      // them to a linear layout, since getOutput does a flat cudaMemcpy of each output buffer assuming
+      // linear layout. Under TensorRT 11 strongly-typed networks, ITensor::setType no longer exists -
+      // the tensor type is fixed by the graph.
       for(int i = 0; i < network->getNbOutputs(); i++) {
-        ITensor* out = network->getOutput(i);
-        out->setType(DataType::kFLOAT);
-        out->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+        network->getOutput(i)->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
       }
 
-      // Force the numerically-sensitive regions to FP32: every RMSNorm reduction (square->reduce->
-      // sqrt, which sums over many elements and loses too much precision in FP16) plus the trunk-tip
-      // norm and policy/value heads. The emitter records these layer names; we pin them via per-layer
-      // setPrecision + kOBEY_PRECISION_CONSTRAINTS (a hard constraint) so correctness does not depend
-      // on TensorRT declining to fuse a numerically-equivalent FP16 path back in. This matches the
-      // FP32-forcing the hand-built ModelParser path already does for its heads/gpool.
-      std::set<string> fp32Names;
-      fp32Names.insert(onnxResult.trunkTipAndHeadNodeNames.begin(), onnxResult.trunkTipAndHeadNodeNames.end());
-      fp32Names.insert(onnxResult.rmsNormNodeNames.begin(), onnxResult.rmsNormNodeNames.end());
-      int pinned = 0;
-      for(int i = 0; i < network->getNbLayers(); i++) {
-        ILayer* layer = network->getLayer(i);
-        const char* lname = layer->getName();
-        if(lname != nullptr && fp32Names.count(string(lname))) {
-          layer->setPrecision(DataType::kFLOAT);
-          for(int o = 0; o < layer->getNbOutputs(); o++)
-            layer->setOutputType(o, DataType::kFLOAT);
-          pinned++;
-        }
-      }
-      forceObeyPrecision = true;
-      logger->write(Global::strprintf("TensorRT backend: pinned %d layers to FP32 (rmsnorm + heads)", pinned));
+      // No per-layer FP32 pinning here: precision is fixed in the ONNX graph. In FP16 mode the emitter
+      // already emitted the numerically-sensitive regions (onnxResult.trunkTipAndHeadNodeNames and
+      // rmsNormNodeNames) in FP32 with explicit casts; in FP32 mode the whole graph is FP32.
 
       // Set optimization profile dims for each input the parser created.
       auto setProfile = [&](const char* name, Dims4 minDims, Dims4 optMaxDims) {
@@ -1321,9 +1248,8 @@ struct ComputeHandle {
     debugOutputs = model->debugOutputs;
     config->addOptimizationProfile(profile);
 
-    // Honor per-layer precision constraints. The ONNX path pins some layers to FP32 and needs a hard
-    // constraint (kOBEY) so TensorRT cannot fall back to FP16; the ModelParser path uses kPREFER.
-    config->setFlag(forceObeyPrecision ? BuilderFlag::kOBEY_PRECISION_CONSTRAINTS : BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+    // No precision-constraint flag under TensorRT 11: strongly-typed networks carry their precision
+    // in the graph itself, so kOBEY/kPREFER_PRECISION_CONSTRAINTS no longer exist.
 
 #if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
     // This is to avoid external tactic sources and tactics that have shape switching overhead