From ac01dd197f9fafb0301b0432da90698cd7e9517a Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe@nvidia.com>
Date: Fri, 11 Oct 2024 16:52:57 +0000
Subject: [PATCH 01/18] DCO Remediation Commit for Ben Howe <bhowe@nvidia.com>

I, Ben Howe <bhowe@nvidia.com>, hereby add my Signed-off-by to this commit: 86681ef67d3b76c0e468f6595e2c2524cf9b4b6c

Signed-off-by: Ben Howe <bhowe@nvidia.com>
Signed-off-by: Anna Gringauze <agringauze@nvidia.com>

From 21a87c1646f168a6465c3e51dc4fc510c1de9c43 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 17 Sep 2024 14:40:45 -0700
Subject: [PATCH 02/18] State pointer synthesis for quantum hardware

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Builder/Intrinsics.h  |   4 +
 include/cudaq/Optimizer/Transforms/Passes.td  |  38 ++++
 lib/Optimizer/Builder/Intrinsics.cpp          |   4 +
 lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp    |   3 +-
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   |  11 +-
 .../Transforms/StateInitialization.cpp        | 146 +++++++++++++++
 python/runtime/cudaq/algorithms/py_state.cpp  |   5 +-
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   2 +-
 runtime/common/ArgumentConversion.cpp         | 167 ++++++++++++++++--
 runtime/common/ArgumentConversion.h           |  22 ++-
 runtime/common/BaseRemoteRESTQPU.h            |  33 ++--
 runtime/common/BaseRestRemoteClient.h         |   4 +-
 runtime/common/CMakeLists.txt                 |   2 +-
 runtime/common/SimulationState.h              |  11 ++
 runtime/cudaq/CMakeLists.txt                  |   1 +
 runtime/cudaq/algorithms/get_state.h          |  12 ++
 .../rest/helpers/quantinuum/quantinuum.yml    |   2 +
 runtime/cudaq/qis/quantum_state.cpp           | 113 ++++++++++++
 runtime/cudaq/qis/quantum_state.h             | 151 ++++++++++++++++
 runtime/cudaq/qis/remote_state.cpp            |   2 +-
 runtime/cudaq/qis/remote_state.h              |   3 +-
 .../Remote-Sim/qvector_init_from_state.cpp    |  16 ++
 .../execution/qvector_init_from_state.cpp     | 147 +++++++++++++++
 targettests/execution/state_init.cpp          |   2 +-
 test/Quake/arg_subst-5.txt                    |  15 ++
 test/Quake/arg_subst-6.txt                    |  11 ++
 test/Quake/arg_subst_func.qke                 |  37 +++-
 test/Quake/state_init.qke                     |  37 ++++
 test/Quake/state_prep.qke                     |   2 +-
 tpls/Stim                                     |   2 +-
 31 files changed, 955 insertions(+), 51 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StateInitialization.cpp
 create mode 100644 runtime/cudaq/qis/quantum_state.cpp
 create mode 100644 runtime/cudaq/qis/quantum_state.h
 create mode 100644 targettests/execution/qvector_init_from_state.cpp
 create mode 100644 test/Quake/arg_subst-5.txt
 create mode 100644 test/Quake/arg_subst-6.txt
 create mode 100644 test/Quake/state_init.qke

diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index 30ab0e696a..c05021b879 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -55,6 +55,10 @@ static constexpr const char createCudaqStateFromDataFP32[] =
 // Delete a state created by the runtime functions above.
 static constexpr const char deleteCudaqState[] = "__nvqpp_cudaq_state_delete";
 
+// Get state of a kernel (placeholder function, calls are always replaced in
+// opts)
+static constexpr const char getCudaqState[] = "__nvqpp_cudaq_state_get";
+
 /// Builder for lowering the clang AST to an IR for CUDA-Q. Lowering includes
 /// the transformation of both quantum and classical computation. Different
 /// features of the CUDA-Q programming model are lowered into different dialects
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 9ca3810f39..66eb4cfcb0 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -779,6 +779,44 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
   }];
 }
 
+def StateInitialization : Pass<"state-initialization", "mlir::ModuleOp"> {
+  let summary =
+    "Replace `quake.init_state` instructions with call to the kernel generating the state";
+  let description = [{
+    Argument synthesis for state pointers for quantum devices substitutes state
+    argument by a new state created from `__nvqpp_cudaq_state_get` intrinsic, which
+    in turn accepts the name for the synthesized kernel that generated the state.
+
+    This optimization completes the replacement of `quake.init_state` instruction by:
+
+    - Replace `quake.init_state` by a call that `get_state` call refers to.
+    - Remove all unneeded instructions.
+
+    For example:
+
+    Before StateInitialization (state-initialization):
+    ```
+    func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+      %0 = cc.string_literal "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.array<i8 x 45>>
+      %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 45>>) -> !cc.ptr<i8>
+      %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+      %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+      %4 = quake.alloca !quake.veq<?>[%3 : i64]
+      %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+      return
+    }
+    ```
+
+    After StateInitialization (state-initialization):
+    ```
+    func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+      %5 = call @__nvqpp__mlirgen__test_init_state.modified_0() : () -> !quake.veq<?>
+      return
+    }
+    ```
+  }];
+}
+
 def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
   let summary =
     "Convert state vector data into gates";
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 12e430dc03..57c636e31d 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -261,6 +261,10 @@ static constexpr IntrinsicCode intrinsicTable[] = {
 
     {cudaq::deleteCudaqState, {}, R"#(
   func.func private @__nvqpp_cudaq_state_delete(%p : !cc.ptr<!cc.state>) -> ()
+  )#"},
+
+    {cudaq::getCudaqState, {}, R"#(
+  func.func private @__nvqpp_cudaq_state_get(%p : !cc.ptr<i8>) -> !cc.ptr<!cc.state>
   )#"},
 
     {cudaq::getNumQubitsFromCudaqState, {}, R"#(
diff --git a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
index 4de20fd7be..04eac5b06f 100644
--- a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
+++ b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
@@ -49,7 +49,8 @@ struct VerifyNVQIRCallOpsPass
           cudaq::getNumQubitsFromCudaqState,
           cudaq::createCudaqStateFromDataFP32,
           cudaq::createCudaqStateFromDataFP64,
-          cudaq::deleteCudaqState};
+          cudaq::deleteCudaqState,
+          cudaq::getCudaqState};
       // It must be either NVQIR extension functions or in the allowed list.
       return std::find(NVQIR_FUNCS.begin(), NVQIR_FUNCS.end(), functionName) !=
                  NVQIR_FUNCS.end() ||
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index a6b94d9a59..f107d78bde 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -50,6 +50,7 @@ add_cudaq_library(OptTransforms
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
+  StateInitialization.cpp
   StatePreparation.cpp
   UnitarySynthesis.cpp
   WiresToWiresets.cpp
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 9328b78896..8cf6a019f8 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -170,9 +170,10 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
           rewriter.setInsertionPointAfter(useuser);
           LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
-          rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
-              load, eleTy, conArr,
-              ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+          auto extract = rewriter.create<cudaq::cc::ExtractValueOp>(
+              loc, eleTy, conArr, ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+          rewriter.replaceAllUsesWith(load, extract);
+          toErase.push_back(load);
           continue;
         }
         if (isa<cudaq::cc::StoreOp>(useuser))
@@ -199,8 +200,10 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       toErase.push_back(alloc);
     }
 
-    for (auto *op : toErase)
+    for (auto *op : toErase) {
+      op->dropAllUses();
       rewriter.eraseOp(op);
+    }
 
     return success();
   }
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
new file mode 100644
index 0000000000..3a122f02a7
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -0,0 +1,146 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include <span>
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_STATEINITIALIZATION
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "state-initialization"
+
+using namespace mlir;
+
+namespace {
+
+static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
+  if (callOp) {
+    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
+      if (auto calleeAttr = createStateCall.getCalleeAttr()) {
+        auto funcName = calleeAttr.getValue().str();
+        if (std::find(names.begin(), names.end(), funcName) != names.end())
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+static bool isGetStateCall(Operation *callOp) {
+  return isCall(callOp, {cudaq::getCudaqState});
+}
+
+static bool isNumberOfQubitsCall(Operation *callOp) {
+  return isCall(callOp, {cudaq::getNumQubitsFromCudaqState});
+}
+
+// clang-format off
+/// Replace `quake.init_state` by a call to a (modified) kernel that produced the state.
+/// ```
+///  %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
+///  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
+///  %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+///  %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+///  %4 = quake.alloca !quake.veq<?>[%3 : i64]
+///  %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+/// ───────────────────────────────────────────
+/// ...
+///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
+/// ```
+// clang-format on
+class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
+                                PatternRewriter &rewriter) const override {
+    auto loc = initState.getLoc();
+    auto allocaOp = initState.getOperand(0).getDefiningOp();
+    auto getStateOp = initState.getOperand(1).getDefiningOp();
+    auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
+
+    if (isGetStateCall(getStateOp)) {
+      auto calleeNameOp = getStateOp->getOperand(0);
+      if (auto cast =
+              dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
+        calleeNameOp = cast.getOperand();
+
+        if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
+                calleeNameOp.getDefiningOp())) {
+          auto calleeName = literal.getStringLiteral();
+
+          Value result =
+              rewriter
+                  .create<func::CallOp>(loc, initState.getType(), calleeName,
+                                        mlir::ValueRange{})
+                  .getResult(0);
+          rewriter.replaceAllUsesWith(initState, result);
+          initState.erase();
+          allocaOp->dropAllUses();
+          rewriter.eraseOp(allocaOp);
+          if (isNumberOfQubitsCall(numOfQubits)) {
+            numOfQubits->dropAllUses();
+            rewriter.eraseOp(numOfQubits);
+          }
+          getStateOp->dropAllUses();
+          rewriter.eraseOp(getStateOp);
+          cast->dropAllUses();
+          rewriter.eraseOp(cast);
+          literal->dropAllUses();
+          rewriter.eraseOp(literal);
+          return success();
+        }
+      }
+    }
+    return failure();
+  }
+};
+
+class StateInitializationPass
+    : public cudaq::opt::impl::StateInitializationBase<
+          StateInitializationPass> {
+public:
+  using StateInitializationBase::StateInitializationBase;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    auto module = getOperation();
+    for (Operation &op : *module.getBody()) {
+      auto func = dyn_cast<func::FuncOp>(op);
+      if (!func)
+        continue;
+
+      std::string funcName = func.getName().str();
+      RewritePatternSet patterns(ctx);
+      patterns.insert<StateInitPattern>(ctx);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Before state initialization: " << func << '\n');
+
+      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                              std::move(patterns))))
+        signalPassFailure();
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "After state initialization: " << func << '\n');
+    }
+  }
+};
+} // namespace
diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index 77a8e4a36d..74e098ebbf 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -96,8 +96,9 @@ class PyRemoteSimulationState : public RemoteSimulationState {
     }
   }
 
-  std::pair<std::string, std::vector<void *>> getKernelInfo() const override {
-    return {kernelName, argsData->getArgs()};
+  std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override {
+    return std::make_pair(kernelName, argsData->getArgs());
   }
 
   std::complex<double> overlap(const cudaq::SimulationState &other) override {
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index b91627de9f..a7531f9caa 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -517,7 +517,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto isLocalSimulator = platform.is_simulator() && !platform.is_emulated();
   auto isSimulator = isLocalSimulator || isRemoteSimulator;
 
-  cudaq::opt::ArgumentConverter argCon(name, unwrap(module), isSimulator);
+  cudaq::opt::ArgumentConverter argCon(name, unwrap(module));
   argCon.gen(runtimeArgs.getArgs());
   std::string kernName = cudaq::runtime::cudaqGenPrefixName + name;
   SmallVector<StringRef> kernels = {kernName};
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 424cbd8873..83e4dd3725 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -10,6 +10,8 @@
 #include "cudaq.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Todo.h"
 #include "cudaq/qis/pauli_word.h"
 #include "cudaq/utils/registry.h"
@@ -97,11 +99,25 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
 static Value genConstant(OpBuilder &builder, const cudaq::state *v,
-                         ModuleOp substMod, llvm::DataLayout &layout,
-                         llvm::StringRef kernelName, bool isSimulator) {
-  if (isSimulator) {
-    // The program is executed remotely, materialize the simulation data
-    // into an array and create a new state from it.
+                         llvm::DataLayout &layout,
+                         cudaq::opt::ArgumentConverter &converter) {
+  auto simState =
+      cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
+
+  auto kernelName = converter.getKernelName();
+  auto sourceMod = converter.getSourceModule();
+  auto substMod = converter.getSubstitutionModule();
+
+  // If the state has amplitude data, we materialize the data as a state
+  // vector and create a new state from it.
+  // TODO: how to handle density matrices? Should we just inline calls?
+  if (simState->hasData()) {
+    // The call below might cause lazy execution of the state kernel.
+    // TODO: For lazy execution scenario on remote simulators, we have the
+    // kernel info available on the state as well, before we needed to run
+    // the state kernel and compute its data, which might cause significant
+    // data transfer). Investigate if it is more performant to use the other
+    // synthesis option in that case (see the next `if`).
     auto numQubits = v->get_num_qubits();
 
     // We currently only synthesize small states.
@@ -130,11 +146,11 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
       std::string name =
           kernelName.str() + ".rodata_synth_" + std::to_string(counter++);
       irBuilder.genVectorOfConstants(loc, substMod, name, vec);
-      auto conGlobal = builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
-      return builder.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
+
+      return builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
     };
 
-    auto conArr = is64Bit ? genConArray.template operator()<double>()
+    auto buffer = is64Bit ? genConArray.template operator()<double>()
                           : genConArray.template operator()<float>();
 
     auto createState = is64Bit ? cudaq::createCudaqStateFromDataFP64
@@ -146,21 +162,111 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto stateTy = cudaq::cc::StateType::get(ctx);
     auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
     auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-    auto buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrTy);
-    builder.create<cudaq::cc::StoreOp>(loc, conArr, buffer);
 
     auto cast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
     auto statePtr = builder
                         .create<func::CallOp>(loc, statePtrTy, createState,
                                               ValueRange{cast, arrSize})
                         .getResult(0);
+    return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
+  }
+
+  // For quantum hardware, replace states with calls to kernels that generated
+  // them.
+  if (simState->getKernelInfo().has_value()) {
+    auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
+
+    std::string calleeKernelName =
+        cudaq::runtime::cudaqGenPrefixName + calleeName;
+
+    auto ctx = builder.getContext();
+    auto loc = builder.getUnknownLoc();
 
-    // TODO: Delete the new state before function exit.
+    auto code = cudaq::get_quake_by_name(calleeName, /*throwException=*/false);
+    assert(!code.empty() && "Quake code not found for callee");
+    auto fromModule = parseSourceString<ModuleOp>(code, ctx);
+
+    static unsigned counter = 0;
+    std::string modifiedCalleeName =
+        calleeName + ".modified_" + std::to_string(counter++);
+    std::string modifiedCalleeKernelName =
+        cudaq::runtime::cudaqGenPrefixName + modifiedCalleeName;
+
+    // Create callee.modified that returns concat of veq allocations.
+    auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
+    assert(calleeFunc && "callee is missing");
+    auto argTypes = calleeFunc.getArgumentTypes();
+    auto retType = quake::VeqType::getUnsized(ctx);
+    auto funcTy = FunctionType::get(ctx, argTypes, {retType});
+
+    {
+      OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToEnd(sourceMod.getBody());
+
+      auto modifiedCalleeFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+      modifiedCalleeFunc.setName(modifiedCalleeKernelName);
+      modifiedCalleeFunc.setType(funcTy);
+      modifiedCalleeFunc.setPrivate();
+
+      OpBuilder modifiedBuilder(ctx);
+      SmallVector<Value> allocations;
+      SmallVector<Operation *> cleanUps;
+      for (auto &op : modifiedCalleeFunc.getOps()) {
+        if (auto alloc = dyn_cast<quake::AllocaOp>(op)) {
+          allocations.push_back(alloc.getResult());
+          // Replace by the result of quake.init_state if used by it
+          for (auto *user : op.getUsers()) {
+            if (auto init = dyn_cast<quake::InitializeStateOp>(*user)) {
+              allocations.pop_back();
+              allocations.push_back(init.getResult());
+            }
+          }
+        }
+        if (auto retOp = dyn_cast<func::ReturnOp>(op)) {
+          if (retOp.getOperands().size() == 0) {
+            modifiedBuilder.setInsertionPointAfter(retOp);
+            assert(allocations.size() > 0 && "No veq allocations found");
+            Value ret = modifiedBuilder.create<quake::ConcatOp>(
+                loc, quake::VeqType::getUnsized(ctx), allocations);
+            modifiedBuilder.create<func::ReturnOp>(loc, ret);
+            cleanUps.push_back(retOp);
+          }
+        }
+      }
+      for (auto *op : cleanUps) {
+        op->dropAllUses();
+        op->erase();
+      }
+    }
+
+    // Create substitutions for the `callee.modified.N`.
+    converter.genCallee(modifiedCalleeName, calleeArgs);
+
+    // Create a subst for state pointer.
+    auto strLitTy = cudaq::cc::PointerType::get(
+        cudaq::cc::ArrayType::get(builder.getContext(), builder.getI8Type(),
+                                  modifiedCalleeKernelName.size() + 1));
+    auto callee = builder.create<cudaq::cc::CreateStringLiteralOp>(
+        loc, strLitTy, builder.getStringAttr(modifiedCalleeKernelName));
+
+    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
+    auto calleeCast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, callee);
+
+    cudaq::IRBuilder irBuilder(ctx);
+    auto result = irBuilder.loadIntrinsic(substMod, cudaq::getCudaqState);
+    assert(succeeded(result) && "loading intrinsic should never fail");
+
+    auto statePtrTy =
+        cudaq::cc::PointerType::get(cudaq::cc::StateType::get(ctx));
+    auto statePtr =
+        builder
+            .create<func::CallOp>(loc, statePtrTy, cudaq::getCudaqState,
+                                  ValueRange{calleeCast})
+            .getResult(0);
     return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
   }
-  // The program is executed on quantum hardware, state data is not
-  // available and needs to be regenerated.
-  TODO("cudaq::state* argument synthesis for quantum hardware");
+
+  TODO("cudaq::state* argument synthesis for quantum hardware for c functions");
   return {};
 }
 
@@ -326,7 +432,7 @@ cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                                  ModuleOp sourceModule,
                                                  bool isSimulator)
     : sourceModule(sourceModule), builder(sourceModule.getContext()),
-      kernelName(kernelName), isSimulator(isSimulator) {
+      kernelName(kernelName) {
   substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
 }
 
@@ -335,7 +441,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
   // We should look up the input type signature here.
 
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
-      cudaq::runtime::cudaqGenPrefixName + kernelName.str());
+      cudaq::runtime::cudaqGenPrefixName + kernelName);
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
@@ -403,8 +509,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
             .Case([&](cc::PointerType ptrTy) -> cc::ArgumentSubstitutionOp {
               if (ptrTy.getElementType() == cc::StateType::get(ctx))
                 return buildSubst(static_cast<const state *>(argPtr),
-                                  substModule, dataLayout, kernelName,
-                                  isSimulator);
+                                  dataLayout, *this);
               return {};
             })
             .Case([&](cc::StdvecType ty) {
@@ -457,3 +562,29 @@ void cudaq::opt::ArgumentConverter::gen_drop_front(
   }
   gen(partialArgs);
 }
+
+std::pair<std::vector<std::string>, std::vector<std::string>>
+cudaq::opt::ArgumentConverter::collectAllSubstitutions() {
+  std::vector<std::string> kernels;
+  std::vector<std::string> substs;
+
+  std::function<void(ArgumentConverter &)> collect =
+      [&kernels, &substs, &collect](ArgumentConverter &con) {
+        auto name = con.getKernelName();
+        std::string kernName = cudaq::runtime::cudaqGenPrefixName + name.str();
+        kernels.push_back(kernName);
+
+        {
+          std::string substBuff;
+          llvm::raw_string_ostream ss(substBuff);
+          ss << con.getSubstitutionModule();
+          substs.push_back(substBuff);
+        }
+
+        for (auto &calleeCon : con.getCalleeConverters())
+          collect(calleeCon);
+      };
+
+  collect(*this);
+  return {kernels, substs};
+}
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 45e6607b0c..be438fe66c 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -14,6 +14,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
 #include <unordered_set>
+#include <vector>
 
 namespace cudaq::opt {
 
@@ -47,13 +48,30 @@ class ArgumentConverter {
   /// created.
   mlir::ModuleOp getSubstitutionModule() { return substModule; }
 
+  mlir::ModuleOp getSourceModule() { return sourceModule; }
+
+  mlir::StringRef getKernelName() { return kernelName; }
+
+  void genCallee(std::string &calleeName, std::vector<void *> &args) {
+    auto converter = ArgumentConverter(calleeName, sourceModule);
+    converter.gen(args);
+    calleeConverters.push_back(converter);
+  }
+
+  std::vector<ArgumentConverter> &getCalleeConverters() {
+    return calleeConverters;
+  }
+
+  std::pair<std::vector<std::string>, std::vector<std::string>>
+  collectAllSubstitutions();
+
 private:
   mlir::ModuleOp sourceModule;
   mlir::ModuleOp substModule;
   mlir::OpBuilder builder;
-  mlir::StringRef kernelName;
+  std::string kernelName;
   mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
-  bool isSimulator;
+  std::vector<ArgumentConverter> calleeConverters;
 };
 
 } // namespace cudaq::opt
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 61c26dc791..41f45b6b75 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -393,15 +393,18 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (!func->hasAttr(cudaq::entryPointAttrName))
       func->setAttr(cudaq::entryPointAttrName, builder.getUnitAttr());
     auto moduleOp = builder.create<mlir::ModuleOp>();
-    moduleOp.push_back(func.clone());
     moduleOp->setAttrs(m_module->getAttrDictionary());
 
     for (auto &op : m_module.getOps()) {
-      // Add any global symbols, including global constant arrays.
-      // Global constant arrays can be created during compilation,
-      // `lift-array-value`, `quake-synthesizer`, and `get-concrete-matrix`
-      // passes.
-      if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
+      if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
+        // Add quantum kernels defined in the module.
+        if (funcOp->hasAttr(cudaq::kernelAttrName) ||
+            funcOp.getName().startswith("__nvqpp__mlirgen__") ||
+            funcOp.getBody().empty())
+          moduleOp.push_back(funcOp.clone());
+      }
+      // Add globals defined in the module.
+      if (auto globalOp = dyn_cast<cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());
     }
 
@@ -428,16 +431,18 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
         cudaq::info("Run Argument Synth.\n");
-        opt::ArgumentConverter argCon(kernelName, moduleOp, false);
+        opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
-        std::string kernName = cudaq::runtime::cudaqGenPrefixName + kernelName;
-        mlir::SmallVector<mlir::StringRef> kernels = {kernName};
-        std::string substBuff;
-        llvm::raw_string_ostream ss(substBuff);
-        ss << argCon.getSubstitutionModule();
-        mlir::SmallVector<mlir::StringRef> substs = {substBuff};
+        auto [kernels, substs] = argCon.collectAllSubstitutions();
         pm.addNestedPass<mlir::func::FuncOp>(
-            opt::createArgumentSynthesisPass(kernels, substs));
+            cudaq::opt::createArgumentSynthesisPass(
+                mlir::SmallVector<mlir::StringRef>{kernels.begin(),
+                                                   kernels.end()},
+                mlir::SmallVector<mlir::StringRef>{substs.begin(),
+                                                   substs.end()}));
+        pm.addPass(mlir::createCanonicalizerPass());
+        pm.addPass(opt::createDeleteStates());
+        pm.addPass(opt::createStateInitialization());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index b938815d92..5384d71008 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -329,8 +329,8 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (!castedState1 || !castedState2)
         throw std::runtime_error(
             "Invalid execution context: input states are not compatible");
-      auto [kernelName1, args1] = castedState1->getKernelInfo();
-      auto [kernelName2, args2] = castedState2->getKernelInfo();
+      auto [kernelName1, args1] = castedState1->getKernelInfo().value();
+      auto [kernelName2, args2] = castedState2->getKernelInfo().value();
       cudaq::IRPayLoad stateIrPayload1, stateIrPayload2;
 
       stateIrPayload1.entryPoint = kernelName1;
diff --git a/runtime/common/CMakeLists.txt b/runtime/common/CMakeLists.txt
index bb8a5ecaba..e1a38c4e25 100644
--- a/runtime/common/CMakeLists.txt
+++ b/runtime/common/CMakeLists.txt
@@ -102,7 +102,7 @@ set_source_files_properties(
     JIT.cpp
     Logger.cpp
     RuntimeMLIR.cpp
-  PROPERTIES COMPILE_FLAGS -fno-rtti
+#  PROPERTIES COMPILE_FLAGS -fno-rtti
 )
 
 target_include_directories(cudaq-mlir-runtime
diff --git a/runtime/common/SimulationState.h b/runtime/common/SimulationState.h
index 3ec97f2568..694770fa48 100644
--- a/runtime/common/SimulationState.h
+++ b/runtime/common/SimulationState.h
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <complex>
 #include <memory>
+#include <optional>
 #include <variant>
 #include <vector>
 
@@ -140,6 +141,16 @@ class SimulationState {
     return createFromSizeAndPtr(size, ptr, data.index());
   }
 
+  /// @brief True if the state has amplitudes or density matrix
+  // is available or can be computed.
+  virtual bool hasData() const { return true; }
+
+  /// @brief Helper to retrieve (kernel name, `args` pointers)
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const {
+    return std::nullopt;
+  }
+
   /// @brief Return the tensor at the given index. Throws
   /// for an invalid tensor index.
   virtual Tensor getTensor(std::size_t tensorIdx = 0) const = 0;
diff --git a/runtime/cudaq/CMakeLists.txt b/runtime/cudaq/CMakeLists.txt
index 9c08eef354..2efb8824e7 100644
--- a/runtime/cudaq/CMakeLists.txt
+++ b/runtime/cudaq/CMakeLists.txt
@@ -20,6 +20,7 @@ add_library(${LIBRARY_NAME}
                 platform/quantum_platform.cpp
                 qis/execution_manager_c_api.cpp
                 qis/execution_manager.cpp
+                qis/quantum_state.cpp
                 qis/remote_state.cpp
                 qis/state.cpp
                 utils/cudaq_utils.cpp
diff --git a/runtime/cudaq/algorithms/get_state.h b/runtime/cudaq/algorithms/get_state.h
index bbb64ebcbf..a57fa0194e 100644
--- a/runtime/cudaq/algorithms/get_state.h
+++ b/runtime/cudaq/algorithms/get_state.h
@@ -14,6 +14,7 @@
 #include "cudaq/host_config.h"
 #include "cudaq/platform.h"
 #include "cudaq/platform/QuantumExecutionQueue.h"
+#include "cudaq/qis/quantum_state.h"
 #include "cudaq/qis/remote_state.h"
 #include "cudaq/qis/state.h"
 #include <complex>
@@ -118,6 +119,17 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
     return state(new RemoteSimulationState(std::forward<QuantumKernel>(kernel),
                                            std::forward<Args>(args)...));
   }
+#endif
+#if defined(CUDAQ_QUANTUM_DEVICE)
+  // Store kernel name and arguments for quantum states.
+  if (!cudaq::get_quake_by_name(cudaq::getKernelName(kernel), false).empty()) {
+    return state(new QuantumState(std::forward<QuantumKernel>(kernel),
+                                  std::forward<Args>(args)...));
+  } else {
+    throw std::runtime_error(
+        "cudaq::state* argument synthesis is not supported for quantum hardware"
+        "for c-like functions, use class kernels instead");
+  }
 #endif
   return details::extractState([&]() mutable {
     cudaq::invokeKernel(std::forward<QuantumKernel>(kernel),
diff --git a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
index 21cc45be1e..0a291a240d 100644
--- a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/qis/quantum_state.cpp b/runtime/cudaq/qis/quantum_state.cpp
new file mode 100644
index 0000000000..faaae5b510
--- /dev/null
+++ b/runtime/cudaq/qis/quantum_state.cpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "quantum_state.h"
+#include "common/Logger.h"
+
+namespace cudaq {
+
+QuantumState::~QuantumState() {
+  if (!platformExecutionLog.empty()) {
+    // Flush any info log from the remote execution
+    printf("%s\n", platformExecutionLog.c_str());
+    platformExecutionLog.clear();
+  }
+
+  for (std::size_t counter = 0; auto &ptr : args)
+    deleters[counter++](ptr);
+
+  args.clear();
+  deleters.clear();
+}
+
+std::size_t QuantumState::getNumQubits() const {
+  throw std::runtime_error(
+      "getNumQubits is not implemented for quantum hardware");
+}
+
+cudaq::SimulationState::Tensor
+QuantumState::getTensor(std::size_t tensorIdx) const {
+  throw std::runtime_error("getTensor is not implemented for quantum hardware");
+}
+
+/// @brief Return all tensors that represent this state
+std::vector<cudaq::SimulationState::Tensor> QuantumState::getTensors() const {
+  throw std::runtime_error(
+      "getTensors is not implemented for quantum hardware");
+  return {getTensor()};
+}
+
+/// @brief Return the number of tensors that represent this state.
+std::size_t QuantumState::getNumTensors() const {
+  throw std::runtime_error(
+      "getNumTensors is not implemented for quantum hardware");
+}
+
+std::complex<double>
+QuantumState::operator()(std::size_t tensorIdx,
+                         const std::vector<std::size_t> &indices) {
+  throw std::runtime_error(
+      "operator() is not implemented for quantum hardware");
+}
+
+std::unique_ptr<SimulationState>
+QuantumState::createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t) {
+  throw std::runtime_error(
+      "createFromSizeAndPtr is not implemented for quantum hardware");
+}
+
+void QuantumState::dump(std::ostream &os) const {
+  throw std::runtime_error("dump is not implemented for quantum hardware");
+}
+
+cudaq::SimulationState::precision QuantumState::getPrecision() const {
+  throw std::runtime_error(
+      "getPrecision is not implemented for quantum hardware");
+}
+
+void QuantumState::destroyState() {
+  // There is no state data so nothing to destroy.
+}
+
+bool QuantumState::isDeviceData() const {
+  throw std::runtime_error(
+      "isDeviceData is not implemented for quantum hardware");
+}
+
+void QuantumState::toHost(std::complex<double> *clientAllocatedData,
+                          std::size_t numElements) const {
+  throw std::runtime_error("toHost is not implemented for quantum hardware");
+}
+
+void QuantumState::toHost(std::complex<float> *clientAllocatedData,
+                          std::size_t numElements) const {
+  throw std::runtime_error("toHost is not implemented for quantum hardware");
+}
+
+std::optional<std::pair<std::string, std::vector<void *>>>
+QuantumState::getKernelInfo() const {
+  return std::make_pair(kernelName, args);
+}
+
+std::vector<std::complex<double>>
+QuantumState::getAmplitudes(const std::vector<std::vector<int>> &basisStates) {
+  throw std::runtime_error(
+      "getAmplitudes is not implemented for quantum hardware");
+}
+
+std::complex<double>
+QuantumState::getAmplitude(const std::vector<int> &basisState) {
+  throw std::runtime_error(
+      "getAmplitudes is not implemented for quantum hardware");
+}
+
+std::complex<double>
+QuantumState::overlap(const cudaq::SimulationState &other) {
+  throw std::runtime_error("overlap is not implemented for quantum hardware");
+}
+} // namespace cudaq
diff --git a/runtime/cudaq/qis/quantum_state.h b/runtime/cudaq/qis/quantum_state.h
new file mode 100644
index 0000000000..63117eb462
--- /dev/null
+++ b/runtime/cudaq/qis/quantum_state.h
@@ -0,0 +1,151 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/SimulationState.h"
+#include "cudaq.h"
+#include "cudaq/utils/cudaq_utils.h"
+
+namespace cudaq {
+/// Implementation of `SimulationState` for quantum device backends.
+// The state is represented by a quantum kernel.
+// Quantum state contains all the information we need to replicate a
+// call to kernel that created the state.
+class QuantumState : public cudaq::SimulationState {
+protected:
+  std::string kernelName;
+  // Lazily-evaluated state data (just keeping the kernel name and arguments).
+  // e.g., to be evaluated at amplitude accessor APIs (const APIs, hence needs
+  // to be mutable) or overlap calculation with another remote state (combining
+  // the IR of both states for remote evaluation)
+  mutable std::unique_ptr<cudaq::SimulationState> state;
+  // Cache log messages from the remote execution.
+  // Mutable to support lazy execution during `const` API calls.
+  mutable std::string platformExecutionLog;
+  using ArgDeleter = std::function<void(void *)>;
+  /// @brief  Vector of arguments
+  // Note: we create a copy of all arguments except pointers.
+  std::vector<void *> args;
+  /// @brief Deletion functions for the arguments.
+  std::vector<std::function<void(void *)>> deleters;
+
+public:
+  template <typename T>
+  void addArgument(const T &arg) {
+    if constexpr (std::is_pointer_v<std::decay_t<T>>) {
+      if constexpr (std::is_copy_constructible_v<
+                        std::remove_pointer_t<std::decay_t<T>>>) {
+        auto ptr = new std::remove_pointer_t<std::decay_t<T>>(*arg);
+        args.push_back(ptr);
+        deleters.push_back([](void *ptr) {
+          delete static_cast<std::remove_pointer_t<std::decay_t<T>> *>(ptr);
+        });
+      } else {
+        throw std::invalid_argument(
+            "Unsupported argument type: only pointers to copy-constructible "
+            "types and copy-constructible types are supported.");
+      }
+    } else if constexpr (std::is_copy_constructible_v<std::decay_t<T>>) {
+      auto *ptr = new std::decay_t<T>(arg);
+      args.push_back(ptr);
+      deleters.push_back(
+          [](void *ptr) { delete static_cast<std::decay_t<T> *>(ptr); });
+    } else {
+      throw std::invalid_argument(
+          "Unsupported argument type: only pointers to copy-constructible "
+          "types and copy-constructible types are supported.");
+    }
+  }
+
+  /// @brief Constructor
+  template <typename QuantumKernel, typename... Args>
+  QuantumState(QuantumKernel &&kernel, Args &&...args) {
+    if constexpr (has_name<QuantumKernel>::value) {
+      // kernel_builder kernel: need to JIT code to get it registered.
+      static_cast<cudaq::details::kernel_builder_base &>(kernel).jitCode();
+      kernelName = kernel.name();
+    } else {
+      kernelName = cudaq::getKernelName(kernel);
+    }
+    (addArgument(args), ...);
+  }
+  QuantumState() = default;
+  virtual ~QuantumState();
+
+  /// @brief True if the state has amplitudes or density matrix available.
+  virtual bool hasData() const override { return false; }
+
+  /// @brief Helper to retrieve (kernel name, `args` pointers)
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override;
+
+  /// @brief Return the number of qubits this state represents.
+  std::size_t getNumQubits() const override;
+
+  /// @brief Compute the overlap of this state representation with
+  /// the provided `other` state, e.g. `<this | other>`.
+  std::complex<double> overlap(const cudaq::SimulationState &other) override;
+
+  /// @brief Return the amplitude of the given computational
+  /// basis state.
+  std::complex<double>
+  getAmplitude(const std::vector<int> &basisState) override;
+
+  /// @brief Return the amplitudes of the given list of computational
+  /// basis states.
+  std::vector<std::complex<double>>
+  getAmplitudes(const std::vector<std::vector<int>> &basisState) override;
+
+  /// @brief Return the tensor at the given index. Throws
+  /// for an invalid tensor index.
+  Tensor getTensor(std::size_t tensorIdx = 0) const override;
+
+  /// @brief Return all tensors that represent this state
+  std::vector<Tensor> getTensors() const override;
+
+  /// @brief Return the number of tensors that represent this state.
+  std::size_t getNumTensors() const override;
+
+  /// @brief Return the element from the tensor at the
+  /// given tensor index and at the given indices.
+  std::complex<double>
+  operator()(std::size_t tensorIdx,
+             const std::vector<std::size_t> &indices) override;
+
+  /// @brief Create a new subclass specific SimulationState
+  /// from the user provided data set.
+  std::unique_ptr<SimulationState>
+  createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t) override;
+
+  /// @brief Dump a representation of the state to the
+  /// given output stream.
+  void dump(std::ostream &os) const override;
+
+  /// @brief Return the floating point precision used by the simulation state.
+  precision getPrecision() const override;
+
+  /// @brief Destroy the state representation, frees all associated memory.
+  void destroyState() override;
+
+  /// @brief Return true if this `SimulationState` wraps data on the GPU.
+  bool isDeviceData() const override;
+
+  /// @brief Transfer data from device to host, return the data
+  /// to the pointer provided by the client. Clients must specify the number of
+  /// elements.
+  void toHost(std::complex<double> *clientAllocatedData,
+              std::size_t numElements) const override;
+
+  /// @brief Transfer data from device to host, return the data
+  /// to the pointer provided by the client. Clients must specify the number of
+  /// elements.
+  void toHost(std::complex<float> *clientAllocatedData,
+              std::size_t numElements) const override;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/qis/remote_state.cpp b/runtime/cudaq/qis/remote_state.cpp
index 713a462e46..84c9bf9410 100644
--- a/runtime/cudaq/qis/remote_state.cpp
+++ b/runtime/cudaq/qis/remote_state.cpp
@@ -128,7 +128,7 @@ void RemoteSimulationState::toHost(std::complex<float> *clientAllocatedData,
   }
 }
 
-std::pair<std::string, std::vector<void *>>
+std::optional<std::pair<std::string, std::vector<void *>>>
 RemoteSimulationState::getKernelInfo() const {
   return std::make_pair(kernelName, args);
 }
diff --git a/runtime/cudaq/qis/remote_state.h b/runtime/cudaq/qis/remote_state.h
index 878bb098dd..ba7929dea4 100644
--- a/runtime/cudaq/qis/remote_state.h
+++ b/runtime/cudaq/qis/remote_state.h
@@ -83,7 +83,8 @@ class RemoteSimulationState : public cudaq::SimulationState {
   virtual void execute() const;
 
   /// @brief Helper to retrieve (kernel name, `args` pointers)
-  virtual std::pair<std::string, std::vector<void *>> getKernelInfo() const;
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override;
 
   /// @brief Return the number of qubits this state represents.
   std::size_t getNumQubits() const override;
diff --git a/targettests/Remote-Sim/qvector_init_from_state.cpp b/targettests/Remote-Sim/qvector_init_from_state.cpp
index 5899c2f598..1f94b47f06 100644
--- a/targettests/Remote-Sim/qvector_init_from_state.cpp
+++ b/targettests/Remote-Sim/qvector_init_from_state.cpp
@@ -148,6 +148,22 @@ int main() {
 // CHECK: 10
   // clang-format on
 
+  {
+    std::cout << "Passing state from another kernel as argument"
+                 " with pauli word arg (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state);
+    auto counts = cudaq::sample(test_state_param2, &state, cudaq::pauli_word{"XX"});
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+// CHECK: 00
+// CHECK: 01
+// CHECK: 10
+// CHECK: 11
+  // clang-format on
+
   {
     std::cout << "Passing state from another kernel as argument iteratively "
                  "with vector args (kernel mode)"
diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
new file mode 100644
index 0000000000..afaba5a2c0
--- /dev/null
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -0,0 +1,147 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir                                     %s -o %t  && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+struct test_init_state {
+  void operator()(int n) __qpu__ {
+    cudaq::qvector q(n);
+    ry(M_PI/2.0, q[0]);
+  }
+};
+
+struct test_state_param {
+  void operator()(cudaq::state *state) __qpu__ {
+    cudaq::qvector q(state);
+    x(q);
+  }
+};
+
+struct test_state_param2 {
+  void operator()(cudaq::state *state, cudaq::pauli_word w) __qpu__ {
+    cudaq::qvector q(state);
+    cudaq::exp_pauli(1.0, q, w);
+  }
+};
+
+void printCounts(cudaq::sample_result &result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << std::endl;
+  }
+}
+
+int main() {
+  std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0., 0., 0., 0., 0.};
+  std::vector<cudaq::complex> vec1{0., 0., 0.,        0.,
+                                   0., 0., M_SQRT1_2, M_SQRT1_2};
+  auto state = cudaq::state::from_data(vec);
+  auto state1 = cudaq::state::from_data(vec1);
+  {
+    std::cout << "Passing state created from data as argument (kernel mode)"
+              << std::endl;
+    auto counts = cudaq::sample(test_state_param{}, &state);
+    printCounts(counts);
+
+    counts = cudaq::sample(test_state_param{}, &state1);
+    printCounts(counts);
+  }
+
+  // clang-format off
+// CHECK: Passing state created from data as argument (kernel mode)
+// CHECK: 011
+// CHECK: 111
+
+// CHECK: 000
+// CHECK: 100
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    auto counts = cudaq::sample(test_state_param{}, &state);
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument (kernel mode)
+// CHECK: 01
+// CHECK: 11
+  // clang-format on
+
+  {
+    std::cout
+        << "Passing large state from another kernel as argument (kernel mode)"
+        << std::endl;
+    auto largeState = cudaq::get_state(test_init_state{}, 14);
+    auto counts = cudaq::sample(test_state_param{}, &largeState);
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing large state from another kernel as argument (kernel mode)
+// CHECK: 01111111111111
+// CHECK: 11111111111111
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument"
+                 " with pauli word arg (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    auto counts = cudaq::sample(test_state_param2{}, &state, cudaq::pauli_word{"XX"});
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+// CHECK: 00
+// CHECK: 01
+// CHECK: 10
+// CHECK: 11
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument iteratively "
+                 "(kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    for (auto i = 0; i < 4; i++) {
+      auto counts = cudaq::sample(test_state_param{}, &state);
+      std::cout << "Iteration: " << i << std::endl;
+      printCounts(counts);
+      state = cudaq::get_state(test_state_param{}, &state);
+    }
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument iteratively (kernel mode)
+// CHECK: Iteration: 0
+// CHECK: 01
+// CHECK: 11
+// CHECK: Iteration: 1
+// CHECK: 00
+// CHECK: 10
+// CHECK: Iteration: 2
+// CHECK: 01
+// CHECK: 11
+// CHECK: Iteration: 3
+// CHECK: 00
+// CHECK: 10
+  // clang-format on
+}
diff --git a/targettests/execution/state_init.cpp b/targettests/execution/state_init.cpp
index 31e946147d..e9b8456513 100644
--- a/targettests/execution/state_init.cpp
+++ b/targettests/execution/state_init.cpp
@@ -40,4 +40,4 @@ int main() {
 }
 
 // CHECK: 00
-// CHECK: 10
+// CHECK: 10
\ No newline at end of file
diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
new file mode 100644
index 0000000000..c5e727bb79
--- /dev/null
+++ b/test/Quake/arg_subst-5.txt
@@ -0,0 +1,15 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+cc.arg_subst[0] {
+  %0 = cc.string_literal "init" : !cc.ptr<!cc.array<i8 x 46>>
+  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 46>>) -> !cc.ptr<i8>
+  %2 = func.call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+  %3 = cc.cast %2 : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+}
+func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
diff --git a/test/Quake/arg_subst-6.txt b/test/Quake/arg_subst-6.txt
new file mode 100644
index 0000000000..4c3a55d883
--- /dev/null
+++ b/test/Quake/arg_subst-6.txt
@@ -0,0 +1,11 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+cc.arg_subst[0] {
+  %c2_i32 = arith.constant 2 : i32
+}
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index e96e04b63a..4bf6e10155 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt --canonicalize %s | FileCheck %s
+// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt,testy5:%S/arg_subst-5.txt,init:%S/arg_subst-6.txt --canonicalize %s | FileCheck %s
 
 func.func private @bar(i32)
 func.func private @baz(f32)
@@ -146,3 +146,38 @@ func.func @testy4(%arg0: !cc.stdvec<!cc.struct<{i32, f64, i8, i16}>>) {
 // CHECK:           call @callee4(%[[VAL_32]]) : (!cc.stdvec<!cc.struct<{i32, f64, i8, i16}>>) -> ()
 // CHECK:           return
 // CHECK:         }
+
+func.func @testy5(%arg0: !cc.ptr<!cc.state>) {
+  %3 = call @__nvqpp_cudaq_state_numberOfQubits(%arg0) : (!cc.ptr<!cc.state>) -> i64
+  %4 = quake.alloca !quake.veq<?>[%3 : i64]
+  %5 = quake.init_state %4, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+  return
+}
+
+func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
+func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+
+func.func private @init(%arg0: i32) -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+  %cst = arith.constant 1.5707963267948966 : f64
+  %0 = cc.cast signed %arg0 : (i32) -> i64
+  %1 = quake.alloca !quake.veq<?>[%0 : i64]
+  %2 = quake.concat %1 : (!quake.veq<?>) -> !quake.veq<?>
+  return %2 : !quake.veq<?>
+}
+
+// CHECK-LABEL:   func.func @testy5() {
+// CHECK:           %[[VAL_0:.*]] = cc.string_literal "init" : !cc.ptr<!cc.array<i8 x 46>>
+// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<i8 x 46>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_2:.*]] = call @__nvqpp_cudaq_state_get(%[[VAL_1]]) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
+// CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
+// CHECK:         func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+// CHECK:         func.func private @init() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_8:.*]] = quake.relax_size %[[VAL_7:.*]] : (!quake.veq<2>) -> !quake.veq<?>
+// CHECK:           return %[[VAL_8]] : !quake.veq<?>
+// CHECK:         }
diff --git a/test/Quake/state_init.qke b/test/Quake/state_init.qke
new file mode 100644
index 0000000000..9f43a965a4
--- /dev/null
+++ b/test/Quake/state_init.qke
@@ -0,0 +1,37 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -state-initialization -canonicalize %s | FileCheck %s
+
+module {
+  func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
+    %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
+    %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+    %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+    %4 = quake.alloca !quake.veq<?>[%3 : i64]
+    %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    return
+  }
+
+  func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
+  func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+
+  func.func private @callee.modified_0() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+    %cst = arith.constant 1.5707963267948966 : f64
+    %0 = quake.alloca !quake.veq<2>
+    %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
+    quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+    %2 = quake.relax_size %0 : (!quake.veq<2>) -> !quake.veq<?>
+    return %2 : !quake.veq<?>
+  }
+// CHECK-LABEL:   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = call @callee.modified_0() : () -> !quake.veq<?>
+// CHECK:           return
+// CHECK:         }
+}
diff --git a/test/Quake/state_prep.qke b/test/Quake/state_prep.qke
index 4289571b33..3072a19218 100644
--- a/test/Quake/state_prep.qke
+++ b/test/Quake/state_prep.qke
@@ -31,7 +31,7 @@ module {
 // CHECK:           return
 // CHECK:         }
 
- func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %0 = cc.address_of @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv.rodata_0 : !cc.ptr<!cc.array<f64 x 4>>
     %1 = quake.alloca !quake.veq<2>
     %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
diff --git a/tpls/Stim b/tpls/Stim
index 47190f4a3a..b01e423915 160000
--- a/tpls/Stim
+++ b/tpls/Stim
@@ -1 +1 @@
-Subproject commit 47190f4a3afb104c9f0068d0be9fea87d2894a70
+Subproject commit b01e42391583d03db4266b387d907eda1d7ae488

From 3fc56de6f0c911888fc8f3ae6356b8613653f0f9 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 14:25:47 -0700
Subject: [PATCH 03/18] Merge with main

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/tests/interop/quantum_lib/CMakeLists.txt | 1 +
 runtime/common/BaseRemoteRESTQPU.h              | 7 +++----
 targettests/execution/state_init.cpp            | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tests/interop/quantum_lib/CMakeLists.txt b/python/tests/interop/quantum_lib/CMakeLists.txt
index 34fb024188..21bb37a4d7 100644
--- a/python/tests/interop/quantum_lib/CMakeLists.txt
+++ b/python/tests/interop/quantum_lib/CMakeLists.txt
@@ -11,3 +11,4 @@ set(CMAKE_CXX_COMPILE_OBJECT "<CMAKE_CXX_COMPILER> -fPIC --enable-mlir --disable
 
 # FIXME Error with SHARED, it pulls in all the mlir libraries anyway
 add_library(quantum_lib OBJECT quantum_lib.cpp)
+add_dependencies(quantum_lib nvq++ cudaq-opt cudaq-quake cudaq-translate)
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 41f45b6b75..32a097cfc5 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -393,14 +393,13 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (!func->hasAttr(cudaq::entryPointAttrName))
       func->setAttr(cudaq::entryPointAttrName, builder.getUnitAttr());
     auto moduleOp = builder.create<mlir::ModuleOp>();
+    moduleOp.push_back(func.clone());
     moduleOp->setAttrs(m_module->getAttrDictionary());
 
     for (auto &op : m_module.getOps()) {
       if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-        // Add quantum kernels defined in the module.
-        if (funcOp->hasAttr(cudaq::kernelAttrName) ||
-            funcOp.getName().startswith("__nvqpp__mlirgen__") ||
-            funcOp.getBody().empty())
+        // Add function definitions for runtime functions.
+        if (funcOp.getBody().empty())
           moduleOp.push_back(funcOp.clone());
       }
       // Add globals defined in the module.
diff --git a/targettests/execution/state_init.cpp b/targettests/execution/state_init.cpp
index e9b8456513..31e946147d 100644
--- a/targettests/execution/state_init.cpp
+++ b/targettests/execution/state_init.cpp
@@ -40,4 +40,4 @@ int main() {
 }
 
 // CHECK: 00
-// CHECK: 10
\ No newline at end of file
+// CHECK: 10

From 7969a755986157cdb04625a8680516432d00e352 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 14:37:56 -0700
Subject: [PATCH 04/18] Merge with main

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 tpls/Stim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tpls/Stim b/tpls/Stim
index b01e423915..47190f4a3a 160000
--- a/tpls/Stim
+++ b/tpls/Stim
@@ -1 +1 @@
-Subproject commit b01e42391583d03db4266b387d907eda1d7ae488
+Subproject commit 47190f4a3afb104c9f0068d0be9fea87d2894a70

From 755d0d1971bc489093ab2e541db759352f4506eb Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 15:24:55 -0700
Subject: [PATCH 05/18] Fix test failure on anyon platform

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/BaseRemoteRESTQPU.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 32a097cfc5..989649d9fa 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -18,6 +18,7 @@
 #include "common/RuntimeMLIR.h"
 #include "cudaq.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
@@ -398,8 +399,13 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
     for (auto &op : m_module.getOps()) {
       if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-        // Add function definitions for runtime functions.
-        if (funcOp.getBody().empty())
+        // Add function definitions for runtime functions that must
+        // be removed after synthesis in cleanup ops.
+        if (funcOp.getBody().empty() &&
+            (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
+             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
+             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
+             funcOp.getName().equals(cudaq::getCudaqState)))
           moduleOp.push_back(funcOp.clone());
       }
       // Add globals defined in the module.

From 382bc99adda74bcae5cab1965096dac12d6e2b37 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 15:40:34 -0700
Subject: [PATCH 06/18] Make StateInitialization a funcOp pass

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td  |  2 +-
 .../Transforms/StateInitialization.cpp        | 25 ++++++-------------
 runtime/common/BaseRemoteRESTQPU.h            |  2 +-
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 66eb4cfcb0..70ae6c7138 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -779,7 +779,7 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
   }];
 }
 
-def StateInitialization : Pass<"state-initialization", "mlir::ModuleOp"> {
+def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
   let summary =
     "Replace `quake.init_state` instructions with call to the kernel generating the state";
   let description = [{
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
index 3a122f02a7..f641eb04f6 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -121,26 +121,17 @@ class StateInitializationPass
 
   void runOnOperation() override {
     auto *ctx = &getContext();
-    auto module = getOperation();
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
+    auto func = getOperation();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<StateInitPattern>(ctx);
 
-      std::string funcName = func.getName().str();
-      RewritePatternSet patterns(ctx);
-      patterns.insert<StateInitPattern>(ctx);
+    LLVM_DEBUG(llvm::dbgs() << "Before state initialization: " << func << '\n');
 
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Before state initialization: " << func << '\n');
+    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                            std::move(patterns))))
+      signalPassFailure();
 
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
-
-      LLVM_DEBUG(llvm::dbgs()
-                 << "After state initialization: " << func << '\n');
-    }
+    LLVM_DEBUG(llvm::dbgs() << "After state initialization: " << func << '\n');
   }
 };
 } // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 989649d9fa..a37d5bf706 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -447,7 +447,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
                                                    substs.end()}));
         pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(opt::createDeleteStates());
-        pm.addPass(opt::createStateInitialization());
+        pm.addNestedPass<mlir::func::FuncOp>(opt::createStateInitialization());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));

From d3a05d4432d41acaae68fea86eeac6f3e34d4cc7 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 18 Oct 2024 11:09:12 -0700
Subject: [PATCH 07/18] Fix issues and tests for the rest of quantum
 architectures

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td  |  11 ++
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 .../Transforms/StateInitialization.cpp        |  16 +--
 lib/Optimizer/Transforms/StateValidation.cpp  | 130 ++++++++++++++++++
 runtime/common/BaseRemoteRESTQPU.h            |   2 +
 .../default/rest/helpers/anyon/anyon.yml      |   2 +
 .../default/rest/helpers/ionq/ionq.yml        |   2 +
 .../platform/default/rest/helpers/iqm/iqm.yml |   2 +
 .../platform/default/rest/helpers/oqc/oqc.yml |   2 +
 .../execution/qvector_init_from_state.cpp     |  17 ++-
 10 files changed, 174 insertions(+), 11 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StateValidation.cpp

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 70ae6c7138..aa8f038c41 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -866,6 +866,17 @@ def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
   ];
 }
 
+def StateValidation : Pass<"state-validation", "mlir::ModuleOp"> {
+  let summary =
+    "Make sure MLIR is valid after synthesis for quantum devices";
+  let description = [{
+    Argument synthesis should replace all `quake.init` from state instructions
+    and calls to state-related runtime functions.
+    Make sure none of them left, and remove definitions for state-related
+    runtime functions.
+  }];
+}
+
 def PromoteRefToVeqAlloc : Pass<"promote-qubit-allocation"> {
   let summary = "Promote single qubit allocations.";
   let description = [{
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index f107d78bde..7eae39e35f 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -52,6 +52,7 @@ add_cudaq_library(OptTransforms
   RegToMem.cpp
   StateInitialization.cpp
   StatePreparation.cpp
+  StateValidation.cpp
   UnitarySynthesis.cpp
   WiresToWiresets.cpp
 
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
index f641eb04f6..c46273b747 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -30,10 +30,10 @@ using namespace mlir;
 
 namespace {
 
-static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
-  if (callOp) {
-    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
-      if (auto calleeAttr = createStateCall.getCalleeAttr()) {
+static bool isCall(Operation *op, std::vector<const char *> &&names) {
+  if (op) {
+    if (auto callOp = dyn_cast<func::CallOp>(op)) {
+      if (auto calleeAttr = callOp.getCalleeAttr()) {
         auto funcName = calleeAttr.getValue().str();
         if (std::find(names.begin(), names.end(), funcName) != names.end())
           return true;
@@ -43,12 +43,12 @@ static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
   return false;
 }
 
-static bool isGetStateCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::getCudaqState});
+static bool isGetStateCall(Operation *op) {
+  return isCall(op, {cudaq::getCudaqState});
 }
 
-static bool isNumberOfQubitsCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::getNumQubitsFromCudaqState});
+static bool isNumberOfQubitsCall(Operation *op) {
+  return isCall(op, {cudaq::getNumQubitsFromCudaqState});
 }
 
 // clang-format off
diff --git a/lib/Optimizer/Transforms/StateValidation.cpp b/lib/Optimizer/Transforms/StateValidation.cpp
new file mode 100644
index 0000000000..be20dd4ede
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateValidation.cpp
@@ -0,0 +1,130 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_STATEVALIDATION
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "state-validation"
+
+using namespace mlir;
+
+
+/// Validate that quantum code does not contain runtime calls and remove runtime function definitions. 
+namespace {
+
+static bool isRuntimeStateCallName(llvm::StringRef funcName) {
+  static std::vector<const char *> names = {
+    cudaq::getCudaqState,
+    cudaq::createCudaqStateFromDataFP32,
+    cudaq::createCudaqStateFromDataFP64,
+    cudaq::deleteCudaqState,
+    cudaq::getNumQubitsFromCudaqState
+  };
+  if (std::find(names.begin(), names.end(), funcName) != names.end())
+      return true; 
+  return false;
+}
+
+static bool isRuntimeStateCall(Operation *callOp) {
+  if (callOp) {
+    if (auto call = dyn_cast<func::CallOp>(callOp)) {
+      if (auto calleeAttr = call.getCalleeAttr()) {
+        auto funcName = calleeAttr.getValue().str();
+        if (isRuntimeStateCallName(funcName))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+class ValidateStateCallPattern : public OpRewritePattern<func::CallOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(func::CallOp callOp,
+                                PatternRewriter &rewriter) const override {
+    if (isRuntimeStateCall(callOp)) {
+      auto name = callOp.getCalleeAttr().getValue();
+      callOp.emitError("Unsupported call for quantum platform: " + name);
+    }
+    return failure();
+  }
+};
+
+class ValidateStateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
+                                PatternRewriter &rewriter) const override {
+    auto stateOp = initState.getOperand(1);
+    if (isa<cudaq::cc::StateType>(stateOp.getType())) 
+      initState.emitError("Synthesis did not remove `quake.init_state <state>` instruction");
+    
+    return failure();
+  }
+};
+
+
+class StateValidationPass
+    : public cudaq::opt::impl::StateValidationBase<StateValidationPass> {
+protected:
+public:
+  using StateValidationBase::StateValidationBase;
+
+  mlir::ModuleOp getModule() { return getOperation(); }
+
+  void runOnOperation() override final {
+    auto *ctx = &getContext();
+    auto module = getModule();
+    SmallVector<Operation *> toErase;
+
+    for (Operation &op : *module.getBody()) {
+      auto func = dyn_cast<func::FuncOp>(op);
+      if (!func)
+        continue;
+
+      RewritePatternSet patterns(ctx);
+      patterns.insert<ValidateStateCallPattern, ValidateStateInitPattern>(ctx);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Before state validation: " << func << '\n');
+
+      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                              std::move(patterns))))
+        signalPassFailure();
+
+      // Delete runtime function definitions.
+      if (func.getBody().empty() && isRuntimeStateCallName(func.getName()))
+        toErase.push_back(func);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "After state validation: " << func << '\n');
+    }
+
+    for (auto *op : toErase)
+      op->erase();
+  }
+};
+
+} // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index a37d5bf706..0eab2c7fba 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -405,6 +405,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
+             funcOp.getName().equals(cudaq::deleteCudaqState) ||
              funcOp.getName().equals(cudaq::getCudaqState)))
           moduleOp.push_back(funcOp.clone());
       }
@@ -448,6 +449,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(opt::createStateInitialization());
+        pm.addPass(opt::createStateValidation());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
diff --git a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
index 3ecb49f302..e0fb208f9c 100644
--- a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
index 238d4c3316..802cdc2e0a 100644
--- a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
index 0e90a1e2af..2c928bda87 100644
--- a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
index 6a8a46c066..cde626676c 100644
--- a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index afaba5a2c0..06c97b1e6a 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -7,8 +7,16 @@
  ******************************************************************************/
 
 // clang-format off
-// RUN: nvq++ %cpp_std --enable-mlir                                     %s -o %t  && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// Simulators
+// RUN: nvq++ %cpp_std --enable-mlir  %s                              -o %t && %t | FileCheck %s
+
+// Quantum emulators
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
 
 #include <cudaq.h>
@@ -91,7 +99,10 @@ int main() {
     std::cout
         << "Passing large state from another kernel as argument (kernel mode)"
         << std::endl;
-    auto largeState = cudaq::get_state(test_init_state{}, 14);
+    // TODO: State larger than 5 qubits fails on iqm machines with Adonis architecture
+    // TODO: State larger than 8 qubits fails on oqc and anyon
+    // Up to 14 bits works with quantinuum an ionq
+    auto largeState = cudaq::get_state(test_init_state{}, 5);
     auto counts = cudaq::sample(test_state_param{}, &largeState);
     printCounts(counts);
   }

From 51ef054c14df334252e389e2244d24974486661e Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 18 Oct 2024 15:48:39 -0700
Subject: [PATCH 08/18] Fix failing quantinuum state prep tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../Transforms/StateInitialization.cpp        | 68 ++++++++++---------
 lib/Optimizer/Transforms/StateValidation.cpp  |  7 +-
 2 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
index c46273b747..0ed6867670 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -73,39 +73,43 @@ class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
                                 PatternRewriter &rewriter) const override {
     auto loc = initState.getLoc();
     auto allocaOp = initState.getOperand(0).getDefiningOp();
-    auto getStateOp = initState.getOperand(1).getDefiningOp();
-    auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
-
-    if (isGetStateCall(getStateOp)) {
-      auto calleeNameOp = getStateOp->getOperand(0);
-      if (auto cast =
-              dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
-        calleeNameOp = cast.getOperand();
-
-        if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
-                calleeNameOp.getDefiningOp())) {
-          auto calleeName = literal.getStringLiteral();
-
-          Value result =
-              rewriter
-                  .create<func::CallOp>(loc, initState.getType(), calleeName,
-                                        mlir::ValueRange{})
-                  .getResult(0);
-          rewriter.replaceAllUsesWith(initState, result);
-          initState.erase();
-          allocaOp->dropAllUses();
-          rewriter.eraseOp(allocaOp);
-          if (isNumberOfQubitsCall(numOfQubits)) {
-            numOfQubits->dropAllUses();
-            rewriter.eraseOp(numOfQubits);
+    auto stateOp = initState.getOperand(1);
+
+    if (isa<cudaq::cc::StateType>(stateOp.getType())) {
+      auto getStateOp = stateOp.getDefiningOp();
+      auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
+
+      if (isGetStateCall(getStateOp)) {
+        auto calleeNameOp = getStateOp->getOperand(0);
+        if (auto cast =
+                dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
+          calleeNameOp = cast.getOperand();
+
+          if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
+                  calleeNameOp.getDefiningOp())) {
+            auto calleeName = literal.getStringLiteral();
+
+            Value result =
+                rewriter
+                    .create<func::CallOp>(loc, initState.getType(), calleeName,
+                                          mlir::ValueRange{})
+                    .getResult(0);
+            rewriter.replaceAllUsesWith(initState, result);
+            initState.erase();
+            allocaOp->dropAllUses();
+            rewriter.eraseOp(allocaOp);
+            if (isNumberOfQubitsCall(numOfQubits)) {
+              numOfQubits->dropAllUses();
+              rewriter.eraseOp(numOfQubits);
+            }
+            getStateOp->dropAllUses();
+            rewriter.eraseOp(getStateOp);
+            cast->dropAllUses();
+            rewriter.eraseOp(cast);
+            literal->dropAllUses();
+            rewriter.eraseOp(literal);
+            return success();
           }
-          getStateOp->dropAllUses();
-          rewriter.eraseOp(getStateOp);
-          cast->dropAllUses();
-          rewriter.eraseOp(cast);
-          literal->dropAllUses();
-          rewriter.eraseOp(literal);
-          return success();
         }
       }
     }
diff --git a/lib/Optimizer/Transforms/StateValidation.cpp b/lib/Optimizer/Transforms/StateValidation.cpp
index f0b25cdc10..c9d301740c 100644
--- a/lib/Optimizer/Transforms/StateValidation.cpp
+++ b/lib/Optimizer/Transforms/StateValidation.cpp
@@ -62,7 +62,8 @@ class ValidateStateCallPattern : public OpRewritePattern<func::CallOp> {
                                 PatternRewriter &rewriter) const override {
     if (isRuntimeStateCall(callOp)) {
       auto name = callOp.getCalleeAttr().getValue();
-      callOp.emitError("Unsupported call for quantum platform: " + name);
+      callOp.emitError(
+          "Synthesis did not remove func call for quantum platform: " + name);
     }
     return failure();
   }
@@ -77,8 +78,8 @@ class ValidateStateInitPattern
                                 PatternRewriter &rewriter) const override {
     auto stateOp = initState.getOperand(1);
     if (isa<cudaq::cc::StateType>(stateOp.getType()))
-      initState.emitError(
-          "Synthesis did not remove `quake.init_state <state>` instruction");
+      initState.emitError("Synthesis did not remove `quake.init_state <veq> "
+                          "<state>` instruction");
 
     return failure();
   }

From a7f5387e10c181704ff36c37504fea72ea2e3486 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 21 Oct 2024 15:11:34 -0700
Subject: [PATCH 09/18] Address CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td  |  17 +--
 lib/Optimizer/Transforms/CMakeLists.txt       |   3 +-
 ...ization.cpp => ReplaceStateWithKernel.cpp} |  98 +++++++-------
 lib/Optimizer/Transforms/StateValidation.cpp  | 127 ------------------
 runtime/common/BaseRemoteRESTQPU.h            |  10 +-
 ...init.qke => replace_state_with_kernel.qke} |   2 +-
 6 files changed, 64 insertions(+), 193 deletions(-)
 rename lib/Optimizer/Transforms/{StateInitialization.cpp => ReplaceStateWithKernel.cpp} (56%)
 delete mode 100644 lib/Optimizer/Transforms/StateValidation.cpp
 rename test/Quake/{state_init.qke => replace_state_with_kernel.qke} (96%)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index aa8f038c41..ef446a3812 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -779,7 +779,7 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
   }];
 }
 
-def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
+def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::FuncOp"> {
   let summary =
     "Replace `quake.init_state` instructions with call to the kernel generating the state";
   let description = [{
@@ -794,7 +794,7 @@ def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
 
     For example:
 
-    Before StateInitialization (state-initialization):
+    Before ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
       %0 = cc.string_literal "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.array<i8 x 45>>
@@ -807,7 +807,7 @@ def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
     }
     ```
 
-    After StateInitialization (state-initialization):
+    After ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
       %5 = call @__nvqpp__mlirgen__test_init_state.modified_0() : () -> !quake.veq<?>
@@ -866,17 +866,6 @@ def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
   ];
 }
 
-def StateValidation : Pass<"state-validation", "mlir::ModuleOp"> {
-  let summary =
-    "Make sure MLIR is valid after synthesis for quantum devices";
-  let description = [{
-    Argument synthesis should replace all `quake.init` from state instructions
-    and calls to state-related runtime functions.
-    Make sure none of them left, and remove definitions for state-related
-    runtime functions.
-  }];
-}
-
 def PromoteRefToVeqAlloc : Pass<"promote-qubit-allocation"> {
   let summary = "Promote single qubit allocations.";
   let description = [{
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 7eae39e35f..153e095e1f 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -50,9 +50,8 @@ add_cudaq_library(OptTransforms
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
-  StateInitialization.cpp
+  ReplaceStateWithKernel.cpp
   StatePreparation.cpp
-  StateValidation.cpp
   UnitarySynthesis.cpp
   WiresToWiresets.cpp
 
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
similarity index 56%
rename from lib/Optimizer/Transforms/StateInitialization.cpp
rename to lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 0ed6867670..d588f09216 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -20,11 +20,11 @@
 #include <span>
 
 namespace cudaq::opt {
-#define GEN_PASS_DEF_STATEINITIALIZATION
+#define GEN_PASS_DEF_REPLACESTATEWITHKERNEL
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
-#define DEBUG_TYPE "state-initialization"
+#define DEBUG_TYPE "replace-state-with-kernel"
 
 using namespace mlir;
 
@@ -52,7 +52,9 @@ static bool isNumberOfQubitsCall(Operation *op) {
 }
 
 // clang-format off
-/// Replace `quake.init_state` by a call to a (modified) kernel that produced the state.
+/// Replace `quake.init_state` by a call to a (modified) kernel that produced
+/// the state.
+///
 /// ```
 ///  %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
 ///  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
@@ -65,50 +67,54 @@ static bool isNumberOfQubitsCall(Operation *op) {
 ///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
 /// ```
 // clang-format on
-class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
+class ReplaceStateWithKernelPattern : public OpRewritePattern<quake::InitializeStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    auto loc = initState.getLoc();
-    auto allocaOp = initState.getOperand(0).getDefiningOp();
+    //auto loc = initState.getLoc();
+    auto *alloca = initState.getOperand(0).getDefiningOp();
     auto stateOp = initState.getOperand(1);
 
-    if (isa<cudaq::cc::StateType>(stateOp.getType())) {
-      auto getStateOp = stateOp.getDefiningOp();
-      auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
-
-      if (isGetStateCall(getStateOp)) {
-        auto calleeNameOp = getStateOp->getOperand(0);
-        if (auto cast =
-                dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
-          calleeNameOp = cast.getOperand();
-
-          if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
-                  calleeNameOp.getDefiningOp())) {
-            auto calleeName = literal.getStringLiteral();
-
-            Value result =
-                rewriter
-                    .create<func::CallOp>(loc, initState.getType(), calleeName,
-                                          mlir::ValueRange{})
-                    .getResult(0);
-            rewriter.replaceAllUsesWith(initState, result);
-            initState.erase();
-            allocaOp->dropAllUses();
-            rewriter.eraseOp(allocaOp);
-            if (isNumberOfQubitsCall(numOfQubits)) {
-              numOfQubits->dropAllUses();
-              rewriter.eraseOp(numOfQubits);
+    if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
+      if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+        auto *getState = stateOp.getDefiningOp();
+        auto *numOfQubits = alloca->getOperand(0).getDefiningOp();
+
+        if (isGetStateCall(getState)) {
+          auto calleeNameOp = getState->getOperand(0);
+          if (auto cast = calleeNameOp.getDefiningOp<cudaq::cc::CastOp>()) {
+            calleeNameOp = cast.getOperand();
+
+            if (auto literal = 
+                    calleeNameOp.getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
+              auto calleeName = literal.getStringLiteral();
+              rewriter.replaceOpWithNewOp<func::CallOp>(initState, initState.getType(), calleeName,
+                                            mlir::ValueRange{});
+
+              if (alloca->getUses().empty()) 
+                rewriter.eraseOp(alloca);
+              else  {
+                alloca->emitError("Failed to remove `quake.alloca` in state synthesis");
+                return failure();
+              }
+              if (isNumberOfQubitsCall(numOfQubits)) {
+                if (numOfQubits->getUses().empty())
+                  rewriter.eraseOp(numOfQubits);
+                else  {
+                  numOfQubits->emitError("Failed to remove runtime call to get number of qubits in state synthesis");
+                  return failure();
+                }
+              }
+              if (getState->getUses().empty())
+                rewriter.eraseOp(getState);
+              else  {
+                alloca->emitError("Failed to remove runtime call to get state in state synthesis");
+                return failure();
+              }
+              return success();
             }
-            getStateOp->dropAllUses();
-            rewriter.eraseOp(getStateOp);
-            cast->dropAllUses();
-            rewriter.eraseOp(cast);
-            literal->dropAllUses();
-            rewriter.eraseOp(literal);
-            return success();
           }
         }
       }
@@ -117,25 +123,25 @@ class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
   }
 };
 
-class StateInitializationPass
-    : public cudaq::opt::impl::StateInitializationBase<
-          StateInitializationPass> {
+class ReplaceStateWithKernelPass
+    : public cudaq::opt::impl::ReplaceStateWithKernelBase<
+          ReplaceStateWithKernelPass> {
 public:
-  using StateInitializationBase::StateInitializationBase;
+  using ReplaceStateWithKernelBase::ReplaceStateWithKernelBase;
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     auto func = getOperation();
     RewritePatternSet patterns(ctx);
-    patterns.insert<StateInitPattern>(ctx);
+    patterns.insert<ReplaceStateWithKernelPattern>(ctx);
 
-    LLVM_DEBUG(llvm::dbgs() << "Before state initialization: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs() << "Before replace state with kernel: " << func << '\n');
 
     if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
                                             std::move(patterns))))
       signalPassFailure();
 
-    LLVM_DEBUG(llvm::dbgs() << "After state initialization: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs() << "After replace state with kerenl: " << func << '\n');
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/StateValidation.cpp b/lib/Optimizer/Transforms/StateValidation.cpp
deleted file mode 100644
index c9d301740c..0000000000
--- a/lib/Optimizer/Transforms/StateValidation.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "PassDetails.h"
-#include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
-#include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/Passes.h"
-
-namespace cudaq::opt {
-#define GEN_PASS_DEF_STATEVALIDATION
-#include "cudaq/Optimizer/Transforms/Passes.h.inc"
-} // namespace cudaq::opt
-
-#define DEBUG_TYPE "state-validation"
-
-using namespace mlir;
-
-/// Validate that quantum code does not contain runtime calls and remove runtime
-/// function definitions.
-namespace {
-
-static bool isRuntimeStateCallName(llvm::StringRef funcName) {
-  static std::vector<const char *> names = {
-      cudaq::getCudaqState, cudaq::createCudaqStateFromDataFP32,
-      cudaq::createCudaqStateFromDataFP64, cudaq::deleteCudaqState,
-      cudaq::getNumQubitsFromCudaqState};
-  if (std::find(names.begin(), names.end(), funcName) != names.end())
-    return true;
-  return false;
-}
-
-static bool isRuntimeStateCall(Operation *callOp) {
-  if (callOp) {
-    if (auto call = dyn_cast<func::CallOp>(callOp)) {
-      if (auto calleeAttr = call.getCalleeAttr()) {
-        auto funcName = calleeAttr.getValue().str();
-        if (isRuntimeStateCallName(funcName))
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-class ValidateStateCallPattern : public OpRewritePattern<func::CallOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(func::CallOp callOp,
-                                PatternRewriter &rewriter) const override {
-    if (isRuntimeStateCall(callOp)) {
-      auto name = callOp.getCalleeAttr().getValue();
-      callOp.emitError(
-          "Synthesis did not remove func call for quantum platform: " + name);
-    }
-    return failure();
-  }
-};
-
-class ValidateStateInitPattern
-    : public OpRewritePattern<quake::InitializeStateOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
-                                PatternRewriter &rewriter) const override {
-    auto stateOp = initState.getOperand(1);
-    if (isa<cudaq::cc::StateType>(stateOp.getType()))
-      initState.emitError("Synthesis did not remove `quake.init_state <veq> "
-                          "<state>` instruction");
-
-    return failure();
-  }
-};
-
-class StateValidationPass
-    : public cudaq::opt::impl::StateValidationBase<StateValidationPass> {
-protected:
-public:
-  using StateValidationBase::StateValidationBase;
-
-  mlir::ModuleOp getModule() { return getOperation(); }
-
-  void runOnOperation() override final {
-    auto *ctx = &getContext();
-    auto module = getModule();
-    SmallVector<Operation *> toErase;
-
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
-
-      RewritePatternSet patterns(ctx);
-      patterns.insert<ValidateStateCallPattern, ValidateStateInitPattern>(ctx);
-
-      LLVM_DEBUG(llvm::dbgs() << "Before state validation: " << func << '\n');
-
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
-
-      // Delete runtime function definitions.
-      if (func.getBody().empty() && isRuntimeStateCallName(func.getName()))
-        toErase.push_back(func);
-
-      LLVM_DEBUG(llvm::dbgs() << "After state validation: " << func << '\n');
-    }
-
-    for (auto *op : toErase)
-      op->erase();
-  }
-};
-
-} // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index c467811a66..a9053411fa 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -437,6 +437,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
         cudaq::info("Run Argument Synth.\n");
+        // For quantum hardware, we collect substitutions for the
+        // whole call tree of states, which are treated as calls to
+        // the kernels and their arguments that produced the state.
         opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
         auto [kernels, substs] = argCon.collectAllSubstitutions();
@@ -446,10 +449,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
                                                    kernels.end()},
                 mlir::SmallVector<mlir::StringRef>{substs.begin(),
                                                    substs.end()}));
-        pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(opt::createDeleteStates());
-        pm.addNestedPass<mlir::func::FuncOp>(opt::createStateInitialization());
-        pm.addPass(opt::createStateValidation());
+        pm.addNestedPass<mlir::func::FuncOp>(
+            opt::createReplaceStateWithKernel());
+        pm.addPass(mlir::createCanonicalizerPass());
+        pm.addPass(mlir::createSymbolDCEPass());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
diff --git a/test/Quake/state_init.qke b/test/Quake/replace_state_with_kernel.qke
similarity index 96%
rename from test/Quake/state_init.qke
rename to test/Quake/replace_state_with_kernel.qke
index 9f43a965a4..70b04e3103 100644
--- a/test/Quake/state_init.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt -state-initialization -canonicalize %s | FileCheck %s
+// RUN: cudaq-opt -replace-state-with-kernel -canonicalize %s | FileCheck %s
 
 module {
   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {

From 9f0937fcb022663cf1e94216e7acb9bd7c429572 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 21 Oct 2024 15:41:40 -0700
Subject: [PATCH 10/18] Format

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../Transforms/ReplaceStateWithKernel.cpp     | 37 +++++++++++--------
 runtime/common/BaseRemoteRESTQPU.h            |  2 +-
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index d588f09216..5300f57415 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -67,13 +67,13 @@ static bool isNumberOfQubitsCall(Operation *op) {
 ///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
 /// ```
 // clang-format on
-class ReplaceStateWithKernelPattern : public OpRewritePattern<quake::InitializeStateOp> {
+class ReplaceStateWithKernelPattern
+    : public OpRewritePattern<quake::InitializeStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    //auto loc = initState.getLoc();
     auto *alloca = initState.getOperand(0).getDefiningOp();
     auto stateOp = initState.getOperand(1);
 
@@ -87,30 +87,35 @@ class ReplaceStateWithKernelPattern : public OpRewritePattern<quake::InitializeS
           if (auto cast = calleeNameOp.getDefiningOp<cudaq::cc::CastOp>()) {
             calleeNameOp = cast.getOperand();
 
-            if (auto literal = 
-                    calleeNameOp.getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
+            if (auto literal =
+                    calleeNameOp
+                        .getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
               auto calleeName = literal.getStringLiteral();
-              rewriter.replaceOpWithNewOp<func::CallOp>(initState, initState.getType(), calleeName,
-                                            mlir::ValueRange{});
+              rewriter.replaceOpWithNewOp<func::CallOp>(
+                  initState, initState.getType(), calleeName,
+                  mlir::ValueRange{});
 
-              if (alloca->getUses().empty()) 
+              if (alloca->getUses().empty())
                 rewriter.eraseOp(alloca);
-              else  {
-                alloca->emitError("Failed to remove `quake.alloca` in state synthesis");
+              else {
+                alloca->emitError(
+                    "Failed to remove `quake.alloca` in state synthesis");
                 return failure();
               }
               if (isNumberOfQubitsCall(numOfQubits)) {
                 if (numOfQubits->getUses().empty())
                   rewriter.eraseOp(numOfQubits);
-                else  {
-                  numOfQubits->emitError("Failed to remove runtime call to get number of qubits in state synthesis");
+                else {
+                  numOfQubits->emitError("Failed to remove runtime call to get "
+                                         "number of qubits in state synthesis");
                   return failure();
                 }
               }
               if (getState->getUses().empty())
                 rewriter.eraseOp(getState);
-              else  {
-                alloca->emitError("Failed to remove runtime call to get state in state synthesis");
+              else {
+                alloca->emitError("Failed to remove runtime call to get state "
+                                  "in state synthesis");
                 return failure();
               }
               return success();
@@ -135,13 +140,15 @@ class ReplaceStateWithKernelPass
     RewritePatternSet patterns(ctx);
     patterns.insert<ReplaceStateWithKernelPattern>(ctx);
 
-    LLVM_DEBUG(llvm::dbgs() << "Before replace state with kernel: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs()
+               << "Before replace state with kernel: " << func << '\n');
 
     if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
                                             std::move(patterns))))
       signalPassFailure();
 
-    LLVM_DEBUG(llvm::dbgs() << "After replace state with kerenl: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs()
+               << "After replace state with kerenl: " << func << '\n');
   }
 };
 } // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index cd57a245d6..2253b4a996 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -411,7 +411,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       }
       // Add any global symbols, including global constant arrays.
       // Global constant arrays can be created during compilation,
-      // `lift-array-alloc`, `argument-synthesis`, `quake-synthesizer`, 
+      // `lift-array-alloc`, `argument-synthesis`, `quake-synthesizer`,
       // and `get-concrete-matrix`passes.
       if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());

From 2f3a62327293e5c79b49c2249ecdf241467e6d9b Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 09:54:47 -0700
Subject: [PATCH 11/18] Fix failing test

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 targettests/execution/qvector_init_from_state.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index 06c97b1e6a..681e42eee0 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -108,8 +108,8 @@ int main() {
   }
   // clang-format off
 // CHECK: Passing large state from another kernel as argument (kernel mode)
-// CHECK: 01111111111111
-// CHECK: 11111111111111
+// CHECK: 01111
+// CHECK: 11111
   // clang-format on
 
   {

From b3813503b148b98f4d7d074075a6a7496b1082c9 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 09:56:28 -0700
Subject: [PATCH 12/18] Format

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 targettests/execution/qvector_init_from_state.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index 681e42eee0..d75a7e30d8 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -109,7 +109,7 @@ int main() {
   // clang-format off
 // CHECK: Passing large state from another kernel as argument (kernel mode)
 // CHECK: 01111
-// CHECK: 11111
+// CHECK: 111111
   // clang-format on
 
   {

From dc87ca4c9b31d7d1037c5f103adc58a353822135 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 09:57:02 -0700
Subject: [PATCH 13/18] Format

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 targettests/execution/qvector_init_from_state.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index d75a7e30d8..681e42eee0 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -109,7 +109,7 @@ int main() {
   // clang-format off
 // CHECK: Passing large state from another kernel as argument (kernel mode)
 // CHECK: 01111
-// CHECK: 111111
+// CHECK: 11111
   // clang-format on
 
   {

From 53a34c97759a619a9298523705392412a2fc7974 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 14:46:03 -0700
Subject: [PATCH 14/18] Replaced getState intrinsic by cc.get_state op

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Builder/Intrinsics.h  |  4 -
 include/cudaq/Optimizer/Dialect/CC/CCOps.td   | 20 +++++
 lib/Optimizer/Builder/Intrinsics.cpp          |  4 -
 .../Transforms/ReplaceStateWithKernel.cpp     | 77 +++++++------------
 runtime/common/ArgumentConversion.cpp         | 21 +----
 runtime/common/BaseRemoteRESTQPU.h            |  1 -
 runtime/test/test_argument_conversion.cpp     | 22 ++----
 test/Quake/replace_state_with_kernel.qke      | 15 ++--
 8 files changed, 63 insertions(+), 101 deletions(-)

diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index d545a57602..fa9ce53097 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -55,10 +55,6 @@ static constexpr const char createCudaqStateFromDataFP32[] =
 // Delete a state created by the runtime functions above.
 static constexpr const char deleteCudaqState[] = "__nvqpp_cudaq_state_delete";
 
-// Get state of a kernel (placeholder function, calls are always replaced in
-// opts)
-static constexpr const char getCudaqState[] = "__nvqpp_cudaq_state_get";
-
 /// Builder for lowering the clang AST to an IR for CUDA-Q. Lowering includes
 /// the transformation of both quantum and classical computation. Different
 /// features of the CUDA-Q programming model are lowered into different dialects
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index a58e3d403d..cda02c7a23 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -898,6 +898,26 @@ def cc_AddressOfOp : CCOp<"address_of", [Pure,
   }];
 }
 
+def cc_GetStateOp : CCOp<"get_state", [Pure] > {
+  let summary = "Get state from kernel with the provided name.";
+  let description = [{
+    This operation is created by argument synthesis of state pointer arguments
+    for quantum devices. It takes a kernel name as ASCIIZ string literal value
+    and returns the kernel's quantum state. The operation is replaced by a call
+    to the kernel with the provided name in ReplaceStateByKernel pass.
+
+    ```mlir
+      %0 = cc.get_state "callee" : !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins StrAttr:$calleeName);
+  let results = (outs cc_PointerType:$result);
+  let assemblyFormat = [{
+     $calleeName `:` qualified(type(results)) attr-dict
+  }];
+}
+
 def cc_GlobalOp : CCOp<"global", [IsolatedFromAbove, Symbol]> {
   let summary = "Create a global constant or variable";
   let description = [{
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index e0ed794264..315743f057 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -269,10 +269,6 @@ static constexpr IntrinsicCode intrinsicTable[] = {
 
     {cudaq::deleteCudaqState, {}, R"#(
   func.func private @__nvqpp_cudaq_state_delete(%p : !cc.ptr<!cc.state>) -> ()
-  )#"},
-
-    {cudaq::getCudaqState, {}, R"#(
-  func.func private @__nvqpp_cudaq_state_get(%p : !cc.ptr<i8>) -> !cc.ptr<!cc.state>
   )#"},
 
     {cudaq::getNumQubitsFromCudaqState, {}, R"#(
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 5300f57415..80907bfec1 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -43,10 +43,6 @@ static bool isCall(Operation *op, std::vector<const char *> &&names) {
   return false;
 }
 
-static bool isGetStateCall(Operation *op) {
-  return isCall(op, {cudaq::getCudaqState});
-}
-
 static bool isNumberOfQubitsCall(Operation *op) {
   return isCall(op, {cudaq::getNumQubitsFromCudaqState});
 }
@@ -56,12 +52,10 @@ static bool isNumberOfQubitsCall(Operation *op) {
 /// the state.
 ///
 /// ```
-///  %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
-///  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
-///  %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-///  %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
-///  %4 = quake.alloca !quake.veq<?>[%3 : i64]
-///  %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+///  %0 = cc.get_state "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.state>
+///  %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
+///  %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///  %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
 /// ...
 ///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
@@ -79,49 +73,34 @@ class ReplaceStateWithKernelPattern
 
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-        auto *getState = stateOp.getDefiningOp();
         auto *numOfQubits = alloca->getOperand(0).getDefiningOp();
-
-        if (isGetStateCall(getState)) {
-          auto calleeNameOp = getState->getOperand(0);
-          if (auto cast = calleeNameOp.getDefiningOp<cudaq::cc::CastOp>()) {
-            calleeNameOp = cast.getOperand();
-
-            if (auto literal =
-                    calleeNameOp
-                        .getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
-              auto calleeName = literal.getStringLiteral();
-              rewriter.replaceOpWithNewOp<func::CallOp>(
-                  initState, initState.getType(), calleeName,
-                  mlir::ValueRange{});
-
-              if (alloca->getUses().empty())
-                rewriter.eraseOp(alloca);
-              else {
-                alloca->emitError(
-                    "Failed to remove `quake.alloca` in state synthesis");
-                return failure();
-              }
-              if (isNumberOfQubitsCall(numOfQubits)) {
-                if (numOfQubits->getUses().empty())
-                  rewriter.eraseOp(numOfQubits);
-                else {
-                  numOfQubits->emitError("Failed to remove runtime call to get "
-                                         "number of qubits in state synthesis");
-                  return failure();
-                }
-              }
-              if (getState->getUses().empty())
-                rewriter.eraseOp(getState);
-              else {
-                alloca->emitError("Failed to remove runtime call to get state "
-                                  "in state synthesis");
-                return failure();
-              }
-              return success();
+        stateOp.getDefiningOp()->dump();
+
+        if (auto getState = stateOp.getDefiningOp<cudaq::cc::GetStateOp>()) {
+          auto calleeName = getState.getCalleeName();
+          rewriter.replaceOpWithNewOp<func::CallOp>(
+              initState, initState.getType(), calleeName, mlir::ValueRange{});
+
+          if (alloca->getUses().empty())
+            rewriter.eraseOp(alloca);
+          else {
+            alloca->emitError(
+                "Failed to remove `quake.alloca` in state synthesis");
+            return failure();
+          }
+          if (isNumberOfQubitsCall(numOfQubits)) {
+            if (numOfQubits->getUses().empty())
+              rewriter.eraseOp(numOfQubits);
+            else {
+              numOfQubits->emitError("Failed to remove runtime call to get "
+                                     "number of qubits in state synthesis");
+              return failure();
             }
           }
+          return success();
         }
+        numOfQubits->emitError(
+            "Failed to replace `quake.init_state` in state synthesis");
       }
     }
     return failure();
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 42b228dd3b..c548d23523 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -243,27 +243,10 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     converter.genCallee(modifiedCalleeName, calleeArgs);
 
     // Create a subst for state pointer.
-    auto strLitTy = cudaq::cc::PointerType::get(
-        cudaq::cc::ArrayType::get(builder.getContext(), builder.getI8Type(),
-                                  modifiedCalleeKernelName.size() + 1));
-    auto callee = builder.create<cudaq::cc::CreateStringLiteralOp>(
-        loc, strLitTy, builder.getStringAttr(modifiedCalleeKernelName));
-
-    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-    auto calleeCast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, callee);
-
-    cudaq::IRBuilder irBuilder(ctx);
-    auto result = irBuilder.loadIntrinsic(substMod, cudaq::getCudaqState);
-    assert(succeeded(result) && "loading intrinsic should never fail");
-
     auto statePtrTy =
         cudaq::cc::PointerType::get(cudaq::cc::StateType::get(ctx));
-    auto statePtr =
-        builder
-            .create<func::CallOp>(loc, statePtrTy, cudaq::getCudaqState,
-                                  ValueRange{calleeCast})
-            .getResult(0);
-    return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
+    return builder.create<cudaq::cc::GetStateOp>(
+        loc, statePtrTy, builder.getStringAttr(modifiedCalleeKernelName));
   }
 
   TODO("cudaq::state* argument synthesis for quantum hardware for c functions");
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 2c8654d540..0421cde877 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -458,7 +458,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createReplaceStateWithKernel());
-        pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(mlir::createSymbolDCEPass());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 9fe3d92f8f..93939125c1 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -380,13 +380,10 @@ void test_state(mlir::MLIRContext *ctx) {
 
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_3:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_2]], %[[VAL_1]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
 // CHECK:        }
 // CHECK-DAG:    cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
 // CHECK-DAG:    func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
@@ -490,13 +487,10 @@ void test_combinations(mlir::MLIRContext *ctx) {
 // CHECK:         }
 // CHECK-LABEL:   cc.arg_subst[1] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_3:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_2]], %[[VAL_1]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK-DAG:     cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
 // CHECK-DAG:     func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 70b04e3103..751e29775a 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -10,18 +10,13 @@
 
 module {
   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-    %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
-    %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
-    %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-    %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
-    %4 = quake.alloca !quake.veq<?>[%3 : i64]
-    %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    return
+    %0 = cc.get_state "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.state>
+    %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
+    %2 = quake.alloca !quake.veq<?>[%1 : i64]
+    %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+return
   }
-
   func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
-  func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-
   func.func private @callee.modified_0() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
     %cst = arith.constant 1.5707963267948966 : f64
     %0 = quake.alloca !quake.veq<2>

From fe6d409ec21b0f72016690213dd5a3781d9c53cc Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 14:47:59 -0700
Subject: [PATCH 15/18] Remove print

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 80907bfec1..bdc1898284 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -74,7 +74,6 @@ class ReplaceStateWithKernelPattern
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
         auto *numOfQubits = alloca->getOperand(0).getDefiningOp();
-        stateOp.getDefiningOp()->dump();
 
         if (auto getState = stateOp.getDefiningOp<cudaq::cc::GetStateOp>()) {
           auto calleeName = getState.getCalleeName();

From 48704e3bcb648043ba9c1ccd7ecd056d620e88e6 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 14:50:08 -0700
Subject: [PATCH 16/18] Remove getCudaqState references

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp | 3 +--
 runtime/common/BaseRemoteRESTQPU.h         | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
index 04eac5b06f..4de20fd7be 100644
--- a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
+++ b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
@@ -49,8 +49,7 @@ struct VerifyNVQIRCallOpsPass
           cudaq::getNumQubitsFromCudaqState,
           cudaq::createCudaqStateFromDataFP32,
           cudaq::createCudaqStateFromDataFP64,
-          cudaq::deleteCudaqState,
-          cudaq::getCudaqState};
+          cudaq::deleteCudaqState};
       // It must be either NVQIR extension functions or in the allowed list.
       return std::find(NVQIR_FUNCS.begin(), NVQIR_FUNCS.end(), functionName) !=
                  NVQIR_FUNCS.end() ||
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 0421cde877..0d9a5ddbc9 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -408,8 +408,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
-             funcOp.getName().equals(cudaq::deleteCudaqState) ||
-             funcOp.getName().equals(cudaq::getCudaqState)))
+             funcOp.getName().equals(cudaq::deleteCudaqState)))
           moduleOp.push_back(funcOp.clone());
       }
       // Add any global symbols, including global constant arrays.

From 137f621febc0c607dbea69d25eba70e7bcb696ca Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 15:01:25 -0700
Subject: [PATCH 17/18] Minor updates

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/BaseRemoteRESTQPU.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 0d9a5ddbc9..5cf89c0332 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -403,18 +403,21 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     for (auto &op : m_module.getOps()) {
       if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
         // Add function definitions for runtime functions that must
-        // be removed after synthesis in cleanup ops.
+        // be removed after synthesis in cleanup passes.
+        static const std::vector<llvm::StringRef> stateFuncs = {
+            cudaq::getNumQubitsFromCudaqState,
+            cudaq::createCudaqStateFromDataFP32,
+            cudaq::createCudaqStateFromDataFP64};
+
         if (funcOp.getBody().empty() &&
-            (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
-             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
-             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
-             funcOp.getName().equals(cudaq::deleteCudaqState)))
+            std::find(stateFuncs.begin(), stateFuncs.end(), funcOp.getName()) !=
+                stateFuncs.end())
           moduleOp.push_back(funcOp.clone());
       }
       // Add any global symbols, including global constant arrays.
       // Global constant arrays can be created during compilation,
       // `lift-array-alloc`, `argument-synthesis`, `quake-synthesizer`,
-      // and `get-concrete-matrix`passes.
+      // and `get-concrete-matrix` passes.
       if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());
     }

From ad7c6bcd26a521f4401e4b46e97e09795a4f6333 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 18:05:49 -0700
Subject: [PATCH 18/18] Fix failing quake test

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 test/Quake/replace_state_with_kernel.qke | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 751e29775a..09570c6290 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -10,7 +10,7 @@
 
 module {
   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-    %0 = cc.get_state "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.state>
+    %0 = cc.get_state "callee.modified_0" : !cc.ptr<!cc.state>
     %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
     %2 = quake.alloca !quake.veq<?>[%1 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>