diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index 00cc3c9c523..56e4b1d898b 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -75,6 +75,7 @@ class PyRemoteSimulationState : public RemoteSimulationState {
                           std::size_t size, std::size_t returnOffset)
       : argsData(argsDataToOwn), kernelMod(args.mod) {
     this->kernelName = in_kernelName;
+    this->args = argsData->getArgs();
   }
 
   void execute() const override {
@@ -98,11 +99,6 @@ class PyRemoteSimulationState : public RemoteSimulationState {
     }
   }
 
-  std::optional<std::pair<std::string, std::vector<void *>>>
-  getKernelInfo() const override {
-    return std::make_pair(kernelName, argsData->getArgs());
-  }
-
   std::complex<double> overlap(const cudaq::SimulationState &other) override {
     const auto &otherState =
         dynamic_cast<const PyRemoteSimulationState &>(other);
@@ -121,7 +117,7 @@ class PyRemoteSimulationState : public RemoteSimulationState {
     return context.overlapResult.value();
   }
 
-  ~PyRemoteSimulationState() { delete argsData; }
+  virtual ~PyRemoteSimulationState() override { delete argsData; }
 };
 
 /// @brief Run `cudaq::get_state` for remote execution targets on the provided
@@ -140,6 +136,39 @@ state pyGetStateRemote(py::object kernel, py::args args) {
                                            size, returnOffset));
 }
 
+/// @brief Python implementation of the `QPUState`.
+// Note: Python kernel arguments are wrapped hence need to be unwrapped
+// accordingly.
+class PyQPUState : public QPUState {
+  // Holder of args data for clean-up.
+  cudaq::OpaqueArguments *argsData;
+
+public:
+  PyQPUState(const std::string &in_kernelName,
+             cudaq::OpaqueArguments *argsDataToOwn)
+      : argsData(argsDataToOwn) {
+    this->kernelName = in_kernelName;
+    this->args = argsData->getArgs();
+  }
+
+  virtual ~PyQPUState() override { delete argsData; }
+};
+
+/// @brief Run `cudaq::get_state` for qpu targets on the provided
+/// kernel and args
+state pyGetStateQPU(py::object kernel, py::args args) {
+  if (py::hasattr(kernel, "compile"))
+    kernel.attr("compile")();
+
+  auto kernelName = kernel.attr("name").cast<std::string>();
+  args = simplifiedValidateInputArguments(args);
+  auto kernelMod = kernel.attr("module").cast<MlirModule>();
+  auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
+  auto [argWrapper, size, returnOffset] =
+      pyCreateNativeKernel(kernelName, kernelMod, *argData);
+  return state(new PyQPUState(kernelName, argData));
+}
+
 state pyGetStateLibraryMode(py::object kernel, py::args args) {
   return details::extractState([&]() mutable {
     if (0 == args.size())
@@ -671,6 +700,8 @@ index pair.
           return pyGetStateRemote(kernel, args);
         if (holder.getTarget().name == "orca-photonics")
           return pyGetStateLibraryMode(kernel, args);
+        if (holder.getTarget().is_remote() || holder.getTarget().is_emulated())
+          return pyGetStateQPU(kernel, args);
         return pyGetState(kernel, args);
       },
       R"#(Return the :class:`State` of the system after execution of the provided `kernel`.
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 687886cdffb..9baaabf9a4b 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -337,7 +337,15 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module,
   if (launch) {
     auto &platform = cudaq::get_platform();
     auto uReturnOffset = static_cast<std::uint64_t>(returnOffset);
-    if (platform.is_remote() || platform.is_emulated()) {
+    auto isRemoteSimulator =
+        platform.get_remote_capabilities().isRemoteSimulator;
+    auto isQuantumDevice =
+        !isRemoteSimulator && (platform.is_remote() || platform.is_emulated());
+
+    if (isRemoteSimulator) {
+      // Remote simulator - use altLaunchKernel to support returning values.
+      // TODO: after cudaq::run support this should be merged with the quantum
+      // device case.
       auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs};
       auto dynamicResult = cudaq::altLaunchKernel(
           name.c_str(), thunk, reinterpret_cast<void *>(wrapper), size,
@@ -345,7 +353,15 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module,
       if (dynamicResult.data_buffer || dynamicResult.size)
         throw std::runtime_error("not implemented: support dynamic results");
       delete wrapper;
+    } else if (isQuantumDevice) {
+      // Quantum devices or their emulation - we can use streamlinedLaunchKernel
+      // as quantum platform do not support direct returns.
+      auto dynamicResult =
+          cudaq::streamlinedLaunchKernel(name.c_str(), runtimeArgs.getArgs());
+      if (dynamicResult.data_buffer || dynamicResult.size)
+        throw std::runtime_error("not implemented: support dynamic results");
     } else {
+      // Local simulator - use altLaunchKernel with the thunk function.
       auto dynamicResult = cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs,
                                                   size, uReturnOffset);
       if (dynamicResult.data_buffer || dynamicResult.size)
diff --git a/python/runtime/utils/PyFermioniqRESTQPU.cpp b/python/runtime/utils/PyFermioniqRESTQPU.cpp
index 17ac448ae4c..d54f4a2b567 100644
--- a/python/runtime/utils/PyFermioniqRESTQPU.cpp
+++ b/python/runtime/utils/PyFermioniqRESTQPU.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "common/ArgumentWrapper.h"
+#include "cudaq/Optimizer/InitAllDialects.h"
 #include "cudaq/platform/fermioniq/FermioniqBaseQPU.h"
 
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -27,41 +28,69 @@ void registerLLVMDialectTranslation(MLIRContext *context);
 namespace cudaq {
 
 class PyFermioniqRESTQPU : public cudaq::FermioniqBaseQPU {
+private:
+  /// Creates new context without mlir initialization.
+  MLIRContext *createContext() {
+    DialectRegistry registry;
+    cudaq::opt::registerCodeGenDialect(registry);
+    cudaq::registerAllDialects(registry);
+    auto context = new MLIRContext(registry);
+    context->loadAllAvailableDialects();
+    registerLLVMDialectTranslation(*context);
+    return context;
+  }
+
 protected:
   std::tuple<ModuleOp, MLIRContext *, void *>
   extractQuakeCodeAndContext(const std::string &kernelName,
                              void *data) override {
+    auto [mod, ctx] = extractQuakeCodeAndContextImpl(kernelName);
+    void *updatedArgs = nullptr;
+    if (data) {
+      auto *wrapper = reinterpret_cast<cudaq::ArgWrapper *>(data);
+      updatedArgs = wrapper->rawArgs;
+    }
+    return {mod, ctx, updatedArgs};
+  }
+
+  std::tuple<ModuleOp, MLIRContext *>
+  extractQuakeCodeAndContextImpl(const std::string &kernelName) {
 
-    auto *wrapper = reinterpret_cast<cudaq::ArgWrapper *>(data);
-    auto m_module = wrapper->mod;
-    auto callableNames = wrapper->callableNames;
+    cudaq::info("extract quake code\n");
+
+    MLIRContext *context = createContext();
 
-    auto *context = m_module->getContext();
     static bool initOnce = [&] {
       registerToQIRTranslation();
       registerToOpenQASMTranslation();
       registerToIQMJsonTranslation();
-      registerLLVMDialectTranslation(*context);
       return true;
     }();
     (void)initOnce;
 
+    // Get the quake representation of the kernel
+    auto quakeCode = cudaq::get_quake_by_name(kernelName);
+    auto m_module = parseSourceString<ModuleOp>(quakeCode, context);
+    if (!m_module)
+      throw std::runtime_error("module cannot be parsed");
+
     // Here we have an opportunity to run any passes that are
     // specific to python before the rest of the RemoteRESTQPU workflow
-    auto cloned = m_module.clone();
+    auto cloned = m_module->clone();
     PassManager pm(cloned.getContext());
-    pm.addNestedPass<func::FuncOp>(cudaq::opt::createPySynthCallableBlockArgs(
-        SmallVector<StringRef>(callableNames.begin(), callableNames.end())));
+
+    pm.addPass(cudaq::opt::createLambdaLiftingPass());
     cudaq::opt::addAggressiveEarlyInlining(pm);
-    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     pm.addNestedPass<mlir::func::FuncOp>(
         cudaq::opt::createUnwindLoweringPass());
-    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     pm.addPass(cudaq::opt::createApplyOpSpecializationPass());
     pm.addPass(createInlinerPass());
     pm.addPass(cudaq::opt::createExpandMeasurementsPass());
-    pm.addPass(createCanonicalizerPass());
-    pm.addPass(createCSEPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createCSEPass());
     if (failed(pm.run(cloned)))
       throw std::runtime_error(
           "Failure to synthesize callable block arguments in PyRemoteRESTQPU ");
@@ -75,8 +104,10 @@ class PyFermioniqRESTQPU : public cudaq::FermioniqBaseQPU {
     // The remote rest qpu workflow will need the module string in
     // the internal registry.
     __cudaq_deviceCodeHolderAdd(kernelName.c_str(), moduleStr.c_str());
-    return std::make_tuple(cloned, context, wrapper->rawArgs);
+    return std::make_tuple(cloned, context);
   }
+
+  void cleanupContext(MLIRContext *context) override { delete context; }
 };
 } // namespace cudaq
 
diff --git a/python/runtime/utils/PyRemoteRESTQPU.cpp b/python/runtime/utils/PyRemoteRESTQPU.cpp
index d1596c688ab..5edafbedbe2 100644
--- a/python/runtime/utils/PyRemoteRESTQPU.cpp
+++ b/python/runtime/utils/PyRemoteRESTQPU.cpp
@@ -9,6 +9,7 @@
 #include "common/ArgumentWrapper.h"
 #include "common/BaseRemoteRESTQPU.h"
 #include "common/RuntimeMLIRCommonImpl.h"
+#include "cudaq/Optimizer/InitAllDialects.h"
 
 // [RFC]:
 // The RemoteRESTQPU implementation that is now split across several files needs
@@ -57,40 +58,65 @@ TranslateFromMLIRRegistration::TranslateFromMLIRRegistration(
 // implement some core functionality here in PyRemoteRESTQPU so we don't load
 // twice
 class PyRemoteRESTQPU : public cudaq::BaseRemoteRESTQPU {
+private:
+  /// Creates new context without mlir initialization.
+  MLIRContext *createContext() {
+    DialectRegistry registry;
+    cudaq::opt::registerCodeGenDialect(registry);
+    cudaq::registerAllDialects(registry);
+    auto context = new MLIRContext(registry);
+    context->loadAllAvailableDialects();
+    registerLLVMDialectTranslation(*context);
+    return context;
+  }
+
 protected:
   std::tuple<ModuleOp, MLIRContext *, void *>
   extractQuakeCodeAndContext(const std::string &kernelName,
                              void *data) override {
+    auto [mod, ctx] = extractQuakeCodeAndContextImpl(kernelName);
+    void *updatedArgs = nullptr;
+    if (data) {
+      auto *wrapper = reinterpret_cast<cudaq::ArgWrapper *>(data);
+      updatedArgs = wrapper->rawArgs;
+    }
+    return {mod, ctx, updatedArgs};
+  }
 
-    auto *wrapper = reinterpret_cast<cudaq::ArgWrapper *>(data);
-    auto m_module = wrapper->mod;
-    auto callableNames = wrapper->callableNames;
+  std::tuple<ModuleOp, MLIRContext *>
+  extractQuakeCodeAndContextImpl(const std::string &kernelName) {
+
+    MLIRContext *context = createContext();
 
-    auto *context = m_module->getContext();
     static bool initOnce = [&] {
       registerToQIRTranslation();
       registerToOpenQASMTranslation();
       registerToIQMJsonTranslation();
-      registerLLVMDialectTranslation(*context);
       return true;
     }();
     (void)initOnce;
 
+    // Get the quake representation of the kernel
+    auto quakeCode = cudaq::get_quake_by_name(kernelName);
+    auto m_module = parseSourceString<ModuleOp>(quakeCode, context);
+    if (!m_module)
+      throw std::runtime_error("module cannot be parsed");
+
     // Here we have an opportunity to run any passes that are
     // specific to python before the rest of the RemoteRESTQPU workflow
-    auto cloned = m_module.clone();
+    auto cloned = m_module->clone();
     PassManager pm(cloned.getContext());
-    pm.addNestedPass<func::FuncOp>(cudaq::opt::createPySynthCallableBlockArgs(
-        SmallVector<StringRef>(callableNames.begin(), callableNames.end())));
+
+    pm.addPass(cudaq::opt::createLambdaLiftingPass());
     cudaq::opt::addAggressiveEarlyInlining(pm);
-    pm.addPass(mlir::createCanonicalizerPass());
-    pm.addNestedPass<mlir::func::FuncOp>(
-        cudaq::opt::createUnwindLoweringPass());
-    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(cudaq::opt::createUnwindLoweringPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     pm.addPass(cudaq::opt::createApplyOpSpecializationPass());
     pm.addPass(createInlinerPass());
-    pm.addPass(createCanonicalizerPass());
-    pm.addPass(createCSEPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createCSEPass());
     if (failed(pm.run(cloned)))
       throw std::runtime_error(
           "Failure to synthesize callable block arguments in PyRemoteRESTQPU ");
@@ -103,8 +129,10 @@ class PyRemoteRESTQPU : public cudaq::BaseRemoteRESTQPU {
     // The remote rest qpu workflow will need the module string in
     // the internal registry.
     __cudaq_deviceCodeHolderAdd(kernelName.c_str(), moduleStr.c_str());
-    return std::make_tuple(cloned, context, wrapper->rawArgs);
+    return std::make_tuple(cloned, context);
   }
+
+  void cleanupContext(MLIRContext *context) override { delete context; }
 };
 } // namespace cudaq
 
diff --git a/python/tests/backends/test_IQM.py b/python/tests/backends/test_IQM.py
index 6c246e9e205..8fd894d4c2f 100644
--- a/python/tests/backends/test_IQM.py
+++ b/python/tests/backends/test_IQM.py
@@ -201,6 +201,86 @@ def test_IQM_state_preparation_builder():
     assert assert_close(counts["11"], 0., 2)
 
 
+def test_IQM_state_synthesis_from_simulator():
+
+    @cudaq.kernel
+    def kernel(state: cudaq.State):
+        qubits = cudaq.qvector(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    print(counts)
+    assert "00" in counts
+    assert "10" in counts
+    assert assert_close(counts["01"], 0., 2)
+    assert assert_close(counts["11"], 0., 2)
+
+    synthesized = cudaq.synthesize(kernel, state)
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+    assert assert_close(counts["01"], 0., 2)
+    assert assert_close(counts["11"], 0., 2)
+
+
+def test_IQM_state_synthesis_from_simulator_builder():
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert assert_close(counts["01"], 0., 2)
+    assert assert_close(counts["11"], 0., 2)
+
+
+def test_IQM_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert assert_close(counts["00"], 0., 2)
+    assert assert_close(counts["01"], 0., 2)
+    assert assert_close(counts["11"], 0., 2)
+
+
+def test_IQM_state_synthesis_builder():
+
+    init, n = cudaq.make_kernel(int)
+    qubits = init.qalloc(n)
+    init.x(qubits[0])
+
+    s = cudaq.get_state(init, 2)
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+    kernel.x(qubits[1])
+
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert assert_close(counts["00"], 0., 2)
+    assert assert_close(counts["01"], 0., 2)
+    assert assert_close(counts["11"], 0., 2)
+
+
 def test_exp_pauli():
 
     @cudaq.kernel
diff --git a/python/tests/backends/test_Infleqtion.py b/python/tests/backends/test_Infleqtion.py
index de26b5fbd0c..3edead9ecef 100644
--- a/python/tests/backends/test_Infleqtion.py
+++ b/python/tests/backends/test_Infleqtion.py
@@ -153,6 +153,47 @@ def ansatz(theta: float):
     print(res.expectation())
 
 
+def test_state_synthesis_from_simulator():
+
+    @cudaq.kernel
+    def kernel(state: cudaq.State):
+        qubits = cudaq.qvector(state)
+        mz(qubits)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+
+def test_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel1(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    @cudaq.kernel
+    def kernel2(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+        mz(q)
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel1, s)
+    counts = cudaq.sample(kernel2, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/backends/test_IonQ.py b/python/tests/backends/test_IonQ.py
index be4aae3193b..146ead19618 100644
--- a/python/tests/backends/test_IonQ.py
+++ b/python/tests/backends/test_IonQ.py
@@ -189,6 +189,78 @@ def test_ionq_state_preparation_builder():
     assert not '11' in counts
 
 
+def test_ionq_state_synthesis_from_simulator():
+
+    @cudaq.kernel
+    def kernel(state: cudaq.State):
+        qubits = cudaq.qvector(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+    synthesized = cudaq.synthesize(kernel, state)
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
+
+
+def test_ionq_state_synthesis_from_simulator_builder():
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+
+def test_Ionq_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
+def test_Ionq_state_synthesis_builder():
+
+    init, n = cudaq.make_kernel(int)
+    qubits = init.qalloc(n)
+    init.x(qubits[0])
+
+    s = cudaq.get_state(init, 2)
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+    kernel.x(qubits[1])
+
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
 def test_exp_pauli():
 
     @cudaq.kernel
diff --git a/python/tests/backends/test_Ionq_LocalEmulation_kernel.py b/python/tests/backends/test_Ionq_LocalEmulation_kernel.py
index 726daefa4d3..59996f43212 100644
--- a/python/tests/backends/test_Ionq_LocalEmulation_kernel.py
+++ b/python/tests/backends/test_Ionq_LocalEmulation_kernel.py
@@ -12,6 +12,7 @@
 import pytest
 import os
 from typing import List
+import numpy as np
 
 
 def assert_close(want, got, tolerance=1.0e-1) -> bool:
@@ -93,6 +94,46 @@ def kernel():
     assert '11000000' in counts
 
 
+def test_Ionq_state_synthesis_from_simulator():
+
+    @cudaq.kernel
+    def kernel(state: cudaq.State):
+        qubits = cudaq.qvector(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+    synthesized = cudaq.synthesize(kernel, state)
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
+
+
+def test_Ionq_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/backends/test_OQC.py b/python/tests/backends/test_OQC.py
index f0002f3e189..3b0cd47fbc5 100644
--- a/python/tests/backends/test_OQC.py
+++ b/python/tests/backends/test_OQC.py
@@ -190,6 +190,78 @@ def test_OQC_state_preparation_builder():
     assert not '11' in counts
 
 
+def test_OQC_state_synthesis_from_simulator():
+
+    @cudaq.kernel
+    def kernel(state: cudaq.State):
+        qubits = cudaq.qvector(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+    synthesized = cudaq.synthesize(kernel, state)
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
+
+
+def test_OQC_state_synthesis_from_simulator_builder():
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+
+def test_OQC_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
+def test_OQC_state_synthesis_builder():
+
+    init, n = cudaq.make_kernel(int)
+    qubits = init.qalloc(n)
+    init.x(qubits[0])
+
+    s = cudaq.get_state(init, 2)
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+    kernel.x(qubits[1])
+
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
 def test_exp_pauli():
 
     @cudaq.kernel
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
index e7d6dfcb6e0..541146c5ca5 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
@@ -137,16 +137,35 @@ def test_quantinuum_state_preparation():
     assert not '111' in counts
 
 
-def test_quantinuum_state_synthesis():
+def test_quantinuum_state_synthesis_from_simulator():
     kernel, state = cudaq.make_kernel(cudaq.State)
     qubits = kernel.qalloc(state)
 
     state = cudaq.State.from_data(
         np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
 
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel, state)
-    assert 'Could not successfully apply quake-synth.' in repr(e)
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+
+def test_quantinuum_state_synthesis():
+
+    init, n = cudaq.make_kernel(int)
+    qubits = init.qalloc(n)
+    init.x(qubits[0])
+
+    s = cudaq.get_state(init, 2)
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+    kernel.x(qubits[1])
+
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
 
 
 def test_exp_pauli():
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
index 93ba682d651..905b62974ea 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
@@ -189,7 +189,7 @@ def kernel(vec: List[complex]):
     assert not '111' in counts
 
 
-def test_quantinuum_state_synthesis():
+def test_quantinuum_state_synthesis_from_simulator():
 
     @cudaq.kernel
     def kernel(state: cudaq.State):
@@ -198,9 +198,35 @@ def kernel(state: cudaq.State):
     state = cudaq.State.from_data(
         np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
 
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel, state)
-    assert 'Could not successfully apply quake-synth.' in repr(e)
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+    synthesized = cudaq.synthesize(kernel, state)
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
+
+
+def test_quantinuum_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
 
 
 def test_exp_pauli():
diff --git a/python/tests/backends/test_Quantinuum_builder.py b/python/tests/backends/test_Quantinuum_builder.py
index be65851ead3..17d550442ec 100644
--- a/python/tests/backends/test_Quantinuum_builder.py
+++ b/python/tests/backends/test_Quantinuum_builder.py
@@ -164,6 +164,37 @@ def test_quantinuum_state_preparation():
     assert not '11' in counts
 
 
+def test_quantinuum_state_synthesis_from_simulator():
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+
+def test_quantinuum_state_synthesis():
+
+    init, n = cudaq.make_kernel(int)
+    qubits = init.qalloc(n)
+    init.x(qubits[0])
+
+    s = cudaq.get_state(init, 2)
+
+    kernel, state = cudaq.make_kernel(cudaq.State)
+    qubits = kernel.qalloc(state)
+    kernel.x(qubits[1])
+
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
 def test_exp_pauli():
     test = cudaq.make_kernel()
     q = test.qalloc(2)
diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index 33c3a1e2726..ce2e5be64cd 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -190,6 +190,46 @@ def kernel(vec: List[complex]):
     assert not '11' in counts
 
 
+def test_quantinuum_state_synthesis_from_simulator():
+
+    @cudaq.kernel
+    def kernel(state: cudaq.State):
+        qubits = cudaq.qvector(state)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+    synthesized = cudaq.synthesize(kernel, state)
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
+
+
+def test_quantinuum_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel, s)
+    counts = cudaq.sample(kernel, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
 def test_exp_pauli():
 
     @cudaq.kernel
diff --git a/python/tests/backends/test_braket.py b/python/tests/backends/test_braket.py
index 351ec9bc7b5..03832adb172 100644
--- a/python/tests/backends/test_braket.py
+++ b/python/tests/backends/test_braket.py
@@ -422,6 +422,47 @@ def kernel():
     assert '00' in counts
 
 
+def test_state_synthesis_from_simulator():
+
+    @cudaq.kernel
+    def kernel(state: cudaq.State):
+        qubits = cudaq.qvector(state)
+        mz(qubits)
+
+    state = cudaq.State.from_data(
+        np.array([1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.], dtype=complex))
+
+    counts = cudaq.sample(kernel, state)
+    assert "00" in counts
+    assert "10" in counts
+    assert len(counts) == 2
+
+
+def test_state_synthesis():
+
+    @cudaq.kernel
+    def init(n: int):
+        q = cudaq.qvector(n)
+        x(q[0])
+
+    @cudaq.kernel
+    def kernel1(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+
+    @cudaq.kernel
+    def kernel2(s: cudaq.State):
+        q = cudaq.qvector(s)
+        x(q[1])
+        mz(q)
+
+    s = cudaq.get_state(init, 2)
+    s = cudaq.get_state(kernel1, s)
+    counts = cudaq.sample(kernel2, s)
+    assert '10' in counts
+    assert len(counts) == 1
+
+
 @pytest.mark.parametrize("device_arn", [
     "arn:aws:braket:::device/quantum-simulator/amazon/dm1",
     "arn:aws:braket:::device/quantum-simulator/amazon/tn1"
diff --git a/python/tests/kernel/test_trotter.py b/python/tests/kernel/test_trotter.py
index 1995093f22e..c103ef36bc4 100644
--- a/python/tests/kernel/test_trotter.py
+++ b/python/tests/kernel/test_trotter.py
@@ -67,7 +67,7 @@ def termCoefficients(op: cudaq.SpinOperator) -> list[complex]:
         def termWords(op: cudaq.SpinOperator) -> list[str]:
             result = []
             for term in op:
-                # The way the trotter kernel is written, it 
+                # The way the trotter kernel is written, it
                 # wants exp pauli to act on the entire state.
                 # That means we need to make it explicit that each term
                 # in this Hamiltonian indeed is supposed to act on each qubit.
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index d1f914608c6..24d7598aba8 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -336,7 +336,7 @@ inline void packArgs(OpaqueArguments &argData, py::args args,
         })
         .Case([&](cudaq::cc::PointerType ty) {
           if (isa<quake::StateType>(ty.getElementType())) {
-            valueArgument(argData, arg.cast<cudaq::state *>());
+            addArgument(argData, cudaq::state(*arg.cast<cudaq::state *>()));
           } else {
             throw std::runtime_error("Invalid pointer type argument: " +
                                      py::str(arg).cast<std::string>() +
diff --git a/runtime/cudaq/algorithms/get_state.h b/runtime/cudaq/algorithms/get_state.h
index 093ae36dcff..c3ceef1e196 100644
--- a/runtime/cudaq/algorithms/get_state.h
+++ b/runtime/cudaq/algorithms/get_state.h
@@ -132,7 +132,6 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
   if constexpr (has_name<QuantumKernel>::value)
     return state(new QPUState(std::forward<QuantumKernel>(kernel),
                               std::forward<Args>(args)...));
-
   throw std::runtime_error(
       "cudaq::state* argument synthesis is not supported for quantum hardware"
       " for c-like functions in library mode");
diff --git a/runtime/cudaq/platform/qpu_state.cpp b/runtime/cudaq/platform/qpu_state.cpp
index 24ce4c412c9..c730859a13c 100644
--- a/runtime/cudaq/platform/qpu_state.cpp
+++ b/runtime/cudaq/platform/qpu_state.cpp
@@ -11,8 +11,9 @@
 namespace cudaq {
 
 QPUState::~QPUState() {
-  for (std::size_t counter = 0; auto &ptr : args)
-    deleters[counter++](ptr);
+  if (!deleters.empty())
+    for (std::size_t counter = 0; auto &ptr : args)
+      deleters[counter++](ptr);
 
   args.clear();
   deleters.clear();
diff --git a/runtime/cudaq/platform/qpu_state.h b/runtime/cudaq/platform/qpu_state.h
index a04120b3728..0f50b05161b 100644
--- a/runtime/cudaq/platform/qpu_state.h
+++ b/runtime/cudaq/platform/qpu_state.h
@@ -70,7 +70,7 @@ class QPUState : public cudaq::SimulationState {
   QPUState() = default;
   QPUState(const QPUState &other)
       : kernelName(other.kernelName), args(other.args), deleters() {}
-  virtual ~QPUState();
+  virtual ~QPUState() override;
 
   /// @brief True if the state has amplitudes or density matrix available.
   virtual bool hasData() const override { return false; }
diff --git a/runtime/cudaq/qis/remote_state.cpp b/runtime/cudaq/qis/remote_state.cpp
index 772b54f3b91..0a7949c12c1 100644
--- a/runtime/cudaq/qis/remote_state.cpp
+++ b/runtime/cudaq/qis/remote_state.cpp
@@ -43,8 +43,9 @@ RemoteSimulationState::~RemoteSimulationState() {
     platformExecutionLog.clear();
   }
 
-  for (std::size_t counter = 0; auto &ptr : args)
-    deleters[counter++](ptr);
+  if (!deleters.empty())
+    for (std::size_t counter = 0; auto &ptr : args)
+      deleters[counter++](ptr);
 
   args.clear();
   deleters.clear();
diff --git a/runtime/cudaq/qis/remote_state.h b/runtime/cudaq/qis/remote_state.h
index 7a70cb5c276..c85c2ecd9c0 100644
--- a/runtime/cudaq/qis/remote_state.h
+++ b/runtime/cudaq/qis/remote_state.h
@@ -78,7 +78,7 @@ class RemoteSimulationState : public cudaq::SimulationState {
     (addArgument(args), ...);
   }
   RemoteSimulationState() = default;
-  virtual ~RemoteSimulationState();
+  virtual ~RemoteSimulationState() override;
   /// @brief Triggers remote execution to resolve the state data.
   virtual void execute() const;