diff --git a/kmath-torch/README.md b/kmath-torch/README.md
index 7d5c76833..0cc12914d 100644
--- a/kmath-torch/README.md
+++ b/kmath-torch/README.md
@@ -81,15 +81,25 @@ Tensors implement the buffer protocol over `MutableNDStructure`. They can only b
 memScoped {
     val intTensor: TorchTensorInt = TorchTensor.copyFromIntArray(
         scope = this,
-        array = intArrayOf(7,8,9,2,6,5), 
-        shape = intArrayOf(3,2))
+        array = (1..24).toList().toIntArray(),
+        shape = intArrayOf(3, 2, 4)
+    )
     println(intTensor)
-    
+
     val floatTensor: TorchTensorFloat = TorchTensor.copyFromFloatArray(
         scope = this,
-        array = floatArrayOf(7f,8.9f,2.6f,5.6f),
-        shape = intArrayOf(4))
+        array = (1..10).map { it + 50f }.toList().toFloatArray(),
+        shape = intArrayOf(10)
+    )
     println(floatTensor)
+
+    val gpuFloatTensor: TorchTensorFloatGPU = TorchTensor.copyFromFloatArrayToGPU(
+        scope = this,
+        array = (1..8).map { it * 2f }.toList().toFloatArray(),
+        shape = intArrayOf(2, 2, 2),
+        device = 0
+    )
+    println(gpuFloatTensor)
 }
 ```
 
diff --git a/kmath-torch/build.gradle.kts b/kmath-torch/build.gradle.kts
index 5db0af2f7..7ddbda497 100644
--- a/kmath-torch/build.gradle.kts
+++ b/kmath-torch/build.gradle.kts
@@ -133,6 +133,7 @@ kotlin {
 
     val test by nativeTarget.compilations.getting
 
+
     sourceSets {
         val nativeMain by creating {
             dependencies {
@@ -142,12 +143,20 @@ kotlin {
         val nativeTest by creating {
             dependsOn(nativeMain)
         }
+        val nativeGPUTest by creating {
+            dependsOn(nativeMain)
+        }
+
 
         main.defaultSourceSet.dependsOn(nativeMain)
         test.defaultSourceSet.dependsOn(nativeTest)
+        if(cudaFound) {
+            test.defaultSourceSet.dependsOn(nativeGPUTest)
+        }
+
     }
 }
 
 val torch: KotlinNativeTarget by kotlin.targets
 tasks[torch.compilations["main"].cinterops["libctorch"].interopProcessingTaskName]
-    .dependsOn(buildCpp)
\ No newline at end of file
+    .dependsOn(buildCpp)
diff --git a/kmath-torch/ctorch/include/ctorch.h b/kmath-torch/ctorch/include/ctorch.h
index 36ec82d10..80a53e8ed 100644
--- a/kmath-torch/ctorch/include/ctorch.h
+++ b/kmath-torch/ctorch/include/ctorch.h
@@ -22,6 +22,10 @@ extern "C"
     TorchTensorHandle copy_from_blob_float(float *data, int *shape, int dim);
     TorchTensorHandle copy_from_blob_long(long *data, int *shape, int dim);
     TorchTensorHandle copy_from_blob_int(int *data, int *shape, int dim);
+    TorchTensorHandle copy_from_blob_to_gpu_double(double *data, int *shape, int dim, int device);
+    TorchTensorHandle copy_from_blob_to_gpu_float(float *data, int *shape, int dim, int device);
+    TorchTensorHandle copy_from_blob_to_gpu_long(long *data, int *shape, int dim, int device);
+    TorchTensorHandle copy_from_blob_to_gpu_int(int *data, int *shape, int dim, int device);
 
     TorchTensorHandle copy_tensor(TorchTensorHandle tensor_handle);
 
@@ -41,6 +45,19 @@ extern "C"
     void dispose_char(char *ptr);
     void dispose_tensor(TorchTensorHandle tensor_handle);
 
+    // Workaround for GPU tensors
+    double get_at_offset_double(TorchTensorHandle tensor_handle, int offset);
+    float get_at_offset_float(TorchTensorHandle tensor_handle, int offset);
+    long get_at_offset_long(TorchTensorHandle tensor_handle, int offset);
+    int get_at_offset_int(TorchTensorHandle tensor_handle, int offset);
+    void set_at_offset_double(TorchTensorHandle tensor_handle, int offset, double value);
+    void set_at_offset_float(TorchTensorHandle tensor_handle, int offset, float value);
+    void set_at_offset_long(TorchTensorHandle tensor_handle, int offset, long value);
+    void set_at_offset_int(TorchTensorHandle tensor_handle, int offset, int value);
+
+    TorchTensorHandle copy_to_cpu(TorchTensorHandle tensor_handle);
+    TorchTensorHandle copy_to_gpu(TorchTensorHandle tensor_handle, int device);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/kmath-torch/ctorch/include/utils.hh b/kmath-torch/ctorch/include/utils.hh
index e58981960..86331339f 100644
--- a/kmath-torch/ctorch/include/utils.hh
+++ b/kmath-torch/ctorch/include/utils.hh
@@ -28,21 +28,21 @@ namespace ctorch
         return torch::kInt32;
     }
 
-    inline torch::Tensor &cast(TorchTensorHandle tensor_handle)
+    inline torch::Tensor &cast(const TorchTensorHandle &tensor_handle)
     {
         return *static_cast<torch::Tensor *>(tensor_handle);
     }
 
     template <typename Dtype>
-    inline torch::Tensor copy_from_blob(Dtype *data, int *shape, int dim)
+    inline torch::Tensor copy_from_blob(Dtype *data, int *shape, int dim, torch::Device device)
     {
         auto shape_vec = std::vector<int64_t>(dim);
         shape_vec.assign(shape, shape + dim);
-        return torch::from_blob(data, shape_vec, dtype<Dtype>()).clone();
+        return torch::from_blob(data, shape_vec, dtype<Dtype>()).to(
+            torch::TensorOptions().layout(torch::kStrided).device(device), false, true);
     }
 
-    template <typename IntArray>
-    inline int *to_dynamic_ints(IntArray arr)
+    inline int *to_dynamic_ints(const c10::IntArrayRef &arr)
     {
         size_t n = arr.size();
         int *res = (int *)malloc(sizeof(int) * n);
@@ -53,4 +53,29 @@ namespace ctorch
         return res;
     }
 
+    inline std::vector<at::indexing::TensorIndex> offset_to_index(int offset, const c10::IntArrayRef &strides)
+    {
+        std::vector<at::indexing::TensorIndex> index;
+        for (const auto &stride : strides)
+        {
+            index.emplace_back(offset / stride);
+            offset %= stride;
+        }
+        return index;
+    }
+
+    template <typename NumType>
+    inline NumType get_at_offset(const TorchTensorHandle &tensor_handle, int offset)
+    {
+        auto ten = ctorch::cast(tensor_handle);
+        return ten.index(ctorch::offset_to_index(offset, ten.strides())).item<NumType>();
+    }
+
+    template <typename NumType>
+    inline void set_at_offset(TorchTensorHandle &tensor_handle, int offset, NumType value)
+    {
+        auto ten = ctorch::cast(tensor_handle);
+        ten.index(offset_to_index(offset, ten.strides())) = value;
+    }
+
 } // namespace ctorch
diff --git a/kmath-torch/ctorch/src/ctorch.cc b/kmath-torch/ctorch/src/ctorch.cc
index 457c2d8c3..747fd4703 100644
--- a/kmath-torch/ctorch/src/ctorch.cc
+++ b/kmath-torch/ctorch/src/ctorch.cc
@@ -27,19 +27,36 @@ void set_seed(int seed)
 
 TorchTensorHandle copy_from_blob_double(double *data, int *shape, int dim)
 {
-  return new torch::Tensor(ctorch::copy_from_blob<double>(data, shape, dim));
+  return new torch::Tensor(ctorch::copy_from_blob<double>(data, shape, dim, torch::kCPU));
 }
 TorchTensorHandle copy_from_blob_float(float *data, int *shape, int dim)
 {
-  return new torch::Tensor(ctorch::copy_from_blob<float>(data, shape, dim));
+  return new torch::Tensor(ctorch::copy_from_blob<float>(data, shape, dim, torch::kCPU));
 }
 TorchTensorHandle copy_from_blob_long(long *data, int *shape, int dim)
 {
-  return new torch::Tensor(ctorch::copy_from_blob<long>(data, shape, dim));
+  return new torch::Tensor(ctorch::copy_from_blob<long>(data, shape, dim, torch::kCPU));
 }
 TorchTensorHandle copy_from_blob_int(int *data, int *shape, int dim)
 {
-  return new torch::Tensor(ctorch::copy_from_blob<int>(data, shape, dim));
+  return new torch::Tensor(ctorch::copy_from_blob<int>(data, shape, dim, torch::kCPU));
+}
+
+TorchTensorHandle copy_from_blob_to_gpu_double(double *data, int *shape, int dim, int device)
+{
+  return new torch::Tensor(ctorch::copy_from_blob<double>(data, shape, dim, torch::Device(torch::kCUDA, device)));
+}
+TorchTensorHandle copy_from_blob_to_gpu_float(float *data, int *shape, int dim, int device)
+{
+  return new torch::Tensor(ctorch::copy_from_blob<float>(data, shape, dim, torch::Device(torch::kCUDA, device)));
+}
+TorchTensorHandle copy_from_blob_to_gpu_long(long *data, int *shape, int dim, int device)
+{
+  return new torch::Tensor(ctorch::copy_from_blob<long>(data, shape, dim, torch::Device(torch::kCUDA, device)));
+}
+TorchTensorHandle copy_from_blob_to_gpu_int(int *data, int *shape, int dim, int device)
+{
+  return new torch::Tensor(ctorch::copy_from_blob<int>(data, shape, dim, torch::Device(torch::kCUDA, device)));
 }
 
 TorchTensorHandle copy_tensor(TorchTensorHandle tensor_handle)
@@ -107,4 +124,46 @@ void dispose_char(char *ptr)
 void dispose_tensor(TorchTensorHandle tensor_handle)
 {
   delete static_cast<torch::Tensor *>(tensor_handle);
-}
\ No newline at end of file
+}
+
+double get_at_offset_double(TorchTensorHandle tensor_handle, int offset)
+{
+  return ctorch::get_at_offset<double>(tensor_handle, offset);
+}
+float get_at_offset_float(TorchTensorHandle tensor_handle, int offset)
+{
+  return ctorch::get_at_offset<float>(tensor_handle, offset);
+}
+long get_at_offset_long(TorchTensorHandle tensor_handle, int offset)
+{
+  return ctorch::get_at_offset<long>(tensor_handle, offset);
+}
+int get_at_offset_int(TorchTensorHandle tensor_handle, int offset)
+{
+  return ctorch::get_at_offset<int>(tensor_handle, offset);
+}
+void set_at_offset_double(TorchTensorHandle tensor_handle, int offset, double value)
+{
+  ctorch::set_at_offset<double>(tensor_handle, offset, value);
+}
+void set_at_offset_float(TorchTensorHandle tensor_handle, int offset, float value)
+{
+  ctorch::set_at_offset<float>(tensor_handle, offset, value);
+}
+void set_at_offset_long(TorchTensorHandle tensor_handle, int offset, long value)
+{
+  ctorch::set_at_offset<long>(tensor_handle, offset, value);
+}
+void set_at_offset_int(TorchTensorHandle tensor_handle, int offset, int value)
+{
+  ctorch::set_at_offset<int>(tensor_handle, offset, value);
+}
+
+TorchTensorHandle copy_to_cpu(TorchTensorHandle tensor_handle)
+{
+  return new torch::Tensor(ctorch::cast(tensor_handle).to(torch::kCPU,false, true));
+}
+TorchTensorHandle copy_to_gpu(TorchTensorHandle tensor_handle, int device)
+{
+  return new torch::Tensor(ctorch::cast(tensor_handle).to(torch::Device(torch::kCUDA, device),false, true));
+}
diff --git a/kmath-torch/src/nativeGPUTest/kotlin/kscience/kmath/torch/TestTorchTensorGPU.kt b/kmath-torch/src/nativeGPUTest/kotlin/kscience/kmath/torch/TestTorchTensorGPU.kt
new file mode 100644
index 000000000..e5e448459
--- /dev/null
+++ b/kmath-torch/src/nativeGPUTest/kotlin/kscience/kmath/torch/TestTorchTensorGPU.kt
@@ -0,0 +1,25 @@
+package kscience.kmath.torch
+
+import kscience.kmath.structures.asBuffer
+
+import kotlinx.cinterop.memScoped
+import kotlin.test.*
+
+class TestTorchTensorGPU {
+
+    @Test
+    fun cudaAvailability() {
+        assertTrue(cudaAvailable())
+    }
+
+    @Test
+    fun floatGPUTensorLayout() = memScoped {
+        val array = (1..8).map { it * 2f }.toList().toFloatArray()
+        val shape = intArrayOf(2, 2, 2)
+        val tensor = TorchTensor.copyFromFloatArrayToGPU(this, array, shape, 0)
+        tensor.elements().forEach {
+            assertEquals(tensor[it.first], it.second)
+        }
+        assertTrue(tensor.buffer.contentEquals(array.asBuffer()))
+    }
+}
\ No newline at end of file
diff --git a/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensor.kt b/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensor.kt
index 71570a184..73ead8d84 100644
--- a/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensor.kt
+++ b/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensor.kt
@@ -5,9 +5,7 @@ import kscience.kmath.structures.*
 import kotlinx.cinterop.*
 import ctorch.*
 
-public abstract class TorchTensor<T,
-        TVar : CPrimitiveVar,
-        TorchTensorBufferImpl : TorchTensorBuffer<T, TVar>> :
+public abstract class TorchTensor<T, out TorchTensorBufferImpl : TorchTensorBuffer<T>> :
     MutableNDBufferTrait<T, TorchTensorBufferImpl, TorchTensorStrides>() {
 
     public companion object {
@@ -23,6 +21,12 @@ public abstract class TorchTensor<T,
             )!!
             return TorchTensorInt(populateStridesFromNative(tensorHandle, rawShape = shape), scope, tensorHandle)
         }
+        public fun copyFromFloatArrayToGPU(scope: DeferScope, array: FloatArray, shape: IntArray, device: Int): TorchTensorFloatGPU {
+            val tensorHandle: COpaquePointer = copy_from_blob_to_gpu_float(
+                array.toCValues(), shape.toCValues(), shape.size, device
+            )!!
+            return TorchTensorFloatGPU(populateStridesFromNative(tensorHandle, rawShape = shape), scope, tensorHandle)
+        }
     }
 
     override fun toString(): String {
@@ -38,7 +42,7 @@ public class TorchTensorFloat internal constructor(
     override val strides: TorchTensorStrides,
     scope: DeferScope,
     tensorHandle: COpaquePointer
-): TorchTensor<Float, FloatVar, TorchTensorBufferFloat>() {
+): TorchTensor<Float, TorchTensorBufferFloat>() {
     override val buffer: TorchTensorBufferFloat = TorchTensorBufferFloat(scope, tensorHandle)
 }
 
@@ -46,7 +50,15 @@ public class TorchTensorInt internal constructor(
     override val strides: TorchTensorStrides,
     scope: DeferScope,
     tensorHandle: COpaquePointer
-): TorchTensor<Int, IntVar, TorchTensorBufferInt>() {
+): TorchTensor<Int, TorchTensorBufferInt>() {
     override val buffer: TorchTensorBufferInt = TorchTensorBufferInt(scope, tensorHandle)
 }
 
+public class TorchTensorFloatGPU internal constructor(
+    override val strides: TorchTensorStrides,
+    scope: DeferScope,
+    tensorHandle: COpaquePointer
+): TorchTensor<Float, TorchTensorBufferFloatGPU>() {
+    override val buffer: TorchTensorBufferFloatGPU = TorchTensorBufferFloatGPU(scope, tensorHandle)
+}
+
diff --git a/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensorBuffer.kt b/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensorBuffer.kt
index c162ef451..83213b0ca 100644
--- a/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensorBuffer.kt
+++ b/kmath-torch/src/nativeMain/kotlin/kscience/kmath/torch/TorchTensorBuffer.kt
@@ -5,31 +5,35 @@ import kscience.kmath.structures.MutableBuffer
 import kotlinx.cinterop.*
 import ctorch.*
 
-public abstract class TorchTensorBuffer<T, TVar : CPrimitiveVar> internal constructor(
+public abstract class TorchTensorBuffer<T> internal constructor(
     internal val scope: DeferScope,
     internal val tensorHandle: COpaquePointer
 ) : MutableBuffer<T> {
+
+    override val size: Int = get_numel(tensorHandle)
+
     init {
         scope.defer(::close)
     }
 
-    internal fun close() {
+    protected fun close() {
         dispose_tensor(tensorHandle)
     }
 
-    protected abstract val tensorData: CPointer<TVar>
-
-    override val size: Int
-        get() = get_numel(tensorHandle)
+    internal abstract fun wrap(outScope: DeferScope, outTensorHandle: COpaquePointer): TorchTensorBuffer<T>
 
+    override fun copy(): TorchTensorBuffer<T> = wrap(
+        outScope = scope,
+        outTensorHandle = copy_tensor(tensorHandle)!!
+    )
 }
 
-
 public class TorchTensorBufferFloat internal constructor(
     scope: DeferScope,
     tensorHandle: COpaquePointer
-) : TorchTensorBuffer<Float, FloatVar>(scope, tensorHandle) {
-    override val tensorData: CPointer<FloatVar> = get_data_float(tensorHandle)!!
+) : TorchTensorBuffer<Float>(scope, tensorHandle) {
+
+    private val tensorData: CPointer<FloatVar> = get_data_float(tensorHandle)!!
 
     override operator fun get(index: Int): Float = tensorData[index]
 
@@ -39,17 +43,19 @@ public class TorchTensorBufferFloat internal constructor(
 
     override operator fun iterator(): Iterator<Float> = (1..size).map { tensorData[it - 1] }.iterator()
 
-    override fun copy(): TorchTensorBufferFloat = TorchTensorBufferFloat(
-        scope = scope,
-        tensorHandle = copy_tensor(tensorHandle)!!
+    override fun wrap(outScope: DeferScope, outTensorHandle: COpaquePointer) = TorchTensorBufferFloat(
+        scope = outScope,
+        tensorHandle = outTensorHandle
     )
 }
 
+
 public class TorchTensorBufferInt internal constructor(
     scope: DeferScope,
     tensorHandle: COpaquePointer
-) : TorchTensorBuffer<Int, IntVar>(scope, tensorHandle) {
-    override val tensorData: CPointer<IntVar> = get_data_int(tensorHandle)!!
+) : TorchTensorBuffer<Int>(scope, tensorHandle) {
+
+    private val tensorData: CPointer<IntVar> = get_data_int(tensorHandle)!!
 
     override operator fun get(index: Int): Int = tensorData[index]
 
@@ -59,9 +65,33 @@ public class TorchTensorBufferInt internal constructor(
 
     override operator fun iterator(): Iterator<Int> = (1..size).map { tensorData[it - 1] }.iterator()
 
-    override fun copy(): TorchTensorBufferInt = TorchTensorBufferInt(
-        scope = scope,
-        tensorHandle = copy_tensor(tensorHandle)!!
+    override fun wrap(outScope: DeferScope, outTensorHandle: COpaquePointer) = TorchTensorBufferInt(
+        scope = outScope,
+        tensorHandle = outTensorHandle
     )
 }
 
+public class TorchTensorBufferFloatGPU internal constructor(
+    scope: DeferScope,
+    tensorHandle: COpaquePointer
+) : TorchTensorBuffer<Float>(scope, tensorHandle) {
+
+    override operator fun get(index: Int): Float = get_at_offset_float(tensorHandle, index)
+
+    override operator fun set(index: Int, value: Float) {
+        set_at_offset_float(tensorHandle, index, value)
+    }
+
+    override operator fun iterator(): Iterator<Float> {
+        val cpuCopy = copy_to_cpu(tensorHandle)!!
+        val tensorCpuData = get_data_float(cpuCopy)!!
+        val iteratorResult = (1..size).map { tensorCpuData[it - 1] }.iterator()
+        dispose_tensor(cpuCopy)
+        return iteratorResult
+    }
+
+    override fun wrap(outScope: DeferScope, outTensorHandle: COpaquePointer) = TorchTensorBufferFloatGPU(
+        scope = outScope,
+        tensorHandle = outTensorHandle
+    )
+}
diff --git a/kmath-torch/src/nativeTest/kotlin/kscience/kmath/torch/TestTorchTensor.kt b/kmath-torch/src/nativeTest/kotlin/kscience/kmath/torch/TestTorchTensor.kt
index 06a2f6d4c..2805a376c 100644
--- a/kmath-torch/src/nativeTest/kotlin/kscience/kmath/torch/TestTorchTensor.kt
+++ b/kmath-torch/src/nativeTest/kotlin/kscience/kmath/torch/TestTorchTensor.kt
@@ -10,9 +10,9 @@ internal class TestTorchTensor {
 
     @Test
     fun intTensorLayout() = memScoped {
-        val array = intArrayOf(7,8,9,2,6,5)
-        val shape = intArrayOf(3,2)
-        val tensor = TorchTensor.copyFromIntArray(scope=this, array=array, shape=shape)
+        val array = (1..24).toList().toIntArray()
+        val shape = intArrayOf(3, 2, 4)
+        val tensor = TorchTensor.copyFromIntArray(scope = this, array = array, shape = shape)
         tensor.elements().forEach {
             assertEquals(tensor[it.first], it.second)
         }
@@ -21,8 +21,8 @@ internal class TestTorchTensor {
 
     @Test
     fun floatTensorLayout() = memScoped {
-        val array = floatArrayOf(7.5f,8.2f,9f,2.58f,6.5f,5f)
-        val shape = intArrayOf(2,3)
+        val array = (1..10).map { it + 50f }.toList().toFloatArray()
+        val shape = intArrayOf(10)
         val tensor = TorchTensor.copyFromFloatArray(this, array, shape)
         tensor.elements().forEach {
             assertEquals(tensor[it.first], it.second)