forked from kscience/kmath
Documentation for nd-performance
This commit is contained in:
parent
9da1a8c3e3
commit
c161ef0b57
@ -8,4 +8,10 @@ dependencies {
|
|||||||
compile project(":kmath-core")
|
compile project(":kmath-core")
|
||||||
compile project(":kmath-coroutines")
|
compile project(":kmath-coroutines")
|
||||||
//jmh project(':kmath-core')
|
//jmh project(':kmath-core')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
jmh{
|
||||||
|
warmupIterations = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
jmhClasses.dependsOn(compileKotlin)
|
@ -1,49 +1,48 @@
|
|||||||
package scientifik.kmath.structures
|
package scientifik.kmath.structures
|
||||||
|
|
||||||
import org.openjdk.jmh.annotations.*
|
import org.openjdk.jmh.annotations.Benchmark
|
||||||
|
import org.openjdk.jmh.annotations.Scope
|
||||||
|
import org.openjdk.jmh.annotations.State
|
||||||
import java.nio.IntBuffer
|
import java.nio.IntBuffer
|
||||||
|
|
||||||
|
|
||||||
@Warmup(iterations = 1)
|
|
||||||
@Measurement(iterations = 5)
|
|
||||||
@State(Scope.Benchmark)
|
@State(Scope.Benchmark)
|
||||||
open class ArrayBenchmark {
|
open class ArrayBenchmark {
|
||||||
|
|
||||||
lateinit var array: IntArray
|
|
||||||
lateinit var arrayBuffer: IntBuffer
|
|
||||||
lateinit var nativeBuffer: IntBuffer
|
|
||||||
|
|
||||||
@Setup
|
|
||||||
fun setup() {
|
|
||||||
array = IntArray(10000) { it }
|
|
||||||
arrayBuffer = IntBuffer.wrap(array)
|
|
||||||
nativeBuffer = IntBuffer.allocate(10000)
|
|
||||||
for (i in 0 until 10000) {
|
|
||||||
nativeBuffer.put(i, i)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Benchmark
|
@Benchmark
|
||||||
fun benchmarkArrayRead() {
|
fun benchmarkArrayRead() {
|
||||||
var res = 0
|
var res = 0
|
||||||
for (i in 1..10000) {
|
for (i in 1..size) {
|
||||||
res += array[10000 - i]
|
res += array[size - i]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Benchmark
|
@Benchmark
|
||||||
fun benchmarkBufferRead() {
|
fun benchmarkBufferRead() {
|
||||||
var res = 0
|
var res = 0
|
||||||
for (i in 1..10000) {
|
for (i in 1..size) {
|
||||||
res += arrayBuffer.get(10000 - i)
|
res += arrayBuffer.get(size - i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Benchmark
|
@Benchmark
|
||||||
fun nativeBufferRead() {
|
fun nativeBufferRead() {
|
||||||
var res = 0
|
var res = 0
|
||||||
for (i in 1..10000) {
|
for (i in 1..size) {
|
||||||
res += nativeBuffer.get(10000 - i)
|
res += nativeBuffer.get(size - i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
val size = 1000
|
||||||
|
|
||||||
|
val array = IntArray(size) { it }
|
||||||
|
val arrayBuffer = IntBuffer.wrap(array)
|
||||||
|
val nativeBuffer = IntBuffer.allocate(size).also {
|
||||||
|
for (i in 0 until size) {
|
||||||
|
it.put(i, i)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,10 +1,10 @@
|
|||||||
package scientifik.kmath.structures
|
package scientifik.kmath.structures
|
||||||
|
|
||||||
import org.openjdk.jmh.annotations.*
|
import org.openjdk.jmh.annotations.Benchmark
|
||||||
|
import org.openjdk.jmh.annotations.Scope
|
||||||
|
import org.openjdk.jmh.annotations.State
|
||||||
import scientifik.kmath.operations.Complex
|
import scientifik.kmath.operations.Complex
|
||||||
|
|
||||||
@Warmup(iterations = 1)
|
|
||||||
@Measurement(iterations = 5)
|
|
||||||
@State(Scope.Benchmark)
|
@State(Scope.Benchmark)
|
||||||
open class BufferBenchmark {
|
open class BufferBenchmark {
|
||||||
|
|
||||||
@ -22,7 +22,7 @@ open class BufferBenchmark {
|
|||||||
|
|
||||||
@Benchmark
|
@Benchmark
|
||||||
fun complexBufferReadWrite() {
|
fun complexBufferReadWrite() {
|
||||||
val buffer = Complex.createBuffer(size / 2)
|
val buffer = MutableBuffer.complex(size / 2)
|
||||||
(0 until size / 2).forEach {
|
(0 until size / 2).forEach {
|
||||||
buffer[it] = Complex(it.toDouble(), -it.toDouble())
|
buffer[it] = Complex(it.toDouble(), -it.toDouble())
|
||||||
}
|
}
|
||||||
@ -33,6 +33,6 @@ open class BufferBenchmark {
|
|||||||
}
|
}
|
||||||
|
|
||||||
companion object {
|
companion object {
|
||||||
const val size = 1000
|
const val size = 100
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -0,0 +1,69 @@
|
|||||||
|
package scientifik.kmath.structures
|
||||||
|
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark
|
||||||
|
import scientifik.kmath.operations.RealField
|
||||||
|
|
||||||
|
open class NDFieldBenchmark {
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
fun autoFieldAdd() {
|
||||||
|
bufferedField.run {
|
||||||
|
var res: NDBuffer<Double> = one
|
||||||
|
repeat(n) {
|
||||||
|
res += one
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
fun autoElementAdd() {
|
||||||
|
var res = bufferedField.run { one.toElement() }
|
||||||
|
repeat(n) {
|
||||||
|
res += 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
fun specializedFieldAdd() {
|
||||||
|
specializedField.run {
|
||||||
|
var res: NDBuffer<Double> = one
|
||||||
|
repeat(n) {
|
||||||
|
res += 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
fun lazyFieldAdd() {
|
||||||
|
lazyNDField.run {
|
||||||
|
var res = one
|
||||||
|
repeat(n) {
|
||||||
|
res += one
|
||||||
|
}
|
||||||
|
|
||||||
|
res.elements().sumByDouble { it.second }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
fun boxingFieldAdd() {
|
||||||
|
genericField.run {
|
||||||
|
var res: NDBuffer<Double> = one
|
||||||
|
repeat(n) {
|
||||||
|
res += one
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
val dim = 1000
|
||||||
|
val n = 100
|
||||||
|
|
||||||
|
val bufferedField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||||
|
val specializedField = NDField.real(intArrayOf(dim, dim))
|
||||||
|
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
||||||
|
val lazyNDField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||||
|
}
|
||||||
|
}
|
@ -7,34 +7,29 @@ fun main(args: Array<String>) {
|
|||||||
val dim = 1000
|
val dim = 1000
|
||||||
val n = 1000
|
val n = 1000
|
||||||
|
|
||||||
val bufferedField = NDField.auto(intArrayOf(dim, dim), RealField)
|
// automatically build context most suited for given type.
|
||||||
|
val autoField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||||
|
// specialized nd-field for Double. It works as generic Double field as well
|
||||||
val specializedField = NDField.real(intArrayOf(dim, dim))
|
val specializedField = NDField.real(intArrayOf(dim, dim))
|
||||||
|
//A field implementing lazy computations. All elements are computed on-demand
|
||||||
|
val lazyField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||||
|
//A generic boxing field. It should be used for objects, not primitives.
|
||||||
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
||||||
val lazyNDField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
|
||||||
|
|
||||||
// val action: NDField<Double, DoubleField, NDStructure<Double>>.() -> Unit = {
|
|
||||||
// var res = one
|
|
||||||
// repeat(n) {
|
|
||||||
// res += 1.0
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
val doubleTime = measureTimeMillis {
|
val autoTime = measureTimeMillis {
|
||||||
|
autoField.run {
|
||||||
bufferedField.run {
|
var res = one
|
||||||
var res: NDBuffer<Double> = one
|
|
||||||
repeat(n) {
|
repeat(n) {
|
||||||
res += one
|
res += 1.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
println("Buffered addition completed in $doubleTime millis")
|
println("Buffered addition completed in $autoTime millis")
|
||||||
|
|
||||||
|
|
||||||
val elementTime = measureTimeMillis {
|
val elementTime = measureTimeMillis {
|
||||||
var res = bufferedField.run{one.toElement()}
|
var res = genericField.one
|
||||||
repeat(n) {
|
repeat(n) {
|
||||||
res += 1.0
|
res += 1.0
|
||||||
}
|
}
|
||||||
@ -43,9 +38,8 @@ fun main(args: Array<String>) {
|
|||||||
println("Element addition completed in $elementTime millis")
|
println("Element addition completed in $elementTime millis")
|
||||||
|
|
||||||
val specializedTime = measureTimeMillis {
|
val specializedTime = measureTimeMillis {
|
||||||
//specializedField.run(action)
|
|
||||||
specializedField.run {
|
specializedField.run {
|
||||||
var res: NDBuffer<Double> = one
|
var res = one
|
||||||
repeat(n) {
|
repeat(n) {
|
||||||
res += 1.0
|
res += 1.0
|
||||||
}
|
}
|
||||||
@ -56,17 +50,16 @@ fun main(args: Array<String>) {
|
|||||||
|
|
||||||
|
|
||||||
val lazyTime = measureTimeMillis {
|
val lazyTime = measureTimeMillis {
|
||||||
val tr : RealField.(Double)->Double = {arg->
|
lazyField.run {
|
||||||
var r = arg
|
val res = one.map {
|
||||||
repeat(n) {
|
var c = 0.0
|
||||||
r += 1.0
|
repeat(n) {
|
||||||
|
c += 1.0
|
||||||
|
}
|
||||||
|
c
|
||||||
}
|
}
|
||||||
r
|
|
||||||
}
|
|
||||||
lazyNDField.run {
|
|
||||||
val res = one.map(tr)
|
|
||||||
|
|
||||||
res.elements().sumByDouble { it.second }
|
res.elements().forEach { it.second }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,10 +70,11 @@ fun main(args: Array<String>) {
|
|||||||
genericField.run {
|
genericField.run {
|
||||||
var res: NDBuffer<Double> = one
|
var res: NDBuffer<Double> = one
|
||||||
repeat(n) {
|
repeat(n) {
|
||||||
res += one
|
res += 1.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
println("Generic addition completed in $genericTime millis")
|
println("Generic addition completed in $genericTime millis")
|
||||||
|
|
||||||
}
|
}
|
@ -28,7 +28,7 @@ allprojects {
|
|||||||
apply(plugin = "com.jfrog.artifactory")
|
apply(plugin = "com.jfrog.artifactory")
|
||||||
|
|
||||||
group = "scientifik"
|
group = "scientifik"
|
||||||
version = "0.0.3-dev-1"
|
version = "0.0.3-dev-2"
|
||||||
|
|
||||||
repositories {
|
repositories {
|
||||||
maven("https://dl.bintray.com/kotlin/kotlin-eap")
|
maven("https://dl.bintray.com/kotlin/kotlin-eap")
|
||||||
|
@ -5,9 +5,123 @@ structures. In `kmath` performance depends on which particular context was used
|
|||||||
|
|
||||||
Let us consider following contexts:
|
Let us consider following contexts:
|
||||||
```kotlin
|
```kotlin
|
||||||
// automatically build context
|
// specialized nd-field for Double. It works as generic Double field as well
|
||||||
val bufferedField = NDField.auto(intArrayOf(dim, dim), RealField)
|
|
||||||
val specializedField = NDField.real(intArrayOf(dim, dim))
|
val specializedField = NDField.real(intArrayOf(dim, dim))
|
||||||
|
|
||||||
|
// automatically build context most suited for given type.
|
||||||
|
val autoField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||||
|
|
||||||
|
//A field implementing lazy computations. All elements are computed on-demand
|
||||||
|
val lazyField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||||
|
|
||||||
|
//A generic boxing field. It should be used for objects, not primitives.
|
||||||
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
||||||
val lazyNDField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
```
|
||||||
```
|
Now let us perform several tests and see which implementation is best suited for each case:
|
||||||
|
|
||||||
|
## Test case
|
||||||
|
|
||||||
|
In order to test performance we will take 2d-structures with `dim = 1000` and add a structure filled with `1.0`
|
||||||
|
to it `n = 1000` times.
|
||||||
|
|
||||||
|
## Specialized
|
||||||
|
The code to run this looks like:
|
||||||
|
```kotlin
|
||||||
|
specializedField.run {
|
||||||
|
var res = one
|
||||||
|
repeat(n) {
|
||||||
|
res += 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
The performance of this code is the best of all tests since it inlines all operations and is specialized for operation
|
||||||
|
with doubles. We will measure everything else relative to this one, so time for this test will be `1x` (real time
|
||||||
|
on my computer is about 4.5 seconds). The only problem with this approach is that it requires to specify type
|
||||||
|
from the beginning. Everyone do so anyway, so it is the recommended approach.
|
||||||
|
|
||||||
|
## Automatic
|
||||||
|
Let's do the same with automatic field inference:
|
||||||
|
```kotlin
|
||||||
|
autoField.run {
|
||||||
|
var res = one
|
||||||
|
repeat(n) {
|
||||||
|
res += 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
Ths speed of this operation is approximately the same as for specialized case since `NDField.auto` just
|
||||||
|
returns the same `RealNDField` in this case. Of course it is usually better to use specialized method to be sure.
|
||||||
|
|
||||||
|
## Lazy
|
||||||
|
Lazy field does not produce a structure when asked, instead it generates an empty structure and fills it on-demand
|
||||||
|
using coroutines to parallelize computations.
|
||||||
|
When one calls
|
||||||
|
```kotlin
|
||||||
|
lazyField.run {
|
||||||
|
var res = one
|
||||||
|
repeat(n) {
|
||||||
|
res += 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
The result will be calculated almost immediately but the result will be empty. In order to get the full result
|
||||||
|
structure one needs to call all its elements. In this case computation overhead will be huge. So this field never
|
||||||
|
should be used if one expects to use the full result structure. Though if one wants only small fraction, it could
|
||||||
|
save a lot of time.
|
||||||
|
|
||||||
|
This field still could be used with reasonable performance if call code is changed:
|
||||||
|
```kotlin
|
||||||
|
lazyField.run {
|
||||||
|
val res = one.map {
|
||||||
|
var c = 0.0
|
||||||
|
repeat(n) {
|
||||||
|
c += 1.0
|
||||||
|
}
|
||||||
|
c
|
||||||
|
}
|
||||||
|
|
||||||
|
res.elements().forEach { it.second }
|
||||||
|
}
|
||||||
|
```
|
||||||
|
In this case it completes in about `4x-5x` time due to boxing.
|
||||||
|
|
||||||
|
## Boxing
|
||||||
|
The boxing field produced by
|
||||||
|
```kotlin
|
||||||
|
genericField.run {
|
||||||
|
var res = one
|
||||||
|
repeat(n) {
|
||||||
|
res += 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
obviously is the slowest one, because it requires to box and unbox the `double` on each operation. It takes about
|
||||||
|
`15x` time (**TODO: there seems to be a problem here, it should be slow, but not that slow**). This field should
|
||||||
|
never be used for primitives.
|
||||||
|
|
||||||
|
## Element operation
|
||||||
|
Let us also check the speed for direct operations on elements:
|
||||||
|
```kotlin
|
||||||
|
var res = genericField.one
|
||||||
|
repeat(n) {
|
||||||
|
res += 1.0
|
||||||
|
}
|
||||||
|
```
|
||||||
|
One would expect to be at least as slow as field operation, but in fact, this one takes only `2x` time to complete.
|
||||||
|
It happens, because in this particular case it does not use actual `NDField` but instead calculated directly
|
||||||
|
via extension function.
|
||||||
|
|
||||||
|
## What about python?
|
||||||
|
|
||||||
|
Usually it is bad idea to compare the direct numerical operation performance in different languages, but it hard to
|
||||||
|
work completely without frame of reference. In this case, simple numpy code:
|
||||||
|
```python
|
||||||
|
res = np.ones((1000,1000))
|
||||||
|
for i in range(1000):
|
||||||
|
res = res + 1.0
|
||||||
|
```
|
||||||
|
gives the completion time of about `1.1x`, which means that specialized kotlin code in fact is working faster (I think it is
|
||||||
|
because better memory management). Of course if one writes `res += 1.0`, the performance will be different,
|
||||||
|
but it would be differenc case, because numpy overrides `+=` with in-place operations. In-place operations are
|
||||||
|
available in `kmath` with `MutableNDStructure` but there is no field for it (one can still work with mapping
|
||||||
|
functions).
|
@ -69,11 +69,11 @@ interface Ring<T> : Space<T> {
|
|||||||
|
|
||||||
operator fun T.times(b: T): T = multiply(this, b)
|
operator fun T.times(b: T): T = multiply(this, b)
|
||||||
|
|
||||||
operator fun T.plus(b: Number) = this.plus(b * one)
|
// operator fun T.plus(b: Number) = this.plus(b * one)
|
||||||
operator fun Number.plus(b: T) = b + this
|
// operator fun Number.plus(b: T) = b + this
|
||||||
|
//
|
||||||
operator fun T.minus(b: Number) = this.minus(b * one)
|
// operator fun T.minus(b: Number) = this.minus(b * one)
|
||||||
operator fun Number.minus(b: T) = -b + this
|
// operator fun Number.minus(b: T) = -b + this
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract class AbstractRing<T : Any> : AbstractSpace<T>(), Ring<T> {
|
abstract class AbstractRing<T : Any> : AbstractSpace<T>(), Ring<T> {
|
||||||
|
@ -79,7 +79,7 @@ class RealNDField(shape: IntArray) :
|
|||||||
* Fast element production using function inlining
|
* Fast element production using function inlining
|
||||||
*/
|
*/
|
||||||
inline fun StridedNDField<Double, RealField>.produceInline(crossinline initializer: RealField.(Int) -> Double): RealNDElement {
|
inline fun StridedNDField<Double, RealField>.produceInline(crossinline initializer: RealField.(Int) -> Double): RealNDElement {
|
||||||
val array = DoubleArray(strides.linearSize) { offset -> elementField.initializer(offset) }
|
val array = DoubleArray(strides.linearSize) { offset -> RealField.initializer(offset) }
|
||||||
return StridedNDElement(this, DoubleBuffer(array))
|
return StridedNDElement(this, DoubleBuffer(array))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,4 +102,4 @@ operator fun RealNDElement.plus(arg: Double) =
|
|||||||
* Subtraction operation between [StridedNDElement] and single element
|
* Subtraction operation between [StridedNDElement] and single element
|
||||||
*/
|
*/
|
||||||
operator fun RealNDElement.minus(arg: Double) =
|
operator fun RealNDElement.minus(arg: Double) =
|
||||||
context.produceInline { i -> buffer[i] - arg }
|
context.produceInline { i -> buffer[i] - arg }
|
||||||
|
@ -21,11 +21,13 @@ class LazyNDField<T, F : Field<T>>(shape: IntArray, field: F, val scope: Corouti
|
|||||||
check(arg)
|
check(arg)
|
||||||
return if (arg is LazyNDStructure<T, *>) {
|
return if (arg is LazyNDStructure<T, *>) {
|
||||||
LazyNDStructure(this) { index ->
|
LazyNDStructure(this) { index ->
|
||||||
this.elementField.transform(index, arg.function(index))
|
//FIXME if value of arg is already calculated, it should be used
|
||||||
|
elementField.transform(index, arg.function(index))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LazyNDStructure(this) { elementField.transform(it, arg.await(it)) }
|
LazyNDStructure(this) { elementField.transform(it, arg.await(it)) }
|
||||||
}
|
}
|
||||||
|
// return LazyNDStructure(this) { elementField.transform(it, arg.await(it)) }
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun map(arg: NDStructure<T>, transform: F.(T) -> T) =
|
override fun map(arg: NDStructure<T>, transform: F.(T) -> T) =
|
||||||
@ -43,6 +45,7 @@ class LazyNDField<T, F : Field<T>>(shape: IntArray, field: F, val scope: Corouti
|
|||||||
} else {
|
} else {
|
||||||
LazyNDStructure(this@LazyNDField) { elementField.transform(a.await(it), b.await(it)) }
|
LazyNDStructure(this@LazyNDField) { elementField.transform(a.await(it), b.await(it)) }
|
||||||
}
|
}
|
||||||
|
// return LazyNDStructure(this) { elementField.transform(a.await(it), b.await(it)) }
|
||||||
}
|
}
|
||||||
|
|
||||||
fun NDStructure<T>.lazy(): LazyNDStructure<T, F> {
|
fun NDStructure<T>.lazy(): LazyNDStructure<T, F> {
|
||||||
|
Loading…
Reference in New Issue
Block a user