forked from kscience/kmath
Documentation for nd-performance
This commit is contained in:
parent
9da1a8c3e3
commit
c161ef0b57
@ -9,3 +9,9 @@ dependencies {
|
||||
compile project(":kmath-coroutines")
|
||||
//jmh project(':kmath-core')
|
||||
}
|
||||
|
||||
jmh{
|
||||
warmupIterations = 1
|
||||
}
|
||||
|
||||
jmhClasses.dependsOn(compileKotlin)
|
@ -1,49 +1,48 @@
|
||||
package scientifik.kmath.structures
|
||||
|
||||
import org.openjdk.jmh.annotations.*
|
||||
import org.openjdk.jmh.annotations.Benchmark
|
||||
import org.openjdk.jmh.annotations.Scope
|
||||
import org.openjdk.jmh.annotations.State
|
||||
import java.nio.IntBuffer
|
||||
|
||||
|
||||
@Warmup(iterations = 1)
|
||||
@Measurement(iterations = 5)
|
||||
@State(Scope.Benchmark)
|
||||
open class ArrayBenchmark {
|
||||
|
||||
lateinit var array: IntArray
|
||||
lateinit var arrayBuffer: IntBuffer
|
||||
lateinit var nativeBuffer: IntBuffer
|
||||
|
||||
@Setup
|
||||
fun setup() {
|
||||
array = IntArray(10000) { it }
|
||||
arrayBuffer = IntBuffer.wrap(array)
|
||||
nativeBuffer = IntBuffer.allocate(10000)
|
||||
for (i in 0 until 10000) {
|
||||
nativeBuffer.put(i, i)
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
fun benchmarkArrayRead() {
|
||||
var res = 0
|
||||
for (i in 1..10000) {
|
||||
res += array[10000 - i]
|
||||
for (i in 1..size) {
|
||||
res += array[size - i]
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
fun benchmarkBufferRead() {
|
||||
var res = 0
|
||||
for (i in 1..10000) {
|
||||
res += arrayBuffer.get(10000 - i)
|
||||
for (i in 1..size) {
|
||||
res += arrayBuffer.get(size - i)
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
fun nativeBufferRead() {
|
||||
var res = 0
|
||||
for (i in 1..10000) {
|
||||
res += nativeBuffer.get(10000 - i)
|
||||
for (i in 1..size) {
|
||||
res += nativeBuffer.get(size - i)
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
val size = 1000
|
||||
|
||||
val array = IntArray(size) { it }
|
||||
val arrayBuffer = IntBuffer.wrap(array)
|
||||
val nativeBuffer = IntBuffer.allocate(size).also {
|
||||
for (i in 0 until size) {
|
||||
it.put(i, i)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@ -1,10 +1,10 @@
|
||||
package scientifik.kmath.structures
|
||||
|
||||
import org.openjdk.jmh.annotations.*
|
||||
import org.openjdk.jmh.annotations.Benchmark
|
||||
import org.openjdk.jmh.annotations.Scope
|
||||
import org.openjdk.jmh.annotations.State
|
||||
import scientifik.kmath.operations.Complex
|
||||
|
||||
@Warmup(iterations = 1)
|
||||
@Measurement(iterations = 5)
|
||||
@State(Scope.Benchmark)
|
||||
open class BufferBenchmark {
|
||||
|
||||
@ -22,7 +22,7 @@ open class BufferBenchmark {
|
||||
|
||||
@Benchmark
|
||||
fun complexBufferReadWrite() {
|
||||
val buffer = Complex.createBuffer(size / 2)
|
||||
val buffer = MutableBuffer.complex(size / 2)
|
||||
(0 until size / 2).forEach {
|
||||
buffer[it] = Complex(it.toDouble(), -it.toDouble())
|
||||
}
|
||||
@ -33,6 +33,6 @@ open class BufferBenchmark {
|
||||
}
|
||||
|
||||
companion object {
|
||||
const val size = 1000
|
||||
const val size = 100
|
||||
}
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package scientifik.kmath.structures
|
||||
|
||||
import org.openjdk.jmh.annotations.Benchmark
|
||||
import scientifik.kmath.operations.RealField
|
||||
|
||||
open class NDFieldBenchmark {
|
||||
|
||||
@Benchmark
|
||||
fun autoFieldAdd() {
|
||||
bufferedField.run {
|
||||
var res: NDBuffer<Double> = one
|
||||
repeat(n) {
|
||||
res += one
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
fun autoElementAdd() {
|
||||
var res = bufferedField.run { one.toElement() }
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
fun specializedFieldAdd() {
|
||||
specializedField.run {
|
||||
var res: NDBuffer<Double> = one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
fun lazyFieldAdd() {
|
||||
lazyNDField.run {
|
||||
var res = one
|
||||
repeat(n) {
|
||||
res += one
|
||||
}
|
||||
|
||||
res.elements().sumByDouble { it.second }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
fun boxingFieldAdd() {
|
||||
genericField.run {
|
||||
var res: NDBuffer<Double> = one
|
||||
repeat(n) {
|
||||
res += one
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
val dim = 1000
|
||||
val n = 100
|
||||
|
||||
val bufferedField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||
val specializedField = NDField.real(intArrayOf(dim, dim))
|
||||
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
||||
val lazyNDField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||
}
|
||||
}
|
@ -7,34 +7,29 @@ fun main(args: Array<String>) {
|
||||
val dim = 1000
|
||||
val n = 1000
|
||||
|
||||
val bufferedField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||
// automatically build context most suited for given type.
|
||||
val autoField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||
// specialized nd-field for Double. It works as generic Double field as well
|
||||
val specializedField = NDField.real(intArrayOf(dim, dim))
|
||||
//A field implementing lazy computations. All elements are computed on-demand
|
||||
val lazyField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||
//A generic boxing field. It should be used for objects, not primitives.
|
||||
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
||||
val lazyNDField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||
|
||||
// val action: NDField<Double, DoubleField, NDStructure<Double>>.() -> Unit = {
|
||||
// var res = one
|
||||
// repeat(n) {
|
||||
// res += 1.0
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
val doubleTime = measureTimeMillis {
|
||||
|
||||
bufferedField.run {
|
||||
var res: NDBuffer<Double> = one
|
||||
val autoTime = measureTimeMillis {
|
||||
autoField.run {
|
||||
var res = one
|
||||
repeat(n) {
|
||||
res += one
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println("Buffered addition completed in $doubleTime millis")
|
||||
|
||||
println("Buffered addition completed in $autoTime millis")
|
||||
|
||||
val elementTime = measureTimeMillis {
|
||||
var res = bufferedField.run{one.toElement()}
|
||||
var res = genericField.one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
@ -43,9 +38,8 @@ fun main(args: Array<String>) {
|
||||
println("Element addition completed in $elementTime millis")
|
||||
|
||||
val specializedTime = measureTimeMillis {
|
||||
//specializedField.run(action)
|
||||
specializedField.run {
|
||||
var res: NDBuffer<Double> = one
|
||||
var res = one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
@ -56,17 +50,16 @@ fun main(args: Array<String>) {
|
||||
|
||||
|
||||
val lazyTime = measureTimeMillis {
|
||||
val tr : RealField.(Double)->Double = {arg->
|
||||
var r = arg
|
||||
lazyField.run {
|
||||
val res = one.map {
|
||||
var c = 0.0
|
||||
repeat(n) {
|
||||
r += 1.0
|
||||
c += 1.0
|
||||
}
|
||||
r
|
||||
c
|
||||
}
|
||||
lazyNDField.run {
|
||||
val res = one.map(tr)
|
||||
|
||||
res.elements().sumByDouble { it.second }
|
||||
res.elements().forEach { it.second }
|
||||
}
|
||||
}
|
||||
|
||||
@ -77,10 +70,11 @@ fun main(args: Array<String>) {
|
||||
genericField.run {
|
||||
var res: NDBuffer<Double> = one
|
||||
repeat(n) {
|
||||
res += one
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println("Generic addition completed in $genericTime millis")
|
||||
|
||||
}
|
@ -28,7 +28,7 @@ allprojects {
|
||||
apply(plugin = "com.jfrog.artifactory")
|
||||
|
||||
group = "scientifik"
|
||||
version = "0.0.3-dev-1"
|
||||
version = "0.0.3-dev-2"
|
||||
|
||||
repositories {
|
||||
maven("https://dl.bintray.com/kotlin/kotlin-eap")
|
||||
|
@ -5,9 +5,123 @@ structures. In `kmath` performance depends on which particular context was used
|
||||
|
||||
Let us consider following contexts:
|
||||
```kotlin
|
||||
// automatically build context
|
||||
val bufferedField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||
// specialized nd-field for Double. It works as generic Double field as well
|
||||
val specializedField = NDField.real(intArrayOf(dim, dim))
|
||||
|
||||
// automatically build context most suited for given type.
|
||||
val autoField = NDField.auto(intArrayOf(dim, dim), RealField)
|
||||
|
||||
//A field implementing lazy computations. All elements are computed on-demand
|
||||
val lazyField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||
|
||||
//A generic boxing field. It should be used for objects, not primitives.
|
||||
val genericField = NDField.buffered(intArrayOf(dim, dim), RealField)
|
||||
val lazyNDField = NDField.lazy(intArrayOf(dim, dim), RealField)
|
||||
```
|
||||
Now let us perform several tests and see which implementation is best suited for each case:
|
||||
|
||||
## Test case
|
||||
|
||||
In order to test performance we will take 2d-structures with `dim = 1000` and add a structure filled with `1.0`
|
||||
to it `n = 1000` times.
|
||||
|
||||
## Specialized
|
||||
The code to run this looks like:
|
||||
```kotlin
|
||||
specializedField.run {
|
||||
var res = one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
```
|
||||
The performance of this code is the best of all tests since it inlines all operations and is specialized for operation
|
||||
with doubles. We will measure everything else relative to this one, so time for this test will be `1x` (real time
|
||||
on my computer is about 4.5 seconds). The only problem with this approach is that it requires to specify type
|
||||
from the beginning. Everyone do so anyway, so it is the recommended approach.
|
||||
|
||||
## Automatic
|
||||
Let's do the same with automatic field inference:
|
||||
```kotlin
|
||||
autoField.run {
|
||||
var res = one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
```
|
||||
Ths speed of this operation is approximately the same as for specialized case since `NDField.auto` just
|
||||
returns the same `RealNDField` in this case. Of course it is usually better to use specialized method to be sure.
|
||||
|
||||
## Lazy
|
||||
Lazy field does not produce a structure when asked, instead it generates an empty structure and fills it on-demand
|
||||
using coroutines to parallelize computations.
|
||||
When one calls
|
||||
```kotlin
|
||||
lazyField.run {
|
||||
var res = one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
```
|
||||
The result will be calculated almost immediately but the result will be empty. In order to get the full result
|
||||
structure one needs to call all its elements. In this case computation overhead will be huge. So this field never
|
||||
should be used if one expects to use the full result structure. Though if one wants only small fraction, it could
|
||||
save a lot of time.
|
||||
|
||||
This field still could be used with reasonable performance if call code is changed:
|
||||
```kotlin
|
||||
lazyField.run {
|
||||
val res = one.map {
|
||||
var c = 0.0
|
||||
repeat(n) {
|
||||
c += 1.0
|
||||
}
|
||||
c
|
||||
}
|
||||
|
||||
res.elements().forEach { it.second }
|
||||
}
|
||||
```
|
||||
In this case it completes in about `4x-5x` time due to boxing.
|
||||
|
||||
## Boxing
|
||||
The boxing field produced by
|
||||
```kotlin
|
||||
genericField.run {
|
||||
var res = one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
}
|
||||
```
|
||||
obviously is the slowest one, because it requires to box and unbox the `double` on each operation. It takes about
|
||||
`15x` time (**TODO: there seems to be a problem here, it should be slow, but not that slow**). This field should
|
||||
never be used for primitives.
|
||||
|
||||
## Element operation
|
||||
Let us also check the speed for direct operations on elements:
|
||||
```kotlin
|
||||
var res = genericField.one
|
||||
repeat(n) {
|
||||
res += 1.0
|
||||
}
|
||||
```
|
||||
One would expect to be at least as slow as field operation, but in fact, this one takes only `2x` time to complete.
|
||||
It happens, because in this particular case it does not use actual `NDField` but instead calculated directly
|
||||
via extension function.
|
||||
|
||||
## What about python?
|
||||
|
||||
Usually it is bad idea to compare the direct numerical operation performance in different languages, but it hard to
|
||||
work completely without frame of reference. In this case, simple numpy code:
|
||||
```python
|
||||
res = np.ones((1000,1000))
|
||||
for i in range(1000):
|
||||
res = res + 1.0
|
||||
```
|
||||
gives the completion time of about `1.1x`, which means that specialized kotlin code in fact is working faster (I think it is
|
||||
because better memory management). Of course if one writes `res += 1.0`, the performance will be different,
|
||||
but it would be differenc case, because numpy overrides `+=` with in-place operations. In-place operations are
|
||||
available in `kmath` with `MutableNDStructure` but there is no field for it (one can still work with mapping
|
||||
functions).
|
@ -69,11 +69,11 @@ interface Ring<T> : Space<T> {
|
||||
|
||||
operator fun T.times(b: T): T = multiply(this, b)
|
||||
|
||||
operator fun T.plus(b: Number) = this.plus(b * one)
|
||||
operator fun Number.plus(b: T) = b + this
|
||||
|
||||
operator fun T.minus(b: Number) = this.minus(b * one)
|
||||
operator fun Number.minus(b: T) = -b + this
|
||||
// operator fun T.plus(b: Number) = this.plus(b * one)
|
||||
// operator fun Number.plus(b: T) = b + this
|
||||
//
|
||||
// operator fun T.minus(b: Number) = this.minus(b * one)
|
||||
// operator fun Number.minus(b: T) = -b + this
|
||||
}
|
||||
|
||||
abstract class AbstractRing<T : Any> : AbstractSpace<T>(), Ring<T> {
|
||||
|
@ -79,7 +79,7 @@ class RealNDField(shape: IntArray) :
|
||||
* Fast element production using function inlining
|
||||
*/
|
||||
inline fun StridedNDField<Double, RealField>.produceInline(crossinline initializer: RealField.(Int) -> Double): RealNDElement {
|
||||
val array = DoubleArray(strides.linearSize) { offset -> elementField.initializer(offset) }
|
||||
val array = DoubleArray(strides.linearSize) { offset -> RealField.initializer(offset) }
|
||||
return StridedNDElement(this, DoubleBuffer(array))
|
||||
}
|
||||
|
||||
|
@ -21,11 +21,13 @@ class LazyNDField<T, F : Field<T>>(shape: IntArray, field: F, val scope: Corouti
|
||||
check(arg)
|
||||
return if (arg is LazyNDStructure<T, *>) {
|
||||
LazyNDStructure(this) { index ->
|
||||
this.elementField.transform(index, arg.function(index))
|
||||
//FIXME if value of arg is already calculated, it should be used
|
||||
elementField.transform(index, arg.function(index))
|
||||
}
|
||||
} else {
|
||||
LazyNDStructure(this) { elementField.transform(it, arg.await(it)) }
|
||||
}
|
||||
// return LazyNDStructure(this) { elementField.transform(it, arg.await(it)) }
|
||||
}
|
||||
|
||||
override fun map(arg: NDStructure<T>, transform: F.(T) -> T) =
|
||||
@ -43,6 +45,7 @@ class LazyNDField<T, F : Field<T>>(shape: IntArray, field: F, val scope: Corouti
|
||||
} else {
|
||||
LazyNDStructure(this@LazyNDField) { elementField.transform(a.await(it), b.await(it)) }
|
||||
}
|
||||
// return LazyNDStructure(this) { elementField.transform(a.await(it), b.await(it)) }
|
||||
}
|
||||
|
||||
fun NDStructure<T>.lazy(): LazyNDStructure<T, F> {
|
||||
|
Loading…
Reference in New Issue
Block a user