kmath/examples/notebooks/Naive classifier.ipynb

419 lines
9.7 KiB
Plaintext
Raw Normal View History

2022-12-10 12:21:56 +03:00
{
"cells": [
{
"cell_type": "code",
"source": [
"%use kmath(0.3.1-dev-5)\n",
"%use plotly(0.5.0)\n",
"@file:DependsOn(\"space.kscience:kmath-commons:0.3.1-dev-5\")"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "lQbSB87rNAn9lV6poArVWW",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "code",
"source": [
"//Uncomment to work in Jupyter classic or DataLore\n",
"//Plotly.jupyter.notebook()"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "0UP158hfccGgjQtHz0wAi6",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "markdown",
"source": [
"# The model\n",
"\n",
"Defining the input data format, the statistic abstraction and the statistic implementation based on a weighted sum of elements."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"source": [
"class XYValues(val xValues: DoubleArray, val yValues: DoubleArray) {\n",
" init {\n",
" require(xValues.size == yValues.size)\n",
" }\n",
"}\n",
"\n",
"fun interface XYStatistic {\n",
" operator fun invoke(values: XYValues): Double\n",
"}\n",
"\n",
"class ConvolutionalXYStatistic(val weights: DoubleArray) : XYStatistic {\n",
" override fun invoke(values: XYValues): Double {\n",
" require(weights.size == values.yValues.size)\n",
" val norm = values.yValues.sum()\n",
" return values.yValues.zip(weights) { value, weight -> value * weight }.sum()/norm\n",
" }\n",
"}"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "Zhgz1Ui91PWz0meJiQpHol",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "markdown",
"source": [
"# Generator\n",
"Generate sample data for parabolas and hyperbolas"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"fun generateParabolas(xValues: DoubleArray, a: Double, b: Double, c: Double): XYValues {\n",
" val yValues = xValues.map { x -> a * x * x + b * x + c }.toDoubleArray()\n",
" return XYValues(xValues, yValues)\n",
"}\n",
"\n",
"fun generateHyperbols(xValues: DoubleArray, gamma: Double, x0: Double, y0: Double): XYValues {\n",
" val yValues = xValues.map { x -> y0 + gamma / (x - x0) }.toDoubleArray()\n",
" return XYValues(xValues, yValues)\n",
"}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"source": [
"val xValues = (1.0..10.0).step(1.0).toDoubleArray()\n",
"\n",
"val xy = generateHyperbols(xValues, 1.0, 0.0, 0.0)\n",
"\n",
"Plotly.plot {\n",
" scatter {\n",
" this.x.doubles = xValues\n",
" this.y.doubles = xy.yValues\n",
" }\n",
"}"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "ZE2atNvFzQsCvpAF8KK4ch",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "markdown",
"source": [
"Create a default statistic with uniform weights"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"source": [
"val statistic = ConvolutionalXYStatistic(DoubleArray(xValues.size){1.0})\n",
"statistic(xy)"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "EA5HaydTddRKYrtAUwd29h",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "code",
"source": [
"import kotlin.random.Random\n",
"\n",
"val random = Random(1288)\n",
"\n",
"val parabolas = buildList{\n",
" repeat(500){\n",
" add(\n",
" generateParabolas(\n",
" xValues, \n",
" random.nextDouble(), \n",
" random.nextDouble(), \n",
" random.nextDouble()\n",
" )\n",
" )\n",
" }\n",
"}\n",
"\n",
"val hyperbolas: List<XYValues> = buildList{\n",
" repeat(500){\n",
" add(\n",
" generateHyperbols(\n",
" xValues, \n",
" random.nextDouble()*10, \n",
" random.nextDouble(), \n",
" random.nextDouble()\n",
" )\n",
" )\n",
" }\n",
"}"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "t5t6IYmD7Q1ykeo9uijFfQ",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "code",
"source": [
"Plotly.plot { \n",
" scatter { \n",
" x.doubles = xValues\n",
" y.doubles = parabolas[257].yValues\n",
" }\n",
" scatter { \n",
" x.doubles = xValues\n",
" y.doubles = hyperbolas[252].yValues\n",
" }\n",
" }"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "oXB8lmju7YVYjMRXITKnhO",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "code",
"source": [
"Plotly.plot { \n",
" histogram { \n",
" name = \"parabolae\"\n",
" x.numbers = parabolas.map { statistic(it) }\n",
" }\n",
" histogram { \n",
" name = \"hyperbolae\"\n",
" x.numbers = hyperbolas.map { statistic(it) }\n",
" }\n",
"}"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "8EIIecUZrt2NNrOkhxG5P0",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "code",
"source": [
"val lossFunction: (XYStatistic) -> Double = { statistic ->\n",
" - abs(parabolas.sumOf { statistic(it) } - hyperbolas.sumOf { statistic(it) })\n",
"}"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "h7UmglJW5zXkAfKHK40oIL",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "markdown",
"source": [
"Using commons-math optimizer to optimize weights"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"source": [
"import org.apache.commons.math3.optim.*\n",
"import org.apache.commons.math3.optim.nonlinear.scalar.*\n",
"import org.apache.commons.math3.optim.nonlinear.scalar.noderiv.*\n",
"\n",
"val optimizer = SimplexOptimizer(1e-1, Double.MAX_VALUE)\n",
"\n",
"val result = optimizer.optimize(\n",
" ObjectiveFunction { point ->\n",
" lossFunction(ConvolutionalXYStatistic(point))\n",
" },\n",
" NelderMeadSimplex(xValues.size),\n",
" InitialGuess(DoubleArray(xValues.size){ 1.0 }),\n",
" GoalType.MINIMIZE,\n",
" MaxEval(100000)\n",
")"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "0EG3K4aCUciMlgGQKPvJ57",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "markdown",
"source": [
"Print resulting weights of optimization"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"source": [
"result.point"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "LelUlY0ZSlJEO9yC6SLk5B",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "code",
"source": [
"Plotly.plot { \n",
" scatter { \n",
" y.doubles = result.point\n",
" }\n",
"}"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "AuFOq5t9KpOIkGrOLsVXNf",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "markdown",
"source": [
"# The resulting statistic distribution"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"source": [
"val resultStatistic = ConvolutionalXYStatistic(result.point)\n",
"Plotly.plot { \n",
" histogram { \n",
" name = \"parabolae\"\n",
" x.numbers = parabolas.map { resultStatistic(it) }\n",
" }\n",
" histogram { \n",
" name = \"hyperbolae\"\n",
" x.numbers = hyperbolas.map { resultStatistic(it) }\n",
" }\n",
"}"
],
"execution_count": null,
"outputs": [],
"metadata": {
"datalore": {
"node_id": "zvmq42DRdM5mZ3SpzviHwI",
"type": "CODE",
"hide_input_from_viewers": false,
"hide_output_from_viewers": false
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Kotlin",
"language": "kotlin",
"name": "kotlin"
},
"datalore": {
"version": 1,
"computation_mode": "JUPYTER",
"package_manager": "pip",
"base_environment": "default",
"packages": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}