From 29977650f10d0d1168a4f218b2b71e371b567653 Mon Sep 17 00:00:00 2001 From: Alexander Nozik Date: Sat, 10 Dec 2022 12:21:56 +0300 Subject: [PATCH] Naive classifier notebook --- .gitignore | 3 +- examples/notebooks/Naive classifier.ipynb | 418 ++++++++++++++++++++++ 2 files changed, 419 insertions(+), 2 deletions(-) create mode 100644 examples/notebooks/Naive classifier.ipynb diff --git a/.gitignore b/.gitignore index 5ddd846a8..34ddf3fd9 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,9 @@ build/ out/ .idea/ - - .vscode/ + # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) !gradle-wrapper.jar diff --git a/examples/notebooks/Naive classifier.ipynb b/examples/notebooks/Naive classifier.ipynb new file mode 100644 index 000000000..937f5b6c6 --- /dev/null +++ b/examples/notebooks/Naive classifier.ipynb @@ -0,0 +1,418 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "%use kmath(0.3.1-dev-5)\n", + "%use plotly(0.5.0)\n", + "@file:DependsOn(\"space.kscience:kmath-commons:0.3.1-dev-5\")" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "lQbSB87rNAn9lV6poArVWW", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "code", + "source": [ + "//Uncomment to work in Jupyter classic or DataLore\n", + "//Plotly.jupyter.notebook()" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "0UP158hfccGgjQtHz0wAi6", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# The model\n", + "\n", + "Defining the input data format, the statistic abstraction and the statistic implementation based on a weighted sum of elements." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "class XYValues(val xValues: DoubleArray, val yValues: DoubleArray) {\n", + " init {\n", + " require(xValues.size == yValues.size)\n", + " }\n", + "}\n", + "\n", + "fun interface XYStatistic {\n", + " operator fun invoke(values: XYValues): Double\n", + "}\n", + "\n", + "class ConvolutionalXYStatistic(val weights: DoubleArray) : XYStatistic {\n", + " override fun invoke(values: XYValues): Double {\n", + " require(weights.size == values.yValues.size)\n", + " val norm = values.yValues.sum()\n", + " return values.yValues.zip(weights) { value, weight -> value * weight }.sum()/norm\n", + " }\n", + "}" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "Zhgz1Ui91PWz0meJiQpHol", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# Generator\n", + "Generate sample data for parabolas and hyperbolas" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "fun generateParabolas(xValues: DoubleArray, a: Double, b: Double, c: Double): XYValues {\n", + " val yValues = xValues.map { x -> a * x * x + b * x + c }.toDoubleArray()\n", + " return XYValues(xValues, yValues)\n", + "}\n", + "\n", + "fun generateHyperbols(xValues: DoubleArray, gamma: Double, x0: Double, y0: Double): XYValues {\n", + " val yValues = xValues.map { x -> y0 + gamma / (x - x0) }.toDoubleArray()\n", + " return XYValues(xValues, yValues)\n", + "}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "val xValues = (1.0..10.0).step(1.0).toDoubleArray()\n", + "\n", + "val xy = generateHyperbols(xValues, 1.0, 0.0, 0.0)\n", + "\n", + "Plotly.plot {\n", + " scatter {\n", + " this.x.doubles = xValues\n", + " this.y.doubles = xy.yValues\n", + " }\n", + "}" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "ZE2atNvFzQsCvpAF8KK4ch", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Create a default statistic with uniform weights" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "val statistic = ConvolutionalXYStatistic(DoubleArray(xValues.size){1.0})\n", + "statistic(xy)" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "EA5HaydTddRKYrtAUwd29h", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "code", + "source": [ + "import kotlin.random.Random\n", + "\n", + "val random = Random(1288)\n", + "\n", + "val parabolas = buildList{\n", + " repeat(500){\n", + " add(\n", + " generateParabolas(\n", + " xValues, \n", + " random.nextDouble(), \n", + " random.nextDouble(), \n", + " random.nextDouble()\n", + " )\n", + " )\n", + " }\n", + "}\n", + "\n", + "val hyperbolas: List = buildList{\n", + " repeat(500){\n", + " add(\n", + " generateHyperbols(\n", + " xValues, \n", + " random.nextDouble()*10, \n", + " random.nextDouble(), \n", + " random.nextDouble()\n", + " )\n", + " )\n", + " }\n", + "}" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "t5t6IYmD7Q1ykeo9uijFfQ", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "code", + "source": [ + "Plotly.plot { \n", + " scatter { \n", + " x.doubles = xValues\n", + " y.doubles = parabolas[257].yValues\n", + " }\n", + " scatter { \n", + " x.doubles = xValues\n", + " y.doubles = hyperbolas[252].yValues\n", + " }\n", + " }" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "oXB8lmju7YVYjMRXITKnhO", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "code", + "source": [ + "Plotly.plot { \n", + " histogram { \n", + " name = \"parabolae\"\n", + " x.numbers = parabolas.map { statistic(it) }\n", + " }\n", + " histogram { \n", + " name = \"hyperbolae\"\n", + " x.numbers = hyperbolas.map { statistic(it) }\n", + " }\n", + "}" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "8EIIecUZrt2NNrOkhxG5P0", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "code", + "source": [ + "val lossFunction: (XYStatistic) -> Double = { statistic ->\n", + " - abs(parabolas.sumOf { statistic(it) } - hyperbolas.sumOf { statistic(it) })\n", + "}" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "h7UmglJW5zXkAfKHK40oIL", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Using commons-math optimizer to optimize weights" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "import org.apache.commons.math3.optim.*\n", + "import org.apache.commons.math3.optim.nonlinear.scalar.*\n", + "import org.apache.commons.math3.optim.nonlinear.scalar.noderiv.*\n", + "\n", + "val optimizer = SimplexOptimizer(1e-1, Double.MAX_VALUE)\n", + "\n", + "val result = optimizer.optimize(\n", + " ObjectiveFunction { point ->\n", + " lossFunction(ConvolutionalXYStatistic(point))\n", + " },\n", + " NelderMeadSimplex(xValues.size),\n", + " InitialGuess(DoubleArray(xValues.size){ 1.0 }),\n", + " GoalType.MINIMIZE,\n", + " MaxEval(100000)\n", + ")" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "0EG3K4aCUciMlgGQKPvJ57", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Print resulting weights of optimization" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "result.point" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "LelUlY0ZSlJEO9yC6SLk5B", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "code", + "source": [ + "Plotly.plot { \n", + " scatter { \n", + " y.doubles = result.point\n", + " }\n", + "}" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "AuFOq5t9KpOIkGrOLsVXNf", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# The resulting statistic distribution" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "val resultStatistic = ConvolutionalXYStatistic(result.point)\n", + "Plotly.plot { \n", + " histogram { \n", + " name = \"parabolae\"\n", + " x.numbers = parabolas.map { resultStatistic(it) }\n", + " }\n", + " histogram { \n", + " name = \"hyperbolae\"\n", + " x.numbers = hyperbolas.map { resultStatistic(it) }\n", + " }\n", + "}" + ], + "execution_count": null, + "outputs": [], + "metadata": { + "datalore": { + "node_id": "zvmq42DRdM5mZ3SpzviHwI", + "type": "CODE", + "hide_input_from_viewers": false, + "hide_output_from_viewers": false + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Kotlin", + "language": "kotlin", + "name": "kotlin" + }, + "datalore": { + "version": 1, + "computation_mode": "JUPYTER", + "package_manager": "pip", + "base_environment": "default", + "packages": [] + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}