{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Lesson 21: Introduction to Numpy and Scipy\n", "\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " Loading BokehJS ...\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(root) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " const force = true;\n", "\n", " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", " root._bokeh_onload_callbacks = [];\n", " root._bokeh_is_loading = undefined;\n", " }\n", "\n", " const JS_MIME_TYPE = 'application/javascript';\n", " const HTML_MIME_TYPE = 'text/html';\n", " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", " const CLASS_NAME = 'output_bokeh rendered_html';\n", "\n", " /**\n", " * Render data to the DOM node\n", " */\n", " function render(props, node) {\n", " const script = document.createElement(\"script\");\n", " node.appendChild(script);\n", " }\n", "\n", " /**\n", " * Handle when an output is cleared or removed\n", " */\n", " function handleClearOutput(event, handle) {\n", " const cell = handle.cell;\n", "\n", " const id = cell.output_area._bokeh_element_id;\n", " const server_id = cell.output_area._bokeh_server_id;\n", " // Clean up Bokeh references\n", " if (id != null && id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", "\n", " if (server_id !== undefined) {\n", " // Clean up Bokeh references\n", " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", " cell.notebook.kernel.execute(cmd_clean, {\n", " iopub: {\n", " output: function(msg) {\n", " const id = msg.content.text.trim();\n", " if (id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", " }\n", " }\n", " });\n", " // Destroy server and session\n", " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", " cell.notebook.kernel.execute(cmd_destroy);\n", " }\n", " }\n", "\n", " /**\n", " * Handle when a new output is added\n", " */\n", " function handleAddOutput(event, handle) {\n", " const output_area = handle.output_area;\n", " const output = handle.output;\n", "\n", " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", " return\n", " }\n", "\n", " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", "\n", " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", " // store reference to embed id on output_area\n", " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", " }\n", " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", " const bk_div = document.createElement(\"div\");\n", " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", " const script_attrs = bk_div.children[0].attributes;\n", " for (let i = 0; i < script_attrs.length; i++) {\n", " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", " }\n", " // store reference to server id on output_area\n", " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", " }\n", " }\n", "\n", " function register_renderer(events, OutputArea) {\n", "\n", " function append_mime(data, metadata, element) {\n", " // create a DOM node to render to\n", " const toinsert = this.create_output_subarea(\n", " metadata,\n", " CLASS_NAME,\n", " EXEC_MIME_TYPE\n", " );\n", " this.keyboard_manager.register_events(toinsert);\n", " // Render to node\n", " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", " render(props, toinsert[toinsert.length - 1]);\n", " element.append(toinsert);\n", " return toinsert\n", " }\n", "\n", " /* Handle when an output is cleared or removed */\n", " events.on('clear_output.CodeCell', handleClearOutput);\n", " events.on('delete.Cell', handleClearOutput);\n", "\n", " /* Handle when a new output is added */\n", " events.on('output_added.OutputArea', handleAddOutput);\n", "\n", " /**\n", " * Register the mime type and append_mime function with output_area\n", " */\n", " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", " /* Is output safe? */\n", " safe: true,\n", " /* Index of renderer in `output_area.display_order` */\n", " index: 0\n", " });\n", " }\n", "\n", " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", " if (root.Jupyter !== undefined) {\n", " const events = require('base/js/events');\n", " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", "\n", " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", " register_renderer(events, OutputArea);\n", " }\n", " }\n", "\n", " \n", " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", " root._bokeh_timeout = Date.now() + 5000;\n", " root._bokeh_failed_load = false;\n", " }\n", "\n", " const NB_LOAD_WARNING = {'data': {'text/html':\n", " \"
\\n\"+\n", " \"

\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"

\\n\"+\n", " \"\\n\"+\n", " \"\\n\"+\n", " \"from bokeh.resources import INLINE\\n\"+\n", " \"output_notebook(resources=INLINE)\\n\"+\n", " \"\\n\"+\n", " \"
\"}};\n", "\n", " function display_loaded() {\n", " const el = document.getElementById(\"1002\");\n", " if (el != null) {\n", " el.textContent = \"BokehJS is loading...\";\n", " }\n", " if (root.Bokeh !== undefined) {\n", " if (el != null) {\n", " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", " }\n", " } else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(display_loaded, 100)\n", " }\n", " }\n", "\n", "\n", " function run_callbacks() {\n", " try {\n", " root._bokeh_onload_callbacks.forEach(function(callback) {\n", " if (callback != null)\n", " callback();\n", " });\n", " } finally {\n", " delete root._bokeh_onload_callbacks\n", " }\n", " console.debug(\"Bokeh: all callbacks have finished\");\n", " }\n", "\n", " function load_libs(css_urls, js_urls, callback) {\n", " if (css_urls == null) css_urls = [];\n", " if (js_urls == null) js_urls = [];\n", "\n", " root._bokeh_onload_callbacks.push(callback);\n", " if (root._bokeh_is_loading > 0) {\n", " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", " return null;\n", " }\n", " if (js_urls == null || js_urls.length === 0) {\n", " run_callbacks();\n", " return null;\n", " }\n", " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", "\n", " function on_load() {\n", " root._bokeh_is_loading--;\n", " if (root._bokeh_is_loading === 0) {\n", " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", " run_callbacks()\n", " }\n", " }\n", "\n", " function on_error(url) {\n", " console.error(\"failed to load \" + url);\n", " }\n", "\n", " for (let i = 0; i < css_urls.length; i++) {\n", " const url = css_urls[i];\n", " const element = document.createElement(\"link\");\n", " element.onload = on_load;\n", " element.onerror = on_error.bind(null, url);\n", " element.rel = \"stylesheet\";\n", " element.type = \"text/css\";\n", " element.href = url;\n", " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", " document.body.appendChild(element);\n", " }\n", "\n", " for (let i = 0; i < js_urls.length; i++) {\n", " const url = js_urls[i];\n", " const element = document.createElement('script');\n", " element.onload = on_load;\n", " element.onerror = on_error.bind(null, url);\n", " element.async = false;\n", " element.src = url;\n", " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", " document.head.appendChild(element);\n", " }\n", " };\n", "\n", " function inject_raw_css(css) {\n", " const element = document.createElement(\"style\");\n", " element.appendChild(document.createTextNode(css));\n", " document.body.appendChild(element);\n", " }\n", "\n", " \n", " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-2.4.2.min.js\"];\n", " const css_urls = [];\n", " \n", "\n", " const inline_js = [\n", " function(Bokeh) {\n", " Bokeh.set_log_level(\"info\");\n", " },\n", " function(Bokeh) {\n", " \n", " \n", " }\n", " ];\n", "\n", " function run_inline_js() {\n", " \n", " if (root.Bokeh !== undefined || force === true) {\n", " \n", " for (let i = 0; i < inline_js.length; i++) {\n", " inline_js[i].call(root, root.Bokeh);\n", " }\n", " if (force === true) {\n", " display_loaded();\n", " }} else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(run_inline_js, 100);\n", " } else if (!root._bokeh_failed_load) {\n", " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", " root._bokeh_failed_load = true;\n", " } else if (force !== true) {\n", " const cell = $(document.getElementById(\"1002\")).parents('.cell').data().cell;\n", " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", " }\n", "\n", " }\n", "\n", " if (root._bokeh_is_loading === 0) {\n", " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", " run_inline_js();\n", " } else {\n", " load_libs(css_urls, js_urls, function() {\n", " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", " run_inline_js();\n", " });\n", " }\n", "}(window));" ], "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"1002\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-2.4.2.min.js\"];\n const css_urls = [];\n \n\n const inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"1002\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "# We'll demo a bit of Scipy\n", "import scipy.special\n", "\n", "import iqplot\n", "\n", "import bokeh.io\n", "import bokeh.plotting\n", "\n", "bokeh.io.output_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "In this lesson, you will learn about [NumPy](http://www.numpy.org), arguably *the* most important package for scientific computing, and SciPy, a package containing lots of goodies for scientific computing, like special functions and numerical integrators. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A very brief introduction to NumPy arrays\n", "\n", "The central object for NumPy and SciPy is the `ndarray`, commonly referred to as a \"NumPy array.\" This is an array object that is convenient for scientific computing. We will go over it in depth in the next lesson, but for now, let's just create some NumPy arrays and see how operators work on them.\n", "\n", "Just like with type conversions with lists, tuples, and other data types we've looked at, we can convert a list to a NumPy array using\n", "\n", " np.array()\n", " \n", "Note that above we imported the NumPy package with the `np` alias. This is for convenience; it allow us to use `np` as a prefix instead of `numpy`. NumPy is in *very* widespread use, and the convention is to use the `np` abbreviation." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3, 4])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a NumPy array from a list\n", "my_ar = np.array([1, 2, 3, 4])\n", "\n", "# Look at it\n", "my_ar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that the list has been converted, and it is explicitly shown as an array. It has several attributes and lots of methods. The most important attributes are probably the data type of its elements and the shape of the array." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('int64')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The data type of stored entries\n", "my_ar.dtype" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4,)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The shape of the array\n", "my_ar.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are also lots of methods. The one I use most often is `astype()`, which converts the data type of the array." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 2., 3., 4.])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_ar.astype(float)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are many others. For example, we can compute summary statistics about the entries in the array, very similar to what we have see with Pandas." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n", "1\n", "10\n", "2.5\n", "1.118033988749895\n" ] } ], "source": [ "print(my_ar.max())\n", "print(my_ar.min())\n", "print(my_ar.sum())\n", "print(my_ar.mean())\n", "print(my_ar.std())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Importantly, NumPy arrays can be arguments to NumPy functions. In this case, these functions do the same operations as the methods we just looked at." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n", "1\n", "10\n", "2.5\n", "1.118033988749895\n" ] } ], "source": [ "print(np.max(my_ar))\n", "print(np.min(my_ar))\n", "print(np.sum(my_ar))\n", "print(np.mean(my_ar))\n", "print(np.std(my_ar))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Other ways to make NumPy arrays\n", "\n", "There are many other ways to make NumPy arrays besides just converting lists or tuples. Below are some examples." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# How long our arrays will be\n", "n = 10\n", "\n", "# Make a NumPy array of length n filled with zeros\n", "np.zeros(n)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a NumPy array of length n filled with ones\n", "np.ones(n)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make an empty NumPy array of length n without initializing entries\n", "# (while it initially holds whatever values were previously in the memory\n", "# locations assigned, ones will be displayed)\n", "np.empty(n)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0],\n", " [0, 0]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a NumPy array filled with zeros the same shape as another NumPy array\n", "my_ar = np.array([[1, 2], [3, 4]])\n", "np.zeros_like(my_ar)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extracting Numpy arrays from Pandas data frames\n", "\n", "NumPy has a primitive function for loading in data from text files, `np.loadtxt()`, but with Pandas's `read_csv()`, there is really no reason to ever use it. Instead, we may sometimes wish to extract NumPy arrays out of Pandas data frames. This is almost always for speed reasons, which we will see when we do hacker stats. NumPy arrays are highly optimized for computing speed.\n", "\n", "As it is always more fun to work with a real biological application, we will populate our NumPy arrays with data. In their 2011 [paper in PLoS ONE](https://doi.org/10.1371/journal.pone.0025840), Harvey and Orbidans measured the cross-sectional area of *C. elegans* eggs that came from mothers who had a high concentration of food and from mothers of a low concentration of food. I digitized the data from their plots, and they are available in the file `~/git/bootcamp/data/c_elegans_egg_xa.csv` in the bootcamp repository." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
foodarea (sq. um)
0high1683
1high2061
2high1792
3high1852
4high2091
\n", "
" ], "text/plain": [ " food area (sq. um)\n", "0 high 1683\n", "1 high 2061\n", "2 high 1792\n", "3 high 1852\n", "4 high 2091" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"data/c_elegans_egg_xa.csv\", comment='#')\n", "\n", "# Take a look\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It's always a good idea to do a quick look at a data set by making a plot." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "
\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " const docs_json = {\"359c0415-bf03-43ef-b275-ebfd406129a3\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1012\"}],\"center\":[{\"id\":\"1015\"},{\"id\":\"1018\"}],\"frame_height\":200,\"frame_width\":375,\"left\":[{\"id\":\"1016\"}],\"renderers\":[{\"id\":\"1040\"}],\"title\":{\"id\":\"1042\"},\"toolbar\":{\"id\":\"1026\"},\"toolbar_location\":\"above\",\"x_range\":{\"id\":\"1005\"},\"x_scale\":{\"id\":\"1008\"},\"y_range\":{\"id\":\"1003\"},\"y_scale\":{\"id\":\"1010\"}},\"id\":\"1004\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1013\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1049\",\"type\":\"AllLabels\"},{\"attributes\":{},\"id\":\"1019\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1020\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1017\",\"type\":\"CategoricalTicker\"},{\"attributes\":{},\"id\":\"1024\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis_label\":\"area (sq. um)\",\"coordinates\":null,\"formatter\":{\"id\":\"1048\"},\"group\":null,\"major_label_policy\":{\"id\":\"1049\"},\"ticker\":{\"id\":\"1013\"}},\"id\":\"1012\",\"type\":\"LinearAxis\"},{\"attributes\":{\"overlay\":{\"id\":\"1025\"}},\"id\":\"1021\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1022\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1023\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis\":{\"id\":\"1012\"},\"coordinates\":null,\"group\":null,\"ticker\":null},\"id\":\"1015\",\"type\":\"Grid\"},{\"attributes\":{\"bottom_units\":\"screen\",\"coordinates\":null,\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"group\":null,\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1025\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"factors\":[\"high\",\"low\"]},\"id\":\"1003\",\"type\":\"FactorRange\"},{\"attributes\":{},\"id\":\"1045\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{\"distribution\":\"normal\",\"range\":{\"id\":\"1003\"},\"width\":0.1},\"id\":\"1035\",\"type\":\"Jitter\"},{\"attributes\":{\"fill_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"hatch_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"line_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"x\":{\"field\":\"area (sq. um)\"},\"y\":{\"field\":\"cat\",\"transform\":{\"id\":\"1035\"}}},\"id\":\"1037\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1046\",\"type\":\"AllLabels\"},{\"attributes\":{},\"id\":\"1010\",\"type\":\"CategoricalScale\"},{\"attributes\":{\"factors\":[\"low\",\"high\"],\"palette\":[\"#1f77b3\",\"#ff7e0e\",\"#2ba02b\",\"#d62628\",\"#9367bc\",\"#8c564b\",\"#e277c1\",\"#7e7e7e\",\"#bcbc21\",\"#16bdcf\",\"#3a0182\",\"#004201\",\"#0fffa8\",\"#5d003f\",\"#bcbcff\",\"#d8afa1\",\"#b80080\",\"#004d52\",\"#6b6400\",\"#7c0100\",\"#6026ff\",\"#ffff9a\",\"#564964\",\"#8cb893\",\"#93fbff\",\"#018267\",\"#90ff00\",\"#8200a0\",\"#ac8944\",\"#5b3400\",\"#ffbff2\",\"#ff6e75\",\"#798cff\",\"#dd00ff\",\"#505646\",\"#004489\",\"#ffbf60\",\"#ff018c\",\"#bdc8cf\",\"#af97b5\",\"#b65600\",\"#017000\",\"#cd87ff\",\"#1cd646\",\"#bfebc3\",\"#7997b5\",\"#a56089\",\"#6e8956\",\"#bc7c75\",\"#8a2844\",\"#00acff\",\"#8ed4ff\",\"#4b6d77\",\"#00d4b1\",\"#9300f2\",\"#8a9500\",\"#5d5b9e\",\"#fddfba\",\"#00939e\",\"#ffdb00\",\"#00aa79\",\"#520067\",\"#000091\",\"#0a5d3d\",\"#a5e275\",\"#623b41\",\"#c6c689\",\"#ff9eb5\",\"#cd4f6b\",\"#ff07d6\",\"#8a3a05\",\"#7e3d70\",\"#ff4901\",\"#602ba5\",\"#1c00ff\",\"#e6dfff\",\"#aa3baf\",\"#d89c00\",\"#a3a39e\",\"#3f69ff\",\"#46490c\",\"#7b6985\",\"#6b978c\",\"#ff9a75\",\"#835bff\",\"#7c6b46\",\"#80b654\",\"#bc0049\",\"#fd93ff\",\"#5d0018\",\"#89d1d1\",\"#9c8cd3\",\"#da6d42\",\"#8a5700\",\"#3b5069\",\"#4b6b3b\",\"#edcfd8\",\"#cfedff\",\"#aa1500\",\"#dfff4f\",\"#ff2a56\",\"#d1499e\",\"#707cb8\",\"#598000\",\"#00e4fd\",\"#774b95\",\"#67d48c\",\"#3d3a72\",\"#ac413f\",\"#d6a166\",\"#c169cd\",\"#69595d\",\"#87aced\",\"#a0a569\",\"#d1aae6\",\"#870062\",\"#00fddb\",\"#672818\",\"#b342ff\",\"#0e59c4\",\"#168742\",\"#90d300\",\"#cd7900\",\"#f959ff\",\"#5b7466\",\"#8eaeb3\",\"#9c7c8c\",\"#4600c6\",\"#6b4d2d\",\"#a56d46\",\"#9e8972\",\"#a8afca\",\"#cd8ca7\",\"#00fd64\",\"#917900\",\"#ff62a1\",\"#f4ffd8\",\"#018cf0\",\"#13aca0\",\"#5b2d59\",\"#89859e\",\"#cfccba\",\"#d4afc4\",\"#dbdd6d\",\"#cffff4\",\"#006485\",\"#006962\",\"#a84167\",\"#2d97c4\",\"#a874ff\",\"#26ba5d\",\"#57b600\",\"#caffa7\",\"#a379aa\",\"#ffbc93\",\"#89e2c1\",\"#0fc8ff\",\"#d400c4\",\"#626d89\",\"#69858e\",\"#4b4d52\",\"#aa6067\",\"#79b5d4\",\"#2b5916\",\"#9a0024\",\"#bdd1f2\",\"#896e67\",\"#69a56b\",\"#855467\",\"#aecdba\",\"#87997e\",\"#cadb00\",\"#9a0390\",\"#ebbc1a\",\"#eb9cd1\",\"#70006e\",\"#b1a131\",\"#ca6b93\",\"#4146a3\",\"#e48c89\",\"#d44400\",\"#c68aca\",\"#b69597\",\"#d41f75\",\"#724bcc\",\"#674d00\",\"#672138\",\"#38564f\",\"#6ebaaa\",\"#853a31\",\"#a5d397\",\"#b8af8e\",\"#d8e4df\",\"#aa00df\",\"#cac1db\",\"#ffdf8c\",\"#e2524d\",\"#66696e\",\"#ff001c\",\"#522d72\",\"#4d906b\",\"#a86d11\",\"#ff9e26\",\"#5ea3af\",\"#c88556\",\"#915997\",\"#a3a1ff\",\"#fdbaba\",\"#242a87\",\"#dbe6a8\",\"#97f2a7\",\"#6793d6\",\"#ba5b3f\",\"#3a5d91\",\"#364f2f\",\"#267c95\",\"#89959a\",\"#cfb356\",\"#004664\",\"#5e5d2f\",\"#8e8e41\",\"#ac3f13\",\"#69953b\",\"#a13d85\",\"#bfb6ba\",\"#acc667\",\"#6469cf\",\"#91af00\",\"#2be2da\",\"#016e36\",\"#ff7952\",\"#42807e\",\"#4fe800\",\"#995428\",\"#5d0a00\",\"#a30057\",\"#0c8700\",\"#5982a7\",\"#ffebfb\",\"#4b6901\",\"#8775d4\",\"#e6c6ff\",\"#a5ffda\",\"#d86e77\",\"#df014b\",\"#69675b\",\"#776ba1\",\"#7e8067\",\"#594685\",\"#0000ca\",\"#7c002a\",\"#97ff72\",\"#b5e2e1\",\"#db52c8\",\"#777734\",\"#57bd8e\"]},\"id\":\"1033\",\"type\":\"CategoricalColorMapper\"},{\"attributes\":{},\"id\":\"1005\",\"type\":\"DataRange1d\"},{\"attributes\":{\"tools\":[{\"id\":\"1019\"},{\"id\":\"1020\"},{\"id\":\"1021\"},{\"id\":\"1022\"},{\"id\":\"1023\"},{\"id\":\"1024\"}]},\"id\":\"1026\",\"type\":\"Toolbar\"},{\"attributes\":{\"coordinates\":null,\"group\":null},\"id\":\"1042\",\"type\":\"Title\"},{\"attributes\":{\"data\":{\"__label\":[\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\"],\"area (sq. um)\":[1683,2061,1792,1852,2091,1781,1912,1802,1751,1731,1892,1951,1809,1683,1787,1840,1821,1910,1930,1800,1833,1683,1671,1680,1692,1800,1821,1882,1642,1749,1712,1661,1701,2141,1863,1752,1740,1721,1660,1930,2030,1851,2131,1828,1840,2090,2169,1988,2212,2339,1989,2144,2290,1920,2280,1809,2158,1800,2133,2060,2160,2001,2030,2088,1951,2460,2021,2010,2139,2160,2106,2171,2113,2179,1890,2179,2021,1969,2150,1900,2267,1711,1901,2114,2112,2361,2130,2061,2121,1832,2210,2130,2153,2009,2100,2252,2143,2252,2222,2121,2409],\"cat\":[\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\"],\"food\":[\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"high\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\",\"low\"]},\"selected\":{\"id\":\"1051\"},\"selection_policy\":{\"id\":\"1050\"}},\"id\":\"1034\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1008\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1050\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis_label\":\"amount of food\",\"coordinates\":null,\"formatter\":{\"id\":\"1045\"},\"group\":null,\"major_label_policy\":{\"id\":\"1046\"},\"ticker\":{\"id\":\"1017\"}},\"id\":\"1016\",\"type\":\"CategoricalAxis\"},{\"attributes\":{},\"id\":\"1051\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1048\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1034\"},\"glyph\":{\"id\":\"1037\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1039\"},\"nonselection_glyph\":{\"id\":\"1038\"},\"view\":{\"id\":\"1041\"}},\"id\":\"1040\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1016\"},\"coordinates\":null,\"dimension\":1,\"grid_line_color\":null,\"group\":null,\"ticker\":null},\"id\":\"1018\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1034\"}},\"id\":\"1041\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"hatch_alpha\":{\"value\":0.1},\"hatch_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"x\":{\"field\":\"area (sq. um)\"},\"y\":{\"field\":\"cat\",\"transform\":{\"id\":\"1035\"}}},\"id\":\"1038\",\"type\":\"Circle\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"hatch_alpha\":{\"value\":0.2},\"hatch_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"field\":\"cat\",\"transform\":{\"id\":\"1033\"}},\"x\":{\"field\":\"area (sq. um)\"},\"y\":{\"field\":\"cat\",\"transform\":{\"id\":\"1035\"}}},\"id\":\"1039\",\"type\":\"Circle\"}],\"root_ids\":[\"1004\"]},\"title\":\"Bokeh Application\",\"version\":\"2.4.2\"}};\n", " const render_items = [{\"docid\":\"359c0415-bf03-43ef-b275-ebfd406129a3\",\"root_ids\":[\"1004\"],\"roots\":{\"1004\":\"16576e1a-d1d8-44ab-8c77-ecbc249ec629\"}}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " let attempts = 0;\n", " const timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " clearInterval(timer);\n", " embed_document(root);\n", " } else {\n", " attempts++;\n", " if (attempts > 100) {\n", " clearInterval(timer);\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", " }\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "1004" } }, "output_type": "display_data" } ], "source": [ "p = iqplot.strip(\n", " data=df,\n", " q=\"area (sq. um)\",\n", " cats=\"food\",\n", " order=[\"low\", \"high\"],\n", " jitter=True,\n", " y_axis_label=\"amount of food\",\n", " frame_height=200,\n", ")\n", "\n", "bokeh.io.show(p)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It looks like worms that eat more food have smaller eggs.\n", "\n", "If we wanted to extract the measurements for worms with high food, we can do so using Boolean indexing in Pandas." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xa_high = df.loc[df[\"food\"]==\"high\", \"area (sq. um)\"]\n", "\n", "# Take a look at the data type\n", "type(xa_high)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The result is a Pandas `Series`, which is kind of like a single-column `DataFrame`. If we want to convert this to a Numpy array, we use the `.values` attribute." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "numpy.ndarray" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xa_high = df.loc[df[\"food\"]==\"high\", \"area (sq. um)\"].values\n", "\n", "type(xa_high)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we have a Numpy array! Let's pull out the low food cross sectional areas as well." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "xa_low = df.loc[df['food']=='low', 'area (sq. um)'].values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And now let's take a look at these arrays." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,\n", " 1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,\n", " 1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,\n", " 2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xa_high" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,\n", " 1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,\n", " 2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,\n", " 1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,\n", " 2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,\n", " 2121, 2409])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xa_low" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will use these arrays as examples to learn about NumPy arrays." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Slicing NumPy arrays\n", "\n", "We can slice NumPy arrays like lists and tuples. Here are a few examples." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1828, 2131, 1851, 2030, 1930, 1660, 1721, 1740, 1752, 1863, 2141,\n", " 1701, 1661, 1712, 1749, 1642, 1882, 1821, 1800, 1692, 1680, 1671,\n", " 1683, 1833, 1800, 1930, 1910, 1821, 1840, 1787, 1683, 1809, 1951,\n", " 1892, 1731, 1751, 1802, 1912, 1781, 2091, 1852, 1792, 2061, 1683])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Reversed array\n", "xa_high[::-1]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1852, 1751, 1683, 1930, 1680, 1642, 2141, 1660, 1828])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Every 5th element, starting at index 3\n", "xa_high[3::5]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1892, 1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Entries 10 to 20\n", "xa_high[10:21]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fancy indexing\n", "\n", "NumPy arrays also allow **fancy indexing**, where we can slice out specific values. For example, say we wanted indices 1, 19, and 6 (in that order) from `xa_high`. We just index with a list of the indices we want." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2061, 1800, 1912])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xa_high[[1, 19, 6]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Instead of a list, we could also use a NumPy array." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2061, 1800, 1912])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xa_high[np.array([1, 19, 6])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As a very nice feature, we can use Boolean indexing with Numpy arrays, just like with Pandas using `.loc`. Say we only want the egg cross sectional areas that are greater than 2000 µm²." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2061, 2091, 2141, 2030, 2131])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Just slice out the big ones\n", "xa_high[xa_high > 2000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we want to know the indices where the values are high, we can use the `np.where()` function." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 1, 4, 33, 40, 42]),)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.where(xa_high > 2000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NumPy arrays are mutable\n", "\n", "Yes, NumPy arrays are mutable. Let's look at some consequences." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 6, 4])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make an array\n", "my_ar = np.array([1, 2, 3, 4])\n", "\n", "# Change an element\n", "my_ar[2] = 6\n", "\n", "# See the result\n", "my_ar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let's try attaching another variable to the NumPy array." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 6, 9])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Attach a new variable\n", "my_ar2 = my_ar\n", "\n", "# Set an entry using the new variable\n", "my_ar2[3] = 9\n", "\n", "# Does the original change? (yes.)\n", "my_ar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's see how messing with NumPy in functions affects things." ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.1, 0.2, 0.3, 0.4])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Re-instantiate my_ar\n", "my_ar = np.array([1, 2, 3, 4]).astype(float)\n", "\n", "# Function to normalize x (note that /= works with mutable objects)\n", "def normalize(x):\n", " x /= np.sum(x)\n", "\n", "# Pass it through a function\n", "normalize(my_ar)\n", "\n", "# Is it normalized even though we didn't return anything? (Yes.)\n", "my_ar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So, be careful when writing functions. What you do to your NumPy array inside the function will happen outside of the function as well. Always remember that:\n", "\n", "
\n", " \n", "NumPy arrays are mutable.\n", " \n", "
\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Slices of NumPy arrays are **views**, not copies\n", "\n", "A very important distinction between NumPy arrays and lists is that slices of NumPy arrays are **views** into the original NumPy array, NOT copies." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1, 2, 3, 4]\n", "[1 9 3 4]\n" ] } ], "source": [ "# Make list and array\n", "my_list = [1, 2, 3, 4]\n", "my_ar = np.array(my_list)\n", "\n", "# Slice out of each\n", "my_list_slice = my_list[1:-1]\n", "my_ar_slice = my_ar[1:-1]\n", "\n", "# Mess with the slices\n", "my_list_slice[0] = 9\n", "my_ar_slice[0] = 9\n", "\n", "# Look at originals\n", "print(my_list)\n", "print(my_ar)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Messing with an element of a slice of a NumPy array messes with that element in the original! This is not the case with lists. Let's issue a warning.\n", "\n", "
\n", " \n", "Slices of NumPy arrays are views, not copies.\n", " \n", "
\n", "\n", "Fortunately, you can make a copy of an array using the `np.copy()` function." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a copy\n", "xa_high_copy = np.copy(xa_high)\n", "\n", "# Mess with an entry\n", "xa_high_copy[10] = 2000\n", "\n", "# Check equality\n", "np.allclose(xa_high, xa_high_copy)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So, messing with an entry in the copy did not affect the original." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mathematical operations with arrays\n", "\n", "Mathematical operations on arrays are done elementwise to all elements, as we saw with Pandas data frames." ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5. , 3. , 2.33333333, 2. ])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Divide one array be another\n", "np.array([5, 6, 7, 8]) / np.array([1, 2, 3, 4])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-6732, -8244, -7168, -7408, -8364, -7124, -7648, -7208, -7004,\n", " -6924, -7568, -7804, -7236, -6732, -7148, -7360, -7284, -7640,\n", " -7720, -7200, -7332, -6732, -6684, -6720, -6768, -7200, -7284,\n", " -7528, -6568, -6996, -6848, -6644, -6804, -8564, -7452, -7008,\n", " -6960, -6884, -6640, -7720, -8120, -7404, -8524, -7312])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Multiply by scalar\n", "-4 * xa_high" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2832489, 4247721, 3211264, 3429904, 4372281, 3171961, 3655744,\n", " 3247204, 3066001, 2996361, 3579664, 3806401, 3272481, 2832489,\n", " 3193369, 3385600, 3316041, 3648100, 3724900, 3240000, 3359889,\n", " 2832489, 2792241, 2822400, 2862864, 3240000, 3316041, 3541924,\n", " 2696164, 3059001, 2930944, 2758921, 2893401, 4583881, 3470769,\n", " 3069504, 3027600, 2961841, 2755600, 3724900, 4120900, 3426201,\n", " 4541161, 3341584])" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Raise to power\n", "xa_high**2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Indexing 2D NumPy arrays\n", "\n", "NumPy arrays need not be one-dimensional. We'll create a two-dimensional NumPy array by reshaping our `xa_high` array from having shape `(44,)` to having shape `(11, 4)`. That is, it will become an array with 11 rows and 4 columns. (The 2D nature of this array has no meaning in this case; it's just meant for demonstration.)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1683, 2061, 1792, 1852],\n", " [2091, 1781, 1912, 1802],\n", " [1751, 1731, 1892, 1951],\n", " [1809, 1683, 1787, 1840],\n", " [1821, 1910, 1930, 1800],\n", " [1833, 1683, 1671, 1680],\n", " [1692, 1800, 1821, 1882],\n", " [1642, 1749, 1712, 1661],\n", " [1701, 2141, 1863, 1752],\n", " [1740, 1721, 1660, 1930],\n", " [2030, 1851, 2131, 1828]])" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# New 2D array using the reshape() method\n", "my_ar = xa_high.reshape((11, 4))\n", "\n", "# Look at it\n", "my_ar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice that it is represented as an array made out of a list of lists. If we had a list of lists, we would index it like this:\n", "\n", " list_of_lists[i][j]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make list of lists\n", "list_of_lists = [[1, 2], [3, 4]]\n", "\n", "# Pull out value in first row, second column\n", "list_of_lists[0][1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Though this will work with NumPy arrays, this is *not* how NumPy arrays are indexed. They are indexed much more conveniently." ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2061" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_ar[0, 1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We essentially have a tuple in the indexing brackets. Now, say we wanted the second row (indexing starting at 0)." ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1751, 1731, 1892, 1951])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_ar[2, :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can use Boolean indexing as before." ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2061, 2091, 2141, 2030, 2131])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_ar[my_ar > 2000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that this gives a one-dimensional array of the entries greater than 2000. If we wanted indices where this is the case, we can again use `np.where()`." ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 0, 1, 8, 10, 10]), array([1, 0, 1, 0, 2]))" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.where(my_ar > 2000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This tuple of NumPy arrays is how we would index using fancy indexing to pull those values out using fancy indexing." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2061, 2091, 2141, 2030, 2131])" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_ar[(np.array([ 0, 1, 8, 10, 10]), np.array([1, 0, 1, 0, 2]))]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "NumPy arrays can be of arbitrary integer dimension, and these principles extrapolate to 3D, 4D, etc., arrays." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concatenating arrays\n", "\n", "Let's say we want to study all cross sectional areas and don't care if the mother was well-fed or not. We would want to concatenate our arrays. The `np.concatenate()` function accomplishes this. We simply have to pass it a tuple containing the NumPy arrays we want to concatenate." ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,\n", " 1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,\n", " 1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,\n", " 2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828,\n", " 1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,\n", " 1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,\n", " 2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,\n", " 1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,\n", " 2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,\n", " 2121, 2409])" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined = np.concatenate((xa_high, xa_low))\n", "\n", "# Look at it\n", "combined" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NumPy has useful mathematical functions\n", "\n", "So far, we have not done much mathematics with Python. We have done some adding and division, but nothing like computing a logarithm or cosine. The NumPy functions also work elementwise on the arrays when it is intuitive to do so (just like we have seen in Pandas `DataFrame`s). That is, they apply the function to each entry in the array. Check it out (even though exponentiating cross sectional areas is nonsensical)." ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5.38167681, 7.8538197 , 6.00144336, 6.37255189, 8.09300412,\n", " 5.93578924, 6.76660849, 6.06175887, 5.76036016, 5.64629738,\n", " 6.63262067, 7.03571978, 6.10434004, 5.38167681, 5.97151103,\n", " 6.29653826, 6.1780334 , 6.7530888 , 6.88951024, 6.04964746,\n", " 6.2526164 , 5.38167681, 5.31748262, 5.36555597, 5.43033051,\n", " 6.04964746, 6.1780334 , 6.56662499, 5.16549017, 5.74885095,\n", " 5.54003047, 5.26457279, 5.47942408, 8.50794132, 6.44303692,\n", " 5.7661234 , 5.69734342, 5.59011579, 5.25931084, 6.88951024,\n", " 7.61408636, 6.36618252, 8.42328589, 6.22143134])" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Exponential\n", "np.exp(xa_high / 1000)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.62656192, 0.9933696 , 0.27501843, 0.03112568, 0.26681725,\n", " -0.96021239, -0.33430744, 0.29228295, -0.42404251, -0.99984597,\n", " 0.72399324, -0.99748325, 0.84865001, 0.62656192, -0.84393482,\n", " 0.56257847, 0.43231386, 0.99610114, 0.48702972, -0.99122275,\n", " -0.11903049, 0.62656192, 0.94691648, -0.73027654, -0.24968607,\n", " -0.99122275, 0.43231386, -0.98275172, -0.49500319, -0.64703425,\n", " -0.98592179, -0.61963892, -0.17156886, 0.00460656, -0.99936794,\n", " 0.53296056, 0.90375673, 0.82939405, 0.3256673 , 0.48702972,\n", " 0.86222727, -0.824246 , 0.5401501 , 0.91834245])" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cosine\n", "np.cos(xa_high)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([41.02438299, 45.39823785, 42.33202098, 43.03486958, 45.72745346,\n", " 42.20189569, 43.72642222, 42.44997055, 41.84495191, 41.60528813,\n", " 43.49712634, 44.17012565, 42.53234064, 41.02438299, 42.27292278,\n", " 42.89522118, 42.67317659, 43.70354677, 43.93176527, 42.42640687,\n", " 42.81354926, 41.02438299, 40.87786687, 40.98780306, 41.1339276 ,\n", " 42.42640687, 42.67317659, 43.38202393, 40.52159918, 41.82104733,\n", " 41.37632173, 40.75536774, 41.24318125, 46.27094121, 43.16248371,\n", " 41.85689907, 41.71330723, 41.48493703, 40.74309757, 43.93176527,\n", " 45.0555213 , 43.02324953, 46.16275555, 42.75511665])" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Square root\n", "np.sqrt(xa_high)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can even do some matrix operations (which are obviously not done elementwise), like dot products." ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "146.360195" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.dot(xa_high/1000, xa_high/1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "NumPy also has useful attributes, like `np.pi`." ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.141592653589793" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.pi" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SciPy has even more useful functions (in modules)\n", "\n", "SciPy actually began life as a library of special functions that operate on NumPy arrays. For example, we can compute an error function using the `scipy.special` module, which contains lots of special functions. Note that you often have to individually import the SciPy module you want to use, for example with\n", " \n", "```python\n", "import scipy.special\n", "```" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.76597747, 0.8549794 , 0.7948931 , 0.80965587, 0.86074212,\n", " 0.79209865, 0.8236209 , 0.79740973, 0.78433732, 0.77904847,\n", " 0.81905337, 0.83227948, 0.79915793, 0.76597747, 0.7936263 ,\n", " 0.80676772, 0.8021292 , 0.82316805, 0.8276577 , 0.79690821,\n", " 0.80506817, 0.76597747, 0.76262579, 0.76514271, 0.76846912,\n", " 0.79690821, 0.8021292 , 0.81673693, 0.7543863 , 0.78381257,\n", " 0.77393853, 0.75980693, 0.77094188, 0.86995276, 0.81227529,\n", " 0.78459935, 0.78143985, 0.77636944, 0.75952376, 0.8276577 ,\n", " 0.84883448, 0.80941641, 0.86814949, 0.80384751])" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scipy.special.erf(xa_high / 2000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are many SciPy submodules which give plenty or rich functionality for scientific computing. You can check out the [SciPy docs](https://docs.scipy.org/doc/scipy/reference/) to learn about all of the functionality. In my own work, I use the following extensively.\n", "\n", "- `scipy.special`: Special functions.\n", "- `scipy.stats`: Functions for statistical analysis.\n", "- `scipy.optimize`: Numerical optimization.\n", "- `scipy.integrate`: Numerical solutions to differential equations.\n", "- `scipy.interpolate`: Smooth interpolation of functions." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vectorization and Numpy arrays\n", "\n", "Like operations on Pandas `Series` that we have already seen, operations on Numpy arrays are also vectorized and binary operations typically work elementwise." ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([3366, 4122, 3584, 3704, 4182, 3562, 3824, 3604, 3502, 3462, 3784,\n", " 3902, 3618, 3366, 3574, 3680, 3642, 3820, 3860, 3600, 3666, 3366,\n", " 3342, 3360, 3384, 3600, 3642, 3764, 3284, 3498, 3424, 3322, 3402,\n", " 4282, 3726, 3504, 3480, 3442, 3320, 3860, 4060, 3702, 4262, 3656])" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xa_high + xa_high" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Slicing Numpy arrays\n", "\n", "Numpy arrays are ordered and strictly indexed by integers. There are therefore sliced just like lists, tuples, and strings as we have seen before. Conveniently we can also use Boolean indexing on Numpy arrays, just like with Pandas." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NumPy and SciPy are highly optimized\n", "\n", "Importantly, NumPy and SciPy routines are often *fast*. To understand why, we need to think a bit about how your computer actually runs code you write." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Interpreted and compiled languages\n", "\n", "We have touched on the fact that Python is an **interpreted language**. This means that the Python interpreter reads through your code, line by line, translates the commands into instructions that your computer's processor can execute, and then these are executed. It also does [**garbage collection**](https://en.wikipedia.org/wiki/Garbage_collection_(computer_science)), which manages memory usage in your programs for you. As an interpreted language, code is often much easier to write, and development time is much shorter. It is often easier to debug. By contrast, with **compiled languages** (the dominant ones being Fortran, C, and C++), your entire source code is translated into machine code before you ever run it. When you execute your program, it is already in machine code. As a result, compiled code is often much faster than interpreted code. The speed difference depends largely on the task at hand, but there is often over a 100-fold difference.\n", "\n", "First, we'll demonstrate the difference between compiled and interpreted languages by looking at a function to sum the elements of an array. Note that Python is [dynamically typed](http://stackoverflow.com/a/34004445/2320823), so the function below works for multiple data types, but the C function works only for [double precision floating point](https://en.wikipedia.org/wiki/Double-precision_floating-point_format) numbers." ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[19793 20111 20171 19978]\n" ] } ], "source": [ "# Python code to sum an array and print the result to the screen\n", "print(sum(my_ar))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```C\n", "/* C code to sum an array and print the result to the screen */\n", "\n", "#include \n", "\n", "void sum_array(double a[], int n);\n", "\n", "void sum_array(double a[], int n) {\n", " int i; \n", " double sum=0;\n", " for (i = 0; i < n; i++){\n", " sum += a[i];\n", " }\n", " printf(\"%g\\n\", sum);\n", "}\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The C code won't even execute without another function called `main` to call it. You should notice the difference in complexity of the code. Interpreted code is very often much easier to write!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### NumPy and SciPy use compiled code!\n", "\n", "Under the hood, when you call a NumPy or SciPy function, or use one of the methods, the Python interpreter passes the arrays into pre-compiled functions. (They are usually C or Fortran functions.) That means that you get to use an interpreted language with near-compiled speed! We can demonstrate the speed by comparing an explicit sum of elements of an array using a Python `for` loop versus NumPy. We will use the `np.random` module to generate a large array of random numbers (we will visit random number generation in a coming lesson). We then use the `%timeit` magic function of IPython to time the execution of the sum of the elements of the array." ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "909 µs ± 2.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "# Make array of 10,000 random numbers\n", "x = np.random.random(10000)\n", "\n", "# Sum with Python for loop\n", "def python_sum(x):\n", " x_sum = 0.0\n", " for y in x:\n", " x_sum += y\n", " return x_sum\n", "\n", "# Test speed\n", "%timeit python_sum(x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we'll do the same test with the NumPy implementation." ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8.05 µs ± 75.6 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" ] } ], "source": [ "%timeit np.sum(x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Wow! We went from a millisecond to *micro*seconds!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Word of advice: use NumPy and SciPy\n", "\n", "If you are writing code and you think to yourself, \"This seems like a pretty common things to do,\" there is a good chance the someone really smart has written code to do it. If it's something numerical, there is a good chance it is in NumPy or SciPy. **Use these packages.** Do not reinvent the wheel. It is very rare you can beat them for performance, error checking, etc.\n", "\n", "Furthermore, NumPy and SciPy are very well tested. In general, you do not need to write unit tests for well-established packages. Obviously, if you use NumPy or SciPy within your own functions, you still need to test what you wrote." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Computing environment" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "tags": [ "hide-input" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python implementation: CPython\n", "Python version : 3.9.12\n", "IPython version : 8.3.0\n", "\n", "numpy : 1.21.5\n", "scipy : 1.7.3\n", "pandas : 1.4.2\n", "bokeh : 2.4.2\n", "iqplot : 0.2.4\n", "jupyterlab: 3.3.2\n", "\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -v -p numpy,scipy,pandas,bokeh,iqplot,jupyterlab" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }