{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting\n",
    "from sklearn import datasets\n",
    "from sklearn import preprocessing\n",
    "from sklearn import linear_model\n",
    "from sklearn import model_selection as ms\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import accuracy_score\n",
    "import timeit\n",
    "import scipy.sparse as sparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_path = './data/mnist_digits/mnist_784.csv'\n",
    "df = pd.read_csv(csv_path)\n",
    "data = df.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(data.shape)\n",
    "print(data[:4,780:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now to make training and test sets\n",
    "# as I don't plan to run model, will make life easy, just take the first 60,000 rows as the training set\n",
    "X_trn = data[:60000, :-1]\n",
    "y_trn = data[:60000, -1]\n",
    "print(X_trn.shape, y_trn.shape)\n",
    "X_tst = data[60000: , :-1]\n",
    "y_tst = data[60000:, -1]\n",
    "print(X_tst.shape, y_tst.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# probability function\n",
    "def compute_probabilities(X, theta, temp_parameter):\n",
    "  \"\"\"\n",
    "  Computes, for each datapoint X[i], the probability that X[i] is labeled as j\n",
    "  for j = 0, 1, ..., k-1\n",
    "\n",
    "  Args:\n",
    "      X - (n, d) NumPy array (n datapoints each with d features)\n",
    "      theta - (k, d) NumPy array, where row j represents the parameters of our model for label j\n",
    "      temp_parameter - the temperature parameter of softmax function (scalar)\n",
    "  Returns:\n",
    "      H - (k, n) NumPy array, where each entry H[j][i] is the probability that X[i] is labeled as j\n",
    "  \"\"\"\n",
    "  n_lbls = theta.shape[0]\n",
    "  n_d_pts = X.shape[0]\n",
    "  probs = np.zeros(n_lbls)\n",
    "  bases = np.array([np.e for _ in range(n_lbls)])\n",
    "\n",
    "  r = 0\n",
    "  for row in X:\n",
    "    tmp_p = np.zeros(n_lbls)\n",
    "    poss_c = [theta[i].dot(row) / temp_parameter for i in range(n_lbls)]\n",
    "    c_use = max(poss_c)\n",
    "    \n",
    "    for j in range(n_lbls):\n",
    "      tmp_p[j] = (theta[j].dot(row) / temp_parameter) - c_use\n",
    "    tmp_p = np.power(bases, tmp_p)\n",
    "    avg_div = tmp_p.sum()\n",
    "    tmp_p = tmp_p / avg_div\n",
    "\n",
    "    probs = np.column_stack((probs, tmp_p))      \n",
    "    r += 1\n",
    "  return probs[:,1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's make ourselves a test theta array and a couple of result arrays\n",
    "theta = np.zeros((10, 784))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "# let's time it for one iteration\n",
    "if True:\n",
    "  p_60 = compute_probabilities(X_trn, theta, temp_parameter=1)\n",
    "# result: 1min 42s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's a fair bit of time. I think I will do my timing using the test dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "p_10 = compute_probabilities(X_tst, theta, temp_parameter=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the iPython magic command `%%timeit` didn't allow me to access the value `p_60` or `p_10`. So, did some digging and found the following approach. You will see why I want those values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_1 = []\n",
    "t_stmt = f'result_1.append(compute_probabilities(X_trn, theta, temp_parameter=1))'\n",
    "print(timeit.timeit(stmt=t_stmt, setup='from __main__ import result_1, compute_probabilities, X_trn, theta', number=1))\n",
    "p_60 = result_1[0] \n",
    "print(p_60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_2 = []\n",
    "t_stmt = f'result_2.append(compute_probabilities(X_tst, theta, temp_parameter=1))'\n",
    "print(timeit.timeit(stmt=t_stmt, setup='from __main__ import result_2, compute_probabilities, X_tst, theta', number=5))\n",
    "p_10 = result_2[0] \n",
    "print(p_10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a different approach to timing the execution speed\n",
    "result_3 = []\n",
    "t_stmt = f'result_3.append(compute_probabilities(X_tst, theta, temp_parameter=1))'\n",
    "print(timeit.repeat(stmt=t_stmt, setup='from __main__ import result_3, compute_probabilities, X_tst, theta', repeat=7, number=1))\n",
    "p_10_2 = result_3[0]\n",
    "print(p_10 == p_10_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# modified so as to not call probability function, added new parameter 'probs'\n",
    "def compute_cost_function(X, Y, theta, probs, lambda_factor, temp_parameter):\n",
    "    \"\"\"\n",
    "    Computes the total cost over every datapoint.\n",
    "\n",
    "    Args:\n",
    "        X - (n, d) NumPy array (n datapoints each with d features)\n",
    "        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each\n",
    "            data point\n",
    "        theta - (k, d) NumPy array, where row j represents the parameters of our\n",
    "                model for label j\n",
    "        lambda_factor - the regularization constant (scalar)\n",
    "        temp_parameter - the temperature parameter of softmax function (scalar)\n",
    "\n",
    "    Returns\n",
    "        c - the cost value (scalar)\n",
    "    \"\"\"\n",
    "    #YOUR CODE HERE\n",
    "    #probs = compute_probabilities(X, theta, temp_parameter)\n",
    "    l_sum = 0\n",
    "    r_sum = (lambda_factor / 2) * np.sum(theta*theta)\n",
    "\n",
    "    for i in range(X.shape[0]):\n",
    "      l_sum += np.log(probs[Y[i], i])\n",
    "    l_sum = -(l_sum / X.shape[0])\n",
    "\n",
    "    return l_sum + r_sum\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cost_1 = []\n",
    "t_stmt = f'cost_1.append(compute_cost_function(X_trn, y_trn, theta, p_60, lambda_factor=1.0e-4, temp_parameter=1))'\n",
    "print(timeit.timeit(stmt=t_stmt, setup='from __main__ import cost_1, compute_cost_function, X_trn, y_trn, theta, p_60', number=1))\n",
    "c_60 = cost_1[0] \n",
    "print(c_60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cost_2 = []\n",
    "t_stmt = f'cost_2.append(compute_cost_function(X_trn, y_trn, theta, p_60, lambda_factor=1.0e-4, temp_parameter=1))'\n",
    "print(timeit.repeat(stmt=t_stmt, setup='from __main__ import cost_2, compute_cost_function, X_trn, y_trn, theta, p_60', repeat=7, number=3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_onehot(v_col, n_smp, n_cat):\n",
    "  return sparse.coo_matrix((np.ones(n_smp), (v_col, range(n_smp))), shape=(n_cat,n_smp)).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# modified so as to not call probability function, added new parameter 'probs'\n",
    "# also in course code, didn't have function for creating onehot encoding of target data\n",
    "def run_gradient_descent_iteration(X, Y, theta, probs, alpha, lambda_factor, temp_parameter):\n",
    "    \"\"\"\n",
    "    Runs one step of batch gradient descent\n",
    "\n",
    "    Args:\n",
    "        X - (n, d) NumPy array (n datapoints each with d features)\n",
    "        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each\n",
    "            data point\n",
    "        theta - (k, d) NumPy array, where row j represents the parameters of our\n",
    "                model for label j\n",
    "        alpha - the learning rate (scalar)\n",
    "        lambda_factor - the regularization constant (scalar)\n",
    "        temp_parameter - the temperature parameter of softmax function (scalar)\n",
    "\n",
    "    Returns:\n",
    "        theta - (k, d) NumPy array that is the final value of parameters theta\n",
    "    \"\"\"\n",
    "    #YOUR CODE HERE\n",
    "    n_smp = X.shape[0]\n",
    "    n_cat = theta.shape[0]\n",
    "\n",
    "    # y_ohe = sparse.coo_matrix((np.ones(n_smp), (Y, range(n_smp))), shape=(n_cat,n_smp)).toarray()\n",
    "    y_ohe = make_onehot(Y, n_smp, n_cat)\n",
    "\n",
    "    # probs = compute_probabilities(X, theta, temp_parameter)\n",
    "\n",
    "    tmp_dot = np.dot((y_ohe - probs),X)\n",
    "\n",
    "    grad = (-1 / (n_smp * temp_parameter)) * tmp_dot + lambda_factor*theta\n",
    "    theta = theta - (alpha * grad)\n",
    "\n",
    "    return theta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gdi_1 = []\n",
    "t_stmt = f'gdi_1.append(run_gradient_descent_iteration(X_trn, y_trn, theta, p_60, alpha=0.3, lambda_factor=1.0e-4, temp_parameter=1))'\n",
    "t_out = timeit.repeat(stmt=t_stmt, setup='from __main__ import gdi_1, run_gradient_descent_iteration, X_trn, y_trn, theta, p_60', repeat=7, number=3)\n",
    "print([tm / 3 for tm in t_out])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# eliminate loops as much as possible\n",
    "def compute_probabilities_2(X, theta, temp_parameter):\n",
    "    \"\"\"\n",
    "    Computes, for each datapoint X[i], the probability that X[i] is labeled as j\n",
    "    for j = 0, 1, ..., k-1\n",
    "\n",
    "    Args:\n",
    "        X - (n, d) NumPy array (n datapoints each with d features)\n",
    "        theta - (k, d) NumPy array, where row j represents the parameters of our model for label j\n",
    "        temp_parameter - the temperature parameter of softmax function (scalar)\n",
    "    Returns:\n",
    "        H - (k, n) NumPy array, where each entry H[j][i] is the probability that X[i] is labeled as j\n",
    "    \"\"\"\n",
    "    bases = np.full((theta.shape[0], X.shape[0]), fill_value=np.e)\n",
    "    base_dot = np.matmul(theta, X.T) / temp_parameter\n",
    "    base_dot -= np.amax(base_dot, axis=0)\n",
    "    probs = np.power(bases, base_dot)\n",
    "    div_v = np.sum(np.power(bases, base_dot), axis=0)\n",
    "    probs /= div_v\n",
    "    return probs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prob_rf_1 = []\n",
    "t_stmt = f'prob_rf_1.append(compute_probabilities_2(X_trn, theta, temp_parameter=1))'\n",
    "t_out_prf_1 = timeit.repeat(stmt=t_stmt, setup='from __main__ import prob_rf_1, compute_probabilities_2, X_trn, theta', repeat=7, number=3)\n",
    "p_60_rf = prob_rf_1[0]\n",
    "print([tm / 3 for tm in t_out_prf_1])\n",
    "print(p_60_rf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# once again, modify to accept probabilities, to limit timing to this functions actual code\n",
    "# with the exception of the call to make_onehot()\n",
    "def compute_cost_function_2(X, Y, theta, probs, lambda_factor, temp_parameter):\n",
    "    \"\"\"\n",
    "    Computes the total cost over every datapoint.\n",
    "\n",
    "    Args:\n",
    "        X - (n, d) NumPy array (n datapoints each with d features)\n",
    "        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each\n",
    "            data point\n",
    "        theta - (k, d) NumPy array, where row j represents the parameters of our\n",
    "                model for label j\n",
    "        lambda_factor - the regularization constant (scalar)\n",
    "        temp_parameter - the temperature parameter of softmax function (scalar)\n",
    "\n",
    "    Returns\n",
    "        c - the cost value (scalar)\n",
    "    \"\"\"\n",
    "    n_smp = X.shape[0]\n",
    "    n_cat = theta.shape[0]\n",
    "    probs = compute_probabilities(X, theta, temp_parameter)\n",
    "    l_sum = 0\n",
    "    r_sum = (lambda_factor / 2) * np.sum(theta*theta)\n",
    "    # y_ohe = sparse.coo_matrix((np.ones(n_smp), (Y, range(n_smp))), shape=(n_cat,n_smp)).toarray()\n",
    "    y_ohe = make_onehot(Y, n_smp, n_cat)\n",
    "    valid_probs = np.choose(Y, np.log(probs[np.arange(theta.shape[0])], y_ohe))\n",
    "    l_sum = - np.sum(valid_probs) / X.shape[0]\n",
    "    return l_sum + r_sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prob_cf_1 = []\n",
    "t_stmt = f'prob_cf_1.append(compute_cost_function_2(X_trn, y_trn, theta, p_60, lambda_factor=1.0e-4, temp_parameter=1))'\n",
    "t_out_crf_1 = timeit.repeat(stmt=t_stmt, setup='from __main__ import prob_cf_1, compute_cost_function_2, X_trn, y_trn, theta, p_60', repeat=7, number=3)\n",
    "c_60_rf = prob_cf_1[0]\n",
    "print([tm / 3 for tm in t_out_crf_1])\n",
    "print(c_60_rf)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}