{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python392jvsc74a57bd0a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85",
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import re\n",
    "import golf_stats_lib as gs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "text": [
      "\u001b[1;31mType:\u001b[0m        module\n",
      "\u001b[1;31mString form:\u001b[0m <module 'golf_stats_lib' from 'r:\\\\learn\\\\ds_intro\\\\golf_stats_lib.py'>\n",
      "\u001b[1;31mFile:\u001b[0m        r:\\learn\\ds_intro\\golf_stats_lib.py\n",
      "\u001b[1;31mDocstring:\u001b[0m  \n",
      "Module to provide functions related to obtaining statistics from PGA web site and saving to local CSV files.\n",
      "   As well as functions to retrieve the data in the CSVs and build DataFrames to hold a specified dataset.\n",
      "\n",
      "File: ./ds_intro/golf_stats_lib.py\n",
      "\n",
      "  - tied to various Too Old To Code blog posts\n",
      "  - function(s) to obtain data from PGA web site and save to local CSV files\n",
      "  - functions(s) to process the CSV files into DataFrames in various ways\n",
      "  - built on Jypyter notebook pandas_play_8.ipynb\n",
      "  \n",
      "Module level and/or global variables:\n",
      "----\n",
      "\n",
      "These may require updating if you add events or stat types.\n",
      "\n",
      "events - dictionary of currently available events providing PGA Tour ids\n",
      "stats - dictionary of currently available stat types providing PGA Tour ids\n",
      "st_cols - for each stat type, a list of table columns to save to csv\n",
      "d_dir - directory to which to save stat csv files\n",
      "\n",
      "Functions:\n",
      "\n",
      "  get_csv_nm(t_yr, t_id, p_st)\n",
      "  stats_2_csv(tyrs, tids, psts)\n",
      "  csv_2_df_base(t_yr, t_id, p_st)\n",
      "  csv_2_df(t_yr, t_id, p_st)\n",
      "  tourney_2_df(t_yr, t_id, psts)\n",
      "  year_2_df(t_yr, tids, psts)\n",
      "  golf_stats_2_df(tyrs, tids, psts)\n",
      "  sort_player_name(df_in)\n"
     ],
     "name": "stdout"
    }
   ],
   "source": [
    "gs?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "text": [
      "\u001b[1;31mSignature:\u001b[0m \u001b[0mgs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtourney_2_df\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mt_yr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mt_id\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpsts\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mDocstring:\u001b[0m\n",
      "Combine all requested stats for a given tournament and year into a single DataFrame. \n",
      "Return DataFrame.\n",
      "Useage: tourney_2_df(t_yr, t_id, p_sts)\n",
      "  where t_yr = tournament year\n",
      "        t_id = tournament id (e.g. 'pga')\n",
      "        psts = list of player stats (e.g. ['drv', 'gir'])\n",
      "\u001b[1;31mFile:\u001b[0m      r:\\learn\\ds_intro\\golf_stats_lib.py\n",
      "\u001b[1;31mType:\u001b[0m      function\n"
     ],
     "name": "stdout"
    }
   ],
   "source": [
    "gs.tourney_2_df?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "                     drv\nPLAYER NAME             \nDustin Johnson     335.6\nLuke List          319.0\nPhil Mickelson     318.6\nRory McIlroy       317.4\nLucas Bjerregaard  316.6\n...                  ...\nMarty Jertson      281.9\nRich Beem          281.9\nHenrik Stenson     281.0\nBrandt Snedeker    279.8\nAndrew Putnam      274.1\n\n[82 rows x 1 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>drv</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Dustin Johnson</th>\n      <td>335.6</td>\n    </tr>\n    <tr>\n      <th>Luke List</th>\n      <td>319.0</td>\n    </tr>\n    <tr>\n      <th>Phil Mickelson</th>\n      <td>318.6</td>\n    </tr>\n    <tr>\n      <th>Rory McIlroy</th>\n      <td>317.4</td>\n    </tr>\n    <tr>\n      <th>Lucas Bjerregaard</th>\n      <td>316.6</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Marty Jertson</th>\n      <td>281.9</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>281.9</td>\n    </tr>\n    <tr>\n      <th>Henrik Stenson</th>\n      <td>281.0</td>\n    </tr>\n    <tr>\n      <th>Brandt Snedeker</th>\n      <td>279.8</td>\n    </tr>\n    <tr>\n      <th>Andrew Putnam</th>\n      <td>274.1</td>\n    </tr>\n  </tbody>\n</table>\n<p>82 rows × 1 columns</p>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df1 = gs.csv_2_df_base('2019', 'pga', 'drv')\n",
    "display(df1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "                      drv    gir\nPLAYER NAME                     \nDustin Johnson      335.6    NaN\nLuke List           319.0    NaN\nPhil Mickelson      318.6    NaN\nRory McIlroy        317.4    NaN\nLucas Bjerregaard   316.6    NaN\n...                   ...    ...\nRafa Cabrera Bello    NaN  47.22\nRyan Vermeer          NaN  47.22\nPat Perez             NaN  47.22\nRich Beem             NaN  47.22\nMarty Jertson         NaN  41.67\n\n[164 rows x 2 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>drv</th>\n      <th>gir</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Dustin Johnson</th>\n      <td>335.6</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Luke List</th>\n      <td>319.0</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Phil Mickelson</th>\n      <td>318.6</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Rory McIlroy</th>\n      <td>317.4</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Lucas Bjerregaard</th>\n      <td>316.6</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Rafa Cabrera Bello</th>\n      <td>NaN</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Ryan Vermeer</th>\n      <td>NaN</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Pat Perez</th>\n      <td>NaN</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>NaN</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Marty Jertson</th>\n      <td>NaN</td>\n      <td>41.67</td>\n    </tr>\n  </tbody>\n</table>\n<p>164 rows × 2 columns</p>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df2 = gs.csv_2_df_base('2019', 'pga', 'gir')\n",
    "df3 = pd.concat([df1, df2])\n",
    "display(df3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "ValueError",
     "evalue": "cannot reindex from a duplicate axis",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-6-2b21254127e8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;31m#print(type(df3.index[0]))\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;31m#print(df3.index)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdf_sort\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_player_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf3\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_sort\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[1;31m# OOPS! don't currently have a fix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mr:\\learn\\ds_intro\\golf_stats_lib.py\u001b[0m in \u001b[0;36msort_player_name\u001b[1;34m(df_in)\u001b[0m\n\u001b[0;32m    240\u001b[0m       \u001b[0mndx_sort2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msorted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_in\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mr'\\W+'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    241\u001b[0m       \u001b[1;31m#print(f'sorted: {ndx_sort2[:5]}')\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m     \u001b[0mdf_sort\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_in\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mndx_sort2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    243\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mdf_sort\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\util\\_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    310\u001b[0m         \u001b[1;33m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    311\u001b[0m         \u001b[1;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mCallable\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m...\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mAny\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 312\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    313\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    314\u001b[0m         \u001b[0mkind\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minspect\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mParameter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPOSITIONAL_OR_KEYWORD\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mreindex\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   4174\u001b[0m         \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"axis\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4175\u001b[0m         \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"labels\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4176\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   4177\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4178\u001b[0m     def drop(\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mreindex\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   4809\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4810\u001b[0m         \u001b[1;31m# perform the reindex on the axes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4811\u001b[1;33m         return self._reindex_axes(\n\u001b[0m\u001b[0;32m   4812\u001b[0m             \u001b[0maxes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4813\u001b[0m         ).__finalize__(self, method=\"reindex\")\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m_reindex_axes\u001b[1;34m(self, axes, level, limit, tolerance, method, fill_value, copy)\u001b[0m\n\u001b[0;32m   4020\u001b[0m         \u001b[0mindex\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"index\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4021\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4022\u001b[1;33m             frame = frame._reindex_index(\n\u001b[0m\u001b[0;32m   4023\u001b[0m                 \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4024\u001b[0m             )\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m_reindex_index\u001b[1;34m(self, new_index, method, copy, level, fill_value, limit, tolerance)\u001b[0m\n\u001b[0;32m   4039\u001b[0m             \u001b[0mnew_index\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4040\u001b[0m         )\n\u001b[1;32m-> 4041\u001b[1;33m         return self._reindex_with_indexers(\n\u001b[0m\u001b[0;32m   4042\u001b[0m             \u001b[1;33m{\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mnew_index\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4043\u001b[0m             \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m_reindex_with_indexers\u001b[1;34m(self, reindexers, fill_value, copy, allow_dups)\u001b[0m\n\u001b[0;32m   4875\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4876\u001b[0m             \u001b[1;31m# TODO: speed up on homogeneous DataFrame objects\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4877\u001b[1;33m             new_data = new_data.reindex_indexer(\n\u001b[0m\u001b[0;32m   4878\u001b[0m                 \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4879\u001b[0m                 \u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mreindex_indexer\u001b[1;34m(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice)\u001b[0m\n\u001b[0;32m   1299\u001b[0m         \u001b[1;31m# some axes don't allow reindexing with dups\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1300\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mallow_dups\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1301\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_reindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1302\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1303\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0maxis\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36m_can_reindex\u001b[1;34m(self, indexer)\u001b[0m\n\u001b[0;32m   3474\u001b[0m         \u001b[1;31m# trying to reindex on an axis with duplicates\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3475\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_index_as_unique\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3476\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"cannot reindex from a duplicate axis\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3478\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mreindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: cannot reindex from a duplicate axis"
     ]
    }
   ],
   "source": [
    "# not what I would expect\n",
    "# let's sort the dataframe on the player index\n",
    "#print(type(df3.index[0]))\n",
    "#print(df3.index)\n",
    "df_sort = gs.sort_player_name(df3)\n",
    "display(df_sort)\n",
    "# OOPS! don't currently have a fix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "                        drv    gir\nPLAYER NAME                       \nAbraham Ancer         285.6  56.94\nKiradech Aphibarnrat  284.0  54.17\nRich Beem             281.9  47.22\nRafa Cabrera Bello    292.3  47.22\nDaniel Berger         301.5  55.56\n...                     ...    ...\nJimmy Walker          304.6  69.44\nMatt Wallace          300.3  56.94\nDanny Willett         307.0  69.44\nAaron Wise            301.0  59.72\nGary Woodland         311.4  70.83\n\n[82 rows x 2 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>drv</th>\n      <th>gir</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>285.6</td>\n      <td>56.94</td>\n    </tr>\n    <tr>\n      <th>Kiradech Aphibarnrat</th>\n      <td>284.0</td>\n      <td>54.17</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>281.9</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Rafa Cabrera Bello</th>\n      <td>292.3</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Daniel Berger</th>\n      <td>301.5</td>\n      <td>55.56</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Jimmy Walker</th>\n      <td>304.6</td>\n      <td>69.44</td>\n    </tr>\n    <tr>\n      <th>Matt Wallace</th>\n      <td>300.3</td>\n      <td>56.94</td>\n    </tr>\n    <tr>\n      <th>Danny Willett</th>\n      <td>307.0</td>\n      <td>69.44</td>\n    </tr>\n    <tr>\n      <th>Aaron Wise</th>\n      <td>301.0</td>\n      <td>59.72</td>\n    </tr>\n    <tr>\n      <th>Gary Woodland</th>\n      <td>311.4</td>\n      <td>70.83</td>\n    </tr>\n  </tbody>\n</table>\n<p>82 rows × 2 columns</p>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df3 = pd.concat([df1, df2], axis=1)\n",
    "df_sort = gs.sort_player_name(df3)\n",
    "display(df_sort)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "                        drv    gir    gir\nPLAYER NAME                              \nAbraham Ancer         285.6  56.94    NaN\nKiradech Aphibarnrat  284.0  54.17    NaN\nRyan Armour             NaN    NaN  55.56\nRich Beem             281.9  47.22    NaN\nRafa Cabrera Bello    292.3  47.22  58.33\n...                     ...    ...    ...\nBoo Weekley             NaN    NaN  59.72\nRichy Werenski          NaN    NaN  47.22\nDanny Willett         307.0  69.44    NaN\nAaron Wise            301.0  59.72    NaN\nGary Woodland         311.4  70.83    NaN\n\n[124 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>drv</th>\n      <th>gir</th>\n      <th>gir</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>285.6</td>\n      <td>56.94</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Kiradech Aphibarnrat</th>\n      <td>284.0</td>\n      <td>54.17</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Ryan Armour</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>55.56</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>281.9</td>\n      <td>47.22</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Rafa Cabrera Bello</th>\n      <td>292.3</td>\n      <td>47.22</td>\n      <td>58.33</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Boo Weekley</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>59.72</td>\n    </tr>\n    <tr>\n      <th>Richy Werenski</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Danny Willett</th>\n      <td>307.0</td>\n      <td>69.44</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Aaron Wise</th>\n      <td>301.0</td>\n      <td>59.72</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Gary Woodland</th>\n      <td>311.4</td>\n      <td>70.83</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>124 rows × 3 columns</p>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df4 = gs.csv_2_df_base('2019','rbch','gir')\n",
    "#display(df4)\n",
    "df5 = pd.concat([df3, df4], axis=1)\n",
    "#display(df5)\n",
    "df_sort = gs.sort_player_name(df5)\n",
    "display(df_sort)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "                       2019\n                        pga\n                        drv\nPLAYER NAME                \nAbraham Ancer         285.6\nKiradech Aphibarnrat  284.0\nRich Beem             281.9\nRafa Cabrera Bello    292.3\nDaniel Berger         301.5\n...                     ...\nJimmy Walker          304.6\nMatt Wallace          300.3\nDanny Willett         307.0\nAaron Wise            301.0\nGary Woodland         311.4\n\n[82 rows x 1 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead tr th {\n        text-align: left;\n    }\n\n    .dataframe thead tr:last-of-type th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr>\n      <th></th>\n      <th>2019</th>\n    </tr>\n    <tr>\n      <th></th>\n      <th>pga</th>\n    </tr>\n    <tr>\n      <th></th>\n      <th>drv</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>285.6</td>\n    </tr>\n    <tr>\n      <th>Kiradech Aphibarnrat</th>\n      <td>284.0</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>281.9</td>\n    </tr>\n    <tr>\n      <th>Rafa Cabrera Bello</th>\n      <td>292.3</td>\n    </tr>\n    <tr>\n      <th>Daniel Berger</th>\n      <td>301.5</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Jimmy Walker</th>\n      <td>304.6</td>\n    </tr>\n    <tr>\n      <th>Matt Wallace</th>\n      <td>300.3</td>\n    </tr>\n    <tr>\n      <th>Danny Willett</th>\n      <td>307.0</td>\n    </tr>\n    <tr>\n      <th>Aaron Wise</th>\n      <td>301.0</td>\n    </tr>\n    <tr>\n      <th>Gary Woodland</th>\n      <td>311.4</td>\n    </tr>\n  </tbody>\n</table>\n<p>82 rows × 1 columns</p>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# unfortunately the above does not tell us to which tournament and year the stat belongs\n",
    "# and likely not something we want to keep track of separately\n",
    "# So let's try adding that info to each csv dataframe\n",
    "df6 = gs.csv_2_df_base('2019', 'pga', 'drv')\n",
    "cols = pd.MultiIndex.from_tuples([('2019', 'pga', 'drv')])\n",
    "#midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], labels=[[1,1,0,],[1,0,1,]])\n",
    "df6.columns = cols\n",
    "df6 = gs.sort_player_name(df6)\n",
    "display(df6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "                       2019       \n                        pga       \n                        drv    gir\nPLAYER NAME                       \nAbraham Ancer         285.6  56.94\nKiradech Aphibarnrat  284.0  54.17\nRich Beem             281.9  47.22\nRafa Cabrera Bello    292.3  47.22\nDaniel Berger         301.5  55.56",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead tr th {\n        text-align: left;\n    }\n\n    .dataframe thead tr:last-of-type th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr>\n      <th></th>\n      <th colspan=\"2\" halign=\"left\">2019</th>\n    </tr>\n    <tr>\n      <th></th>\n      <th colspan=\"2\" halign=\"left\">pga</th>\n    </tr>\n    <tr>\n      <th></th>\n      <th>drv</th>\n      <th>gir</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>285.6</td>\n      <td>56.94</td>\n    </tr>\n    <tr>\n      <th>Kiradech Aphibarnrat</th>\n      <td>284.0</td>\n      <td>54.17</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>281.9</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Rafa Cabrera Bello</th>\n      <td>292.3</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Daniel Berger</th>\n      <td>301.5</td>\n      <td>55.56</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "2019  pga  drv    317.40\n           gir     65.28\nName: Rory McIlroy, dtype: float64"
     },
     "metadata": {}
    }
   ],
   "source": [
    "\n",
    "# now another stat same year and tourney\n",
    "df7 = gs.csv_2_df_base('2019', 'pga', 'gir')\n",
    "cols = pd.MultiIndex.from_tuples([('2019', 'pga', 'gir')])\n",
    "df7.columns = cols\n",
    "df7 = gs.sort_player_name(df7)\n",
    "df_comb1 = pd.concat([df6, df7], axis=1)\n",
    "display(df_comb1.head())\n",
    "display(df_comb1.loc['Rory McIlroy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "                       2019              \n                        pga          rbch\n                        drv    gir    gir\nPLAYER NAME                              \nAbraham Ancer         285.6  56.94    NaN\nKiradech Aphibarnrat  284.0  54.17    NaN\nRyan Armour             NaN    NaN  55.56\nRich Beem             281.9  47.22    NaN\nRafa Cabrera Bello    292.3  47.22  58.33\n...                     ...    ...    ...\nBoo Weekley             NaN    NaN  59.72\nRichy Werenski          NaN    NaN  47.22\nDanny Willett         307.0  69.44    NaN\nAaron Wise            301.0  59.72    NaN\nGary Woodland         311.4  70.83    NaN\n\n[124 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead tr th {\n        text-align: left;\n    }\n\n    .dataframe thead tr:last-of-type th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr>\n      <th></th>\n      <th colspan=\"3\" halign=\"left\">2019</th>\n    </tr>\n    <tr>\n      <th></th>\n      <th colspan=\"2\" halign=\"left\">pga</th>\n      <th>rbch</th>\n    </tr>\n    <tr>\n      <th></th>\n      <th>drv</th>\n      <th>gir</th>\n      <th>gir</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>285.6</td>\n      <td>56.94</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Kiradech Aphibarnrat</th>\n      <td>284.0</td>\n      <td>54.17</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Ryan Armour</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>55.56</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>281.9</td>\n      <td>47.22</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Rafa Cabrera Bello</th>\n      <td>292.3</td>\n      <td>47.22</td>\n      <td>58.33</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Boo Weekley</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>59.72</td>\n    </tr>\n    <tr>\n      <th>Richy Werenski</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Danny Willett</th>\n      <td>307.0</td>\n      <td>69.44</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Aaron Wise</th>\n      <td>301.0</td>\n      <td>59.72</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Gary Woodland</th>\n      <td>311.4</td>\n      <td>70.83</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>124 rows × 3 columns</p>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# and a different tournament\n",
    "df8 = gs.csv_2_df_base('2019', 'rbch', 'gir')\n",
    "cols = pd.MultiIndex.from_tuples([('2019', 'rbch', 'gir')])\n",
    "df8.columns = cols\n",
    "df8 = gs.sort_player_name(df8)\n",
    "df_comb2 = pd.concat([df6, df7, df8], axis=1)\n",
    "df_comb2 = gs.sort_player_name(df_comb2)\n",
    "display(df_comb2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "year                   2019       \n",
       "event                   pga   rbch\n",
       "stat                    gir    gir\n",
       "PLAYER NAME                       \n",
       "Abraham Ancer         56.94    NaN\n",
       "Kiradech Aphibarnrat  54.17    NaN\n",
       "Ryan Armour             NaN  55.56\n",
       "Rich Beem             47.22    NaN\n",
       "Rafa Cabrera Bello    47.22  58.33\n",
       "...                     ...    ...\n",
       "Boo Weekley             NaN  59.72\n",
       "Richy Werenski          NaN  47.22\n",
       "Danny Willett         69.44    NaN\n",
       "Aaron Wise            59.72    NaN\n",
       "Gary Woodland         70.83    NaN\n",
       "\n",
       "[124 rows x 2 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead tr th {\n        text-align: left;\n    }\n\n    .dataframe thead tr:last-of-type th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr>\n      <th>year</th>\n      <th colspan=\"2\" halign=\"left\">2019</th>\n    </tr>\n    <tr>\n      <th>event</th>\n      <th>pga</th>\n      <th>rbch</th>\n    </tr>\n    <tr>\n      <th>stat</th>\n      <th>gir</th>\n      <th>gir</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>56.94</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Kiradech Aphibarnrat</th>\n      <td>54.17</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Ryan Armour</th>\n      <td>NaN</td>\n      <td>55.56</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>47.22</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Rafa Cabrera Bello</th>\n      <td>47.22</td>\n      <td>58.33</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Boo Weekley</th>\n      <td>NaN</td>\n      <td>59.72</td>\n    </tr>\n    <tr>\n      <th>Richy Werenski</th>\n      <td>NaN</td>\n      <td>47.22</td>\n    </tr>\n    <tr>\n      <th>Danny Willett</th>\n      <td>69.44</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Aaron Wise</th>\n      <td>59.72</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Gary Woodland</th>\n      <td>70.83</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>124 rows × 2 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 12
    }
   ],
   "source": [
    "#df_comb2.columns\n",
    "df_comb2.columns.names = ['year', 'event', 'stat']\n",
    "filter = df_comb2.columns.get_level_values('stat') == 'gir'\n",
    "df_comb2.iloc[:, filter].dropna(how='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "year                   2019                 2020\nevent                   pga          rbch    pga\nstat                    drv    gir    gir    drv\nPLAYER NAME                                     \nByeong Hun An           NaN    NaN    NaN  286.6\nAbraham Ancer         285.6  56.94    NaN  295.6\nKiradech Aphibarnrat  284.0  54.17    NaN    NaN\nRyan Armour             NaN    NaN  55.56    NaN\nRich Beem             281.9  47.22    NaN    NaN\n...                     ...    ...    ...    ...\nDanny Willett         307.0  69.44    NaN    NaN\nAaron Wise            301.0  59.72    NaN    NaN\nMatthew Wolff           NaN    NaN    NaN  303.8\nGary Woodland         311.4  70.83    NaN  293.0\nTiger Woods             NaN    NaN    NaN  304.0\n\n[153 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead tr th {\n        text-align: left;\n    }\n\n    .dataframe thead tr:last-of-type th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr>\n      <th>year</th>\n      <th colspan=\"3\" halign=\"left\">2019</th>\n      <th>2020</th>\n    </tr>\n    <tr>\n      <th>event</th>\n      <th colspan=\"2\" halign=\"left\">pga</th>\n      <th>rbch</th>\n      <th>pga</th>\n    </tr>\n    <tr>\n      <th>stat</th>\n      <th>drv</th>\n      <th>gir</th>\n      <th>gir</th>\n      <th>drv</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Byeong Hun An</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>286.6</td>\n    </tr>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>285.6</td>\n      <td>56.94</td>\n      <td>NaN</td>\n      <td>295.6</td>\n    </tr>\n    <tr>\n      <th>Kiradech Aphibarnrat</th>\n      <td>284.0</td>\n      <td>54.17</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Ryan Armour</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>55.56</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Rich Beem</th>\n      <td>281.9</td>\n      <td>47.22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>Danny Willett</th>\n      <td>307.0</td>\n      <td>69.44</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Aaron Wise</th>\n      <td>301.0</td>\n      <td>59.72</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>Matthew Wolff</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>303.8</td>\n    </tr>\n    <tr>\n      <th>Gary Woodland</th>\n      <td>311.4</td>\n      <td>70.83</td>\n      <td>NaN</td>\n      <td>293.0</td>\n    </tr>\n    <tr>\n      <th>Tiger Woods</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>304.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>153 rows × 4 columns</p>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# add another year\n",
    "# was going to try the append() method, but...\n",
    "df9 = gs.csv_2_df_base('2020', 'pga', 'drv')\n",
    "cols = pd.MultiIndex.from_tuples([('2020', 'pga', 'drv')])\n",
    "#midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], labels=[[1,1,0,],[1,0,1,]])\n",
    "df9.columns = cols\n",
    "df_comb2 = pd.concat([df6, df7, df8, df9], axis=1)\n",
    "df_comb2.columns.names = ['year', 'event', 'stat']\n",
    "df_comb2 = gs.sort_player_name(df_comb2)\n",
    "display(df_comb2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "year                   2019   2020\n",
       "event                   pga    pga\n",
       "stat                    drv    drv\n",
       "PLAYER NAME                       \n",
       "Abraham Ancer         285.6  295.6\n",
       "Daniel Berger         301.5  291.9\n",
       "Patrick Cantlay       314.6  295.3\n",
       "Paul Casey            305.6  295.8\n",
       "Cameron Champ         313.8  321.1\n",
       "Joel Dahmen           295.8  289.5\n",
       "Jason Day             310.0  295.0\n",
       "Tony Finau            314.3  305.3\n",
       "Tommy Fleetwood       301.3  305.5\n",
       "Emiliano Grillo       302.9  287.4\n",
       "Adam Hadwin           285.8  282.9\n",
       "Billy Horschel        296.1  284.4\n",
       "Harold Varner III     308.3  286.8\n",
       "Dustin Johnson        335.6  305.0\n",
       "Sung Kang             308.1  292.9\n",
       "Kurt Kitayama         315.3  302.9\n",
       "Brooks Koepka         313.0  295.8\n",
       "Danny Lee             306.5  291.0\n",
       "Haotong Li            293.9  304.1\n",
       "Luke List             319.0  299.6\n",
       "Adam Long             291.5  285.4\n",
       "Shane Lowry           300.0  287.6\n",
       "Joost Luiten          284.9  286.0\n",
       "Hideki Matsuyama      303.5  285.3\n",
       "Rory McIlroy          317.4  312.5\n",
       "Phil Mickelson        318.6  293.1\n",
       "Alex Noren            298.0  289.0\n",
       "Louis Oosthuizen      306.0  284.6\n",
       "J.T. Poston           291.5  291.6\n",
       "Chez Reavie           289.1  274.5\n",
       "Erik van Rooyen       305.3  295.3\n",
       "Justin Rose           309.6  294.4\n",
       "Xander Schauffele     307.0  302.3\n",
       "Adam Scott            301.5  299.1\n",
       "Webb Simpson          295.3  281.1\n",
       "Cameron Smith         302.5  286.3\n",
       "Brandt Snedeker       279.8  280.9\n",
       "Jordan Spieth         303.3  295.9\n",
       "Michael Lorenzo-Vera  302.0  287.6\n",
       "Matt Wallace          300.3  286.6\n",
       "Gary Woodland         311.4  293.0"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead tr th {\n        text-align: left;\n    }\n\n    .dataframe thead tr:last-of-type th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr>\n      <th>year</th>\n      <th>2019</th>\n      <th>2020</th>\n    </tr>\n    <tr>\n      <th>event</th>\n      <th>pga</th>\n      <th>pga</th>\n    </tr>\n    <tr>\n      <th>stat</th>\n      <th>drv</th>\n      <th>drv</th>\n    </tr>\n    <tr>\n      <th>PLAYER NAME</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Abraham Ancer</th>\n      <td>285.6</td>\n      <td>295.6</td>\n    </tr>\n    <tr>\n      <th>Daniel Berger</th>\n      <td>301.5</td>\n      <td>291.9</td>\n    </tr>\n    <tr>\n      <th>Patrick Cantlay</th>\n      <td>314.6</td>\n      <td>295.3</td>\n    </tr>\n    <tr>\n      <th>Paul Casey</th>\n      <td>305.6</td>\n      <td>295.8</td>\n    </tr>\n    <tr>\n      <th>Cameron Champ</th>\n      <td>313.8</td>\n      <td>321.1</td>\n    </tr>\n    <tr>\n      <th>Joel Dahmen</th>\n      <td>295.8</td>\n      <td>289.5</td>\n    </tr>\n    <tr>\n      <th>Jason Day</th>\n      <td>310.0</td>\n      <td>295.0</td>\n    </tr>\n    <tr>\n      <th>Tony Finau</th>\n      <td>314.3</td>\n      <td>305.3</td>\n    </tr>\n    <tr>\n      <th>Tommy Fleetwood</th>\n      <td>301.3</td>\n      <td>305.5</td>\n    </tr>\n    <tr>\n      <th>Emiliano Grillo</th>\n      <td>302.9</td>\n      <td>287.4</td>\n    </tr>\n    <tr>\n      <th>Adam Hadwin</th>\n      <td>285.8</td>\n      <td>282.9</td>\n    </tr>\n    <tr>\n      <th>Billy Horschel</th>\n      <td>296.1</td>\n      <td>284.4</td>\n    </tr>\n    <tr>\n      <th>Harold Varner III</th>\n      <td>308.3</td>\n      <td>286.8</td>\n    </tr>\n    <tr>\n      <th>Dustin Johnson</th>\n      <td>335.6</td>\n      <td>305.0</td>\n    </tr>\n    <tr>\n      <th>Sung Kang</th>\n      <td>308.1</td>\n      <td>292.9</td>\n    </tr>\n    <tr>\n      <th>Kurt Kitayama</th>\n      <td>315.3</td>\n      <td>302.9</td>\n    </tr>\n    <tr>\n      <th>Brooks Koepka</th>\n      <td>313.0</td>\n      <td>295.8</td>\n    </tr>\n    <tr>\n      <th>Danny Lee</th>\n      <td>306.5</td>\n      <td>291.0</td>\n    </tr>\n    <tr>\n      <th>Haotong Li</th>\n      <td>293.9</td>\n      <td>304.1</td>\n    </tr>\n    <tr>\n      <th>Luke List</th>\n      <td>319.0</td>\n      <td>299.6</td>\n    </tr>\n    <tr>\n      <th>Adam Long</th>\n      <td>291.5</td>\n      <td>285.4</td>\n    </tr>\n    <tr>\n      <th>Shane Lowry</th>\n      <td>300.0</td>\n      <td>287.6</td>\n    </tr>\n    <tr>\n      <th>Joost Luiten</th>\n      <td>284.9</td>\n      <td>286.0</td>\n    </tr>\n    <tr>\n      <th>Hideki Matsuyama</th>\n      <td>303.5</td>\n      <td>285.3</td>\n    </tr>\n    <tr>\n      <th>Rory McIlroy</th>\n      <td>317.4</td>\n      <td>312.5</td>\n    </tr>\n    <tr>\n      <th>Phil Mickelson</th>\n      <td>318.6</td>\n      <td>293.1</td>\n    </tr>\n    <tr>\n      <th>Alex Noren</th>\n      <td>298.0</td>\n      <td>289.0</td>\n    </tr>\n    <tr>\n      <th>Louis Oosthuizen</th>\n      <td>306.0</td>\n      <td>284.6</td>\n    </tr>\n    <tr>\n      <th>J.T. Poston</th>\n      <td>291.5</td>\n      <td>291.6</td>\n    </tr>\n    <tr>\n      <th>Chez Reavie</th>\n      <td>289.1</td>\n      <td>274.5</td>\n    </tr>\n    <tr>\n      <th>Erik van Rooyen</th>\n      <td>305.3</td>\n      <td>295.3</td>\n    </tr>\n    <tr>\n      <th>Justin Rose</th>\n      <td>309.6</td>\n      <td>294.4</td>\n    </tr>\n    <tr>\n      <th>Xander Schauffele</th>\n      <td>307.0</td>\n      <td>302.3</td>\n    </tr>\n    <tr>\n      <th>Adam Scott</th>\n      <td>301.5</td>\n      <td>299.1</td>\n    </tr>\n    <tr>\n      <th>Webb Simpson</th>\n      <td>295.3</td>\n      <td>281.1</td>\n    </tr>\n    <tr>\n      <th>Cameron Smith</th>\n      <td>302.5</td>\n      <td>286.3</td>\n    </tr>\n    <tr>\n      <th>Brandt Snedeker</th>\n      <td>279.8</td>\n      <td>280.9</td>\n    </tr>\n    <tr>\n      <th>Jordan Spieth</th>\n      <td>303.3</td>\n      <td>295.9</td>\n    </tr>\n    <tr>\n      <th>Michael Lorenzo-Vera</th>\n      <td>302.0</td>\n      <td>287.6</td>\n    </tr>\n    <tr>\n      <th>Matt Wallace</th>\n      <td>300.3</td>\n      <td>286.6</td>\n    </tr>\n    <tr>\n      <th>Gary Woodland</th>\n      <td>311.4</td>\n      <td>293.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 14
    }
   ],
   "source": [
    "filter = df_comb2.columns.get_level_values('stat') == 'drv'\n",
    "df_comb2.iloc[:, filter].dropna(how='any')"
   ]
  },
  {
   "source": [
    "Let's have a look at pd.merge() and joins."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import display_html\n",
    "def display_side_by_side(*args):\n",
    "    html_str=''\n",
    "    for df in args:\n",
    "        html_str += df.to_html()\n",
    "    display_html(html_str.replace('table','table style=\"display:inline; margin:4px; border:none\"'),raw=True)\n",
    "    \n",
    "    #https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_small_df(t_yr, t_id, p_st):\n",
    "    col_nms = {'drv': {'AVG.': 'drv'},\n",
    "              'gir': {'%': 'gir'},\n",
    "              'scramble': {'%': 'scramble'}\n",
    "    }\n",
    "    csv_fl = gs.get_csv_nm(t_yr, t_id, p_st)\n",
    "    get_col = list(col_nms[p_st].keys())[0]\n",
    "    df_t = pd.read_csv(csv_fl, usecols=[get_col, 'PLAYER NAME'])\n",
    "    df_t.rename(columns=col_nms[p_st], inplace=True)\n",
    "    #df1 = gs.sort_player_name(df1)\n",
    "    #df1.index = sorted(df1['PLAYER NAME'],key=lambda x: re.split(r'\\W+', x)[-1])\n",
    "    #print(df1['PLAYER NAME'].str.extractall(r'(\\W+)$'))\n",
    "    df_t['surnm'] = df_t['PLAYER NAME'].apply(lambda x: re.split(r'\\W+', x)[-1])\n",
    "    df_t = df_t.sort_values(by='surnm').drop(['surnm'], axis=1)\n",
    "    df_t = df_t.iloc[8:13, :]\n",
    "    return df_t\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/html": "<table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PLAYER NAME</th>\n      <th>drv</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>28</th>\n      <td>Jason Day</td>\n      <td>295.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Bryson DeChambeau</td>\n      <td>318.1</td>\n    </tr>\n    <tr>\n      <th>19</th>\n      <td>Harris English</td>\n      <td>297.1</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Tony Finau</td>\n      <td>305.3</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Tommy Fleetwood</td>\n      <td>305.5</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\"><table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PLAYER NAME</th>\n      <th>gir</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2</th>\n      <td>Jason Day</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>23</th>\n      <td>Bryson DeChambeau</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>75</th>\n      <td>Harris English</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Tony Finau</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>40</th>\n      <td>Tommy Fleetwood</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\"><table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PLAYER NAME</th>\n      <th>drv</th>\n      <th>gir</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Jason Day</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Bryson DeChambeau</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harris English</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Tony Finau</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Tommy Fleetwood</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\">"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df1 = get_small_df('2020', 'pga', 'drv')\n",
    "df2 = get_small_df('2020', 'pga', 'gir')\n",
    "df3 = pd.merge(df1, df2)\n",
    "display_side_by_side(df1, df2, df3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# couldn't get this to work\n",
    "def disp_sd_by_sd(*dfs, caps=None):\n",
    "    stylers = []\n",
    "    space = \"\\xa0\" * 10\n",
    "    space = \"\"\n",
    "    cnt = 0\n",
    "    for df in dfs:\n",
    "        if caps:\n",
    "            stylers.append(df.style.set_table_attributes(\"style='display:inline'\").set_caption(caps[cnt]))\n",
    "        else:\n",
    "            stylers.append(df.style.set_table_attributes(\"style='display:inline'\"))\n",
    "        cnt += 1\n",
    "    str_html = \"\"\n",
    "    for sts in stylers:\n",
    "        if str_html:\n",
    "            str_html += space\n",
    "        str_html += sts._repr_html_()\n",
    "    display(str_html)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/html": "<table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PLAYER NAME</th>\n      <th>drv</th>\n      <th>gir</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Jason Day</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Bryson DeChambeau</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harris English</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Tony Finau</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Tommy Fleetwood</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\"><table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PLAYER NAME</th>\n      <th>drv</th>\n      <th>gir</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Jason Day</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Bryson DeChambeau</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harris English</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Tony Finau</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Tommy Fleetwood</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\">"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# can specify column(s) to use if desired\n",
    "df4 = pd.merge(df1, df2, on=['PLAYER NAME'], sort=False)\n",
    "display_side_by_side(df3, df4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/html": "<table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv1</th>\n      <th>gir1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Frank</td>\n      <td>Smith</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>White</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\"><table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv2</th>\n      <th>gir2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>Brown</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Arthur</td>\n      <td>White</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\">"
     },
     "metadata": {}
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "  firstname surname   drv1   gir1   drv2   gir2\n0      John   Smith  295.0  76.39  295.0  76.39\n1     Harry   Brown  297.1  52.78  318.1  66.67\n2    Joseph   Black  305.5  63.89  297.1  52.78",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv1</th>\n      <th>gir1</th>\n      <th>drv2</th>\n      <th>gir2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>297.1</td>\n      <td>52.78</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>305.5</td>\n      <td>63.89</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# let's try using multiple columns\n",
    "left = pd.DataFrame(\n",
    "    {\n",
    "        'firstname': ['John', 'Frank', 'Harry', 'Morris', 'Joseph'],\n",
    "        'surname': ['Smith', 'Smith', 'Brown', 'White', 'Black'],\n",
    "        'drv1': df1.drv.to_list(),\n",
    "        'gir1': df2.gir.to_list()\n",
    "    }\n",
    ")\n",
    "right = pd.DataFrame(\n",
    "    {\n",
    "        'firstname': ['John', 'Harry', 'Joseph', 'Morris', 'Arthur'],\n",
    "        'surname': ['Smith', 'Brown', 'Black', 'Brown', 'White'],\n",
    "        'drv2': df1.drv.to_list(),\n",
    "        'gir2': df2.gir.to_list()\n",
    "    }\n",
    ")\n",
    "df_comb = pd.merge(left, right, on=['firstname', 'surname'])\n",
    "# default join is \"inner\", i.e. intersection\n",
    "display_side_by_side(left, right)\n",
    "display(df_comb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/html": "<table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv1</th>\n      <th>gir1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Frank</td>\n      <td>Smith</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>White</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\"><table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv2</th>\n      <th>gir2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>Brown</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Arthur</td>\n      <td>White</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\">"
     },
     "metadata": {}
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "  firstname surname   drv1   gir1   drv2   gir2\n0      John   Smith  295.0  76.39  295.0  76.39\n1     Frank   Smith  318.1  66.67    NaN    NaN\n2     Harry   Brown  297.1  52.78  318.1  66.67\n3    Morris   White  305.3  72.22    NaN    NaN\n4    Joseph   Black  305.5  63.89  297.1  52.78\n5    Morris   Brown    NaN    NaN  305.3  72.22\n6    Arthur   White    NaN    NaN  305.5  63.89",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv1</th>\n      <th>gir1</th>\n      <th>drv2</th>\n      <th>gir2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Frank</td>\n      <td>Smith</td>\n      <td>318.1</td>\n      <td>66.67</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>297.1</td>\n      <td>52.78</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>White</td>\n      <td>305.3</td>\n      <td>72.22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>305.5</td>\n      <td>63.89</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Morris</td>\n      <td>Brown</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Arthur</td>\n      <td>White</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# specify 'outer' to get a union\n",
    "df_union = df_comb = pd.merge(left, right, on=['firstname', 'surname'], sort=False, how='outer')\n",
    "display_side_by_side(left, right)\n",
    "display(df_union)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/html": "<table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv1</th>\n      <th>gir1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Frank</td>\n      <td>Smith</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>White</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\"><table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv2</th>\n      <th>gir2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>Brown</td>\n      <td>305.3</td>\n      <td>72.22</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Arthur</td>\n      <td>White</td>\n      <td>305.5</td>\n      <td>63.89</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\">"
     },
     "metadata": {}
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "  firstname surname   drv1   gir1   drv2   gir2\n0      John   Smith  295.0  76.39  295.0  76.39\n1     Frank   Smith  318.1  66.67    NaN    NaN\n2     Harry   Brown  297.1  52.78  318.1  66.67\n3    Morris   White  305.3  72.22    NaN    NaN\n4    Joseph   Black  305.5  63.89  297.1  52.78",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>firstname</th>\n      <th>surname</th>\n      <th>drv1</th>\n      <th>gir1</th>\n      <th>drv2</th>\n      <th>gir2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>Smith</td>\n      <td>295.0</td>\n      <td>76.39</td>\n      <td>295.0</td>\n      <td>76.39</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Frank</td>\n      <td>Smith</td>\n      <td>318.1</td>\n      <td>66.67</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harry</td>\n      <td>Brown</td>\n      <td>297.1</td>\n      <td>52.78</td>\n      <td>318.1</td>\n      <td>66.67</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>White</td>\n      <td>305.3</td>\n      <td>72.22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Joseph</td>\n      <td>Black</td>\n      <td>305.5</td>\n      <td>63.89</td>\n      <td>297.1</td>\n      <td>52.78</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# or we can use the keys from on frame or the other\n",
    "# let's try the left\n",
    "df_left_join = df_comb = pd.merge(left, right, on=['firstname', 'surname'], sort=False, how='left')\n",
    "display_side_by_side(left, right)\n",
    "display(df_left_join)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/html": "<table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>name</th>\n      <th>department</th>\n      <th>location</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>finance</td>\n      <td>Vancouver</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Frank</td>\n      <td>sales</td>\n      <td>Vancouver</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Harry</td>\n      <td>engineering</td>\n      <td>Vancouver</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Morris</td>\n      <td>sales</td>\n      <td>Calgary</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Joseph</td>\n      <td>engineering</td>\n      <td>Calgary</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\"><table style=\"display:inline; margin:4px; border:none\" border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>dept</th>\n      <th>supervisor</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>engineering</td>\n      <td>George</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>finance</td>\n      <td>Helen</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>sales</td>\n      <td>Barbara</td>\n    </tr>\n  </tbody>\n</table style=\"display:inline; margin:4px; border:none\">"
     },
     "metadata": {}
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "     name   department   location         dept supervisor\n0    John      finance  Vancouver      finance      Helen\n1   Frank        sales  Vancouver        sales    Barbara\n2  Morris        sales    Calgary        sales    Barbara\n3   Harry  engineering  Vancouver  engineering     George\n4  Joseph  engineering    Calgary  engineering     George",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>name</th>\n      <th>department</th>\n      <th>location</th>\n      <th>dept</th>\n      <th>supervisor</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>John</td>\n      <td>finance</td>\n      <td>Vancouver</td>\n      <td>finance</td>\n      <td>Helen</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Frank</td>\n      <td>sales</td>\n      <td>Vancouver</td>\n      <td>sales</td>\n      <td>Barbara</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Morris</td>\n      <td>sales</td>\n      <td>Calgary</td>\n      <td>sales</td>\n      <td>Barbara</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Harry</td>\n      <td>engineering</td>\n      <td>Vancouver</td>\n      <td>engineering</td>\n      <td>George</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Joseph</td>\n      <td>engineering</td>\n      <td>Calgary</td>\n      <td>engineering</td>\n      <td>George</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "left = pd.DataFrame(\n",
    "    {\n",
    "        \n",
    "        'name': ['John', 'Frank', 'Harry', 'Morris', 'Joseph'],\n",
    "        'department': ['finance', 'sales', 'engineering', 'sales', 'engineering'],\n",
    "        'location': ['Vancouver', 'Vancouver', 'Vancouver', 'Calgary', 'Calgary'],\n",
    "    }\n",
    ")\n",
    "right = pd.DataFrame(\n",
    "    {\n",
    "        'dept': ['engineering', 'finance', 'sales'],\n",
    "        'supervisor': ['George', 'Helen', 'Barbara']\n",
    "    }\n",
    ")\n",
    "display_side_by_side(left, right)\n",
    "display(pd.merge(left, right, left_on='department', right_on='dept'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}