{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 2, "kernelspec": { "name": "python392jvsc74a57bd0a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85", "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pathlib import Path\n", "import re\n", "import golf_stats_lib as gs" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "output_type": "stream", "text": [ "\u001b[1;31mType:\u001b[0m module\n", "\u001b[1;31mString form:\u001b[0m \n", "\u001b[1;31mFile:\u001b[0m r:\\learn\\ds_intro\\golf_stats_lib.py\n", "\u001b[1;31mDocstring:\u001b[0m \n", "Module to provide functions related to obtaining statistics from PGA web site and saving to local CSV files.\n", " As well as functions to retrieve the data in the CSVs and build DataFrames to hold a specified dataset.\n", "\n", "File: ./ds_intro/golf_stats_lib.py\n", "\n", " - tied to various Too Old To Code blog posts\n", " - function(s) to obtain data from PGA web site and save to local CSV files\n", " - functions(s) to process the CSV files into DataFrames in various ways\n", " - built on Jypyter notebook pandas_play_8.ipynb\n", " \n", "Module level and/or global variables:\n", "----\n", "\n", "These may require updating if you add events or stat types.\n", "\n", "events - dictionary of currently available events providing PGA Tour ids\n", "stats - dictionary of currently available stat types providing PGA Tour ids\n", "st_cols - for each stat type, a list of table columns to save to csv\n", "d_dir - directory to which to save stat csv files\n", "\n", "Functions:\n", "\n", " get_csv_nm(t_yr, t_id, p_st)\n", " stats_2_csv(tyrs, tids, psts)\n", " csv_2_df_base(t_yr, t_id, p_st)\n", " csv_2_df(t_yr, t_id, p_st)\n", " tourney_2_df(t_yr, t_id, psts)\n", " year_2_df(t_yr, tids, psts)\n", " golf_stats_2_df(tyrs, tids, psts)\n", " sort_player_name(df_in)\n" ], "name": "stdout" } ], "source": [ "gs?" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "output_type": "stream", "text": [ "\u001b[1;31mSignature:\u001b[0m \u001b[0mgs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtourney_2_df\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mt_yr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mt_id\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpsts\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mDocstring:\u001b[0m\n", "Combine all requested stats for a given tournament and year into a single DataFrame. \n", "Return DataFrame.\n", "Useage: tourney_2_df(t_yr, t_id, p_sts)\n", " where t_yr = tournament year\n", " t_id = tournament id (e.g. 'pga')\n", " psts = list of player stats (e.g. ['drv', 'gir'])\n", "\u001b[1;31mFile:\u001b[0m r:\\learn\\ds_intro\\golf_stats_lib.py\n", "\u001b[1;31mType:\u001b[0m function\n" ], "name": "stdout" } ], "source": [ "gs.tourney_2_df?" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " drv\nPLAYER NAME \nDustin Johnson 335.6\nLuke List 319.0\nPhil Mickelson 318.6\nRory McIlroy 317.4\nLucas Bjerregaard 316.6\n... ...\nMarty Jertson 281.9\nRich Beem 281.9\nHenrik Stenson 281.0\nBrandt Snedeker 279.8\nAndrew Putnam 274.1\n\n[82 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
drv
PLAYER NAME
Dustin Johnson335.6
Luke List319.0
Phil Mickelson318.6
Rory McIlroy317.4
Lucas Bjerregaard316.6
......
Marty Jertson281.9
Rich Beem281.9
Henrik Stenson281.0
Brandt Snedeker279.8
Andrew Putnam274.1
\n

82 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "df1 = gs.csv_2_df_base('2019', 'pga', 'drv')\n", "display(df1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " drv gir\nPLAYER NAME \nDustin Johnson 335.6 NaN\nLuke List 319.0 NaN\nPhil Mickelson 318.6 NaN\nRory McIlroy 317.4 NaN\nLucas Bjerregaard 316.6 NaN\n... ... ...\nRafa Cabrera Bello NaN 47.22\nRyan Vermeer NaN 47.22\nPat Perez NaN 47.22\nRich Beem NaN 47.22\nMarty Jertson NaN 41.67\n\n[164 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
drvgir
PLAYER NAME
Dustin Johnson335.6NaN
Luke List319.0NaN
Phil Mickelson318.6NaN
Rory McIlroy317.4NaN
Lucas Bjerregaard316.6NaN
.........
Rafa Cabrera BelloNaN47.22
Ryan VermeerNaN47.22
Pat PerezNaN47.22
Rich BeemNaN47.22
Marty JertsonNaN41.67
\n

164 rows × 2 columns

\n
" }, "metadata": {} } ], "source": [ "df2 = gs.csv_2_df_base('2019', 'pga', 'gir')\n", "df3 = pd.concat([df1, df2])\n", "display(df3)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "error", "ename": "ValueError", "evalue": "cannot reindex from a duplicate axis", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m#print(type(df3.index[0]))\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;31m#print(df3.index)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdf_sort\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_player_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf3\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_sort\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;31m# OOPS! don't currently have a fix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mr:\\learn\\ds_intro\\golf_stats_lib.py\u001b[0m in \u001b[0;36msort_player_name\u001b[1;34m(df_in)\u001b[0m\n\u001b[0;32m 240\u001b[0m \u001b[0mndx_sort2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msorted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_in\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mr'\\W+'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[1;31m#print(f'sorted: {ndx_sort2[:5]}')\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m \u001b[0mdf_sort\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_in\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mndx_sort2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 243\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdf_sort\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\util\\_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 310\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mCallable\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m...\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mAny\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 312\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 313\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[0mkind\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minspect\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mParameter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPOSITIONAL_OR_KEYWORD\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mreindex\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 4174\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"axis\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4175\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"labels\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4176\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4177\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4178\u001b[0m def drop(\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mreindex\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 4809\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4810\u001b[0m \u001b[1;31m# perform the reindex on the axes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4811\u001b[1;33m return self._reindex_axes(\n\u001b[0m\u001b[0;32m 4812\u001b[0m \u001b[0maxes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4813\u001b[0m ).__finalize__(self, method=\"reindex\")\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m_reindex_axes\u001b[1;34m(self, axes, level, limit, tolerance, method, fill_value, copy)\u001b[0m\n\u001b[0;32m 4020\u001b[0m \u001b[0mindex\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"index\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4021\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4022\u001b[1;33m frame = frame._reindex_index(\n\u001b[0m\u001b[0;32m 4023\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4024\u001b[0m )\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m_reindex_index\u001b[1;34m(self, new_index, method, copy, level, fill_value, limit, tolerance)\u001b[0m\n\u001b[0;32m 4039\u001b[0m \u001b[0mnew_index\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4040\u001b[0m )\n\u001b[1;32m-> 4041\u001b[1;33m return self._reindex_with_indexers(\n\u001b[0m\u001b[0;32m 4042\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mnew_index\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4043\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m_reindex_with_indexers\u001b[1;34m(self, reindexers, fill_value, copy, allow_dups)\u001b[0m\n\u001b[0;32m 4875\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4876\u001b[0m \u001b[1;31m# TODO: speed up on homogeneous DataFrame objects\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4877\u001b[1;33m new_data = new_data.reindex_indexer(\n\u001b[0m\u001b[0;32m 4878\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4879\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mreindex_indexer\u001b[1;34m(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice)\u001b[0m\n\u001b[0;32m 1299\u001b[0m \u001b[1;31m# some axes don't allow reindexing with dups\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1300\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mallow_dups\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1301\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_reindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1302\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1303\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0maxis\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mE:\\appDev\\Miniconda3\\envs\\ds-3.9\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36m_can_reindex\u001b[1;34m(self, indexer)\u001b[0m\n\u001b[0;32m 3474\u001b[0m \u001b[1;31m# trying to reindex on an axis with duplicates\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3475\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_index_as_unique\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3476\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"cannot reindex from a duplicate axis\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3478\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mreindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: cannot reindex from a duplicate axis" ] } ], "source": [ "# not what I would expect\n", "# let's sort the dataframe on the player index\n", "#print(type(df3.index[0]))\n", "#print(df3.index)\n", "df_sort = gs.sort_player_name(df3)\n", "display(df_sort)\n", "# OOPS! don't currently have a fix" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " drv gir\nPLAYER NAME \nAbraham Ancer 285.6 56.94\nKiradech Aphibarnrat 284.0 54.17\nRich Beem 281.9 47.22\nRafa Cabrera Bello 292.3 47.22\nDaniel Berger 301.5 55.56\n... ... ...\nJimmy Walker 304.6 69.44\nMatt Wallace 300.3 56.94\nDanny Willett 307.0 69.44\nAaron Wise 301.0 59.72\nGary Woodland 311.4 70.83\n\n[82 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
drvgir
PLAYER NAME
Abraham Ancer285.656.94
Kiradech Aphibarnrat284.054.17
Rich Beem281.947.22
Rafa Cabrera Bello292.347.22
Daniel Berger301.555.56
.........
Jimmy Walker304.669.44
Matt Wallace300.356.94
Danny Willett307.069.44
Aaron Wise301.059.72
Gary Woodland311.470.83
\n

82 rows × 2 columns

\n
" }, "metadata": {} } ], "source": [ "df3 = pd.concat([df1, df2], axis=1)\n", "df_sort = gs.sort_player_name(df3)\n", "display(df_sort)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " drv gir gir\nPLAYER NAME \nAbraham Ancer 285.6 56.94 NaN\nKiradech Aphibarnrat 284.0 54.17 NaN\nRyan Armour NaN NaN 55.56\nRich Beem 281.9 47.22 NaN\nRafa Cabrera Bello 292.3 47.22 58.33\n... ... ... ...\nBoo Weekley NaN NaN 59.72\nRichy Werenski NaN NaN 47.22\nDanny Willett 307.0 69.44 NaN\nAaron Wise 301.0 59.72 NaN\nGary Woodland 311.4 70.83 NaN\n\n[124 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
drvgirgir
PLAYER NAME
Abraham Ancer285.656.94NaN
Kiradech Aphibarnrat284.054.17NaN
Ryan ArmourNaNNaN55.56
Rich Beem281.947.22NaN
Rafa Cabrera Bello292.347.2258.33
............
Boo WeekleyNaNNaN59.72
Richy WerenskiNaNNaN47.22
Danny Willett307.069.44NaN
Aaron Wise301.059.72NaN
Gary Woodland311.470.83NaN
\n

124 rows × 3 columns

\n
" }, "metadata": {} } ], "source": [ "df4 = gs.csv_2_df_base('2019','rbch','gir')\n", "#display(df4)\n", "df5 = pd.concat([df3, df4], axis=1)\n", "#display(df5)\n", "df_sort = gs.sort_player_name(df5)\n", "display(df_sort)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2019\n pga\n drv\nPLAYER NAME \nAbraham Ancer 285.6\nKiradech Aphibarnrat 284.0\nRich Beem 281.9\nRafa Cabrera Bello 292.3\nDaniel Berger 301.5\n... ...\nJimmy Walker 304.6\nMatt Wallace 300.3\nDanny Willett 307.0\nAaron Wise 301.0\nGary Woodland 311.4\n\n[82 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2019
pga
drv
PLAYER NAME
Abraham Ancer285.6
Kiradech Aphibarnrat284.0
Rich Beem281.9
Rafa Cabrera Bello292.3
Daniel Berger301.5
......
Jimmy Walker304.6
Matt Wallace300.3
Danny Willett307.0
Aaron Wise301.0
Gary Woodland311.4
\n

82 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "# unfortunately the above does not tell us to which tournament and year the stat belongs\n", "# and likely not something we want to keep track of separately\n", "# So let's try adding that info to each csv dataframe\n", "df6 = gs.csv_2_df_base('2019', 'pga', 'drv')\n", "cols = pd.MultiIndex.from_tuples([('2019', 'pga', 'drv')])\n", "#midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], labels=[[1,1,0,],[1,0,1,]])\n", "df6.columns = cols\n", "df6 = gs.sort_player_name(df6)\n", "display(df6)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2019 \n pga \n drv gir\nPLAYER NAME \nAbraham Ancer 285.6 56.94\nKiradech Aphibarnrat 284.0 54.17\nRich Beem 281.9 47.22\nRafa Cabrera Bello 292.3 47.22\nDaniel Berger 301.5 55.56", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2019
pga
drvgir
PLAYER NAME
Abraham Ancer285.656.94
Kiradech Aphibarnrat284.054.17
Rich Beem281.947.22
Rafa Cabrera Bello292.347.22
Daniel Berger301.555.56
\n
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": "2019 pga drv 317.40\n gir 65.28\nName: Rory McIlroy, dtype: float64" }, "metadata": {} } ], "source": [ "\n", "# now another stat same year and tourney\n", "df7 = gs.csv_2_df_base('2019', 'pga', 'gir')\n", "cols = pd.MultiIndex.from_tuples([('2019', 'pga', 'gir')])\n", "df7.columns = cols\n", "df7 = gs.sort_player_name(df7)\n", "df_comb1 = pd.concat([df6, df7], axis=1)\n", "display(df_comb1.head())\n", "display(df_comb1.loc['Rory McIlroy'])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2019 \n pga rbch\n drv gir gir\nPLAYER NAME \nAbraham Ancer 285.6 56.94 NaN\nKiradech Aphibarnrat 284.0 54.17 NaN\nRyan Armour NaN NaN 55.56\nRich Beem 281.9 47.22 NaN\nRafa Cabrera Bello 292.3 47.22 58.33\n... ... ... ...\nBoo Weekley NaN NaN 59.72\nRichy Werenski NaN NaN 47.22\nDanny Willett 307.0 69.44 NaN\nAaron Wise 301.0 59.72 NaN\nGary Woodland 311.4 70.83 NaN\n\n[124 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2019
pgarbch
drvgirgir
PLAYER NAME
Abraham Ancer285.656.94NaN
Kiradech Aphibarnrat284.054.17NaN
Ryan ArmourNaNNaN55.56
Rich Beem281.947.22NaN
Rafa Cabrera Bello292.347.2258.33
............
Boo WeekleyNaNNaN59.72
Richy WerenskiNaNNaN47.22
Danny Willett307.069.44NaN
Aaron Wise301.059.72NaN
Gary Woodland311.470.83NaN
\n

124 rows × 3 columns

\n
" }, "metadata": {} } ], "source": [ "# and a different tournament\n", "df8 = gs.csv_2_df_base('2019', 'rbch', 'gir')\n", "cols = pd.MultiIndex.from_tuples([('2019', 'rbch', 'gir')])\n", "df8.columns = cols\n", "df8 = gs.sort_player_name(df8)\n", "df_comb2 = pd.concat([df6, df7, df8], axis=1)\n", "df_comb2 = gs.sort_player_name(df_comb2)\n", "display(df_comb2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "year 2019 \n", "event pga rbch\n", "stat gir gir\n", "PLAYER NAME \n", "Abraham Ancer 56.94 NaN\n", "Kiradech Aphibarnrat 54.17 NaN\n", "Ryan Armour NaN 55.56\n", "Rich Beem 47.22 NaN\n", "Rafa Cabrera Bello 47.22 58.33\n", "... ... ...\n", "Boo Weekley NaN 59.72\n", "Richy Werenski NaN 47.22\n", "Danny Willett 69.44 NaN\n", "Aaron Wise 59.72 NaN\n", "Gary Woodland 70.83 NaN\n", "\n", "[124 rows x 2 columns]" ], "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
year2019
eventpgarbch
statgirgir
PLAYER NAME
Abraham Ancer56.94NaN
Kiradech Aphibarnrat54.17NaN
Ryan ArmourNaN55.56
Rich Beem47.22NaN
Rafa Cabrera Bello47.2258.33
.........
Boo WeekleyNaN59.72
Richy WerenskiNaN47.22
Danny Willett69.44NaN
Aaron Wise59.72NaN
Gary Woodland70.83NaN
\n

124 rows × 2 columns

\n
" }, "metadata": {}, "execution_count": 12 } ], "source": [ "#df_comb2.columns\n", "df_comb2.columns.names = ['year', 'event', 'stat']\n", "filter = df_comb2.columns.get_level_values('stat') == 'gir'\n", "df_comb2.iloc[:, filter].dropna(how='all')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": "year 2019 2020\nevent pga rbch pga\nstat drv gir gir drv\nPLAYER NAME \nByeong Hun An NaN NaN NaN 286.6\nAbraham Ancer 285.6 56.94 NaN 295.6\nKiradech Aphibarnrat 284.0 54.17 NaN NaN\nRyan Armour NaN NaN 55.56 NaN\nRich Beem 281.9 47.22 NaN NaN\n... ... ... ... ...\nDanny Willett 307.0 69.44 NaN NaN\nAaron Wise 301.0 59.72 NaN NaN\nMatthew Wolff NaN NaN NaN 303.8\nGary Woodland 311.4 70.83 NaN 293.0\nTiger Woods NaN NaN NaN 304.0\n\n[153 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
year20192020
eventpgarbchpga
statdrvgirgirdrv
PLAYER NAME
Byeong Hun AnNaNNaNNaN286.6
Abraham Ancer285.656.94NaN295.6
Kiradech Aphibarnrat284.054.17NaNNaN
Ryan ArmourNaNNaN55.56NaN
Rich Beem281.947.22NaNNaN
...............
Danny Willett307.069.44NaNNaN
Aaron Wise301.059.72NaNNaN
Matthew WolffNaNNaNNaN303.8
Gary Woodland311.470.83NaN293.0
Tiger WoodsNaNNaNNaN304.0
\n

153 rows × 4 columns

\n
" }, "metadata": {} } ], "source": [ "# add another year\n", "# was going to try the append() method, but...\n", "df9 = gs.csv_2_df_base('2020', 'pga', 'drv')\n", "cols = pd.MultiIndex.from_tuples([('2020', 'pga', 'drv')])\n", "#midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], labels=[[1,1,0,],[1,0,1,]])\n", "df9.columns = cols\n", "df_comb2 = pd.concat([df6, df7, df8, df9], axis=1)\n", "df_comb2.columns.names = ['year', 'event', 'stat']\n", "df_comb2 = gs.sort_player_name(df_comb2)\n", "display(df_comb2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "year 2019 2020\n", "event pga pga\n", "stat drv drv\n", "PLAYER NAME \n", "Abraham Ancer 285.6 295.6\n", "Daniel Berger 301.5 291.9\n", "Patrick Cantlay 314.6 295.3\n", "Paul Casey 305.6 295.8\n", "Cameron Champ 313.8 321.1\n", "Joel Dahmen 295.8 289.5\n", "Jason Day 310.0 295.0\n", "Tony Finau 314.3 305.3\n", "Tommy Fleetwood 301.3 305.5\n", "Emiliano Grillo 302.9 287.4\n", "Adam Hadwin 285.8 282.9\n", "Billy Horschel 296.1 284.4\n", "Harold Varner III 308.3 286.8\n", "Dustin Johnson 335.6 305.0\n", "Sung Kang 308.1 292.9\n", "Kurt Kitayama 315.3 302.9\n", "Brooks Koepka 313.0 295.8\n", "Danny Lee 306.5 291.0\n", "Haotong Li 293.9 304.1\n", "Luke List 319.0 299.6\n", "Adam Long 291.5 285.4\n", "Shane Lowry 300.0 287.6\n", "Joost Luiten 284.9 286.0\n", "Hideki Matsuyama 303.5 285.3\n", "Rory McIlroy 317.4 312.5\n", "Phil Mickelson 318.6 293.1\n", "Alex Noren 298.0 289.0\n", "Louis Oosthuizen 306.0 284.6\n", "J.T. Poston 291.5 291.6\n", "Chez Reavie 289.1 274.5\n", "Erik van Rooyen 305.3 295.3\n", "Justin Rose 309.6 294.4\n", "Xander Schauffele 307.0 302.3\n", "Adam Scott 301.5 299.1\n", "Webb Simpson 295.3 281.1\n", "Cameron Smith 302.5 286.3\n", "Brandt Snedeker 279.8 280.9\n", "Jordan Spieth 303.3 295.9\n", "Michael Lorenzo-Vera 302.0 287.6\n", "Matt Wallace 300.3 286.6\n", "Gary Woodland 311.4 293.0" ], "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
year20192020
eventpgapga
statdrvdrv
PLAYER NAME
Abraham Ancer285.6295.6
Daniel Berger301.5291.9
Patrick Cantlay314.6295.3
Paul Casey305.6295.8
Cameron Champ313.8321.1
Joel Dahmen295.8289.5
Jason Day310.0295.0
Tony Finau314.3305.3
Tommy Fleetwood301.3305.5
Emiliano Grillo302.9287.4
Adam Hadwin285.8282.9
Billy Horschel296.1284.4
Harold Varner III308.3286.8
Dustin Johnson335.6305.0
Sung Kang308.1292.9
Kurt Kitayama315.3302.9
Brooks Koepka313.0295.8
Danny Lee306.5291.0
Haotong Li293.9304.1
Luke List319.0299.6
Adam Long291.5285.4
Shane Lowry300.0287.6
Joost Luiten284.9286.0
Hideki Matsuyama303.5285.3
Rory McIlroy317.4312.5
Phil Mickelson318.6293.1
Alex Noren298.0289.0
Louis Oosthuizen306.0284.6
J.T. Poston291.5291.6
Chez Reavie289.1274.5
Erik van Rooyen305.3295.3
Justin Rose309.6294.4
Xander Schauffele307.0302.3
Adam Scott301.5299.1
Webb Simpson295.3281.1
Cameron Smith302.5286.3
Brandt Snedeker279.8280.9
Jordan Spieth303.3295.9
Michael Lorenzo-Vera302.0287.6
Matt Wallace300.3286.6
Gary Woodland311.4293.0
\n
" }, "metadata": {}, "execution_count": 14 } ], "source": [ "filter = df_comb2.columns.get_level_values('stat') == 'drv'\n", "df_comb2.iloc[:, filter].dropna(how='any')" ] }, { "source": [ "Let's have a look at pd.merge() and joins." ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from IPython.display import display_html\n", "def display_side_by_side(*args):\n", " html_str=''\n", " for df in args:\n", " html_str += df.to_html()\n", " display_html(html_str.replace('table','table style=\"display:inline; margin:4px; border:none\"'),raw=True)\n", " \n", " #https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def get_small_df(t_yr, t_id, p_st):\n", " col_nms = {'drv': {'AVG.': 'drv'},\n", " 'gir': {'%': 'gir'},\n", " 'scramble': {'%': 'scramble'}\n", " }\n", " csv_fl = gs.get_csv_nm(t_yr, t_id, p_st)\n", " get_col = list(col_nms[p_st].keys())[0]\n", " df_t = pd.read_csv(csv_fl, usecols=[get_col, 'PLAYER NAME'])\n", " df_t.rename(columns=col_nms[p_st], inplace=True)\n", " #df1 = gs.sort_player_name(df1)\n", " #df1.index = sorted(df1['PLAYER NAME'],key=lambda x: re.split(r'\\W+', x)[-1])\n", " #print(df1['PLAYER NAME'].str.extractall(r'(\\W+)$'))\n", " df_t['surnm'] = df_t['PLAYER NAME'].apply(lambda x: re.split(r'\\W+', x)[-1])\n", " df_t = df_t.sort_values(by='surnm').drop(['surnm'], axis=1)\n", " df_t = df_t.iloc[8:13, :]\n", " return df_t\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PLAYER NAMEdrv
28Jason Day295.0
1Bryson DeChambeau318.1
19Harris English297.1
5Tony Finau305.3
4Tommy Fleetwood305.5
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PLAYER NAMEgir
2Jason Day76.39
23Bryson DeChambeau66.67
75Harris English52.78
5Tony Finau72.22
40Tommy Fleetwood63.89
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PLAYER NAMEdrvgir
0Jason Day295.076.39
1Bryson DeChambeau318.166.67
2Harris English297.152.78
3Tony Finau305.372.22
4Tommy Fleetwood305.563.89
" }, "metadata": {} } ], "source": [ "df1 = get_small_df('2020', 'pga', 'drv')\n", "df2 = get_small_df('2020', 'pga', 'gir')\n", "df3 = pd.merge(df1, df2)\n", "display_side_by_side(df1, df2, df3)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# couldn't get this to work\n", "def disp_sd_by_sd(*dfs, caps=None):\n", " stylers = []\n", " space = \"\\xa0\" * 10\n", " space = \"\"\n", " cnt = 0\n", " for df in dfs:\n", " if caps:\n", " stylers.append(df.style.set_table_attributes(\"style='display:inline'\").set_caption(caps[cnt]))\n", " else:\n", " stylers.append(df.style.set_table_attributes(\"style='display:inline'\"))\n", " cnt += 1\n", " str_html = \"\"\n", " for sts in stylers:\n", " if str_html:\n", " str_html += space\n", " str_html += sts._repr_html_()\n", " display(str_html)\n", " " ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PLAYER NAMEdrvgir
0Jason Day295.076.39
1Bryson DeChambeau318.166.67
2Harris English297.152.78
3Tony Finau305.372.22
4Tommy Fleetwood305.563.89
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PLAYER NAMEdrvgir
0Jason Day295.076.39
1Bryson DeChambeau318.166.67
2Harris English297.152.78
3Tony Finau305.372.22
4Tommy Fleetwood305.563.89
" }, "metadata": {} } ], "source": [ "# can specify column(s) to use if desired\n", "df4 = pd.merge(df1, df2, on=['PLAYER NAME'], sort=False)\n", "display_side_by_side(df3, df4)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv1gir1
0JohnSmith295.076.39
1FrankSmith318.166.67
2HarryBrown297.152.78
3MorrisWhite305.372.22
4JosephBlack305.563.89
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv2gir2
0JohnSmith295.076.39
1HarryBrown318.166.67
2JosephBlack297.152.78
3MorrisBrown305.372.22
4ArthurWhite305.563.89
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": " firstname surname drv1 gir1 drv2 gir2\n0 John Smith 295.0 76.39 295.0 76.39\n1 Harry Brown 297.1 52.78 318.1 66.67\n2 Joseph Black 305.5 63.89 297.1 52.78", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv1gir1drv2gir2
0JohnSmith295.076.39295.076.39
1HarryBrown297.152.78318.166.67
2JosephBlack305.563.89297.152.78
\n
" }, "metadata": {} } ], "source": [ "# let's try using multiple columns\n", "left = pd.DataFrame(\n", " {\n", " 'firstname': ['John', 'Frank', 'Harry', 'Morris', 'Joseph'],\n", " 'surname': ['Smith', 'Smith', 'Brown', 'White', 'Black'],\n", " 'drv1': df1.drv.to_list(),\n", " 'gir1': df2.gir.to_list()\n", " }\n", ")\n", "right = pd.DataFrame(\n", " {\n", " 'firstname': ['John', 'Harry', 'Joseph', 'Morris', 'Arthur'],\n", " 'surname': ['Smith', 'Brown', 'Black', 'Brown', 'White'],\n", " 'drv2': df1.drv.to_list(),\n", " 'gir2': df2.gir.to_list()\n", " }\n", ")\n", "df_comb = pd.merge(left, right, on=['firstname', 'surname'])\n", "# default join is \"inner\", i.e. intersection\n", "display_side_by_side(left, right)\n", "display(df_comb)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv1gir1
0JohnSmith295.076.39
1FrankSmith318.166.67
2HarryBrown297.152.78
3MorrisWhite305.372.22
4JosephBlack305.563.89
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv2gir2
0JohnSmith295.076.39
1HarryBrown318.166.67
2JosephBlack297.152.78
3MorrisBrown305.372.22
4ArthurWhite305.563.89
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": " firstname surname drv1 gir1 drv2 gir2\n0 John Smith 295.0 76.39 295.0 76.39\n1 Frank Smith 318.1 66.67 NaN NaN\n2 Harry Brown 297.1 52.78 318.1 66.67\n3 Morris White 305.3 72.22 NaN NaN\n4 Joseph Black 305.5 63.89 297.1 52.78\n5 Morris Brown NaN NaN 305.3 72.22\n6 Arthur White NaN NaN 305.5 63.89", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv1gir1drv2gir2
0JohnSmith295.076.39295.076.39
1FrankSmith318.166.67NaNNaN
2HarryBrown297.152.78318.166.67
3MorrisWhite305.372.22NaNNaN
4JosephBlack305.563.89297.152.78
5MorrisBrownNaNNaN305.372.22
6ArthurWhiteNaNNaN305.563.89
\n
" }, "metadata": {} } ], "source": [ "# specify 'outer' to get a union\n", "df_union = df_comb = pd.merge(left, right, on=['firstname', 'surname'], sort=False, how='outer')\n", "display_side_by_side(left, right)\n", "display(df_union)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv1gir1
0JohnSmith295.076.39
1FrankSmith318.166.67
2HarryBrown297.152.78
3MorrisWhite305.372.22
4JosephBlack305.563.89
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv2gir2
0JohnSmith295.076.39
1HarryBrown318.166.67
2JosephBlack297.152.78
3MorrisBrown305.372.22
4ArthurWhite305.563.89
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": " firstname surname drv1 gir1 drv2 gir2\n0 John Smith 295.0 76.39 295.0 76.39\n1 Frank Smith 318.1 66.67 NaN NaN\n2 Harry Brown 297.1 52.78 318.1 66.67\n3 Morris White 305.3 72.22 NaN NaN\n4 Joseph Black 305.5 63.89 297.1 52.78", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
firstnamesurnamedrv1gir1drv2gir2
0JohnSmith295.076.39295.076.39
1FrankSmith318.166.67NaNNaN
2HarryBrown297.152.78318.166.67
3MorrisWhite305.372.22NaNNaN
4JosephBlack305.563.89297.152.78
\n
" }, "metadata": {} } ], "source": [ "# or we can use the keys from on frame or the other\n", "# let's try the left\n", "df_left_join = df_comb = pd.merge(left, right, on=['firstname', 'surname'], sort=False, how='left')\n", "display_side_by_side(left, right)\n", "display(df_left_join)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
namedepartmentlocation
0JohnfinanceVancouver
1FranksalesVancouver
2HarryengineeringVancouver
3MorrissalesCalgary
4JosephengineeringCalgary
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
deptsupervisor
0engineeringGeorge
1financeHelen
2salesBarbara
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": " name department location dept supervisor\n0 John finance Vancouver finance Helen\n1 Frank sales Vancouver sales Barbara\n2 Morris sales Calgary sales Barbara\n3 Harry engineering Vancouver engineering George\n4 Joseph engineering Calgary engineering George", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
namedepartmentlocationdeptsupervisor
0JohnfinanceVancouverfinanceHelen
1FranksalesVancouversalesBarbara
2MorrissalesCalgarysalesBarbara
3HarryengineeringVancouverengineeringGeorge
4JosephengineeringCalgaryengineeringGeorge
\n
" }, "metadata": {} } ], "source": [ "left = pd.DataFrame(\n", " {\n", " \n", " 'name': ['John', 'Frank', 'Harry', 'Morris', 'Joseph'],\n", " 'department': ['finance', 'sales', 'engineering', 'sales', 'engineering'],\n", " 'location': ['Vancouver', 'Vancouver', 'Vancouver', 'Calgary', 'Calgary'],\n", " }\n", ")\n", "right = pd.DataFrame(\n", " {\n", " 'dept': ['engineering', 'finance', 'sales'],\n", " 'supervisor': ['George', 'Helen', 'Barbara']\n", " }\n", ")\n", "display_side_by_side(left, right)\n", "display(pd.merge(left, right, left_on='department', right_on='dept'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }