{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 2, "kernelspec": { "name": "python392jvsc74a57bd0a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85", "display_name": "Python 3.9.2 64-bit" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "events = {'wmpo': 't003', 'tpc': 'to11', 'rbch': 't012', 'masters': 't014', 'pga': 't033'}\n", "stats = {'drv': '101', 'gir': '103', 't2g': '02674'}\n", "st_cols = {'drv': ['PLAYER NAME', 'AVG.', 'TOTAL DISTANCE', 'TOTAL DRIVES'],\n", " 'gir': ['PLAYER NAME', '%', 'GREENS HIT', '# HOLES', 'RELATIVE/PAR'],\n", " 't2g': ['PLAYER NAME', 'AVERAGE', 'SG:OTT', 'SG:APR', 'SG:ARG']\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "df1: 79, df2: 79\n\n AVG.\nPLAYER NAME \nCameron Champ 321.1\nBryson DeChambeau 318.1\nRory McIlroy 312.5\nSepp Straka 305.8\nTommy Fleetwood 305.5\n\n %\nPLAYER NAME \nMatthew Wolff 77.78\nPaul Casey 76.39\nJason Day 76.39\nLouis Oosthuizen 73.61\nCameron Champ 73.61\n\n\n" ] } ], "source": [ "d_dir = \"./data/\"\n", "csvs = ['pga_2020_drv_2.test.csv', 'pga_2020_gir_2.test.csv',\n", " 'pga_2021_drv_2.test.csv', 'pga_2021_gir_2.test.csv',\n", " 'rbch_2020_drv_2.test.csv', 'rbch_2020_gir_2.test.csv',\n", " 'rbch_2021_drv_2.test.csv', 'rbch_2021_gir_2.test.csv'\n", "]\n", "df1 = pd.read_csv(f\"{d_dir}{csvs[0]}\", index_col=['PLAYER NAME'], usecols=['AVG.', 'PLAYER NAME'])\n", "df2 = pd.read_csv(f\"{d_dir}{csvs[1]}\", index_col=['PLAYER NAME'], usecols=['%', 'PLAYER NAME'])\n", "print(f\"df1: {len(df1)}, df2: {len(df2)}\\n\")\n", "print(df1.head())\n", "print()\n", "print(df2.head())\n", "print()\n", "print(type(df2))\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " AVG. %\nPLAYER NAME \nCameron Champ 321.1 NaN\nBryson DeChambeau 318.1 NaN\nRory McIlroy 312.5 NaN\nSepp Straka 305.8 NaN\nTommy Fleetwood 305.5 NaN\n\n AVG. %\nPLAYER NAME \nDenny McCarthy NaN 54.17\nHarris English NaN 52.78\nBrian Harman NaN 51.39\nBrandt Snedeker NaN 51.39\nJ.T. Poston NaN 50.00\n\n158\n" ] } ], "source": [ "combined = pd.concat([df1, df2])\n", "print(combined.head())\n", "print()\n", "print(combined.tail())\n", "print()\n", "print(len(combined))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "PLAYER NAME\n", "Cameron Champ True\n", "Bryson DeChambeau True\n", "Rory McIlroy True\n", "Sepp Straka True\n", "Tommy Fleetwood True\n", " ... \n", "Denny McCarthy False\n", "Harris English False\n", "Brian Harman False\n", "Brandt Snedeker False\n", "J.T. Poston False\n", "Name: AVG., Length: 158, dtype: bool" ] }, "metadata": {}, "execution_count": 5 } ], "source": [ "pd.notna(combined[\"AVG.\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " AVG. %\nPLAYER NAME \nCameron Champ 321.1 73.61\nBryson DeChambeau 318.1 66.67\nRory McIlroy 312.5 65.28\nSepp Straka 305.8 58.33\nTommy Fleetwood 305.5 63.89\n\n AVG. %\nPLAYER NAME \nCharl Schwartzel 276.0 59.72\nChez Reavie 274.5 55.56\nBrendon Todd 272.3 62.50\nPatrick Reed 271.6 68.06\nMark Hubbard 268.8 61.11\n" ] } ], "source": [ "combined = pd.concat([df1, df2], axis=1)\n", "print(combined.head())\n", "print()\n", "print(combined.tail())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "df1a ->\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nPLAYER NAME \nCameron Champ AVG. 321.1\nBryson DeChambeau AVG. 318.1\nRory McIlroy AVG. 312.5\nSepp Straka AVG. 305.8\nTommy Fleetwood AVG. 305.5\n... ...\nCharl Schwartzel AVG. 276.0\nChez Reavie AVG. 274.5\nBrendon Todd AVG. 272.3\nPatrick Reed AVG. 271.6\nMark Hubbard AVG. 268.8\n\n[79 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
PLAYER NAME
Cameron ChampAVG.321.1
Bryson DeChambeauAVG.318.1
Rory McIlroyAVG.312.5
Sepp StrakaAVG.305.8
Tommy FleetwoodAVG.305.5
.........
Charl SchwartzelAVG.276.0
Chez ReavieAVG.274.5
Brendon ToddAVG.272.3
Patrick ReedAVG.271.6
Mark HubbardAVG.268.8
\n

79 rows × 1 columns

\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf_tst.head(10) ->\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nPLAYER NAME \nCameron Champ AVG. 321.1\nBryson DeChambeau AVG. 318.1\nRory McIlroy AVG. 312.5\nSepp Straka AVG. 305.8\nTommy Fleetwood AVG. 305.5\nTony Finau AVG. 305.3\nDustin Johnson AVG. 305.0\nVictor Perez AVG. 304.4\nHaotong Li AVG. 304.1\nTiger Woods AVG. 304.0", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
PLAYER NAME
Cameron ChampAVG.321.1
Bryson DeChambeauAVG.318.1
Rory McIlroyAVG.312.5
Sepp StrakaAVG.305.8
Tommy FleetwoodAVG.305.5
Tony FinauAVG.305.3
Dustin JohnsonAVG.305.0
Victor PerezAVG.304.4
Haotong LiAVG.304.1
Tiger WoodsAVG.304.0
\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf_tst, after sorting index column ->\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nPLAYER NAME \nAbraham Ancer % 63.89\n AVG. 295.60\nAdam Hadwin % 54.17\n AVG. 282.90\nAdam Long % 59.72\n... ...\nViktor Hovland AVG. 281.90\nWebb Simpson % 62.50\n AVG. 281.10\nXander Schauffele % 68.06\n AVG. 302.30\n\n[158 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
PLAYER NAME
Abraham Ancer%63.89
AVG.295.60
Adam Hadwin%54.17
AVG.282.90
Adam Long%59.72
.........
Viktor HovlandAVG.281.90
Webb Simpson%62.50
AVG.281.10
Xander Schauffele%68.06
AVG.302.30
\n

158 rows × 1 columns

\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf_tst.loc['Bryson DeChambeau', :] ->\n 2020\n pga\n% 66.67\nAVG. 318.10\n" ] } ], "source": [ "s1 = df1.stack()\n", "cols1 = pd.MultiIndex.from_tuples([('2020', 'pga')])\n", "#print(s1.index)\n", "#print(s1.columns)\n", "df1a = pd.DataFrame(s1, columns=cols1)\n", "print('\\ndf1a ->')\n", "display(df1a)\n", "s2 = df2.stack()\n", "df2a = pd.DataFrame(s2, columns=cols1)\n", "df_tst = pd.concat([df1a, df2a])\n", "print('\\ndf_tst.head(10) ->')\n", "display(df_tst.head(10))\n", "df_tst = df_tst.sort_index()\n", "#print(f\"\\ntype(df_tst): {type(df_tst)}\\n\")\n", "print('\\ndf_tst, after sorting index column ->')\n", "display(df_tst)\n", "print(\"\\ndf_tst.loc['Bryson DeChambeau', :] ->\")\n", "print(df_tst.loc['Bryson DeChambeau', :])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "df1 with columns renamed ->\n", "\n" ] }, { "output_type": "display_data", "data": { "text/plain": " drv\nplayer \nCameron Champ 321.1\nBryson DeChambeau 318.1\nRory McIlroy 312.5\nSepp Straka 305.8\nTommy Fleetwood 305.5", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
drv
player
Cameron Champ321.1
Bryson DeChambeau318.1
Rory McIlroy312.5
Sepp Straka305.8
Tommy Fleetwood305.5
\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf1a (multi-indexed) ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer \nCameron Champ drv 321.1\nBryson DeChambeau drv 318.1\nRory McIlroy drv 312.5\nSepp Straka drv 305.8\nTommy Fleetwood drv 305.5\n... ...\nCharl Schwartzel drv 276.0\nChez Reavie drv 274.5\nBrendon Todd drv 272.3\nPatrick Reed drv 271.6\nMark Hubbard drv 268.8\n\n[79 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
player
Cameron Champdrv321.1
Bryson DeChambeaudrv318.1
Rory McIlroydrv312.5
Sepp Strakadrv305.8
Tommy Fleetwooddrv305.5
.........
Charl Schwartzeldrv276.0
Chez Reaviedrv274.5
Brendon Todddrv272.3
Patrick Reeddrv271.6
Mark Hubbarddrv268.8
\n

79 rows × 1 columns

\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf_tst (concat of df1a / df2a, sorted?) ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer \nAbraham Ancer drv 295.60\n gir 63.89\nAdam Hadwin drv 282.90\n gir 54.17\nAdam Long drv 285.40\n... ...\nViktor Hovland gir 65.28\nWebb Simpson drv 281.10\n gir 62.50\nXander Schauffele drv 302.30\n gir 68.06\n\n[158 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
player
Abraham Ancerdrv295.60
gir63.89
Adam Hadwindrv282.90
gir54.17
Adam Longdrv285.40
.........
Viktor Hovlandgir65.28
Webb Simpsondrv281.10
gir62.50
Xander Schauffeledrv302.30
gir68.06
\n

158 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "df1.rename(columns={'AVG.': 'drv', '%': 'gir'}, inplace=True)\n", "df1.rename_axis(index={'PLAYER NAME': 'player'}, inplace=True)\n", "print('\\ndf1 with columns renamed ->\\n')\n", "display(df1.head())\n", "#cols = pd.MultiIndex.from_tuples([(\"lang\",), (\"2018\",\"height\"), (\"2018\",\"weight\"), (\"2019\",\"height\"), (\"2019\",\"weight\")])\n", "cols1 = pd.MultiIndex.from_tuples([('2020', 'pga')])\n", "s1 = df1.stack()\n", "#print(s1.index)\n", "#print(s1.columns)\n", "df1a = pd.DataFrame(s1, columns=cols1)\n", "print('\\ndf1a (multi-indexed) ->\\n')\n", "display(df1a)\n", "df2.rename(columns={'AVG.': 'drv', '%': 'gir'}, inplace=True)\n", "df2.rename_axis(index={'PLAYER NAME': 'player'}, inplace=True)\n", "s2 = df2.stack()\n", "df2a = pd.DataFrame(s2, columns=cols1)\n", "df_tst = pd.concat([df1a, df2a])\n", "df_tst = df_tst.sort_index()\n", "#print(f\"\\ntype(df_tst): {type(df_tst)}\\n\")\n", "print('\\ndf_tst (concat of df1a / df2a, sorted?) ->\\n')\n", "display(df_tst)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\ndf_tst (added index column name) ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer stat \nAbraham Ancer drv 295.60\n gir 63.89\nAdam Hadwin drv 282.90\n gir 54.17\nAdam Long drv 285.40\n... ...\nViktor Hovland gir 65.28\nWebb Simpson drv 281.10\n gir 62.50\nXander Schauffele drv 302.30\n gir 68.06\n\n[158 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
playerstat
Abraham Ancerdrv295.60
gir63.89
Adam Hadwindrv282.90
gir54.17
Adam Longdrv285.40
.........
Viktor Hovlandgir65.28
Webb Simpsondrv281.10
gir62.50
Xander Schauffeledrv302.30
gir68.06
\n

158 rows × 1 columns

\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf_tst.loc['Bryson DeChambeau', :] ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nstat \ndrv 318.10\ngir 66.67", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
stat
drv318.10
gir66.67
\n
" }, "metadata": {} } ], "source": [ "# Add label for stat column of multi-index\n", "df_tst.rename_axis(['player', 'stat'], inplace=True)\n", "print('\\ndf_tst (added index column name) ->\\n')\n", "display(df_tst)\n", "print(\"\\ndf_tst.loc['Bryson DeChambeau', :] ->\\n\")\n", "display(df_tst.loc['Bryson DeChambeau', :])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\ndf_tst (concat of df1a / df2a, index sorted on player last name) ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer stat \nByeong Hun An drv 286.60\n gir 62.50\nAbraham Ancer drv 295.60\n gir 63.89\nDaniel Berger drv 291.90\n... ...\nMatthew Wolff gir 77.78\nGary Woodland drv 293.00\n gir 65.28\nTiger Woods drv 304.00\n gir 62.50\n\n[158 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
playerstat
Byeong Hun Andrv286.60
gir62.50
Abraham Ancerdrv295.60
gir63.89
Daniel Bergerdrv291.90
.........
Matthew Wolffgir77.78
Gary Woodlanddrv293.00
gir65.28
Tiger Woodsdrv304.00
gir62.50
\n

158 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "#print(df_tst.index)\n", "#print(re.split(r'\\W+', df_tst.index[0][0])[-1])\n", "ndx_sort = sorted(df_tst.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", "#print(ndx_sort)\n", "#df_tst = df_tst.sort_index()\n", "df_tst = df_tst.reindex(ndx_sort)\n", "#print(f\"\\ntype(df_tst): {type(df_tst)}\\n\")\n", "print('\\ndf_tst (concat of df1a / df2a, index sorted on player last name) ->\\n')\n", "display(df_tst)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020\n rbch\nplayer stat \nAbraham Ancer drv 278.10\n gir 90.28\nRyan Armour drv 275.00\n gir 56.94\nDaniel Berger drv 273.60\n... ...\nMatt Wallace gir 65.28\nBubba Watson drv 277.50\n gir 65.28\nGary Woodland drv 287.00\n gir 73.61\n\n[150 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
rbch
playerstat
Abraham Ancerdrv278.10
gir90.28
Ryan Armourdrv275.00
gir56.94
Daniel Bergerdrv273.60
.........
Matt Wallacegir65.28
Bubba Watsondrv277.50
gir65.28
Gary Woodlanddrv287.00
gir73.61
\n

150 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "# change order of csv files to reflect order I want in dataframe\n", "csvs = ['pga_2020_drv_2.test.csv', 'pga_2020_gir_2.test.csv',\n", " 'rbch_2020_drv_2.test.csv', 'rbch_2020_gir_2.test.csv',\n", " 'pga_2021_drv_2.test.csv', 'pga_2021_gir_2.test.csv',\n", " 'rbch_2021_drv_2.test.csv', 'rbch_2021_gir_2.test.csv'\n", "]\n", "df3 = pd.read_csv(f\"{d_dir}{csvs[2]}\", index_col=['PLAYER NAME'], usecols=['AVG.', 'PLAYER NAME'])\n", "df4 = pd.read_csv(f\"{d_dir}{csvs[3]}\", index_col=['PLAYER NAME'], usecols=['%', 'PLAYER NAME'])\n", "cols1 = pd.MultiIndex.from_tuples([('2020', 'rbch')])\n", "\n", "df3.rename(columns={'AVG.': 'drv', '%': 'gir'}, inplace=True)\n", "df3.rename_axis(index={'PLAYER NAME': 'player'}, inplace=True)\n", "s3 = df3.stack()\n", "df3a = pd.DataFrame(s3, columns=cols1)\n", "\n", "df4.rename(columns={'AVG.': 'drv', '%': 'gir'}, inplace=True)\n", "df4.rename_axis(index={'PLAYER NAME': 'player'}, inplace=True)\n", "s4 = df4.stack()\n", "df4a = pd.DataFrame(s4, columns=cols1)\n", "\n", "df_tst2 = pd.concat([df3a, df4a])\n", "df_tst2.rename_axis(['player', 'stat'], inplace=True)\n", "ndx_sort2 = sorted(df_tst2.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", "df_tst2 = df_tst2.reindex(ndx_sort2)\n", "display(df_tst2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\ndf_comb.head(10) ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020 \n pga rbch\nplayer stat \nByeong Hun An drv 286.60 NaN\n gir 62.50 NaN\nAbraham Ancer drv 295.60 NaN\n gir 63.89 NaN\nDaniel Berger drv 291.90 NaN\n gir 65.28 NaN\nPatrick Cantlay drv 295.30 NaN\n gir 59.72 NaN\nPaul Casey drv 295.80 NaN\n gir 76.39 NaN", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pgarbch
playerstat
Byeong Hun Andrv286.60NaN
gir62.50NaN
Abraham Ancerdrv295.60NaN
gir63.89NaN
Daniel Bergerdrv291.90NaN
gir65.28NaN
Patrick Cantlaydrv295.30NaN
gir59.72NaN
Paul Caseydrv295.80NaN
gir76.39NaN
\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\nlen(df_comb): 308\n\ndf_comb.dropna() ->\n\nEmpty DataFrame\nColumns: [(2020, pga), (2020, rbch)]\nIndex: []\n" ] } ], "source": [ "# now lets concatenate our two stats dfs\n", "df_comb = pd.concat([df_tst, df_tst2])\n", "print('\\ndf_comb.head(10) ->\\n')\n", "display(df_comb.head(10))\n", "print(f\"\\nlen(df_comb): {len(df_comb)}\")\n", "print('\\ndf_comb.dropna() ->\\n')\n", "print(df_comb.dropna())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\ndf_comb.dropna(subset=[('2020', 'pga')]) ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020 \n pga rbch\nplayer stat \nByeong Hun An drv 286.60 NaN\n gir 62.50 NaN\nAbraham Ancer drv 295.60 NaN\n gir 63.89 NaN\nDaniel Berger drv 291.90 NaN\n... ... ...\nMatthew Wolff gir 77.78 NaN\nGary Woodland drv 293.00 NaN\n gir 65.28 NaN\nTiger Woods drv 304.00 NaN\n gir 62.50 NaN\n\n[158 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pgarbch
playerstat
Byeong Hun Andrv286.60NaN
gir62.50NaN
Abraham Ancerdrv295.60NaN
gir63.89NaN
Daniel Bergerdrv291.90NaN
............
Matthew Wolffgir77.78NaN
Gary Woodlanddrv293.00NaN
gir65.28NaN
Tiger Woodsdrv304.00NaN
gir62.50NaN
\n

158 rows × 2 columns

\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf_comb.dropna(subset=[('2020', 'rbch')]) ->\n\n" ] }, { "output_type": "display_data", "data": { "text/plain": " 2020 \n pga rbch\nplayer stat \nAbraham Ancer drv NaN 278.10\n gir NaN 90.28\nRyan Armour drv NaN 275.00\n gir NaN 56.94\nDaniel Berger drv NaN 273.60\n... ... ...\nMatt Wallace gir NaN 65.28\nBubba Watson drv NaN 277.50\n gir NaN 65.28\nGary Woodland drv NaN 287.00\n gir NaN 73.61\n\n[150 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pgarbch
playerstat
Abraham AncerdrvNaN278.10
girNaN90.28
Ryan ArmourdrvNaN275.00
girNaN56.94
Daniel BergerdrvNaN273.60
............
Matt WallacegirNaN65.28
Bubba WatsondrvNaN277.50
girNaN65.28
Gary WoodlanddrvNaN287.00
girNaN73.61
\n

150 rows × 2 columns

\n
" }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\ndf_comb.loc['Abraham Ancer'] ->\n\n 2020 \n pga rbch\nstat \ndrv 295.60 NaN\ngir 63.89 NaN\ndrv NaN 278.10\ngir NaN 90.28\n" ] } ], "source": [ "print(\"\\ndf_comb.dropna(subset=[('2020', 'pga')]) ->\\n\")\n", "display(df_comb.dropna(how='any', subset=[('2020', 'pga')]))\n", "\n", "print(\"\\ndf_comb.dropna(subset=[('2020', 'rbch')]) ->\\n\")\n", "cols= [(c0, c1) for (c0, c1) in df_comb.columns if c1 in ['rbch']]\n", "display(df_comb.dropna(how='any', subset=cols))\n", "# what's going on\n", "print(\"\\ndf_comb.loc['Abraham Ancer'] ->\\n\")\n", "print(df_comb.loc['Abraham Ancer'])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020 \n pga rbch\nplayer stat \nByeong Hun An drv 286.60 NaN\n gir 62.50 NaN\nAbraham Ancer drv 295.60 278.10\n gir 63.89 90.28\nRyan Armour drv NaN 275.00\n... ... ...\nMatthew Wolff gir 77.78 NaN\nGary Woodland drv 293.00 287.00\n gir 65.28 73.61\nTiger Woods drv 304.00 NaN\n gir 62.50 NaN\n\n[234 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pgarbch
playerstat
Byeong Hun Andrv286.60NaN
gir62.50NaN
Abraham Ancerdrv295.60278.10
gir63.8990.28
Ryan ArmourdrvNaN275.00
............
Matthew Wolffgir77.78NaN
Gary Woodlanddrv293.00287.00
gir65.2873.61
Tiger Woodsdrv304.00NaN
gir62.50NaN
\n

234 rows × 2 columns

\n
" }, "metadata": {} } ], "source": [ "df_comb = pd.merge(df_tst, df_tst2, how='outer', on=['player', 'stat'])\n", "ndx_sort3 = sorted(df_comb.index, key=lambda x: re.split(r'\\W+', x[0])[-1])\n", "df_comb = df_comb.reindex(ndx_sort3)\n", "\n", "display(df_comb)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020 \n pga rbch\nplayer stat \nAbraham Ancer drv 295.60 278.10\n gir 63.89 90.28\nDaniel Berger drv 291.90 273.60\n gir 65.28 69.44\nJoel Dahmen drv 289.50 270.30\n... ... ...\nMatt Wallace gir 65.28 65.28\nBubba Watson drv 290.10 277.50\n gir 70.83 65.28\nGary Woodland drv 293.00 287.00\n gir 65.28 73.61\n\n[74 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pgarbch
playerstat
Abraham Ancerdrv295.60278.10
gir63.8990.28
Daniel Bergerdrv291.90273.60
gir65.2869.44
Joel Dahmendrv289.50270.30
............
Matt Wallacegir65.2865.28
Bubba Watsondrv290.10277.50
gir70.8365.28
Gary Woodlanddrv293.00287.00
gir65.2873.61
\n

74 rows × 2 columns

\n
" }, "metadata": {} } ], "source": [ "display(df_comb.dropna())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }