{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 2, "kernelspec": { "name": "python392jvsc74a57bd0a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85", "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pathlib import Path\n", "import re" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# define some needed variables, lists, and the like\n", "events = {'wmpo': 't003', 'api': 't009', 'tpc': 'to11', 'rbch': 't012', 'masters': 't014', 'pga': 't033'}\n", "stats = {'drv': '101', 'gir': '103', 't2g': '02674', 'scramble': '130'}\n", "st_cols = {'drv': ['PLAYER NAME', 'AVG.', 'TOTAL DISTANCE', 'TOTAL DRIVES'],\n", " 'gir': ['PLAYER NAME', '%', 'GREENS HIT', '# HOLES', 'RELATIVE/PAR'],\n", " 't2g': ['PLAYER NAME', 'AVERAGE', 'SG:OTT', 'SG:APR', 'SG:ARG'],\n", " 'scramble': ['PLAYER NAME', '%', 'PAR OR BETTER', 'MISSED GIR']\n", "}\n", "# data directory and stats, tournaments and years to process\n", "d_dir = \"./data/\"\n", "p_sids = ['drv', 'gir']\n", "p_tids = ['pga', 'rbch']\n", "p_yrs = ['2020', '2021']" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# function: take year, tounament id and stat id, return CSV file path (relative) \n", "def get_csv_nm(t_yr, t_id, p_st):\n", " global d_dir\n", " return f\"{d_dir}{t_id}_{t_yr}_{p_st}_2.test.csv\"\n", "\n", "# function: will take a year, tournament and stat, read csv, return suitable DataFrame\n", "def csv_2_df(t_yr, t_id, p_st):\n", " \"\"\" Read appropriate CSV file into DataFrame. Modify Dataframe to employ multi-indices.\n", " Return modified DataFrame.\n", " Useage: csv_2_df(t_yr, t_id, p_st)\n", " where t_yr = tournament year\n", " t_id = tournament id (e.g. 'pga')\n", " p_st = player stat (e.g. 'drv', 'gir')\n", " \"\"\"\n", " global st_cols\n", " col_nms = {'drv': {'AVG.': 'drv'},\n", " 'gir': {'%': 'gir'},\n", " 'scramble': {'%': 'scramble'}\n", " }\n", "\n", " csv_fl = get_csv_nm(t_yr, t_id, p_st)\n", " ty_m_idx = pd.MultiIndex.from_tuples([(t_yr, t_id)])\n", " s_col = st_cols[p_st][1]\n", " df_stats = pd.read_csv(csv_fl, index_col=['PLAYER NAME'], usecols=[s_col, 'PLAYER NAME'])\n", " df_stats.rename(columns=col_nms[p_st], inplace=True)\n", " s_tmp = df_stats.stack()\n", " ts_df = pd.DataFrame(s_tmp, columns=ty_m_idx)\n", " ts_df.rename_axis(['player', 'stat'], inplace=True)\n", " return ts_df" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer stat \nCameron Champ drv 321.1\nBryson DeChambeau drv 318.1\nRory McIlroy drv 312.5\nSepp Straka drv 305.8\nTommy Fleetwood drv 305.5\n... ...\nCharl Schwartzel drv 276.0\nChez Reavie drv 274.5\nBrendon Todd drv 272.3\nPatrick Reed drv 271.6\nMark Hubbard drv 268.8\n\n[79 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
playerstat
Cameron Champdrv321.1
Bryson DeChambeaudrv318.1
Rory McIlroydrv312.5
Sepp Strakadrv305.8
Tommy Fleetwooddrv305.5
.........
Charl Schwartzeldrv276.0
Chez Reaviedrv274.5
Brendon Todddrv272.3
Patrick Reeddrv271.6
Mark Hubbarddrv268.8
\n

79 rows × 1 columns

\n
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer stat \nMatthew Wolff gir 77.78\nPaul Casey gir 76.39\nJason Day gir 76.39\nLouis Oosthuizen gir 73.61\nCameron Champ gir 73.61\n... ...\nDenny McCarthy gir 54.17\nHarris English gir 52.78\nBrian Harman gir 51.39\nBrandt Snedeker gir 51.39\nJ.T. Poston gir 50.00\n\n[79 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
playerstat
Matthew Wolffgir77.78
Paul Caseygir76.39
Jason Daygir76.39
Louis Oosthuizengir73.61
Cameron Champgir73.61
.........
Denny McCarthygir54.17
Harris Englishgir52.78
Brian Harmangir51.39
Brandt Snedekergir51.39
J.T. Postongir50.00
\n

79 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "# quick test\n", "df1 = csv_2_df('2020', 'pga', 'drv')\n", "display(df1)\n", "df2 = csv_2_df('2020', 'pga', 'gir')\n", "display(df2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "\n", "# function: will take a year, a tournament and a list of stats, generate a DataFrame for that tournament and year\n", "def tourney_2_df(t_yr, t_id, p_sts):\n", " \"\"\" Combine all requested stats for a given tournament and year into a single DataFrame. \n", " Return DataFrame.\n", " Useage: tourney_2_df(t_yr, t_id, p_sts)\n", " where t_yr = tournament year\n", " t_id = tournament id (e.g. 'pga')\n", " p_sts = list of player stat (e.g. ['drv', 'gir'])\n", " \"\"\"\n", " df1 = csv_2_df(t_yr, t_id, p_sts[0])\n", " if len(p_sts) > 1:\n", " df2 = csv_2_df(t_yr, t_id, p_sts[1])\n", " df_tourney = pd.concat([df1, df2])\n", " else:\n", " return df1\n", " if len(p_sts) > 2:\n", " passs\n", " \n", " ndx_sort2 = sorted(df_tourney.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", " df_tourney = df_tourney.reindex(ndx_sort2)\n", " return df_tourney\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer stat \nByeong Hun An drv 286.60\n gir 62.50\nAbraham Ancer drv 295.60\n gir 63.89\nDaniel Berger drv 291.90\n... ...\nMatthew Wolff gir 77.78\nGary Woodland drv 293.00\n gir 65.28\nTiger Woods drv 304.00\n gir 62.50\n\n[158 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
playerstat
Byeong Hun Andrv286.60
gir62.50
Abraham Ancerdrv295.60
gir63.89
Daniel Bergerdrv291.90
.........
Matthew Wolffgir77.78
Gary Woodlanddrv293.00
gir65.28
Tiger Woodsdrv304.00
gir62.50
\n

158 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "# test\n", "pga_2020 = tourney_2_df('2020', 'pga', ['drv', 'gir'])\n", "display(pga_2020)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# function: will take a year, list of tournaments and list of stats and generate a DataFrame for that year\n", "def year_2_df(t_yr, t_ids, p_sts):\n", " df1 = tourney_2_df(t_yr, t_ids[0], p_sts)\n", " if len(t_ids) == 1:\n", " return df1\n", " df2 = tourney_2_df(t_yr, t_ids[1], p_sts)\n", " df_comb = pd.merge(df1, df2, how='outer', on=['player', 'stat'])\n", " if len(t_ids) > 2:\n", " pass\n", " ndx_sort2 = sorted(df_comb.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", " df_comb = df_comb.reindex(ndx_sort2)\n", "\n", " return df_comb" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020 \n pga rbch\nplayer stat \nByeong Hun An drv 286.60 NaN\n gir 62.50 NaN\nAbraham Ancer drv 295.60 278.10\n gir 63.89 90.28\nRyan Armour drv NaN 275.00\n... ... ...\nMatthew Wolff gir 77.78 NaN\nGary Woodland drv 293.00 287.00\n gir 65.28 73.61\nTiger Woods drv 304.00 NaN\n gir 62.50 NaN\n\n[234 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pgarbch
playerstat
Byeong Hun Andrv286.60NaN
gir62.50NaN
Abraham Ancerdrv295.60278.10
gir63.8990.28
Ryan ArmourdrvNaN275.00
............
Matthew Wolffgir77.78NaN
Gary Woodlanddrv293.00287.00
gir65.2873.61
Tiger Woodsdrv304.00NaN
gir62.50NaN
\n

234 rows × 2 columns

\n
" }, "metadata": {} } ], "source": [ "# test year_2_df\n", "df_2020 = year_2_df('2020', ['pga', 'rbch'], ['drv', 'gir'])\n", "display(df_2020)\n", "# want to see what the csv looks like\n", "golf_csv = f'{d_dir}golf_play_8.test.csv'\n", "df_2020.to_csv(golf_csv)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2021 \n pga rbch\nplayer stat \nByeong Hun An drv 306.50 NaN\n gir 48.61 NaN\nAbraham Ancer drv 301.80 299.90\n gir 61.11 70.83\nDaniel Berger drv 298.40 299.60\n... ... ...\nAaron Wise gir 54.17 NaN\nGary Woodland drv 311.10 NaN\n gir 58.33 NaN\nWill Zalatoris drv 306.50 312.00\n gir 61.11 58.33\n\n[226 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2021
pgarbch
playerstat
Byeong Hun Andrv306.50NaN
gir48.61NaN
Abraham Ancerdrv301.80299.90
gir61.1170.83
Daniel Bergerdrv298.40299.60
............
Aaron Wisegir54.17NaN
Gary Woodlanddrv311.10NaN
gir58.33NaN
Will Zalatorisdrv306.50312.00
gir61.1158.33
\n

226 rows × 2 columns

\n
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": " 2020 2021 \n pga rbch pga rbch\nplayer stat \nByeong Hun An drv 286.60 NaN 306.50 NaN\n gir 62.50 NaN 48.61 NaN\nAbraham Ancer drv 295.60 278.10 301.80 299.90\n gir 63.89 90.28 61.11 70.83\nRyan Armour drv NaN 275.00 NaN NaN\n... ... ... ... ...\nGary Woodland gir 65.28 73.61 58.33 NaN\nTiger Woods drv 304.00 NaN NaN NaN\n gir 62.50 NaN NaN NaN\nWill Zalatoris drv NaN NaN 306.50 312.00\n gir NaN NaN 61.11 58.33\n\n[318 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
20202021
pgarbchpgarbch
playerstat
Byeong Hun Andrv286.60NaN306.50NaN
gir62.50NaN48.61NaN
Abraham Ancerdrv295.60278.10301.80299.90
gir63.8990.2861.1170.83
Ryan ArmourdrvNaN275.00NaNNaN
..................
Gary Woodlandgir65.2873.6158.33NaN
Tiger Woodsdrv304.00NaNNaNNaN
gir62.50NaNNaNNaN
Will ZalatorisdrvNaNNaN306.50312.00
girNaNNaN61.1158.33
\n

318 rows × 4 columns

\n
" }, "metadata": {} } ], "source": [ "# test merging two years\n", "df_2021 = year_2_df('2021', ['pga', 'rbch'], ['drv', 'gir'])\n", "display(df_2021)\n", "df_comb = pd.merge(df_2020, df_2021, how='outer', on=['player', 'stat'])\n", "ndx_sort2 = sorted(df_comb.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", "df_comb = df_comb.reindex(ndx_sort2)\n", "display(df_comb)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# let's try that with 3 different stats\n", "# will need to redefine tourney_2_df\n", "# you may have noticed the extra condition with only a 'pass'\n", "def tourney_2_df(t_yr, t_id, p_sts):\n", " \"\"\" Combine all requested stats for a given tournament and year into a single DataFrame. \n", " Return DataFrame.\n", " Useage: tourney_2_df(t_yr, t_id, p_sts)\n", " where t_yr = tournament year\n", " t_id = tournament id (e.g. 'pga')\n", " p_sts = list of player stat (e.g. ['drv', 'gir'])\n", " \"\"\"\n", " df1 = csv_2_df(t_yr, t_id, p_sts[0])\n", " if len(p_sts) > 1:\n", " df2 = csv_2_df(t_yr, t_id, p_sts[1])\n", " df_tourney = pd.concat([df1, df2])\n", " else:\n", " return df1\n", " if len(p_sts) > 2:\n", " all_dfs = [df_tourney]\n", " for p_st in p_sts[2:]:\n", " df_tmp = csv_2_df(t_yr, t_id, p_st)\n", " all_dfs.append(df_tmp)\n", " df_tourney = pd.concat(all_dfs)\n", " \n", " ndx_sort2 = sorted(df_tourney.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", " df_tourney = df_tourney.reindex(ndx_sort2)\n", " return df_tourney" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020\n pga\nplayer stat \nByeong Hun An drv 286.60\n gir 62.50\n scramble 62.96\nAbraham Ancer drv 295.60\n gir 63.89\n... ...\nGary Woodland gir 65.28\n scramble 64.00\nTiger Woods drv 304.00\n gir 62.50\n scramble 59.26\n\n[237 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pga
playerstat
Byeong Hun Andrv286.60
gir62.50
scramble62.96
Abraham Ancerdrv295.60
gir63.89
.........
Gary Woodlandgir65.28
scramble64.00
Tiger Woodsdrv304.00
gir62.50
scramble59.26
\n

237 rows × 1 columns

\n
" }, "metadata": {} } ], "source": [ "# test time\n", "p_sts = ['drv', 'gir', 'scramble']\n", "pga_2020 = tourney_2_df('2020', 'pga', p_sts)\n", "display(pga_2020)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020 \n pga rbch\nplayer stat \nByeong Hun An drv 286.60 NaN\n gir 62.50 NaN\n scramble 62.96 NaN\nAbraham Ancer drv 295.60 278.10\n gir 63.89 90.28\n... ... ...\nGary Woodland gir 65.28 73.61\n scramble 64.00 42.11\nTiger Woods drv 304.00 NaN\n gir 62.50 NaN\n scramble 59.26 NaN\n\n[351 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
pgarbch
playerstat
Byeong Hun Andrv286.60NaN
gir62.50NaN
scramble62.96NaN
Abraham Ancerdrv295.60278.10
gir63.8990.28
............
Gary Woodlandgir65.2873.61
scramble64.0042.11
Tiger Woodsdrv304.00NaN
gir62.50NaN
scramble59.26NaN
\n

351 rows × 2 columns

\n
" }, "metadata": {} } ], "source": [ "# that seemed to work, so let's do a second tourney and merge\n", "df_2020 = year_2_df('2020', ['pga', 'rbch'], ['drv', 'gir', 'scramble'])\n", "display(df_2020)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2021 \n pga rbch\nplayer stat \nByeong Hun An drv 306.50 NaN\n gir 48.61 NaN\n scramble 67.57 NaN\nAbraham Ancer drv 301.80 299.90\n gir 61.11 70.83\n... ... ...\nGary Woodland gir 58.33 NaN\n scramble 60.00 NaN\nWill Zalatoris drv 306.50 312.00\n gir 61.11 58.33\n scramble 64.29 56.67\n\n[339 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2021
pgarbch
playerstat
Byeong Hun Andrv306.50NaN
gir48.61NaN
scramble67.57NaN
Abraham Ancerdrv301.80299.90
gir61.1170.83
............
Gary Woodlandgir58.33NaN
scramble60.00NaN
Will Zalatorisdrv306.50312.00
gir61.1158.33
scramble64.2956.67
\n

339 rows × 2 columns

\n
" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": " 2020 2021 \n pga rbch pga rbch\nplayer stat \nByeong Hun An drv 286.60 NaN 306.50 NaN\n gir 62.50 NaN 48.61 NaN\n scramble 62.96 NaN 67.57 NaN\nAbraham Ancer drv 295.60 278.10 301.80 299.90\n gir 63.89 90.28 61.11 70.83\n... ... ... ... ...\nTiger Woods gir 62.50 NaN NaN NaN\n scramble 59.26 NaN NaN NaN\nWill Zalatoris drv NaN NaN 306.50 312.00\n gir NaN NaN 61.11 58.33\n scramble NaN NaN 64.29 56.67\n\n[477 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
20202021
pgarbchpgarbch
playerstat
Byeong Hun Andrv286.60NaN306.50NaN
gir62.50NaN48.61NaN
scramble62.96NaN67.57NaN
Abraham Ancerdrv295.60278.10301.80299.90
gir63.8990.2861.1170.83
..................
Tiger Woodsgir62.50NaNNaNNaN
scramble59.26NaNNaNNaN
Will ZalatorisdrvNaNNaN306.50312.00
girNaNNaN61.1158.33
scrambleNaNNaN64.2956.67
\n

477 rows × 4 columns

\n
" }, "metadata": {} } ], "source": [ "# okay, and now a 2nd year\n", "df_2021 = year_2_df('2021', ['pga', 'rbch'], ['drv', 'gir', 'scramble'])\n", "display(df_2021)\n", "df_comb = pd.merge(df_2020, df_2021, how='outer', on=['player', 'stat'])\n", "ndx_sort2 = sorted(df_comb.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", "df_comb = df_comb.reindex(ndx_sort2)\n", "display(df_comb)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# now let's add a 3rd tournament\n", "# and of course we need to redefine year_2_df\n", "def year_2_df(t_yr, t_ids, p_sts):\n", " df1 = tourney_2_df(t_yr, t_ids[0], p_sts)\n", " if len(t_ids) == 1:\n", " return df1\n", " df2 = tourney_2_df(t_yr, t_ids[1], p_sts)\n", " df_comb = pd.merge(df1, df2, how='outer', on=['player', 'stat'])\n", " if len(t_ids) > 2:\n", " for t_id in t_ids[2:]:\n", " df_tmp = tourney_2_df(t_yr, t_id, p_sts)\n", " df_comb = pd.merge(df_comb, df_tmp, how='outer', on=['player', 'stat'])\n", "\n", " ndx_sort2 = sorted(df_comb.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", " df_comb = df_comb.reindex(ndx_sort2)\n", "\n", " return df_comb" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020 \n api pga rbch\nplayer stat \nByeong Hun An drv 291.80 286.60 NaN\n gir 66.67 62.50 NaN\n scramble 54.17 62.96 NaN\nAbraham Ancer drv 281.90 295.60 278.10\n gir 48.61 63.89 90.28\n... ... ... ...\nTiger Woods gir NaN 62.50 NaN\n scramble NaN 59.26 NaN\nXinjun Zhang drv 288.80 NaN NaN\n gir 50.00 NaN NaN\n scramble 52.78 NaN NaN\n\n[441 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
2020
apipgarbch
playerstat
Byeong Hun Andrv291.80286.60NaN
gir66.6762.50NaN
scramble54.1762.96NaN
Abraham Ancerdrv281.90295.60278.10
gir48.6163.8990.28
...............
Tiger WoodsgirNaN62.50NaN
scrambleNaN59.26NaN
Xinjun Zhangdrv288.80NaNNaN
gir50.00NaNNaN
scramble52.78NaNNaN
\n

441 rows × 3 columns

\n
" }, "metadata": {} } ], "source": [ "# testing 1 2 3\n", "df_2020 = year_2_df('2020', ['api', 'pga', 'rbch'], ['drv', 'gir', 'scramble'])\n", "display(df_2020)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2020 2021 \n api pga rbch api pga rbch\nplayer stat \nByeong Hun An drv 291.80 286.60 NaN 290.60 306.50 NaN\n gir 66.67 62.50 NaN 56.94 48.61 NaN\n scramble 54.17 62.96 NaN 64.52 67.57 NaN\nAbraham Ancer drv 281.90 295.60 278.10 NaN 301.80 299.90\n gir 48.61 63.89 90.28 NaN 61.11 70.83\n... ... ... ... ... ... ...\nWill Zalatoris gir NaN NaN NaN 69.44 61.11 58.33\n scramble NaN NaN NaN 59.09 64.29 56.67\nXinjun Zhang drv 288.80 NaN NaN NaN NaN NaN\n gir 50.00 NaN NaN NaN NaN NaN\n scramble 52.78 NaN NaN NaN NaN NaN\n\n[558 rows x 6 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
20202021
apipgarbchapipgarbch
playerstat
Byeong Hun Andrv291.80286.60NaN290.60306.50NaN
gir66.6762.50NaN56.9448.61NaN
scramble54.1762.96NaN64.5267.57NaN
Abraham Ancerdrv281.90295.60278.10NaN301.80299.90
gir48.6163.8990.28NaN61.1170.83
........................
Will ZalatorisgirNaNNaNNaN69.4461.1158.33
scrambleNaNNaNNaN59.0964.2956.67
Xinjun Zhangdrv288.80NaNNaNNaNNaNNaN
gir50.00NaNNaNNaNNaNNaN
scramble52.78NaNNaNNaNNaNNaN
\n

558 rows × 6 columns

\n
" }, "metadata": {} } ], "source": [ "# add the second year\n", "df_2021 = year_2_df('2021', ['api', 'pga', 'rbch'], ['drv', 'gir', 'scramble'])\n", "df_comb = pd.merge(df_2020, df_2021, how='outer', on=['player', 'stat'])\n", "ndx_sort2 = sorted(df_comb.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", "df_comb = df_comb.reindex(ndx_sort2)\n", "display(df_comb)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# decided to test using an additional year, 2019\n", "# so need to get some more stats into csv files\n", "def stats_2_csv(tyrs, tids, psts):\n", " global d_dir, events, stats, st_cols\n", " for tid in tids:\n", " eid = events[tid]\n", " for tyr in tyrs:\n", " for pst in psts:\n", " print(f\"\\n{tid}, {tyr}, {pst} ->\", end='')\n", " stid = stats[pst]\n", " tlnk = f'https://www.pgatour.com/content/pgatour/stats/stat.{stid}.y{tyr}.eon.{eid}.html'\n", " f_out = get_csv_nm(tyr, tid, pst)\n", " if Path(f_out).is_file():\n", " print(\" already exists, not downloaded again\")\n", " else:\n", " tmp_stats = pd.read_html(tlnk)\n", " if len(tmp_stats) <= 1:\n", " print(\" not found on site\")\n", " break\n", " df_stats = tmp_stats[1][st_cols[pst]]\n", " df_stats.to_csv(f_out, index=False)\n", " print(\" downloaded and saved to CSV\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\napi, 2019, drv -> already exists, not downloaded again\n\napi, 2019, gir -> already exists, not downloaded again\n\napi, 2019, scramble -> already exists, not downloaded again\n\npga, 2019, drv -> already exists, not downloaded again\n\npga, 2019, gir -> already exists, not downloaded again\n\npga, 2019, scramble -> already exists, not downloaded again\n\nrbch, 2019, drv -> already exists, not downloaded again\n\nrbch, 2019, gir -> already exists, not downloaded again\n\nrbch, 2019, scramble -> already exists, not downloaded again\n" ] } ], "source": [ "p_sts = ['drv', 'gir', 'scramble']\n", "t_ids = ['api', 'pga', 'rbch']\n", "t_yrs = ['2019']\n", "stats_2_csv(t_yrs, t_ids, p_sts)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# now a new function to build the final df by combining the ones for each year\n", "def golf_stats_2_df(tyrs, tids, psts):\n", " if len(tyrs) < 1:\n", " return None\n", " else: \n", " df1 = year_2_df(tyrs[0], tids, psts)\n", " if len(tyrs) == 1:\n", " return df1\n", " else:\n", " df2 = year_2_df(tyrs[1], tids, psts)\n", " df_comb = pd.merge(df1, df2, how='outer', on=['player', 'stat'])\n", " if len(tyrs) > 2:\n", " for tyr in tyrs[2:]:\n", " df_tmp = year_2_df(tyr, tids, psts)\n", " df_comb = pd.merge(df_comb, df_tmp, how='outer', on=['player', 'stat'])\n", "\n", " ndx_sort2 = sorted(df_comb.index,key=lambda x: re.split(r'\\W+', x[0])[-1])\n", " df_comb = df_comb.reindex(ndx_sort2)\n", "\n", " return df_comb" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "output_type": "display_data", "data": { "text/plain": " 2019 2020 2021 \\\n api pga rbch api pga rbch api \nplayer stat \nByeong Hun An drv 312.90 NaN NaN 291.80 286.60 NaN 290.60 \n gir 63.89 NaN NaN 66.67 62.50 NaN 56.94 \n scramble 69.23 NaN NaN 54.17 62.96 NaN 64.52 \nAbraham Ancer drv NaN 285.60 NaN 281.90 295.60 278.10 NaN \n gir NaN 56.94 NaN 48.61 63.89 90.28 NaN \n... ... ... ... ... ... ... ... \nWill Zalatoris gir NaN NaN NaN NaN NaN NaN 69.44 \n scramble NaN NaN NaN NaN NaN NaN 59.09 \nXinjun Zhang drv NaN NaN NaN 288.80 NaN NaN NaN \n gir NaN NaN NaN 50.00 NaN NaN NaN \n scramble NaN NaN NaN 52.78 NaN NaN NaN \n\n \n pga rbch \nplayer stat \nByeong Hun An drv 306.50 NaN \n gir 48.61 NaN \n scramble 67.57 NaN \nAbraham Ancer drv 301.80 299.90 \n gir 61.11 70.83 \n... ... ... \nWill Zalatoris gir 61.11 58.33 \n scramble 64.29 56.67 \nXinjun Zhang drv NaN NaN \n gir NaN NaN \n scramble NaN NaN \n\n[678 rows x 9 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
201920202021
apipgarbchapipgarbchapipgarbch
playerstat
Byeong Hun Andrv312.90NaNNaN291.80286.60NaN290.60306.50NaN
gir63.89NaNNaN66.6762.50NaN56.9448.61NaN
scramble69.23NaNNaN54.1762.96NaN64.5267.57NaN
Abraham AncerdrvNaN285.60NaN281.90295.60278.10NaN301.80299.90
girNaN56.94NaN48.6163.8990.28NaN61.1170.83
.................................
Will ZalatorisgirNaNNaNNaNNaNNaNNaN69.4461.1158.33
scrambleNaNNaNNaNNaNNaNNaN59.0964.2956.67
Xinjun ZhangdrvNaNNaNNaN288.80NaNNaNNaNNaNNaN
girNaNNaNNaN50.00NaNNaNNaNNaNNaN
scrambleNaNNaNNaN52.78NaNNaNNaNNaNNaN
\n

678 rows × 9 columns

\n
" }, "metadata": {} } ], "source": [ "# test time\n", "# manually edidted the appropriate csv files and removed the ', Jr.' for the Ted Potter row\n", "all_yrs = ['2019', '2020', '2021']\n", "golf_stats = golf_stats_2_df(all_yrs, t_ids, p_sts)\n", "display(golf_stats)\n", "# want to see what the csv looks like\n", "golf_csv = f'{d_dir}golf_stats.test.csv'\n", "golf_stats.to_csv(golf_csv)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# all that's left is to figure out how to use this data set in some meaningful fashion" ] } ] }