{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 2, "kernelspec": { "name": "python392jvsc74a57bd0a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85", "display_name": "Python 3.9.2 64-bit" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import time" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "events = {'wmpo': 't003', 'tpc': 'to11', 'rbch': 't012', 'masters': 't014', 'pga': 't033'}\n", "stats = {'drv': '101', 'gir': '103', 't2g': '02674'}\n", "st_cols = {'drv': ['PLAYER NAME', 'AVG.', 'TOTAL DISTANCE', 'TOTAL DRIVES'],\n", " 'gir': ['PLAYER NAME', '%', 'GREENS HIT', '# HOLES', 'RELATIVE/PAR'],\n", " 't2g': ['PLAYER NAME', 'AVERAGE', 'SG:OTT', 'SG:APR', 'SG:ARG']\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# define the tournament, stats and years to collect\n", "t_ids = ['pga', 'rbch']\n", "t_yrs = ['2020', '2021']\n", "p_sts = ['drv', 'gir']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# define the actual stats we will be getting\n", "t_ids = ['pga', 'rbch']\n", "t_yrs = ['2020', '2021']\n", "p_sts = ['drv', 'gir']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "pga, 2020, drv ->\n", " PLAYER NAME AVG. TOTAL DISTANCE TOTAL DRIVES\n", "0 Cameron Champ 321.1 2569 8\n", "1 Bryson DeChambeau 318.1 2545 8\n", "2 Rory McIlroy 312.5 2500 8\n", "3 Sepp Straka 305.8 2446 8\n", "4 Tommy Fleetwood 305.5 2444 8\n", "\n", "pga, 2020, gir ->\n", " PLAYER NAME % GREENS HIT # HOLES RELATIVE/PAR\n", "0 Matthew Wolff 77.78 56 72 -0.25\n", "1 Paul Casey 76.39 55 72 -0.27\n", "2 Jason Day 76.39 55 72 -0.27\n", "3 Louis Oosthuizen 73.61 53 72 -0.26\n", "4 Cameron Champ 73.61 53 72 -0.30\n", "\n", "pga, 2021, drv ->\n", " PLAYER NAME AVG. TOTAL DISTANCE TOTAL DRIVES\n", "0 Bryson DeChambeau 327.6 2621 8\n", "1 Dean Burmester 324.1 2593 8\n", "2 Rory McIlroy 320.8 2566 8\n", "3 Joaquin Niemann 320.1 2561 8\n", "4 Garrick Higgo 318.6 2549 8\n", "\n", "pga, 2021, gir ->\n", " PLAYER NAME % GREENS HIT # HOLES RELATIVE/PAR\n", "0 Charley Hoffman 70.83 51 72 -0.27\n", "1 Louis Oosthuizen 69.44 50 72 -0.28\n", "2 Martin Laird 69.44 50 72 -0.22\n", "3 Keegan Bradley 68.06 49 72 -0.20\n", "4 Paul Casey 68.06 49 72 -0.27\n", "\n", "rbch, 2020, drv ->\n", " PLAYER NAME AVG. TOTAL DISTANCE TOTAL DRIVES\n", "0 Erik van Rooyen 306.1 2449 8\n", "1 Justin Thomas 304.8 2438 8\n", "2 Brooks Koepka 303.4 2427 8\n", "3 Sepp Straka 300.5 2404 8\n", "4 Xander Schauffele 297.8 2382 8\n", "\n", "rbch, 2020, gir ->\n", " PLAYER NAME % GREENS HIT # HOLES RELATIVE/PAR\n", "0 Abraham Ancer 90.28 65 72 -0.34\n", "1 Brice Garnett 80.56 58 72 -0.34\n", "2 Corey Conners 79.17 57 72 -0.37\n", "3 Joaquin Niemann 79.17 57 72 -0.37\n", "4 Bryson DeChambeau 79.17 57 72 -0.37\n", "\n", "rbch, 2021, drv ->\n", " PLAYER NAME AVG. TOTAL DISTANCE TOTAL DRIVES\n", "0 Wyndham Clark 312.3 2498 8\n", "1 Stewart Cink 312.0 2496 8\n", "2 Will Zalatoris 312.0 2496 8\n", "3 Luke List 311.9 2495 8\n", "4 Sam Burns 310.9 2487 8\n", "\n", "rbch, 2021, gir ->\n", " PLAYER NAME % GREENS HIT # HOLES RELATIVE/PAR\n", "0 Stewart Cink 77.78 56 72 -0.39\n", "1 Collin Morikawa 77.78 56 72 -0.29\n", "2 Matthew NeSmith 76.39 55 72 -0.27\n", "3 Corey Conners 76.39 55 72 -0.33\n", "4 Emiliano Grillo 75.00 54 72 -0.30\n", "\n", "time diff: 1622147466.4122176 - 1622147435.031193 = 31.3810246\n" ] } ], "source": [ "s_tm = time.time()\n", "for t_id in t_ids:\n", " e_id = events[t_id]\n", " for t_yr in t_yrs:\n", " for p_st in p_sts:\n", " st_id = stats[p_st]\n", " t_lnk = f'https://www.pgatour.com/content/pgatour/stats/stat.{st_id}.y{t_yr}.eon.{e_id}.html'\n", " stat_1 = pd.read_html(t_lnk)\n", " if len(stat_1) <= 1:\n", " break\n", " print(f\"\\n{t_id}, {t_yr}, {p_st} ->\")\n", " df_stats = stat_1[1][st_cols[p_st]]\n", " print(df_stats.head())\n", " f_out = f'./data/{t_id}_{t_yr}_{p_st}_2.test.csv'\n", " df_stats.to_csv(f_out, index=False)\n", "f_tm = time.time()\n", "t_diff = f_tm - s_tm\n", "print(f\"\\ntime diff: {f_tm} - {s_tm} = {t_diff:.7f}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }