{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt # for plotting \n", "import seaborn as sns # for plotting\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up some notebook display defaults\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "%matplotlib inline\n", "plt.style.use('default')\n", "sns.set()\n", "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# paths to datasets, current and future\n", "kaggle_trn = \"./data/titanic/train.csv\"\n", "kaggle_tst = \"./data/titanic/test.csv\"\n", "kaggle_tst_2 = \"./data/titanic/test_2.csv\"\n", "rek_k_tst = \"./data/titanic/rek_test.csv\"\n", "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"\n", "kaggle_trg = \"./data/titanic/target.csv\"\n", "kaggle_trg_2 = \"./data/titanic/target_2.csv\"\n", "osf_full = \"./data/titanic/osf_titanic.csv\"\n", "MYEkMl_full = \"./data/titanic/phpMYEkMl.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load the datasets of interest\n", "k_tst = pd.read_csv(kaggle_tst)\n", "ekml_f = pd.read_csv(MYEkMl_full)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# some more testing\n", "# previous time: ekml: Assaf Khalil, Mrs. Mariana (Miriam) not found!\n", "\n", "k_pass = k_tst[k_tst[\"Name\"] == \"Assaf Khalil, Mrs. Mariana (Miriam)\"]\n", "print(k_pass)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_pass_2 = k_tst[k_tst[\"Name\"].str.startswith(\"Assaf Khalil, Mrs. Mariana\")]\n", "print(k_pass_2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Turns out the name in k_tst looks like: `Assaf Khalil, Mrs. Mariana (Miriam\"\")\"\"`. There were ~30 other instances of the unnecessary double quotes in the file. So, used text editor to remove them (i.e. replaced with empty string). In a new file, *test_2.csv*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# okay, let's load the revised test dataset\n", "k_tst2 = pd.read_csv(kaggle_tst_2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k_pass_3 = k_tst2[k_tst2[\"Name\"] == \"Assaf Khalil, Mrs. Mariana (Miriam)\"]\n", "print(k_pass_3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# can we find it in ekml\n", "k_pass_4 = ekml_f[ekml_f[\"name\"] == \"Assaf Khalil, Mrs. Mariana (Miriam)\"]\n", "print(k_pass_4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# okay why not\n", "k_pass_5 = ekml_f[ekml_f[\"name\"].str.startswith(\"Assaf Khalil, Mrs. Mariana\")]\n", "print(k_pass_5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# okay let's look at the next one not found last time\n", "# Johnston, Mrs. Andrew G (Elizabeth Lily Watson)\n", "k_pass_6 = ekml_f[ekml_f[\"name\"].str.startswith(\"Johnston, Mrs. Andrew G\")]\n", "print(k_pass_6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# and the third\n", "# Katavelas, Mr. Vassilios (Catavelas Vassilios)\n", "k_pass_7 = ekml_f[ekml_f[\"name\"].str.startswith(\"Katavelas, Mr. Vassilios\")]\n", "print(k_pass_7)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# and the fourth\n", "# Coutts, Mrs. William (Winnie Minnie Treanor)\n", "k_pass_8 = ekml_f[ekml_f[\"name\"].str.startswith(\"Coutts, Mrs. William\")]\n", "print(k_pass_8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I think I am seeing a trend. Let's try coding for this situation. Then see if any other issues crop up." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# going to store passenger name along with survival status so can check I got things right\n", "# we will start slow\n", "i = 0 # for testing only\n", "with open(kaggle_trg_2, 'w') as trg_fh:\n", " t_out = trg_fh.write(\"Survived,TName\\n\")\n", " for _, rw in k_tst2.iterrows():\n", " tst_nm = rw.Name\n", " ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n", " if len(ekml_rw) > 1:\n", " print(f\"\\n{tst_nm} -> found: {len(ekml_rw)} entry/entries\")\n", " print(ekml_rw)\n", " elif len(ekml_rw == 1):\n", " pass\n", " else:\n", " print(f\"\\n{tst_nm} -> not found\")\n", "\n", " i += 1\n", " if i >= 25:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# okay, let's deal with more than 1 name first\n", "# we'll try using the ticket feature to select from the set of rows\n", "i = 0 # for testing only\n", "with open(kaggle_trg_2, 'w') as trg_fh:\n", " t_out = trg_fh.write(\"Survived,TName\\n\")\n", " for _, rw in k_tst2.iterrows():\n", " tst_nm = rw.Name\n", " ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n", " if len(ekml_rw) > 1:\n", " # print(f\"\\n{tst_nm} -> found: {len(ekml_rw)} entry/entries\")\n", " # print(ekml_rw)\n", " tkt = rw.Ticket\n", " ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n", " if len(ekml_tkt) == 1:\n", " # print(f\"want ticket {tkt} -> found:\\n{ekml_tkt}\")\n", " ekml_srvv = ekml_tkt.survived.item()\n", " ekml_nm = ekml_tkt.name.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n", " t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n", " elif len(ekml_rw == 1):\n", " ekml_nm = ekml_rw.name.item()\n", " ekml_srvv = ekml_rw.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " elif len(ekml_rw) == 0:\n", " print(f\"\\n{tst_nm} -> not found\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n", " else:\n", " print(f\"\\n{tst_nm} -> ! else: !, else what\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')\n", "\n", " i += 1\n", " if i >= 25:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now let's deal with those name fields with '()' in them\n", "i = 0 # for testing only\n", "with open(kaggle_trg_2, 'w') as trg_fh:\n", " t_out = trg_fh.write(\"Survived,TName\\n\")\n", " for _, rw in k_tst2.iterrows():\n", " tst_nm = rw.Name\n", " ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n", " if len(ekml_rw) > 1:\n", " tkt = rw.Ticket\n", " ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n", " if len(ekml_tkt) == 1:\n", " ekml_srvv = ekml_tkt.survived.item()\n", " ekml_nm = ekml_tkt.name.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n", " t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n", " elif len(ekml_rw == 1):\n", " ekml_nm = ekml_rw.name.item()\n", " ekml_srvv = ekml_rw.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " elif len(ekml_rw) == 0:\n", " if \"(\" in tst_nm:\n", " print(f\"\\n{tst_nm} contains brackets, let's remove that section and try again\")\n", " t_nm2 = re.match(\"(.*?)\\(\",tst_nm).group(1)\n", " print(f\"\\ttry {t_nm2}\")\n", " ekml_re = ekml_f[ekml_f[\"name\"].str.contains(t_nm2)]\n", " if len(ekml_re) == 1:\n", " ekml_nm = ekml_re.name.item()\n", " ekml_srvv = ekml_re.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"{tst_nm} -> {t_nm2} found {ekml_re}\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n", " else:\n", " print(f\"\\n{tst_nm} -> not found\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n", " else:\n", " print(f\"\\n{tst_nm} -> ! else: !, else what\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')\n", "\n", " i += 1\n", " if i >= 25:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now let's go for it\n", "with open(kaggle_trg_2, 'w') as trg_fh:\n", " t_out = trg_fh.write(\"Survived,TName\\n\")\n", " for _, rw in k_tst2.iterrows():\n", " tst_nm = rw.Name\n", " ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n", " if len(ekml_rw) > 1:\n", " tkt = rw.Ticket\n", " ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n", " if len(ekml_tkt) == 1:\n", " ekml_srvv = ekml_tkt.survived.item()\n", " ekml_nm = ekml_tkt.name.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n", " t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n", " elif len(ekml_rw == 1):\n", " ekml_nm = ekml_rw.name.item()\n", " ekml_srvv = ekml_rw.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " elif len(ekml_rw) == 0:\n", " if \"(\" in tst_nm:\n", " # print(f\"\\n{tst_nm} contains brackets, let's remove that section and try again\")\n", " t_nm2 = re.match(\"(.*?)\\(\",tst_nm).group(1)\n", " # print(f\"\\ttry {t_nm2}\")\n", " ekml_re = ekml_f[ekml_f[\"name\"].str.contains(t_nm2)]\n", " if len(ekml_re) == 1:\n", " ekml_nm = ekml_re.name.item()\n", " ekml_srvv = ekml_re.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"{tst_nm} -> {t_nm2} found {ekml_re}\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} -> {t_nm2} NOT FOUND\"\\n')\n", " else:\n", " print(f\"\\n{tst_nm} -> not found\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n", " else:\n", " print(f\"\\n{tst_nm} -> ! else: !, else what\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# another fix, but how\n", "with open(kaggle_trg_2, 'w') as trg_fh:\n", " t_out = trg_fh.write(\"Survived,TName\\n\")\n", " for _, rw in k_tst2.iterrows():\n", " tst_nm = rw.Name\n", " ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n", " if len(ekml_rw) > 1:\n", " tkt = rw.Ticket\n", " ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n", " if len(ekml_tkt) == 1:\n", " ekml_srvv = ekml_tkt.survived.item()\n", " ekml_nm = ekml_tkt.name.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n", " t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n", " elif len(ekml_rw == 1):\n", " ekml_nm = ekml_rw.name.item()\n", " ekml_srvv = ekml_rw.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " elif len(ekml_rw) == 0:\n", " if \"(\" in tst_nm:\n", " # print(f\"\\n{tst_nm} contains brackets, let's remove that section and try again\")\n", " t_nm2 = re.match(\"(.*?)\\(\",tst_nm).group(1)\n", " # print(f\"\\ttry {t_nm2}\")\n", " ekml_re = ekml_f[ekml_f[\"name\"].str.contains(t_nm2)]\n", " if len(ekml_re) == 1:\n", " ekml_nm = ekml_re.name.item()\n", " ekml_srvv = ekml_re.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"{tst_nm} -> {t_nm2} found {ekml_re}\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} -> {t_nm2} NOT FOUND\"\\n')\n", " else:\n", " # let's try to deal with those single quotes in the ekml file\n", " nm_b4ls = tst_nm.rpartition(' ')[0]\n", " ekml_b4ls = ekml_f[ekml_f[\"name\"].str.contains(nm_b4ls)]\n", " if len(ekml_b4ls) == 1:\n", " ekml_nm = ekml_b4ls.name.item()\n", " ekml_srvv = ekml_b4ls.survived.item()\n", " t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n", " else:\n", " print(f\"\\n{tst_nm} -> not found\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n", " else:\n", " print(f\"\\n{tst_nm} -> ! else: !, else what\")\n", " t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now a little checking\n", "targ_2 = pd.read_csv(kaggle_trg_2)\n", "targ_2.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rek_tst = pd.concat([k_tst2, targ_2], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rek_tst.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rek_tst.tail(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rek_tst.to_csv(rek_k_tst2, index=False)" ] } ], "metadata": { "interpreter": { "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }