{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt # for plotting \n",
    "import seaborn as sns # for plotting\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up some notebook display defaults\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "%matplotlib inline\n",
    "plt.style.use('default')\n",
    "sns.set()\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# paths to datasets, current and future\n",
    "kaggle_trn = \"./data/titanic/train.csv\"\n",
    "kaggle_tst = \"./data/titanic/test.csv\"\n",
    "kaggle_tst_2 = \"./data/titanic/test_2.csv\"\n",
    "rek_k_tst = \"./data/titanic/rek_test.csv\"\n",
    "rek_k_tst2 = \"./data/titanic/rek_test_2.csv\"\n",
    "kaggle_trg = \"./data/titanic/target.csv\"\n",
    "kaggle_trg_2 = \"./data/titanic/target_2.csv\"\n",
    "osf_full = \"./data/titanic/osf_titanic.csv\"\n",
    "MYEkMl_full = \"./data/titanic/phpMYEkMl.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the datasets of interest\n",
    "k_tst = pd.read_csv(kaggle_tst)\n",
    "ekml_f = pd.read_csv(MYEkMl_full)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# some more testing\n",
    "# previous time: ekml: Assaf Khalil, Mrs. Mariana (Miriam) not found!\n",
    "\n",
    "k_pass = k_tst[k_tst[\"Name\"] == \"Assaf Khalil, Mrs. Mariana (Miriam)\"]\n",
    "print(k_pass)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_pass_2 = k_tst[k_tst[\"Name\"].str.startswith(\"Assaf Khalil, Mrs. Mariana\")]\n",
    "print(k_pass_2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Turns out the name in k_tst looks like: `Assaf Khalil, Mrs. Mariana (Miriam\"\")\"\"`. There were ~30 other instances of the unnecessary double quotes in the file. So, used text editor to remove them (i.e. replaced with empty string). In a new file, *test_2.csv*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# okay, let's load the revised test dataset\n",
    "k_tst2 = pd.read_csv(kaggle_tst_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k_pass_3 = k_tst2[k_tst2[\"Name\"] == \"Assaf Khalil, Mrs. Mariana (Miriam)\"]\n",
    "print(k_pass_3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# can we find it in ekml\n",
    "k_pass_4 = ekml_f[ekml_f[\"name\"] == \"Assaf Khalil, Mrs. Mariana (Miriam)\"]\n",
    "print(k_pass_4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# okay why not\n",
    "k_pass_5 = ekml_f[ekml_f[\"name\"].str.startswith(\"Assaf Khalil, Mrs. Mariana\")]\n",
    "print(k_pass_5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# okay let's look at the next one not found last time\n",
    "# Johnston, Mrs. Andrew G (Elizabeth Lily Watson)\n",
    "k_pass_6 = ekml_f[ekml_f[\"name\"].str.startswith(\"Johnston, Mrs. Andrew G\")]\n",
    "print(k_pass_6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# and the third\n",
    "# Katavelas, Mr. Vassilios (Catavelas Vassilios)\n",
    "k_pass_7 = ekml_f[ekml_f[\"name\"].str.startswith(\"Katavelas, Mr. Vassilios\")]\n",
    "print(k_pass_7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# and the fourth\n",
    "# Coutts, Mrs. William (Winnie Minnie Treanor)\n",
    "k_pass_8 = ekml_f[ekml_f[\"name\"].str.startswith(\"Coutts, Mrs. William\")]\n",
    "print(k_pass_8)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I think I am seeing a trend. Let's try coding for this situation. Then see if any other issues crop up."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# going to store passenger name along with survival status so can check I got things right\n",
    "# we will start slow\n",
    "i = 0 # for testing only\n",
    "with open(kaggle_trg_2, 'w') as trg_fh:\n",
    "  t_out = trg_fh.write(\"Survived,TName\\n\")\n",
    "  for _, rw in k_tst2.iterrows():\n",
    "    tst_nm = rw.Name\n",
    "    ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n",
    "    if len(ekml_rw) > 1:\n",
    "      print(f\"\\n{tst_nm} -> found: {len(ekml_rw)} entry/entries\")\n",
    "      print(ekml_rw)\n",
    "    elif len(ekml_rw == 1):\n",
    "      pass\n",
    "    else:\n",
    "      print(f\"\\n{tst_nm} -> not found\")\n",
    "\n",
    "    i += 1\n",
    "    if i >= 25:\n",
    "      break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# okay, let's deal with more than 1 name first\n",
    "# we'll try using the ticket feature to select from the set of rows\n",
    "i = 0 # for testing only\n",
    "with open(kaggle_trg_2, 'w') as trg_fh:\n",
    "  t_out = trg_fh.write(\"Survived,TName\\n\")\n",
    "  for _, rw in k_tst2.iterrows():\n",
    "    tst_nm = rw.Name\n",
    "    ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n",
    "    if len(ekml_rw) > 1:\n",
    "      # print(f\"\\n{tst_nm} -> found: {len(ekml_rw)} entry/entries\")\n",
    "      # print(ekml_rw)\n",
    "      tkt = rw.Ticket\n",
    "      ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n",
    "      if len(ekml_tkt) == 1:\n",
    "        # print(f\"want ticket {tkt} -> found:\\n{ekml_tkt}\")\n",
    "        ekml_srvv = ekml_tkt.survived.item()\n",
    "        ekml_nm = ekml_tkt.name.item()\n",
    "        t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "      else:\n",
    "        print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n",
    "        t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n",
    "    elif len(ekml_rw == 1):\n",
    "      ekml_nm = ekml_rw.name.item()\n",
    "      ekml_srvv = ekml_rw.survived.item()\n",
    "      t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "    elif len(ekml_rw) == 0:\n",
    "      print(f\"\\n{tst_nm} -> not found\")\n",
    "      t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n",
    "    else:\n",
    "      print(f\"\\n{tst_nm} -> ! else: !, else what\")\n",
    "      t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')\n",
    "\n",
    "    i += 1\n",
    "    if i >= 25:\n",
    "      break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now let's deal with those name fields with '()' in them\n",
    "i = 0 # for testing only\n",
    "with open(kaggle_trg_2, 'w') as trg_fh:\n",
    "  t_out = trg_fh.write(\"Survived,TName\\n\")\n",
    "  for _, rw in k_tst2.iterrows():\n",
    "    tst_nm = rw.Name\n",
    "    ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n",
    "    if len(ekml_rw) > 1:\n",
    "      tkt = rw.Ticket\n",
    "      ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n",
    "      if len(ekml_tkt) == 1:\n",
    "        ekml_srvv = ekml_tkt.survived.item()\n",
    "        ekml_nm = ekml_tkt.name.item()\n",
    "        t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "      else:\n",
    "        print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n",
    "        t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n",
    "    elif len(ekml_rw == 1):\n",
    "      ekml_nm = ekml_rw.name.item()\n",
    "      ekml_srvv = ekml_rw.survived.item()\n",
    "      t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "    elif len(ekml_rw) == 0:\n",
    "      if \"(\" in tst_nm:\n",
    "        print(f\"\\n{tst_nm} contains brackets, let's remove that section and try again\")\n",
    "        t_nm2 = re.match(\"(.*?)\\(\",tst_nm).group(1)\n",
    "        print(f\"\\ttry {t_nm2}\")\n",
    "        ekml_re = ekml_f[ekml_f[\"name\"].str.contains(t_nm2)]\n",
    "        if len(ekml_re) == 1:\n",
    "          ekml_nm = ekml_re.name.item()\n",
    "          ekml_srvv = ekml_re.survived.item()\n",
    "          t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "        else:\n",
    "          print(f\"{tst_nm} -> {t_nm2} found {ekml_re}\")\n",
    "          t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n",
    "      else:\n",
    "        print(f\"\\n{tst_nm} -> not found\")\n",
    "        t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n",
    "    else:\n",
    "      print(f\"\\n{tst_nm} -> ! else: !, else what\")\n",
    "      t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')\n",
    "\n",
    "    i += 1\n",
    "    if i >= 25:\n",
    "      break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now let's go for it\n",
    "with open(kaggle_trg_2, 'w') as trg_fh:\n",
    "  t_out = trg_fh.write(\"Survived,TName\\n\")\n",
    "  for _, rw in k_tst2.iterrows():\n",
    "    tst_nm = rw.Name\n",
    "    ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n",
    "    if len(ekml_rw) > 1:\n",
    "      tkt = rw.Ticket\n",
    "      ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n",
    "      if len(ekml_tkt) == 1:\n",
    "        ekml_srvv = ekml_tkt.survived.item()\n",
    "        ekml_nm = ekml_tkt.name.item()\n",
    "        t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "      else:\n",
    "        print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n",
    "        t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n",
    "    elif len(ekml_rw == 1):\n",
    "      ekml_nm = ekml_rw.name.item()\n",
    "      ekml_srvv = ekml_rw.survived.item()\n",
    "      t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "    elif len(ekml_rw) == 0:\n",
    "      if \"(\" in tst_nm:\n",
    "        # print(f\"\\n{tst_nm} contains brackets, let's remove that section and try again\")\n",
    "        t_nm2 = re.match(\"(.*?)\\(\",tst_nm).group(1)\n",
    "        # print(f\"\\ttry {t_nm2}\")\n",
    "        ekml_re = ekml_f[ekml_f[\"name\"].str.contains(t_nm2)]\n",
    "        if len(ekml_re) == 1:\n",
    "          ekml_nm = ekml_re.name.item()\n",
    "          ekml_srvv = ekml_re.survived.item()\n",
    "          t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "        else:\n",
    "          print(f\"{tst_nm} -> {t_nm2} found {ekml_re}\")\n",
    "          t_out = trg_fh.write(f'?,\"{tst_nm} -> {t_nm2} NOT FOUND\"\\n')\n",
    "      else:\n",
    "        print(f\"\\n{tst_nm} -> not found\")\n",
    "        t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n",
    "    else:\n",
    "      print(f\"\\n{tst_nm} -> ! else: !, else what\")\n",
    "      t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# another fix, but how\n",
    "with open(kaggle_trg_2, 'w') as trg_fh:\n",
    "  t_out = trg_fh.write(\"Survived,TName\\n\")\n",
    "  for _, rw in k_tst2.iterrows():\n",
    "    tst_nm = rw.Name\n",
    "    ekml_rw = ekml_f[ekml_f[\"name\"] == tst_nm]\n",
    "    if len(ekml_rw) > 1:\n",
    "      tkt = rw.Ticket\n",
    "      ekml_tkt = ekml_rw[ekml_rw[\"ticket\"] == tkt]\n",
    "      if len(ekml_tkt) == 1:\n",
    "        ekml_srvv = ekml_tkt.survived.item()\n",
    "        ekml_nm = ekml_tkt.name.item()\n",
    "        t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "      else:\n",
    "        print(f\"ekml: {tst_nm} w/ticket {tkt} not found\")\n",
    "        t_out = trg_fh.write(f'?,\"{ekml_nm} w/TICKET {tkt} NOT FOUND\"\\n')\n",
    "    elif len(ekml_rw == 1):\n",
    "      ekml_nm = ekml_rw.name.item()\n",
    "      ekml_srvv = ekml_rw.survived.item()\n",
    "      t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "    elif len(ekml_rw) == 0:\n",
    "      if \"(\" in tst_nm:\n",
    "        # print(f\"\\n{tst_nm} contains brackets, let's remove that section and try again\")\n",
    "        t_nm2 = re.match(\"(.*?)\\(\",tst_nm).group(1)\n",
    "        # print(f\"\\ttry {t_nm2}\")\n",
    "        ekml_re = ekml_f[ekml_f[\"name\"].str.contains(t_nm2)]\n",
    "        if len(ekml_re) == 1:\n",
    "          ekml_nm = ekml_re.name.item()\n",
    "          ekml_srvv = ekml_re.survived.item()\n",
    "          t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "        else:\n",
    "          print(f\"{tst_nm} -> {t_nm2} found {ekml_re}\")\n",
    "          t_out = trg_fh.write(f'?,\"{tst_nm} -> {t_nm2} NOT FOUND\"\\n')\n",
    "      else:\n",
    "        # let's try to deal with those single quotes in the ekml file\n",
    "        nm_b4ls = tst_nm.rpartition(' ')[0]\n",
    "        ekml_b4ls = ekml_f[ekml_f[\"name\"].str.contains(nm_b4ls)]\n",
    "        if len(ekml_b4ls) == 1:\n",
    "          ekml_nm = ekml_b4ls.name.item()\n",
    "          ekml_srvv = ekml_b4ls.survived.item()\n",
    "          t_out = trg_fh.write(f'{int(ekml_srvv)},\"{ekml_nm}\"\\n')\n",
    "        else:\n",
    "          print(f\"\\n{tst_nm} -> not found\")\n",
    "          t_out = trg_fh.write(f'?,\"{tst_nm} NOT FOUND\"\\n')\n",
    "    else:\n",
    "      print(f\"\\n{tst_nm} -> ! else: !, else what\")\n",
    "      t_out = trg_fh.write(f'?,\"{tst_nm} HIT ELSE: IN LOOP?\"\\n')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now a little checking\n",
    "targ_2 = pd.read_csv(kaggle_trg_2)\n",
    "targ_2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rek_tst = pd.concat([k_tst2, targ_2], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rek_tst.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rek_tst.tail(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rek_tst.to_csv(rek_k_tst2, index=False)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a27d3f2bf68df5402465348834a2195030d3fc5bfc8e594e2a17c8c7e2447c85"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit ('ds-3.9': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}