In [None]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting 
import seaborn as sns # for plotting
import re

In [None]:
# set up some notebook display defaults
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
plt.style.use('default')
sns.set()
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# paths to datasets, current and future
kaggle_trn = "./data/titanic/train.csv"
kaggle_tst = "./data/titanic/test.csv"
kaggle_tst_2 = "./data/titanic/test_2.csv"
rek_k_tst = "./data/titanic/rek_test.csv"
rek_k_tst2 = "./data/titanic/rek_test_2.csv"
kaggle_trg = "./data/titanic/target.csv"
kaggle_trg_2 = "./data/titanic/target_2.csv"
osf_full = "./data/titanic/osf_titanic.csv"
MYEkMl_full = "./data/titanic/phpMYEkMl.csv"

In [None]:
# load the datasets of interest
k_tst = pd.read_csv(kaggle_tst)
ekml_f = pd.read_csv(MYEkMl_full)

In [None]:
# some more testing
# previous time: ekml: Assaf Khalil, Mrs. Mariana (Miriam) not found!

k_pass = k_tst[k_tst["Name"] == "Assaf Khalil, Mrs. Mariana (Miriam)"]
print(k_pass)

In [None]:
k_pass_2 = k_tst[k_tst["Name"].str.startswith("Assaf Khalil, Mrs. Mariana")]
print(k_pass_2)

Turns out the name in k_tst looks like: `Assaf Khalil, Mrs. Mariana (Miriam"")""`. There were ~30 other instances of the unnecessary double quotes in the file. So, used text editor to remove them (i.e. replaced with empty string). In a new file, *test_2.csv*.

In [None]:
# okay, let's load the revised test dataset
k_tst2 = pd.read_csv(kaggle_tst_2)

In [None]:
k_pass_3 = k_tst2[k_tst2["Name"] == "Assaf Khalil, Mrs. Mariana (Miriam)"]
print(k_pass_3)

In [None]:
# can we find it in ekml
k_pass_4 = ekml_f[ekml_f["name"] == "Assaf Khalil, Mrs. Mariana (Miriam)"]
print(k_pass_4)

In [None]:
# okay why not
k_pass_5 = ekml_f[ekml_f["name"].str.startswith("Assaf Khalil, Mrs. Mariana")]
print(k_pass_5)

In [None]:
# okay let's look at the next one not found last time
# Johnston, Mrs. Andrew G (Elizabeth Lily Watson)
k_pass_6 = ekml_f[ekml_f["name"].str.startswith("Johnston, Mrs. Andrew G")]
print(k_pass_6)

In [None]:
# and the third
# Katavelas, Mr. Vassilios (Catavelas Vassilios)
k_pass_7 = ekml_f[ekml_f["name"].str.startswith("Katavelas, Mr. Vassilios")]
print(k_pass_7)

In [None]:
# and the fourth
# Coutts, Mrs. William (Winnie Minnie Treanor)
k_pass_8 = ekml_f[ekml_f["name"].str.startswith("Coutts, Mrs. William")]
print(k_pass_8)

I think I am seeing a trend. Let's try coding for this situation. Then see if any other issues crop up.

In [None]:
# going to store passenger name along with survival status so can check I got things right
# we will start slow
i = 0 # for testing only
with open(kaggle_trg_2, 'w') as trg_fh:
  t_out = trg_fh.write("Survived,TName\n")
  for _, rw in k_tst2.iterrows():
    tst_nm = rw.Name
    ekml_rw = ekml_f[ekml_f["name"] == tst_nm]
    if len(ekml_rw) > 1:
      print(f"\n{tst_nm} -> found: {len(ekml_rw)} entry/entries")
      print(ekml_rw)
    elif len(ekml_rw == 1):
      pass
    else:
      print(f"\n{tst_nm} -> not found")

    i += 1
    if i >= 25:
      break

In [None]:
# okay, let's deal with more than 1 name first
# we'll try using the ticket feature to select from the set of rows
i = 0 # for testing only
with open(kaggle_trg_2, 'w') as trg_fh:
  t_out = trg_fh.write("Survived,TName\n")
  for _, rw in k_tst2.iterrows():
    tst_nm = rw.Name
    ekml_rw = ekml_f[ekml_f["name"] == tst_nm]
    if len(ekml_rw) > 1:
      # print(f"\n{tst_nm} -> found: {len(ekml_rw)} entry/entries")
      # print(ekml_rw)
      tkt = rw.Ticket
      ekml_tkt = ekml_rw[ekml_rw["ticket"] == tkt]
      if len(ekml_tkt) == 1:
        # print(f"want ticket {tkt} -> found:\n{ekml_tkt}")
        ekml_srvv = ekml_tkt.survived.item()
        ekml_nm = ekml_tkt.name.item()
        t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
      else:
        print(f"ekml: {tst_nm} w/ticket {tkt} not found")
        t_out = trg_fh.write(f'?,"{ekml_nm} w/TICKET {tkt} NOT FOUND"\n')
    elif len(ekml_rw == 1):
      ekml_nm = ekml_rw.name.item()
      ekml_srvv = ekml_rw.survived.item()
      t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
    elif len(ekml_rw) == 0:
      print(f"\n{tst_nm} -> not found")
      t_out = trg_fh.write(f'?,"{tst_nm} NOT FOUND"\n')
    else:
      print(f"\n{tst_nm} -> ! else: !, else what")
      t_out = trg_fh.write(f'?,"{tst_nm} HIT ELSE: IN LOOP?"\n')

    i += 1
    if i >= 25:
      break

In [None]:
# now let's deal with those name fields with '()' in them
i = 0 # for testing only
with open(kaggle_trg_2, 'w') as trg_fh:
  t_out = trg_fh.write("Survived,TName\n")
  for _, rw in k_tst2.iterrows():
    tst_nm = rw.Name
    ekml_rw = ekml_f[ekml_f["name"] == tst_nm]
    if len(ekml_rw) > 1:
      tkt = rw.Ticket
      ekml_tkt = ekml_rw[ekml_rw["ticket"] == tkt]
      if len(ekml_tkt) == 1:
        ekml_srvv = ekml_tkt.survived.item()
        ekml_nm = ekml_tkt.name.item()
        t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
      else:
        print(f"ekml: {tst_nm} w/ticket {tkt} not found")
        t_out = trg_fh.write(f'?,"{ekml_nm} w/TICKET {tkt} NOT FOUND"\n')
    elif len(ekml_rw == 1):
      ekml_nm = ekml_rw.name.item()
      ekml_srvv = ekml_rw.survived.item()
      t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
    elif len(ekml_rw) == 0:
      if "(" in tst_nm:
        print(f"\n{tst_nm} contains brackets, let's remove that section and try again")
        t_nm2 = re.match("(.*?)\(",tst_nm).group(1)
        print(f"\ttry {t_nm2}")
        ekml_re = ekml_f[ekml_f["name"].str.contains(t_nm2)]
        if len(ekml_re) == 1:
          ekml_nm = ekml_re.name.item()
          ekml_srvv = ekml_re.survived.item()
          t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
        else:
          print(f"{tst_nm} -> {t_nm2} found {ekml_re}")
          t_out = trg_fh.write(f'?,"{tst_nm} NOT FOUND"\n')
      else:
        print(f"\n{tst_nm} -> not found")
        t_out = trg_fh.write(f'?,"{tst_nm} NOT FOUND"\n')
    else:
      print(f"\n{tst_nm} -> ! else: !, else what")
      t_out = trg_fh.write(f'?,"{tst_nm} HIT ELSE: IN LOOP?"\n')

    i += 1
    if i >= 25:
      break

In [None]:
# now let's go for it
with open(kaggle_trg_2, 'w') as trg_fh:
  t_out = trg_fh.write("Survived,TName\n")
  for _, rw in k_tst2.iterrows():
    tst_nm = rw.Name
    ekml_rw = ekml_f[ekml_f["name"] == tst_nm]
    if len(ekml_rw) > 1:
      tkt = rw.Ticket
      ekml_tkt = ekml_rw[ekml_rw["ticket"] == tkt]
      if len(ekml_tkt) == 1:
        ekml_srvv = ekml_tkt.survived.item()
        ekml_nm = ekml_tkt.name.item()
        t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
      else:
        print(f"ekml: {tst_nm} w/ticket {tkt} not found")
        t_out = trg_fh.write(f'?,"{ekml_nm} w/TICKET {tkt} NOT FOUND"\n')
    elif len(ekml_rw == 1):
      ekml_nm = ekml_rw.name.item()
      ekml_srvv = ekml_rw.survived.item()
      t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
    elif len(ekml_rw) == 0:
      if "(" in tst_nm:
        # print(f"\n{tst_nm} contains brackets, let's remove that section and try again")
        t_nm2 = re.match("(.*?)\(",tst_nm).group(1)
        # print(f"\ttry {t_nm2}")
        ekml_re = ekml_f[ekml_f["name"].str.contains(t_nm2)]
        if len(ekml_re) == 1:
          ekml_nm = ekml_re.name.item()
          ekml_srvv = ekml_re.survived.item()
          t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
        else:
          print(f"{tst_nm} -> {t_nm2} found {ekml_re}")
          t_out = trg_fh.write(f'?,"{tst_nm} -> {t_nm2} NOT FOUND"\n')
      else:
        print(f"\n{tst_nm} -> not found")
        t_out = trg_fh.write(f'?,"{tst_nm} NOT FOUND"\n')
    else:
      print(f"\n{tst_nm} -> ! else: !, else what")
      t_out = trg_fh.write(f'?,"{tst_nm} HIT ELSE: IN LOOP?"\n')

In [None]:
# another fix, but how
with open(kaggle_trg_2, 'w') as trg_fh:
  t_out = trg_fh.write("Survived,TName\n")
  for _, rw in k_tst2.iterrows():
    tst_nm = rw.Name
    ekml_rw = ekml_f[ekml_f["name"] == tst_nm]
    if len(ekml_rw) > 1:
      tkt = rw.Ticket
      ekml_tkt = ekml_rw[ekml_rw["ticket"] == tkt]
      if len(ekml_tkt) == 1:
        ekml_srvv = ekml_tkt.survived.item()
        ekml_nm = ekml_tkt.name.item()
        t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
      else:
        print(f"ekml: {tst_nm} w/ticket {tkt} not found")
        t_out = trg_fh.write(f'?,"{ekml_nm} w/TICKET {tkt} NOT FOUND"\n')
    elif len(ekml_rw == 1):
      ekml_nm = ekml_rw.name.item()
      ekml_srvv = ekml_rw.survived.item()
      t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
    elif len(ekml_rw) == 0:
      if "(" in tst_nm:
        # print(f"\n{tst_nm} contains brackets, let's remove that section and try again")
        t_nm2 = re.match("(.*?)\(",tst_nm).group(1)
        # print(f"\ttry {t_nm2}")
        ekml_re = ekml_f[ekml_f["name"].str.contains(t_nm2)]
        if len(ekml_re) == 1:
          ekml_nm = ekml_re.name.item()
          ekml_srvv = ekml_re.survived.item()
          t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
        else:
          print(f"{tst_nm} -> {t_nm2} found {ekml_re}")
          t_out = trg_fh.write(f'?,"{tst_nm} -> {t_nm2} NOT FOUND"\n')
      else:
        # let's try to deal with those single quotes in the ekml file
        nm_b4ls = tst_nm.rpartition(' ')[0]
        ekml_b4ls = ekml_f[ekml_f["name"].str.contains(nm_b4ls)]
        if len(ekml_b4ls) == 1:
          ekml_nm = ekml_b4ls.name.item()
          ekml_srvv = ekml_b4ls.survived.item()
          t_out = trg_fh.write(f'{int(ekml_srvv)},"{ekml_nm}"\n')
        else:
          print(f"\n{tst_nm} -> not found")
          t_out = trg_fh.write(f'?,"{tst_nm} NOT FOUND"\n')
    else:
      print(f"\n{tst_nm} -> ! else: !, else what")
      t_out = trg_fh.write(f'?,"{tst_nm} HIT ELSE: IN LOOP?"\n')


In [None]:
# now a little checking
targ_2 = pd.read_csv(kaggle_trg_2)
targ_2.head()

In [None]:
rek_tst = pd.concat([k_tst2, targ_2], axis=1)

In [None]:
rek_tst.head(10)

In [None]:
rek_tst.tail(10)

In [None]:
rek_tst.to_csv(rek_k_tst2, index=False)