Created
October 23, 2014 15:03
-
-
Save kanhua/bf4a75884f3cb2934364 to your computer and use it in GitHub Desktop.
Submit Kaggle Titanic results (not optimised!)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"language": "Julia", | |
"name": "", | |
"signature": "sha256:999e48e1cfeb072e56d4089a89783a9483f6c014acc704990d74ba7079a57434" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"using Gadfly\n", | |
"using DataFrames\n", | |
"using DecisionTree" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"function cleanData(filename)\n", | |
" df=readtable(filename)\n", | |
" pool!(df,[:Sex])\n", | |
"\tpool!(df,[:Pclass])\n", | |
"\taverageAge=mean(df[!isna(df[:Age]),:Age])\n", | |
"\tdf[:Age]=array(df[:Age],averageAge)\n", | |
" \n", | |
" if any(isna(df[:Fare]))\n", | |
" averageFare=mean(df[!isna(df[:Fare]),:Fare])\n", | |
" df[:Fare]=array(df[:Fare],averageFare)\n", | |
" end\n", | |
"\tdf[:Embarked]=array(df[:Embarked],utf8(\"S\"))\n", | |
"\tpool!(df,[:Embarked])\n", | |
"\tnewdata=df[:,[:Pclass,:Age,:Sex,:SibSp,:Parch,:Fare,:Embarked]]\n", | |
" \n", | |
" iddata=df[:,[:PassengerId]]\n", | |
" return newdata,iddata\n", | |
"end" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"cleanData (generic function with 1 method)" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"function readylabel(filename)\n", | |
" df=readtable(filename)\n", | |
"\tpool!(df,[:Survived])\n", | |
"\n", | |
"\treturn df[:Survived]\n", | |
"end" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"readylabel (generic function with 1 method)" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"xTrain,idTrain=cleanData(\"train.csv\")\n", | |
"xTest,idTest=cleanData(\"test.csv\")\n", | |
"yTrain=readylabel(\"train.csv\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"891-element PooledDataArray{Int64,Uint8,1}:\n", | |
" 0\n", | |
" 1\n", | |
" 1\n", | |
" 1\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 1\n", | |
" 1\n", | |
" 1\n", | |
" 1\n", | |
" 0\n", | |
" \u22ee\n", | |
" 1\n", | |
" 1\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 1\n", | |
" 0\n", | |
" 1\n", | |
" 0" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"describe(xTest)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Pclass\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Min 1.0" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"1st Qu. 1.0\n", | |
"Median 3.0\n", | |
"Mean 2.2655502392344498\n", | |
"3rd Qu. 3.0\n", | |
"Max 3.0\n", | |
"NAs 0\n", | |
"NA% 0.0%\n", | |
"\n", | |
"Age\n", | |
"Min 0.17\n", | |
"1st Qu. 23.0\n", | |
"Median 30.272590361445783\n", | |
"Mean 30.272590361445793\n", | |
"3rd Qu. 35.75\n", | |
"Max 76.0\n", | |
"NAs 0\n", | |
"NA% 0.0%\n", | |
"\n", | |
"Sex\n", | |
"Length 418" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Type Pooled UTF8String\n", | |
"NAs 0\n", | |
"NA% 0.0%\n", | |
"Unique 2\n", | |
"\n", | |
"SibSp\n", | |
"Min 0.0\n", | |
"1st Qu. 0.0\n", | |
"Median 0.0\n", | |
"Mean 0.4473684210526316\n", | |
"3rd Qu. 1.0\n", | |
"Max 8.0\n", | |
"NAs 0\n", | |
"NA% 0.0%\n", | |
"\n", | |
"Parch\n", | |
"Min 0.0\n", | |
"1st Qu. 0.0\n", | |
"Median 0.0\n", | |
"Mean 0.3923444976076555\n", | |
"3rd Qu. 0.0\n", | |
"Max 9.0\n", | |
"NAs 0\n", | |
"NA% 0.0%\n", | |
"\n", | |
"Fare\n", | |
"Min 0.0\n", | |
"1st Qu. 7.8958\n", | |
"Median 14.4542\n", | |
"Mean 35.627188489208635\n", | |
"3rd Qu. 31.5\n", | |
"Max 512.3292\n", | |
"NAs 0\n", | |
"NA% 0.0%\n", | |
"\n", | |
"Embarked\n", | |
"Length 418\n", | |
"Type Pooled UTF8String\n", | |
"NAs 0\n", | |
"NA% 0.0%\n", | |
"Unique 3\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"yTrain=array(yTrain)\n", | |
"xTrain=array(xTrain)\n", | |
"xTest=array(xTest)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"418x7 Array{Any,2}:\n", | |
" 3 34.5 \"male\" 0 0 7.8292 \"Q\"\n", | |
" 3 47.0 \"female\" 1 0 7.0 \"S\"\n", | |
" 2 62.0 \"male\" 0 0 9.6875 \"Q\"\n", | |
" 3 27.0 \"male\" 0 0 8.6625 \"S\"\n", | |
" 3 22.0 \"female\" 1 1 12.2875 \"S\"\n", | |
" 3 14.0 \"male\" 0 0 9.225 \"S\"\n", | |
" 3 30.0 \"female\" 0 0 7.6292 \"Q\"\n", | |
" 2 26.0 \"male\" 1 1 29.0 \"S\"\n", | |
" 3 18.0 \"female\" 0 0 7.2292 \"C\"\n", | |
" 3 21.0 \"male\" 2 0 24.15 \"S\"\n", | |
" 3 30.2726 \"male\" 0 0 7.8958 \"S\"\n", | |
" 1 46.0 \"male\" 0 0 26.0 \"S\"\n", | |
" 1 23.0 \"female\" 1 0 82.2667 \"S\"\n", | |
" \u22ee \u22ee \n", | |
" 2 23.0 \"male\" 1 0 10.5 \"S\"\n", | |
" 1 50.0 \"male\" 1 1 211.5 \"C\"\n", | |
" 3 30.2726 \"female\" 0 0 7.7208 \"Q\"\n", | |
" 3 3.0 \"female\" 1 1 13.775 \"S\"\n", | |
" 3 30.2726 \"female\" 0 0 7.75 \"Q\"\n", | |
" 1 37.0 \"female\" 1 0 90.0 \"Q\"\n", | |
" 3 28.0 \"female\" 0 0 7.775 \"S\"\n", | |
" 3 30.2726 \"male\" 0 0 8.05 \"S\"\n", | |
" 1 39.0 \"female\" 0 0 108.9 \"C\"\n", | |
" 3 38.5 \"male\" 0 0 7.25 \"S\"\n", | |
" 3 30.2726 \"male\" 0 0 8.05 \"S\"\n", | |
" 3 30.2726 \"male\" 1 1 22.3583 \"C\"" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"accuracy = nfoldCV_forest(yTrain, xTrain, 5, 20, 4, 0.7);" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Fold " | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1\n", | |
"Classes: {0,1}\n", | |
"Matrix: \n", | |
"[" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"115 14\n", | |
" 27 66]\n", | |
"Accuracy: 0.8153153153153153\n", | |
"Kappa: 0.6131089007906146\n", | |
"\n", | |
"Fold " | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"2\n", | |
"Classes: {0,1}\n", | |
"Matrix: \n", | |
"[122 17\n", | |
" 26 57]\n", | |
"Accuracy: 0.8063063063063063\n", | |
"Kappa: 0.5770491803278688\n", | |
"\n", | |
"Fold " | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"3\n", | |
"Classes: {0,1}\n", | |
"Matrix: \n", | |
"[115 18\n", | |
" 24 65]\n", | |
"Accuracy: 0.8108108108108109\n", | |
"Kappa: 0.6017086715079027\n", | |
"\n", | |
"Fold " | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"4\n", | |
"Classes: {0,1}\n", | |
"Matrix: \n", | |
"[125 21\n", | |
" 20 56]\n", | |
"Accuracy: 0.8153153153153153\n", | |
"Kappa: 0.591141856077621\n", | |
"\n", | |
"Mean Accuracy: 0.8119369369369369\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Build random forest model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"model = build_forest(yTrain, xTrain, 5, 20, 0.7)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"Ensemble of Decision Trees\n", | |
"Trees: 20\n", | |
"Avg Leaves: 108.9\n", | |
"Avg Depth: 18.75" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"predy=apply_forest(model,xTest)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"418-element Array{Any,1}:\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 1\n", | |
" 0\n", | |
" 1\n", | |
" 0\n", | |
" 0\n", | |
" 0\n", | |
" 1\n", | |
" \u22ee\n", | |
" 0\n", | |
" 0\n", | |
" 1\n", | |
" 1\n", | |
" 1\n", | |
" 1\n", | |
" 1\n", | |
" 0\n", | |
" 1\n", | |
" 0\n", | |
" 0\n", | |
" 1" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"predydf=DataArray(Survived=predy)\n", | |
"predydf=int(predydf[:,1])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "LoadError", | |
"evalue": "function DataArray does not accept keyword arguments\nwhile loading In[10], in expression starting on line 1", | |
"output_type": "pyerr", | |
"traceback": [ | |
"function DataArray does not accept keyword arguments\nwhile loading In[10], in expression starting on line 1", | |
"" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"resultdf=[idTest predydf]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "LoadError", | |
"evalue": "predydf not defined\nwhile loading In[11], in expression starting on line 1", | |
"output_type": "pyerr", | |
"traceback": [ | |
"predydf not defined\nwhile loading In[11], in expression starting on line 1", | |
"" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"writedlm(\"output.csv\",[array(idTest) predy],',')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"convert(Int64,\"3\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "LoadError", | |
"evalue": "`convert` has no method matching convert(::Type{Int64}, ::ASCIIString)\nwhile loading In[14], in expression starting on line 1", | |
"output_type": "pyerr", | |
"traceback": [ | |
"`convert` has no method matching convert(::Type{Int64}, ::ASCIIString)\nwhile loading In[14], in expression starting on line 1", | |
"", | |
" in convert at base.jl:13" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"int(\"3\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"typeof(\"3\")==ASCIIString" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 18, | |
"text": [ | |
"true" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment