Last active
August 29, 2015 14:22
-
-
Save jphall663/661334961ca41b29adfb to your computer and use it in GitHub Desktop.
PROC DISCRIM vs. the MBR node in Enterprise Miner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*** clear any previous results; | |
ods html close; | |
ods html; | |
*** display resolution of macro variables in log; | |
option symbolgen; | |
*** request detailed performance info; | |
options fullstimer; | |
*** number of points per cluster; | |
%let N=5000; | |
*** spread of each cluster, causes overlap; | |
%let SPREAD=1.5; | |
*** set tuning paramters; | |
%let K=11; /* for all */ | |
%let BUCKETS=100; /* for PROC PMBR RDTREE method, higher for less approximation */ | |
%let EPSILON=10; /* for PROC PMBR RDTREE method, lower for less approximation */ | |
*** generate sample data with obvious classes ********************************; | |
data clusters_train clusters_test; | |
drop n; | |
id=0; | |
do n=1 to &N; | |
x=2*rannor(12345)*&SPREAD.+20; | |
y=2*rannor(12345)*&SPREAD.+20; | |
if mod(n, 2)=0 then output clusters_train; | |
else output clusters_test; | |
end; | |
id=id + 1; | |
do n=1 to &N; | |
x=2*rannor(12345)*&SPREAD.+15; | |
y=2*rannor(12345)*&SPREAD.+15; | |
if mod(n, 2)=0 then output clusters_train; | |
else output clusters_test; | |
end; | |
id=id + 1; | |
do n=1 to &N; | |
x=2*rannor(12345)*&SPREAD.+10; | |
y=2*rannor(12345)*&SPREAD.+10; | |
if mod(n, 2)=0 then output clusters_train; | |
else output clusters_test; | |
end; | |
id=id + 1; | |
do n=1 to &N; | |
x=2*rannor(12345)*&SPREAD.+5; | |
y=2*rannor(12345)*&SPREAD.+5; | |
if mod(n, 2)=0 then output clusters_train; | |
else output clusters_test; | |
end; | |
id=id + 1; | |
do n=1 to &N; | |
x=2*rannor(12345)*&SPREAD.; | |
y=2*rannor(12345)*&SPREAD.; | |
if mod(n, 2)=0 then output clusters_train; | |
else output clusters_test; | |
end; | |
id=id + 1; | |
do n=1 to &N; | |
x=2*rannor(12345)*&SPREAD.+15; | |
y=2*rannor(12345)*&SPREAD.; | |
if mod(n, 2)=0 then output clusters_train; | |
else output clusters_test; | |
end; | |
id=id + 1; | |
do n=1 to &N; | |
x=2*rannor(12345)*&SPREAD.; | |
y=2*rannor(12345)*&SPREAD.+15; | |
if mod(n, 2)=0 then output clusters_train; | |
else output clusters_test; | |
end; | |
run; | |
*** plot generated test data; | |
title 'Generated Test Data'; | |
proc sgplot | |
data=clusters_test; | |
scatter y=y x=x /group=id; | |
run; | |
*** classifiy with PROC DISCRIM **********************************************; | |
proc discrim | |
data=clusters_train | |
method=npar | |
k=&K | |
testdata=clusters_test | |
testout=score_discrim | |
; | |
class id; | |
var x y; | |
run; | |
*** sort for plot color consistency; | |
*** remove label from misclassified points; | |
proc sort | |
data=score_discrim; | |
by _INTO_; | |
run; | |
data score_discrim; | |
set score_discrim; | |
if _INTO_ ne id then _INTO_=.; | |
run; | |
*** misclassification; | |
title 'PROC DISCRIM Misclassification'; | |
proc sql; | |
select count(*) | |
from score_discrim | |
where _INTO_=.; | |
quit; | |
*** plot test data classified with PROC DISCRIM; | |
title 'Test Data Classified with PROC DISCRIM'; | |
proc sgplot | |
data=score_discrim; | |
scatter y=y x=x / group=_INTO_; | |
run; | |
*** create mandatory dmdb database for PROC PMBR *****************************; | |
proc dmdb | |
data=clusters_train | |
dmdbcat=work._cat; | |
class id; | |
var x y; | |
target id; | |
run; | |
*** classify with PROC PMBR SCAN method **************************************; | |
*** should be similar to PROC DISCRIM; | |
proc pmbr | |
data=clusters_train | |
dmdbcat=_cat | |
method=scan | |
k=&K | |
; | |
score data=clusters_test out=score_pmbr_scan; | |
target id; | |
var x y; | |
run; | |
*** sort for plot color consistency; | |
*** remove label from misclassified points; | |
proc sort | |
data=score_pmbr_scan; | |
by I_id; | |
run; | |
data score_pmbr_scan; | |
set score_pmbr_scan; | |
if strip(I_id) ne strip(put(id, best.)) then I_id=''; | |
run; | |
*** misclassification; | |
title 'PROC PMBR SCAN Misclassification'; | |
proc sql; | |
select count(*) | |
from score_pmbr_scan | |
where I_id=''; | |
quit; | |
*** plot test data classified with PROC PMBR SCAN method; | |
title 'Test Data Classified with PROC PMBR SCAN method'; | |
proc sgplot | |
data=score_pmbr_scan; | |
scatter y=y x=x / group=I_id; | |
run; | |
*** PROC PMBR RDTREE METHOD **************************************************; | |
*** can be different from PROC DISCRIM; | |
*** more efficient, but calculates distances approximately; | |
*** you may change the BUCKETS and EPSILON tuning parameters; | |
proc pmbr | |
data=clusters_train | |
dmdbcat=_cat | |
method=rdtree | |
k=&K | |
epsilon=&EPSILON | |
buckets=&BUCKETS | |
; | |
score data=clusters_test out=score_pmbr_rdtree; | |
target id; | |
var x y; | |
run; | |
*** sort for plot color consistency; | |
*** remove label from misclassified points; | |
proc sort | |
data=score_pmbr_rdtree; | |
by I_id; | |
run; | |
data score_pmbr_rdtree; | |
set score_pmbr_rdtree; | |
if strip(I_id) ne strip(put(id, best.)) then I_id=''; | |
run; | |
*** misclassification; | |
title 'PROC PMBR RDTREE Misclassification'; | |
proc sql; | |
select count(*) | |
from score_pmbr_rdtree | |
where I_id=''; | |
quit; | |
*** plot test data classified with PROC PMBR RDTREE method; | |
title 'Test Data Classified with PROC PMBR RDTREE method'; | |
title2 "BUCKETS=&BUCKETS EPSILON=&EPSILON"; | |
proc sgplot | |
data=score_pmbr_rdtree; | |
scatter y=y x=x / group=I_id; | |
run; | |
title; | |
title2; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment