jphall663 · August 29, 2015 14:22
diff --git a/discrim_v_pmbr.sas b/discrim_v_pmbr.sas
 *** clear any previous results; 
 ods html close;
 ods html; 

 *** display resolution of macro variables in log; 
 option symbolgen; 

 *** request detailed performance info; 
 options fullstimer; 
 
 *** number of points per cluster; 
 %let N=5000; 

 *** spread of each cluster, causes overlap;  
 %let SPREAD=1.5; 
 
 *** set tuning paramters; 
 %let K=11; /* for all */
 %let BUCKETS=100; /* for PROC PMBR RDTREE method, higher for less approximation */ 
 %let EPSILON=10; /* for PROC PMBR RDTREE method, lower for less approximation */
 
 *** generate sample data with obvious classes ********************************; 
 data clusters_train clusters_test;
      drop n;
      id=0; 
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+20;
            y=2*rannor(12345)*&SPREAD.+20;   
           	if mod(n, 2)=0 then output clusters_train; 
 	  		else output clusters_test; 
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+15;
            y=2*rannor(12345)*&SPREAD.+15;     
           	if mod(n, 2)=0 then output clusters_train; 
 	  		else output clusters_test; 
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+10;
            y=2*rannor(12345)*&SPREAD.+10;     
           	if mod(n, 2)=0 then output clusters_train; 
 	  		else output clusters_test; 
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+5;
            y=2*rannor(12345)*&SPREAD.+5;     
           	if mod(n, 2)=0 then output clusters_train; 
 	  		else output clusters_test; 
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.;
            y=2*rannor(12345)*&SPREAD.;     
           	if mod(n, 2)=0 then output clusters_train; 
 	  		else output clusters_test; 
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+15;
            y=2*rannor(12345)*&SPREAD.;     
           	if mod(n, 2)=0 then output clusters_train; 
 	  		else output clusters_test; 
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.;
            y=2*rannor(12345)*&SPREAD.+15;     
           	if mod(n, 2)=0 then output clusters_train; 
 	  		else output clusters_test; 
      end;
 run; 
 
 *** plot generated test data; 
 title 'Generated Test Data';
 proc sgplot 
 	data=clusters_test;
 	scatter y=y x=x /group=id;
 run;
 
 *** classifiy with PROC DISCRIM **********************************************; 
 proc discrim 
 	data=clusters_train
 	method=npar 
 	k=&K
 	testdata=clusters_test
 	testout=score_discrim
 	;
 	class id;
 	var x y;
 run;
 
 
 *** sort for plot color consistency; 
 *** remove label from misclassified points; 
 proc sort
 	data=score_discrim; 
 	by _INTO_; 
 run;
 data score_discrim;  
 	set score_discrim; 
 	if _INTO_ ne id then _INTO_=.;
 run; 
 
 *** misclassification; 
 title 'PROC DISCRIM Misclassification';
 proc sql; 
 	select count(*)
 	from score_discrim
 	where _INTO_=.; 
 quit; 
 
 *** plot test data classified with PROC DISCRIM;
 title 'Test Data Classified with PROC DISCRIM'; 
 proc sgplot
 	data=score_discrim;
 	scatter y=y x=x / group=_INTO_;
 run;
 
 *** create mandatory dmdb database for PROC PMBR *****************************; 
 proc dmdb
 	data=clusters_train
 	dmdbcat=work._cat;
 	class id;
 	var x y;  
 	target id;
 run;
 
 *** classify with PROC PMBR SCAN method **************************************; 
 *** should be similar to PROC DISCRIM; 
 proc pmbr
 	data=clusters_train
 	dmdbcat=_cat
 	method=scan
 	k=&K
 	; 
 	score data=clusters_test out=score_pmbr_scan; 
 	target id; 
 	var x y; 
 run; 
 
 *** sort for plot color consistency; 
 *** remove label from misclassified points;
 proc sort
 	data=score_pmbr_scan; 
 	by I_id; 
 run; 
 data score_pmbr_scan; 
 	set score_pmbr_scan; 
 	if strip(I_id) ne strip(put(id, best.)) then I_id=''; 
 run; 
 
 *** misclassification; 
 title 'PROC PMBR SCAN Misclassification';
 proc sql; 
 	select count(*)
 	from score_pmbr_scan
 	where I_id=''; 
 quit; 
 
 *** plot test data classified with PROC PMBR SCAN method;
 title 'Test Data Classified with PROC PMBR SCAN method';
 proc sgplot
 	data=score_pmbr_scan; 
 	scatter y=y x=x / group=I_id;
 run;
 
 *** PROC PMBR RDTREE METHOD **************************************************; 
 *** can be different from PROC DISCRIM; 
 *** more efficient, but calculates distances approximately; 
 *** you may change the BUCKETS and EPSILON tuning parameters;  
 proc pmbr 
 	data=clusters_train
 	dmdbcat=_cat
 	method=rdtree
 	k=&K
 	epsilon=&EPSILON
 	buckets=&BUCKETS
 	; 
 	score data=clusters_test out=score_pmbr_rdtree; 
 	target id; 
 	var x y; 
 run; 
 
 *** sort for plot color consistency; 
 *** remove label from misclassified points;
 proc sort
 	data=score_pmbr_rdtree; 
 	by I_id; 
 run; 
 data score_pmbr_rdtree; 
 	set score_pmbr_rdtree; 
 	if strip(I_id) ne strip(put(id, best.)) then I_id=''; 
 run; 
 
 
 *** misclassification; 
 title 'PROC PMBR RDTREE Misclassification';
 proc sql; 
 	select count(*)
 	from score_pmbr_rdtree
 	where I_id=''; 
 quit; 
 
 *** plot test data classified with PROC PMBR RDTREE method;
 title 'Test Data Classified with PROC PMBR RDTREE method';
 title2 "BUCKETS=&BUCKETS EPSILON=&EPSILON";
 proc sgplot
 	data=score_pmbr_rdtree; 
 	scatter y=y x=x / group=I_id;
 run;
 
 title; 
 title2;
	*** clear any previous results;
	ods html close;
	ods html;

	*** display resolution of macro variables in log;
	option symbolgen;

	*** request detailed performance info;
	options fullstimer;

	*** number of points per cluster;
	%let N=5000;

	*** spread of each cluster, causes overlap;
	%let SPREAD=1.5;

	*** set tuning paramters;
	%let K=11; /* for all */
	%let BUCKETS=100; /* for PROC PMBR RDTREE method, higher for less approximation */
	%let EPSILON=10; /* for PROC PMBR RDTREE method, lower for less approximation */

	* generate sample data with obvious classes ******************************;
	data clusters_train clusters_test;
	drop n;
	id=0;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+20;
	y=2rannor(12345)&SPREAD.+20;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+15;
	y=2rannor(12345)&SPREAD.+15;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+10;
	y=2rannor(12345)&SPREAD.+10;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+5;
	y=2rannor(12345)&SPREAD.+5;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.;
	y=2rannor(12345)&SPREAD.;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+15;
	y=2rannor(12345)&SPREAD.;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.;
	y=2rannor(12345)&SPREAD.+15;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	run;

	*** plot generated test data;
	title 'Generated Test Data';
	proc sgplot
	data=clusters_test;
	scatter y=y x=x /group=id;
	run;

	* classifiy with PROC DISCRIM ********************************************;
	proc discrim
	data=clusters_train
	method=npar
	k=&K
	testdata=clusters_test
	testout=score_discrim
	;
	class id;
	var x y;
	run;


	*** sort for plot color consistency;
	*** remove label from misclassified points;
	proc sort
	data=score_discrim;
	by _INTO_;
	run;
	data score_discrim;
	set score_discrim;
	if _INTO_ ne id then _INTO_=.;
	run;

	*** misclassification;
	title 'PROC DISCRIM Misclassification';
	proc sql;
	select count(*)
	from score_discrim
	where _INTO_=.;
	quit;

	*** plot test data classified with PROC DISCRIM;
	title 'Test Data Classified with PROC DISCRIM';
	proc sgplot
	data=score_discrim;
	scatter y=y x=x / group=_INTO_;
	run;

	* create mandatory dmdb database for PROC PMBR ***************************;
	proc dmdb
	data=clusters_train
	dmdbcat=work._cat;
	class id;
	var x y;
	target id;
	run;

	* classify with PROC PMBR SCAN method ************************************;
	*** should be similar to PROC DISCRIM;
	proc pmbr
	data=clusters_train
	dmdbcat=_cat
	method=scan
	k=&K
	;
	score data=clusters_test out=score_pmbr_scan;
	target id;
	var x y;
	run;

	*** sort for plot color consistency;
	*** remove label from misclassified points;
	proc sort
	data=score_pmbr_scan;
	by I_id;
	run;
	data score_pmbr_scan;
	set score_pmbr_scan;
	if strip(I_id) ne strip(put(id, best.)) then I_id='';
	run;

	*** misclassification;
	title 'PROC PMBR SCAN Misclassification';
	proc sql;
	select count(*)
	from score_pmbr_scan
	where I_id='';
	quit;

	*** plot test data classified with PROC PMBR SCAN method;
	title 'Test Data Classified with PROC PMBR SCAN method';
	proc sgplot
	data=score_pmbr_scan;
	scatter y=y x=x / group=I_id;
	run;

	* PROC PMBR RDTREE METHOD ************************************************;
	*** can be different from PROC DISCRIM;
	*** more efficient, but calculates distances approximately;
	*** you may change the BUCKETS and EPSILON tuning parameters;
	proc pmbr
	data=clusters_train
	dmdbcat=_cat
	method=rdtree
	k=&K
	epsilon=&EPSILON
	buckets=&BUCKETS
	;
	score data=clusters_test out=score_pmbr_rdtree;
	target id;
	var x y;
	run;

	*** sort for plot color consistency;
	*** remove label from misclassified points;
	proc sort
	data=score_pmbr_rdtree;
	by I_id;
	run;
	data score_pmbr_rdtree;
	set score_pmbr_rdtree;
	if strip(I_id) ne strip(put(id, best.)) then I_id='';
	run;


	*** misclassification;
	title 'PROC PMBR RDTREE Misclassification';
	proc sql;
	select count(*)
	from score_pmbr_rdtree
	where I_id='';
	quit;

	*** plot test data classified with PROC PMBR RDTREE method;
	title 'Test Data Classified with PROC PMBR RDTREE method';
	title2 "BUCKETS=&BUCKETS EPSILON=&EPSILON";
	proc sgplot
	data=score_pmbr_rdtree;
	scatter y=y x=x / group=I_id;
	run;

	title;
	title2;