/*分类变量基于gini系数变量最优粗分类*/
options mlogic mprint symbolgen;
%macrogini_fz(indata=,var=,tar_var=,outdata=,g_value=);
libname data "/folders/myfolders/data/";
data temp;
set &indata.;
run;
%local m n;
proc sql noprint;
select max(&var.) into: m from temp;/*取分组变量的最大值*/
select min(&var.) into: n from temp;/*取分组变量的最小值*/
select count(&var.) into:total from temp;/*取总体数量*/
%do a = &n. %to &m.;
select sum(n_b) into:T_&a._1 from temp where &var.=&a.;/*违约客户的数量*/
select sum(n_g) into: T_&a._0 from temp where &var.=&a.;/*非违约客户的数量*/
select sum(total) into:T_&a._s from temp where &var.=&a.;/*该分组下的违约和非违约客户的的总数量*/
select sum(n_b) into:T_1 from temp;/*总体中违约客户的数量*/
select sum(n_g) into:T_0 from temp;/*总体中非违约客户的数量*/
;
%end;
quit;
%put &m. &n. &total. &T_1.&T_0.;
/*分别计算每组中的样本占比*/
%do a=&n. %to &m.;
%local g_&a.;
%let g_&a.=0;
data temp_b_&i.;/*根据遍历的最小的value确定切分点和分组*/
set temp_b_&i.;
if i<=&best_i. then split=1;
else split=0;
drop i;
run;
proc sort data=temp_b_&i. ;by split b_pct;run;
data temp_b_try&i.;/*将预计切分的数据拿出来*/
set temp_b_&i.;
if split=1 then bin=%eval(&bin.+1);
run;
data temp_b_in&i.;
set temp_b;
if bin=&i. then delete;
run;
data temp_b_in&i.;/*将预计切分的数据集与之前未拆分的部分合并为最新的数据集*/
set temp_b_in&i.
temp_b_try&i.
;
run;
%gini_fz(indata=temp_b_in&i.,var=bin,tar_var=default,outdata=g_out,g_value=value);
proc sql noprint;
insert into temp_b_valuevalues(&i.,"temp_b_in&i.",&value.);
quit;
%end;
%end;
proc sort data=temp_b_value;by descendingvalue;run;/*选取各组切分后gini指数最小的切分点,也即value值最大的*/
data _null_;
set temp_b_value;
if _N_=1 then call symput ("n",compress(BinToSplit));
run;
%put &n.;
data temp_b;
set temp_b_in&n.;
drop i;
run;
proc sort data=temp_b ;by bin b_pct;run;
data temp_b;/*重新再每个bin组定义i变量值,以便于下次循环切分*/
set temp_b;
retain i 0;
by bin b_pct;
if first.bin then i=1;
else i=i+1;
run;
%end;
data temp_final;
set temp_b;
run;
data &out_data.;
retain &var. n_b n_g total bin ;
set temp_final;
keep &var. n_b n_g total bin ;
label &var.="原分组" n_b="违约客户量" n_g="正常客户量" total="总客户量" bin="新分组";
run;
proc sort data=&out_data ; by bin ;run;
/**********************************END_确定分组的最优分类过程*************************************************/
proc datasets lib=work ;/*仅保留输出数据集*/
save &out_data.;
run;
%mend;
%gini_c(indata=data.xinyong,var=employ,tar_var=default,b_max=4,out_data=g_best_out);