420948492 发表于 2011-3-31 11:01 
我有两个表,是两个月份的顾客信息,每个表有近两百万顾客,每个顾客有唯一的ID,现在我想计算每个顾客某个指标的平均值,运行比较慢,有什么优化的算法吗
我现在用的是proc SQl ,没有创建索引,
According to my test, it should be reasonable fast with sql (cpu=5 secons).
286 proc sql;
287 create table mean_stat as
288 select t1.id as id ,
289 mean( x1, x2, x3, x4, x5, x6, x7, x8, x9, x10 ) as meanx,
290 mean( y1, y2, y3, y4, y5, y6, y7, y8, y9, y10 ) as meany
291 from t1 as a,
292 t2 as b
293 where t1.id=t2.id
294
295 ;
NOTE: Table WORK.MEAN_STAT created, with 2000000 rows and 3 columns.
296 quit;
NOTE: PROCEDURE SQL used (Total process time):
real time 10.18 seconds
cpu time 5.10 seconds
***************pgm*****;
data t1 t2(rename=(x1-x10 = y1-y10));
array x(10) ;
do id=1 to 2e6;
do j=1 to dim (x);
x[j]=ranuni(123);
end;
output;
end;
run;
proc sql;
create table mean_stat as
select t1.id as id ,
mean( x1, x2, x3, x4, x5, x6, x7, x8, x9, x10 ) as meanx,
mean( y1, y2, y3, y4, y5, y6, y7, y8, y9, y10 ) as meany
from t1 as a,
t2 as b
where t1.id=t2.id
;
quit;