filename dt 'D:\Myex\Bisai';
libname rt 'D:\Myex\Bisai';
data Rt.BChar;
infile dt(Bisai.txt) lrecl=3000;
length Char $2 Hex $4 IC $1 z $1;
retain IC '0' z '';
input Byte $char1. @@;
if IC ne '0' then do;
Char = substr(z,1,1)||Byte;
Hex = put(Char, $Hex4.);
output;
z ='';
IC='0';
end;
else do;
if put(Byte, $hex2.) < 'A0' then do;
Char = Byte;
Hex = put(Char, $Hex4.);
output;
end;
else do;
IC='1';
z = Byte;
end;
end;
keep Char Hex;
run;
proc freq data=rt.Bchar;
tables Char*Hex/out=rt.Bchfr noprint;
run;
data C1;
set Rt.Fr_ch(firstobs=3 drop=percent);
run;
data C2;
set Rt.Bchfr(firstobs=2 drop=percent);
run;
proc compare Base =C1 compare = C2;
run;
-------------------------------------------------------------
TXT2CH.SAS
-------------------------------------------------------------
filename dt 'D:\Myex\Bisai';
libname rt 'D:\Myex\Bisai';
data rt.Line(keep = LineNum Line Len_byte Len_char)
rt.Char(keep = LineNum j Char Hex)
rt.Word(keep = LineNum k Word);
length LineNum 8 Line $2400 Len_byte Len_char 8
j 8 Char $2 Hex $4
IC $1 k 8 word $20 z $20;
infile dt(Bisai.txt) lrecl=2400 truncover;
input Line $char2400.;
LineNum = _n_;
Len_byte = lengthn(Line);
Len_char = min(Len_byte, klength(Line));
output rt.Line;
IC='0'; j=1;
if Len_char ne 0 then do j=1 to Len_char;
Char = ksubstr(Line,j,1);
Hex = put(Char, $hex4.);
output rt.Char;
if notalpha(char,1)=1 then do;
if IC = '1' then do;
word = z;
output Rt.Word;
end;
if put(char, $hex4.) ge 'B040' then do;
k = j;
word = char;
IC = '2';
output Rt.Word;
end;
else IC = '3';
end;
else do;
if IC ne '1' then do;
z = char;
k = j;
IC = '1';
end;
else z = trim(z)||Char;
if j = klength(line) then do;
word=z;
output Rt.Word;
end;
end;
end;
Char = '0A0D'x;
Hex = put(Char, $hex4.);
output rt.Char;
run;
proc export data = rt.Word outfile = 'D:\Myex\Bisai\out.txt' replace;
run;
------------------------------------------------------------------------------------
TXT2CH3
------------------------------------------------------------------------------------
filename dt 'D:\Myex\Bisai';
libname rt 'D:\Myex\Bisai';
data rt.Line1;
length LineNum 8 Line $2400 Len_byte Len_char 8;
infile dt(Bisai.txt) lrecl=2400 truncover;
input line $char2400.;
LineNum = _n_;
Len_byte = lengthn(line);
Len_char = min(Len_byte, klength(line));
run;
data rt.char1;
set rt.line1;
length j 8 char $2;
j = 1;
if Len_char ne 0 then do j=1 to Len_char;
Char=ksubstr(Line,j,1);
Hex= put(Char, $hex4.);
output;
end;
Char='0A0D'x;
Hex= put(Char, $hex4.);
output;
drop Line Len_byte Len_char;
run;
data Rt.Word1(keep=LineNum k word);
set Rt.Char1 end=eof;
length IC $1 k 8 word $20 z $20;
retain IC '0' z '' k 0;
if notalpha(char,1)=1 then do;
if IC = '1' then do;
word=z;
output;
end;
if put(char, $hex4.) ge 'B040' then do;
k=j;
word=char;
IC='2';
output;
end;
else IC='3';
end;
else do;
if IC ne '1' then do;
z=char;
k=j;
IC='1';
end;
else z=trim(z)||Char;
if eof then do;
word=z;
output;
end;
end;
run;
proc export data = rt.Word1 outfile = 'D:\Myex\Bisai\out1.txt' replace;
run;
/* 以下是字符频数统计和其他用于自检的一些辅助程序 */
/* 比较生成数据集的一致性 */
proc compare base=Rt.Line compare=Rt.Line1;
run;
proc compare base=Rt.Char compare=Rt.Char1;
run;
proc compare base=Rt.Word compare=Rt.Word1;
run;
/* 行长(字节数, 字符数)统计 */
proc means data=rt.Line max sum;
var Len_byte Len_char;
run;
proc freq data=rt.char;
tables Char*Hex/out=rt.Fr_ch noprint;
run;
proc freq data=rt.Char;
tables Char*Hex/out=rt.Fr_ch noprint;
run;
proc sort data = rt.Fr_ch out = rt.Fr_chst;
by descending count;
run;
/* 比较 SAS 与 R 每行读入字符的差异 */
proc import datafile='D:\Myex\Bisai\Rll.txt' out=Rt.RLenLine replace;
run;
data rt.line_dif;
merge Rt.Line Rt.Rlenline(rename=(Lenchar=RLenchar));
Linedif = RLenchar-Len_char;
if Linedif ne 0;
run;
proc import datafile='D:\Myex\Bisai\Rfrst1.csv' out=Rt.Rfrst1 replace;
run;
proc compare base=rt.Fr_chst compare=Rt.Rfrst1;
var char count;
with char Freq;
run;
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
版权归属:Jia-gang Wang, E-mail:
jiagangwang@yahoo.com.cn
评论: 完成了比赛的目标同时处理行,从而证明了中文是完全可以处理的
程序可读性好,有注释文档。
参赛热情高。 一等奖+热心分数4+学术分数4+经验100
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
上面是暂定的。最终结果将结合论坛会员的评论做最后的结论。