(* Multivariate Analysis Package - Kmeans Clustering Module Copyright 1986 Douglas L. Anderton. This program may be freely circulated so long as it is not sold for profit and any charge does not exceed costs of reproduction *) Program Cluster(Input,Output); Const N=10; { number of vars in clustering } C=25; { max-min number of clusters to consider, e.g. between 50 and 20 (or 100 and 70, etc.) clusters should be examined } NBYC=250; WORK=3500; Type SUBS=1..N; NCLS=1..C; NSAV=1..NBYC; RVEC = Array [SUBS] Of Real; RVC2 = Array [NCLS] Of Real; RVC3 = Array [NSAV] Of Real; LVEC = Array [1..WORK] Of Real; { limited only by memory } IVEC = Array [SUBS] Of Integer; S8 = Array [SUBS] Of String[8]; Var dfile, ofile, tfile : Text; sel : IVEC; varn : S8; vars, miss : RVEC; i, j, k, l, nc, n1, n2, nv : Integer; dv, ot, f : Byte; x : LVEC; s : RVC3; e : RVC2; d : Real; df : String[1]; tfl : String[12]; {$I TRANSBUF.LIB} Procedure openfiles(Var dfile, ofile:Text; Var ot:Byte); Var dfl, ofl:String[12]; Begin ClrScr; Writeln(' *** CLUSTER: KMEANS CLUSTERING ***'); Writeln; Write('Name of the data file? '); Readln(dfl); Assign(dfile,dfl); Reset(dfile); Write('Name of the output file? '); Readln(ofl); Assign(ofile,ofl); Rewrite(ofile); ot:= 0; If (ofl='CON:') Or (ofl='con:') Then ot:=1; If (ofl='LST:') Or (ofl='lst:') Then ot:=2; If (ot=2) Then Begin Writeln(ofile,'Multivariate Analysis Package (1.6) - ', 'Program: CLUSTER, Datafile: ',dfl); Writeln(ofile); End; End; (* Of openfiles *) Procedure wait(ot:Byte); Begin If ot=1 Then Begin Write('- Press any key to continue -'); While Not KeyPressed Do; ClrScr; End; End; (* of wait *) Procedure selectvar(Var sel:IVEC; Var varn:S8; Var miss:RVEC; Var nv:Integer; Var dv:Byte); Var cfile:Text; cfl:String[12]; i,j,f:Integer; mis:Real; van:String[8]; Begin Write('Name of the codebook file (or NONE)? '); Readln(cfl); If (cfl<>'NONE') And (cfl<>'none') Then f:=1 Else f:=0; If f=1 Then Begin Assign(cfile,cfl); Reset(cfile); End; Writeln; Write('How many variables in data file? '); Readln(nv); Write('Number of variables to use in CLUSTER? '); Readln(dv); Writeln('If multiple variables specified all should be comparable scale.'); For i := 1 To dv Do Begin Write('Column number for variable ',i,'? '); Readln(sel[i]); Str(sel[i]:3,varn[i]); miss[i]:=-1E37; End; If f=1 Then Begin For j:=1 to nv Do Begin mis:=-1E37; Readln(cfile,f,van,mis); For i:=1 to dv Do If f=sel[i] Then Begin varn[i]:=van; miss[i]:=mis; Writeln('Col: ',sel[i],' Name: ',varn[i],' Missing: ',miss[i]:6); End; End; Close(cfile); wait(1); End; End; (* Of selectvar *) Procedure getcase(Var vars:RVEC; sel:IVEC; nv, dv:Integer; Var dfile:Text); Var i, j:Integer; x:Real; Begin For i:=1 To nv Do Begin Read(dfile,x); For j:=1 To dv Do If (sel[j]=i) Then vars[j]:=x; End; End; (* Of getcase *) Procedure kmeans(Var x:LVEC; Var s:RVC3; Var e:RVC2; Var l, nc:Integer; Var dv:Byte; Var d:Real); Var q: Array [NCLS] Of Integer; i, it, j, k, r, v, w: Integer; a, b, f, h, g, t: Real; Begin For i:=1 To l Do Begin q[i]:=0; e[i]:=0.0; For j:=1 To dv Do s[(i-1)*dv+j]:=0.0; End; For i:=1 To nc Do Begin k:=Trunc(x[(i-1)*(dv+1)+dv+1]+0.1); q[k]:=q[k]+1; For j:=1 To dv Do s[(k-1)*dv+j]:=s[(k-1)*dv+j]+x[(i-1)*(dv+1)+j]; End; For i:=1 To l Do For j:=1 To dv Do s[(i-1)*dv+j]:=s[(i-1)*dv+j]/q[i]; d:=0.; For i:=1 To nc Do Begin f:=0; k:=Trunc(x[(i-1)*(dv+1)+dv+1]+0.1); For j:=1 To dv Do f:=f+Sqr(s[(k-1)*dv+j]-x[(i-1)*(dv+1)+j]); e[k]:=e[k]+f; d:=d+f; End; i:=0; it:=0; While itnc Then i:=i-nc; r:=Trunc(x[(i-1)*(dv+1)+dv+1]+0.1); If q[r]>1 Then Begin h:=q[r]; h:=h/(h-1.0); f:=0; For j:=1 To dv Do f:=f+Sqr(s[(r-1)*dv+j]-x[(i-1)*(dv+1)+j]); a:=h*f; b:=1E30; For j:=1 To l Do Begin If j<>r Then Begin h:=q[j]; h:=h/(h+1.0); f:=0; For k:=1 To dv Do f:=f+Sqr(s[(j-1)*dv+k]-x[(i-1)*(dv+1)+k]); f:=h*f; If f<=b Then Begin b:=f; v:=j; w:=q[j]; End; End; End; If b>=a Then Begin it:=it+1; Write(' ',it); End Else Begin it:=0; e[r]:=e[r]-a; e[v]:=e[v]+b; d:=d-a+b; h:=q[r]; g:=w; a:=1./(h-1.0); b:=1./(g+1.0); For k:=1 To dv Do Begin s[(r-1)*dv+k]:=a*(h*s[(r-1)*dv+k]-x[(i-1)*(dv+1)+k]); s[(v-1)*dv+k]:=b*(g*s[(v-1)*dv+k]+x[(i-1)*(dv+1)+k]); End; x[(i-1)*(dv+1)+dv+1]:=v; q[r]:=q[r]-1; q[v]:=q[v]+1; End; End; End; End; (* Of kmeans *) Begin (* main *) openfiles(dfile, ofile, ot); selectvar(sel, varn, miss, nv, dv); (* initialize *) ClrScr; Writeln; n1:=0; n2:=C+1; While (n2-n1)>C Do Begin Write('Minimum number of Clusters to investigate? ');Read(n1); Write(' Maximum number? ');Readln(n2); If(n2-n1)>C Then Writeln('Too many, specify smaller range.'); End; (* read in matrix *) Writeln; nc:=0; While Not EOF(dfile) Do Begin getcase(vars, sel, nv, dv, dfile); If Frac(nc/10)=0.0 Then Write('+'); If Not EOF(dfile) Then Begin f:=0; For j:=1 To dv Do If vars[j]=miss[j] Then f:=1; If f=0 Then Begin nc:=nc+1; For j:=1 To dv Do Begin l:=(nc-1)*(dv+1)+j; If l>WORK Then Begin Writeln('*** STORAGE EXCEEDED: Try Fewer (MAX-MIN) Clusters ***'); bdos(0); End; x[l]:=vars[j]; End; End; End; End; Reset(dfile); (* compute And Output *) For l:=n1 To n2 Do Begin k:=0; For j:=1 To nc Do x[(j-1)*(dv+1)+dv+1]:=j-(j div l)*l+1; ClrScr; Writeln('Iterating on ',l:2,' clusters.'); kmeans(x,s,e,l,nc,dv,d); ClrScr; Writeln(ofile); Writeln(ofile,'Total SSQ Distances Within ',l:2,' Clusters: ',d:10); If ot<>1 Then Writeln('Total SSQ Distances Within ',l:2,' Clusters: ',d:10); wait(ot); Writeln(ofile); For i:=1 To l Do Begin Writeln(ofile,'Cluster: ',i:2,' Within SSQ: ',e[i]:10); Writeln(ofile,' Centroids:'); For j:=1 To dv Do Begin Write(ofile,' ',sel[j]:2,': ',s[(i-1)*dv+j]:10); If (j=5) And (dv>5) Then Writeln(ofile); End; Writeln(ofile); If ((Frac(i/5)=0.0) Or (i=l)) Then wait(ot); End; Writeln('Would you like data and cluster membership for this level'); Write('written to a merged file (y/n)? '); Readln(df); If (df='y') Or (df='Y') Then Begin Writeln; Write('Name of new output file? '); Readln(tfl); initwrite(tfl); k:=0; While Not EOF(dfile) Do Begin f:=0; For i:=1 To nv Do Read(dfile,s[i]); If Frac(k/10)=0.0 Then Write('+'); If Not EOF(dfile) Then Begin For i:=1 To nv Do For j:=1 To dv Do If (sel[j]=i) And (s[i]=miss[j]) Then f:=1; s[nv+1]:=-1E37; If f=0 Then Begin k:=k+1; s[nv+1]:=x[(k-1)*(dv+1)+dv+1]; End; For i:=1 To nv+1 Do Write(Usr,s[i]:9,' '); Writeln(Usr); End; End; Close(dfile); endwrite; Assign(dfile,tfl); Reset(dfile); nv:=nv+1; End; ClrScr; End; Close(dfile); Close(ofile); Assign(dfile,'MAPSTAT.COM'); Execute(dfile); End.