{$A-} (* allow recursive code for CP/M-80 *) PROGRAM Squeezer; CONST version = '1.9 last update 01-27-85'; (* version modified to run under CP/M-80. Changes include adding compiler directive to allow recursive code under CP/M-80, and changing the file types to 'file' and using blockread/write for file I/O. Jeff Duncan *) { CP/M compatible file squeezer utility. This translation uses the Huffman algorithm to develop a binary tree representing the decoding information for a variable length bit string code for each input value. Each string's length is in inverse proportion to its frequency of appearance in the incoming data stream. The encoding table is derived from the decoding table. The range of valid values into the Huffman algorithm are the values of a byte stored in an integer plus the special endfile value chosen to be an adjacent value. Overall, 0-SPEOF. The algorithm develops the single element trees into a single binary tree by forming subtrees rooted in interior nodes having weights equal to the sum of weights of all their descendents and having depth counts indicating the depth of their longest paths. When all trees have been formed into a single tree satisfying the heap property (on weight, with depth as a tie breaker) then the binary code assigned to a leaf (value to be encoded) is then the series of left (0) and right (1) paths leading from the root to the leaf. Note that trees are removed from the heaped list by moving the last element over the top element and reheaping the shorter list. To further compress the output code stream, all bytes pass directly through except for: 1) DLE is encoded as (DLE, zero). 2) repeated byte values (count >= 3) are encoded as (value, DLE, count). In the original design it was believed that a Huffman code would fit in the same number of bits that will hold the sum of all the counts. That was disproven by a user's file and was a rare but infamous bug. This version attempts to choose among equally weighted subtrees according to their maximum depths to avoid unnecessarily long codes. In case that is not sufficient to guarantee codes <= 16 bits long, we initially scale the counts so the total fits in an unsigned integer, but if codes longer than 16 bits are generated the counts are rescaled to a lower ceiling and code generation is retried. The "node" array of structures contains the nodes of the binary tree. The first NUMVALS nodes are the leaves of the tree and represent the values of the data bytes being encoded and the special endfile, SPEOF. The remaining nodes become the internal nodes of the tree. Program states: NoHist don't consider previous input SentChar lastchar set, no lookahead yet SendNewC newchar set, previous sequence done SendCnt newchar set, DLE sent, send count next } CONST space = ' '; Error = -1; Null = -2; Recognize = $FF76; { unlikely pattern } DLE = #$90; SPEOF = 256; { special endfile token } NumVals = 257; { 256 data values plus SPEOF } NumNodes = 513; { = 2 * NUMVALS - 1 = number of nodes } NoChild = -1; { indicates end of path through tree } maxcount = MAXINT; { biggest UNSIGNED integer } buffersize = 128; (* 128 byte buffer for output *) TYPE FileName = STRING[30]; tfile = FILE; ValType = ARRAY[0..numvals] OF integer; StateTypes = (NoHist,SentChar,SendNewC,SendCnt,EndFile); NodeType = RECORD weight: real; { number of appearances } tdepth: integer; { length on longest path in tree } lchild, rchild: integer; { indices to next level } END; VAR InFileName, OutFileName: FileName; in_file, out_file: FILE; infilebuffer : ARRAY[1..2048] OF byte; (* allow 2k input buffer *) infilepointer, infilesize, inbuffersize : integer; infilebytes : real; outfilebuffer : ARRAY[1..128] OF byte; outfilepointer, outfilesize, outbuffer_num : integer; start, finish, i: integer; crc: integer; { Cyclic Redundancy Check code } likect: integer; { count of consecutive identical chars } lastchar, newchar: char; State: StateTypes; EOFlag, done: boolean; node: ARRAY[0..NUMNODES] OF NodeType; dctreehd: integer; { index to head node of final tree } { This is the encoding table: The bit strings have first bit in = low bit. Note that counts were scaled so code fits UNSIGNED integer } codelen, code: ARRAY[0..numvals] OF integer; { number of bits in code & code itself, right adjusted } tcode: integer; { temporary code value } curin: integer; { Value currently being encoded } cbitsrem: integer; { Number of code string bits remaining } ccode: integer; { Current code shifted so next code bit is at right } answer : STRING[10]; (* debug mode vars *) debug : boolean; {$I sq1.pas} PROCEDURE Initialize_Huffman; { Initialize the Huffman translation. This requires reading the input file through any preceding translation functions to get the frequency distribution of the various values. } VAR c, i: integer; btlist: ValType; { list of intermediate binary trees } listlen: integer; { length of btlist } ceiling: integer; { limit for scaling } { Heap and Adjust maintain a list of binary trees as a heap with the top indexing the binary tree on the list which has the least weight or, in case of equal weights, least depth in its longest path. The depth part is not strictly necessary, but tends to avoid long codes which might provoke rescaling. } PROCEDURE Heap(VAR list: ValType; l: integer); VAR i, len: integer; BEGIN (* heap *) len := (l - 2) DIV 2; FOR i:=len DOWNTO 0 DO adjust(i, l - 1, list); END; (* heap *) PROCEDURE PrintFrequency; VAR i, j: integer; BEGIN (* printfrequency *) j := 0; FOR i:=0 TO numvals-1 DO IF node[i].weight>0 THEN BEGIN j := j + 1; writeln('node ',i:3,' weight is ',node[i].weight:4:0); END; writeln; writeln('Total node count is ',j); END; (* print frequency *) PROCEDURE PrintList; VAR i: integer; str: STRING[10]; BEGIN (* print list *) writeln(', waiting'); readln(str); FOR i:=0 TO numvals-1 DO BEGIN write('number ',i:3,' length ',codelen[i]:2); write(' weight ',node[i].weight:4:0); IF codelen[i]>0 THEN PrintBits(codelen[i], code[i]) ELSE writeln; END; END; (* print list *) BEGIN (* init huffman *) writeln('Pass 1: Analysis'); crc := 0; zero_tree; state := NoHist; EOFlag := false; REPEAT { Build frequency info in tree } c := ord(getcnr); IF EOFlag THEN c := SPEOF; WITH node[c] DO IF weight < maxcount THEN weight := weight + 1; IF EOFlag THEN writeln('End of file found.'); UNTIL (EOFlag); IF debug THEN PrintFrequency; ceiling := MAXCOUNT; { Try to build encoding table. Fail if any code is > 16 bits long. } REPEAT IF (ceiling <> MAXCOUNT) THEN write('*** rescaling ***, '); scale(ceiling); ceiling := ceiling DIV 2; {in case we rescale again} listlen := 0; {find length of list and build single nodes} FOR i:=0 TO numvals-1 DO BEGIN IF node[i].weight > 0 THEN BEGIN node[i].tdepth := 0; btlist[listlen] := i; listlen := listlen + 1; END; END; heap(btlist, listlen-1); { *** changed from listlen } Build_Tree(btlist, listlen); FOR i := 0 TO NUMVALS-1 DO codelen[i] := 0; UNTIL (buildenc(0,dctreehd) <> ERROR); IF debug THEN PrintList; { Initialize encoding variables } cbitsrem := 0; curin := 0; END; (* init huffman *) FUNCTION gethuff: char; {returns byte values except for EOF} { Get an encoded byte or EOF. Reads from specified stream AS NEEDED. There are two unsynchronized bit-byte relationships here: The input stream bytes are converted to bit strings of various lengths via the static variables named Cxxxxx. These bit strings are concatenated without padding to become the stream of encoded result bytes, which this function returns one at a time. The EOF (end of file) is converted to SPEOF for convenience and encoded like any other input value. True EOF is returned after that. } VAR rbyte: integer; {Result byte value} need, take: integer; {numbers of bits} return: integer; BEGIN rbyte := 0; need := 8; {build one byte per call} return := ERROR; {start off with an error} {Loop to build a byte of encoded data. Initialization forces read the first time} WHILE return = ERROR DO BEGIN IF cbitsrem >= need THEN BEGIN {Current code fullfills our needs} IF need = 0 THEN return := rbyte AND $00FF ELSE BEGIN rbyte := rbyte OR (ccode shl (8 - need)); {take what we need} ccode := ccode shr need; {and leave the rest} cbitsrem := cbitsrem - need; return := rbyte AND $00FF; END; END ELSE BEGIN IF cbitsrem > 0 THEN BEGIN {We need more than current code} rbyte := rbyte OR (ccode shl (8 - need)); {take what there is} need := need - cbitsrem; END; IF curin = SPEOF THEN BEGIN cbitsrem := 0; IF need=8 THEN BEGIN {end of file} done := true; return := 0; {any valid char value} END ELSE return := rbyte AND $00FF; {data first} END ELSE BEGIN curin := ord(getcnr); IF EOFlag THEN curin := SPEOF; ccode := code[curin]; cbitsrem := codelen[curin]; END; END; END; (* while *) gethuff := chr(return); END; PROCEDURE squeeze; VAR c : char; char_count : integer; BEGIN writeln; write('Pass 2: Squeezing'); seek(in_file, 0); inbuffersize := 0; infilepointer := 1; infilesize := filesize(in_file); infilebytes := infilesize * 128.; rewrite(out_file); outfilepointer := 1; (* good idea to start at beginning of buffer *) EOFlag := false; write(', header'); Write_Header; write(', body'); state := NoHist; done := false; c := gethuff; {prime while loop} char_count := 0; WHILE NOT done DO BEGIN putc(out_file, c); c := gethuff; END; END; BEGIN { Main } lowvideo; clrscr; gotoxy(1,5); writeln('File squeezer version ',version); writeln; debug := false; { get filename to process & convert to upper case} write('Enter file to squeeze: '); readln(InFileName); writeln; FOR i:=1 TO length(InFileName) DO InFileName[i] := upcase(InFileName[i]); { Find and change output file type } start := 1; { skip leading blanks } WHILE (InFileName[start]=space) AND (start <= length(InFileName)) DO start := start + 1; InFileName := copy(InFileName, start, length(InFileName)-start+1); finish := pos('.',InFileName); IF finish=0 THEN OutFileName := InFileName + '.QQQ' ELSE BEGIN OutFileName := InFileName; OutFileName[finish+2] := 'Q'; END; { open source file and check for existence } assign(in_file,InFileName); assign(out_file,OutFileName); {$I-} reset(In_File); {$I+} IF IOresult=0 THEN BEGIN inbuffersize := 0; infilepointer := 1; infilesize := filesize(in_file); infilebytes := infilesize * 128.; write('The file ',InFileName,' (',infilebytes:6:0); writeln(' bytes) is being squeezed to ',OutFilename); Initialize_Huffman; squeeze; writeln(', Done.'); IF outfilepointer > 1 THEN blockwrite(out_file, outfilebuffer, 1); close(in_file); close(out_file); END ELSE writeln('Error -- input file doesn''t exist'); END.