{$A-}  (* allow recursive code for CP/M-80 *)

PROGRAM Squeezer;

  CONST version   = '1.9  last update 01-27-85';
    (* version modified to run under CP/M-80.  Changes include
       adding compiler directive to allow recursive code under
       CP/M-80, and changing the file types to 'file' and using
       blockread/write for file I/O.   Jeff Duncan
    *)

{ CP/M compatible file squeezer utility.

This  translation  uses the Huffman algorithm to develop a binary tree
representing the decoding information for a variable length bit string
code for each  input  value.   Each  string's  length  is  in  inverse
proportion to its frequency of appearance in the incoming data stream.
The encoding table is derived from the decoding table.

  The  range of valid values into the Huffman algorithm are the values
of a byte stored in an integer plus the special endfile  value  chosen
to be an adjacent value.  Overall, 0-SPEOF.

  The algorithm develops the single element trees into a single binary
tree by forming subtrees rooted in interior nodes having weights equal
to the sum of weights of all their descendents and having depth counts
indicating the depth of their longest paths.

  When  all  trees  have been formed into a single tree satisfying the
heap property (on weight, with depth as a tie breaker) then the binary
code assigned to a leaf (value to be encoded) is then  the  series  of
left  (0) and right (1) paths leading from the root to the leaf.  Note
that trees are removed from the heaped list by moving the last element
over the top element and reheaping the shorter list.

  To  further compress the output code stream, all bytes pass directly
through except for:
        1) DLE is encoded as (DLE, zero).
        2) repeated byte values (count >= 3) are encoded as
           (value, DLE, count).

  In the original design it was believed that a Huffman code would fit
in  the  same number of bits that will hold the sum of all the counts.
That was disproven by a user's file and was a rare but  infamous  bug.
This  version  attempts  to  choose  among  equally  weighted subtrees
according to their maximum depths to avoid unnecessarily  long  codes.
In  case that is not sufficient to guarantee codes <= 16 bits long, we
initially scale the counts so the total fits in an  unsigned  integer,
but if codes longer than 16 bits are generated the counts are rescaled
to a lower ceiling and code generation is retried.

  The  "node"  array  of  structures  contains the nodes of the binary
tree.  The first  NUMVALS  nodes  are  the  leaves  of  the  tree  and
represent  the  values of the data bytes being encoded and the special
endfile, SPEOF.  The remaining nodes become the internal nodes of  the
tree.

  Program states:
     NoHist    don't consider previous input
     SentChar  lastchar set, no lookahead yet
     SendNewC  newchar set, previous sequence done
     SendCnt   newchar set, DLE sent, send count next
}

    CONST space     = ' ';
        Error     = -1;
        Null      = -2;
        Recognize = $FF76;  { unlikely pattern }
        DLE       = #$90;
        SPEOF     = 256;    { special endfile token }
        NumVals   = 257;    { 256 data values plus SPEOF }
        NumNodes  = 513;    { = 2 * NUMVALS - 1 = number of nodes }
        NoChild   = -1;     { indicates end of path through tree }
        maxcount  = MAXINT; { biggest UNSIGNED integer }
        buffersize = 128; (* 128 byte buffer for output *)

   TYPE FileName = STRING[30];
        tfile = FILE;
        ValType  = ARRAY[0..numvals] OF integer;
        StateTypes = (NoHist,SentChar,SendNewC,SendCnt,EndFile);
        NodeType = RECORD
                     weight: real;       { number of appearances }
                     tdepth: integer;    { length on longest path in tree }
                     lchild, rchild: integer;  { indices to next level }
                   END;

    VAR InFileName, OutFileName: FileName;
        in_file, out_file: FILE;
        infilebuffer : ARRAY[1..2048] OF byte; (* allow 2k input buffer *)
        infilepointer, infilesize, inbuffersize : integer;
        infilebytes : real;
        outfilebuffer : ARRAY[1..128] OF byte;
        outfilepointer, outfilesize, outbuffer_num : integer;
        start, finish, i: integer;
        crc: integer;      { Cyclic Redundancy Check code }

        likect: integer;   { count of consecutive identical chars }
        lastchar, newchar: char;

        State: StateTypes;
        EOFlag, done: boolean;

        node: ARRAY[0..NUMNODES] OF NodeType;
        dctreehd: integer;        { index to head node of final tree }

{ This is the encoding table:  The bit strings have first bit in = low bit.
  Note that counts were scaled so code fits UNSIGNED integer }

        codelen, code: ARRAY[0..numvals] OF integer;
          { number of bits in code & code itself, right adjusted }
        tcode: integer;     { temporary code value }

        curin: integer;     { Value currently being encoded }
        cbitsrem: integer;  { Number of code string bits remaining }
        ccode: integer;   { Current code shifted so next code bit is at right }

        answer : STRING[10]; (* debug mode vars *)
        debug : boolean;

{$I sq1.pas}

  PROCEDURE Initialize_Huffman;

  {  Initialize  the  Huffman  translation.  This requires reading the
  input file through any preceding translation functions  to  get  the
  frequency distribution of the various values.  }

    VAR c, i: integer;
        btlist: ValType;   { list of intermediate binary trees }
        listlen: integer;  { length of btlist }
        ceiling: integer;  { limit for scaling }

{  Heap  and Adjust maintain a list of binary trees as a heap with the
top indexing the binary tree on the list which has  the  least  weight
or,  in  case  of equal weights, least depth in its longest path.  The
depth part is not strictly necessary, but tends to  avoid  long  codes
which might provoke rescaling.  }

  PROCEDURE Heap(VAR list: ValType; l: integer);

    VAR i, len: integer;

      BEGIN (* heap *)
        len := (l - 2) DIV 2;
        FOR i:=len DOWNTO 0 DO
          adjust(i, l - 1, list);
      END;  (* heap *)

    PROCEDURE PrintFrequency;

      VAR i, j: integer;

      BEGIN (* printfrequency *)
        j := 0;
        FOR i:=0 TO numvals-1 DO
          IF node[i].weight>0
            THEN
              BEGIN
                j := j + 1;
                writeln('node ',i:3,'  weight is ',node[i].weight:4:0);
              END;
        writeln;
        writeln('Total node count is ',j);
      END; (* print frequency *)

    PROCEDURE PrintList;

      VAR i: integer;
          str: STRING[10];

      BEGIN (* print list *)
        writeln(', waiting');
        readln(str);
        FOR i:=0 TO numvals-1 DO
          BEGIN
            write('number ',i:3,'  length ',codelen[i]:2);
            write('  weight ',node[i].weight:4:0);
            IF codelen[i]>0
              THEN PrintBits(codelen[i], code[i])
              ELSE writeln;
          END;
      END; (* print list *)

    BEGIN (* init huffman *)
      writeln('Pass 1: Analysis');
      crc := 0;
      zero_tree;
      state := NoHist;
      EOFlag := false;
      REPEAT    { Build frequency info in tree }
        c := ord(getcnr);
        IF EOFlag
          THEN
            c := SPEOF;
          WITH node[c] DO
            IF weight < maxcount
              THEN
                weight := weight + 1;
          IF EOFlag
            THEN writeln('End of file found.');
      UNTIL (EOFlag);
      IF debug
        THEN
          PrintFrequency;
      ceiling := MAXCOUNT;

      { Try to build encoding table. Fail if any code is > 16 bits long. }
      REPEAT
        IF (ceiling <> MAXCOUNT)
          THEN
            write('*** rescaling ***, ');
        scale(ceiling);
        ceiling := ceiling DIV 2;  {in case we rescale again}
        listlen := 0;   {find length of list and build single nodes}
        FOR i:=0 TO numvals-1 DO
          BEGIN
            IF node[i].weight > 0
              THEN
                BEGIN
                  node[i].tdepth := 0;
                  btlist[listlen] := i;
                  listlen := listlen + 1;
                END;
          END;
        heap(btlist, listlen-1);  { *** changed from listlen }
        Build_Tree(btlist, listlen);
        FOR i := 0 TO NUMVALS-1 DO
          codelen[i] := 0;
      UNTIL (buildenc(0,dctreehd) <> ERROR);
      IF debug
        THEN
          PrintList;
      { Initialize encoding variables }
      cbitsrem := 0;
      curin := 0;
    END; (* init huffman *)

  FUNCTION gethuff: char; {returns byte values except for EOF}

  { Get an encoded byte or EOF. Reads from specified stream AS NEEDED.

    There  are  two  unsynchronized  bit-byte  relationships here: The
  input stream bytes are converted to bit strings of  various  lengths
  via  the  static  variables  named  Cxxxxx.   These  bit strings are
  concatenated without padding to become the stream of encoded  result
  bytes,  which  this function returns one at a time.  The EOF (end of
  file) is converted to SPEOF for convenience  and  encoded  like  any
  other input value.  True EOF is returned after that.  }

    VAR rbyte: integer;       {Result byte value}
        need, take: integer;  {numbers of bits}
        return: integer;

    BEGIN
      rbyte := 0;
      need := 8;        {build one byte per call}
      return := ERROR;  {start off with an error}

      {Loop  to  build  a byte of encoded data.  Initialization forces
      read the first time}

      WHILE return = ERROR DO
        BEGIN
          IF cbitsrem >= need
            THEN
              BEGIN {Current code fullfills our needs}
                IF need = 0
                  THEN
                    return := rbyte AND $00FF
                  ELSE
                    BEGIN
                      rbyte := rbyte OR (ccode shl (8 - need));
                    {take what we need}
                      ccode := ccode shr need;   {and leave the rest}
                      cbitsrem := cbitsrem - need;
                      return := rbyte AND $00FF;
                    END;
              END
            ELSE
              BEGIN
                IF cbitsrem > 0
                  THEN
                    BEGIN  {We need more than current code}
                      rbyte := rbyte OR (ccode shl (8 - need));
                    {take what there is}
                      need := need - cbitsrem;
                    END;
                IF curin = SPEOF
                  THEN
                    BEGIN
                      cbitsrem := 0;
                      IF need=8
                        THEN
                          BEGIN  {end of file}
                            done := true;
                            return := 0; {any valid char value}
                          END
                        ELSE
                          return := rbyte AND $00FF;  {data first}
                    END
                  ELSE
                    BEGIN
                      curin := ord(getcnr);
                      IF EOFlag
                        THEN
                          curin := SPEOF;
                      ccode := code[curin];
                      cbitsrem := codelen[curin];
                    END;
              END;
        END; (* while *)
      gethuff := chr(return);
    END;

  PROCEDURE squeeze;

    VAR 
      c : char;
      char_count : integer;

    BEGIN
      writeln;
      write('Pass 2: Squeezing');
      seek(in_file, 0);
      inbuffersize := 0;
      infilepointer := 1;
      infilesize := filesize(in_file);
      infilebytes := infilesize * 128.;
      rewrite(out_file);
      outfilepointer := 1; (* good idea to start at beginning of buffer *)
      EOFlag := false;
      write(', header');
      Write_Header;
      write(', body');
      state := NoHist;
      done := false;
      c := gethuff;  {prime while loop}
      char_count := 0;
      WHILE NOT done DO
        BEGIN
          putc(out_file, c);
          c := gethuff;
        END;
    END;

BEGIN { Main }

  lowvideo;
  clrscr;
  gotoxy(1,5);
  writeln('File squeezer version ',version);
  writeln;
  debug := false;
  { get filename to process & convert to upper case}
  write('Enter file to squeeze: ');
  readln(InFileName);
  writeln;
  FOR i:=1 TO length(InFileName) DO
    InFileName[i] := upcase(InFileName[i]);
  { Find and change output file type }
  start := 1; { skip leading blanks }
  WHILE (InFileName[start]=space) AND (start <= length(InFileName)) DO
    start := start + 1;
  InFileName := copy(InFileName, start, length(InFileName)-start+1);
  finish := pos('.',InFileName);
  IF finish=0
    THEN OutFileName := InFileName + '.QQQ'
    ELSE BEGIN
           OutFileName := InFileName;
           OutFileName[finish+2] := 'Q';
         END;

  { open source file and check for existence }
  assign(in_file,InFileName);
  assign(out_file,OutFileName);
  {$I-}
  reset(In_File);
  {$I+}
  IF IOresult=0
    THEN
      BEGIN
        inbuffersize := 0;
        infilepointer := 1;
        infilesize := filesize(in_file);
        infilebytes := infilesize * 128.;
        write('The file ',InFileName,' (',infilebytes:6:0);
        writeln(' bytes) is being squeezed to ',OutFilename);
        Initialize_Huffman;
        squeeze;
        writeln(', Done.');
        IF outfilepointer > 1
          THEN
            blockwrite(out_file, outfilebuffer, 1);
        close(in_file);
        close(out_file);
      END
    ELSE
      writeln('Error -- input file doesn''t exist');
END.