hts-specs/CRAMcodecs.tex at master · samtools/hts-specs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%&pdfLaTeX
% !TEX encoding = UTF-8 Unicode
\documentclass[a4paper]{article}
\usepackage{ifxetex}
\ifxetex
\usepackage{fontspec}
\setmainfont[Mapping=tex-text]{STIXGeneral}
\else
\usepackage[T1]{fontenc}
\usepackage[latin1]{inputenc}
\fi
\usepackage{textcomp}

\usepackage{graphicx}
\usepackage{array}
\usepackage{fixltx2e}
\usepackage{amssymb}
\usepackage{fancyhdr}
\usepackage{amsmath}
\usepackage{float}
\usepackage{algpseudocode}
\usepackage{threeparttable}
\usepackage{xfrac}
\usepackage{tikz}
\usetikzlibrary{decorations.pathreplacing}
\renewcommand{\headrulewidth}{0pt}
\renewcommand{\footrulewidth}{0pt}

\setlength{\parindent}{0cm}
\setlength{\parskip}{0.18cm}
\usepackage[hmargin=2cm,vmargin=2.5cm,bindingoffset=0.0cm]{geometry}
\usepackage[pdfborder={0 0 0}]{hyperref}
\begin{document}

% \floatstyle{boxed}
\newfloat{program}{thHp}{prg}
\floatname{program}{Program}

\input{CRAMcodecs.ver}
\title{CRAM codec specification (version 3.1)}
\author{samtools-devel@lists.sourceforge.net}
\date{\headdate}
\maketitle


\begin{quote}\small
The master version of this document can be found at
\url{https://github.com/samtools/hts-specs}.\\
This printing is version~\commitdesc\ from that repository,
last modified on the date shown above.
\end{quote}

\begin{center}
\textit{license: Apache 2.0}
\end{center}
\vspace*{1em}

\newlength{\maxwidth}
\algnewcommand\algorithmicto{\text{ \textbf{to} }}
\algnewcommand\algorithmicin{\text{ \textbf{in} }}
\newcommand{\algalign}[2] % #1 = text to left, #2 = text to right
{\makebox[\maxwidth][l]{$#1{}$}${}#2$}
\algblockdefx[Foreach]{Foreach}{EndForeach}[1]{\textbf{foreach} #1 \textbf{do}}{\textbf{end foreach}}

\makeatletter
\newcommand*{\bdiv}{%
  \nonscript\mskip-\medmuskip\mkern5mu%
  \mathbin{\operator@font div}\penalty900\mkern5mu%
  \nonscript\mskip-\medmuskip
}
\newcommand*{\bitand}{%
  \nonscript\mskip-\medmuskip\mkern5mu%
  \mathbin{\operator@font AND}\penalty900\mkern5mu%
  \nonscript\mskip-\medmuskip
}
\newcommand*{\logand}{% keyword rather than mathematical operator
  \nonscript\mskip-\medmuskip\mkern5mu%
  \mathbin{\operator@font \textbf{and}}\penalty900\mkern5mu%
  \nonscript\mskip-\medmuskip
}
\newcommand*{\logor}{% keyword rather than mathematical operator
  \nonscript\mskip-\medmuskip\mkern5mu%
  \mathbin{\operator@font \textbf{or}}\penalty900\mkern5mu%
  \nonscript\mskip-\medmuskip
}
\newcommand*{\bitor}{%
  \nonscript\mskip-\medmuskip\mkern5mu%
  \mathbin{\operator@font OR}\penalty900\mkern5mu%
  \nonscript\mskip-\medmuskip
}
\newcommand*{\bitxor}{%
  \nonscript\mskip-\medmuskip\mkern5mu%
  \mathbin{\operator@font XOR}\penalty900\mkern5mu%
  \nonscript\mskip-\medmuskip
}
\newcommand\concat{\mathbin{+\mkern-10mu+\,}}
% \newcommand\concat{\mathbin{\oplus\,}}
% \newcommand\concat{\mathbin{\widetilde{+}\,}}
\newcommand\shiftl{\mathbin{<\mkern-3mu<\,}}
\newcommand\shiftr{\mathbin{>\mkern-3mu>\,}}
%\newcommand{\concat}{\ensuremath{+\!\!\!\!+\,}}
\makeatother

\section{Introduction}

This document covers the compression and decompression algorithms
(codecs) specific to the CRAM format.  All bar the first of these were
introduced in CRAM v3.1.

It does not cover the CRAM format itself.  For that see \url{http://samtools.github.io/hts-specs/}.

\subsection{Pseudocode introduction}

Various parts of this specification are written in a simplistic
pseudocode.  This intentionally does not make explicit use of data
types and has minimal error checking.  The number of operators is kept
to a minimum, but some are necessary and may be language specific.
Due to the lack of explicit data types, we also have different
division operators to symbolise floating point and integer divisions.

The pseudocode doesn't prescribe any particular programming paradigm -
functional, procedural or object oriented - but it does have a few
implicit assumptions.  Variables are considered to be passed between
functions via unspecified means.  For example the Range Coder sets
$range$ and $code$ during creation and these are used during the
decoding steps, but are not explicitly passed in as variables.  We
make the implicit assumption that they are simply local variables of the
particular usage of this range coder.  Other than ephemeral loop
counters, we do not reuse variable names so the intention should be
clear.

The exception to the above is occasionally we need to have multiple
instances of a particular data type, such as Order-1 decoding will
have many models.  Here we use an object oriented way of describing
the problem with $instance$.\textsc{Function} notation.

Note some functions may return multiple items, such as
\texttt{return (}\textit{value, length}\texttt{)}, but the calling
code may assign a single variable to this result.  In this case the first
value \textit{value} will be used and \textit{length} will be discarded.

\subsection{Mathematical operators}

\begin{tabular}{rl}
\hline
\textbf{Operator} & \textbf{Description}\\
\hline
$a + b$       & Addition \\
$a - b$       & Subtraction \\
$a \times b$  & Multiplication \\
$a\ /\ b$     & Floating point division $a/b$\\
$a \bdiv b$   & Integer division $a/b$, equivalent to $\lfloor{a/b}\rfloor$\\
$a \bmod b$   & Integer modulo (remainder) $a - b\times(a \bdiv b)$\\
$a = b$       & Compares $a$ and $b$ variables, yielding true if they match, false otherwise\\
$a \gets b$   & Assigns value of $b$ to variable $a$\\
$a \shiftl b$ & Bit-wise left shift $a$ by $b$ bits, shifting in zeros \\
$a \shiftr b$ & Bit-wise right shift $a$ by $b$ bits, shifting in zeros \\
$a \bitand b$ & Bit-wise AND operator, joining values $a$, $b$\\
$a \bitor b$  & Bit-wise OR operator, joining values $a$, $b$\\
$a \logor b$  & Logical OR operator, joining expressions $a$, $b$\\
$a \logand b$ & Logical AND operator, joining expressions $a$, $b$\\
$a \concat b$ & String concatenation of $a$ and $b$: $ab$\\
$V_i$         & Element $i$ of vector $V$\\
              & The entire vector $V$ may be passed into a function\\
$W_{i,j}$     & Element $i,j$ of two-dimensional vector $W$.\\
              & The entire vector $W$ or a one dimensional slice $W_i$ (of size $j$) may be passed into a function.\\
\hline
\end{tabular}

Note that string concatenation with the $\concat$ operator
assumes the left and right values are converted to string form.  For
example ``level'' $\concat 42$ will convert the integer 42 to ``42''
and produce the string ``level42''.

\subsection{Implicit functions}

\begin{tabular}{rl}
\hline
\textbf{Operator} & \textbf{Description}\\
\hline
\textsc{Min}$(a,\ b)$ & Smaller of $a$ and $b$\\
\textsc{Max}$(a,\ b)$ & Larger of $a$ and $b$\\
\textsc{ReadUint8} & Read an 8-bit unsigned integer (1 byte) from unspecified input source\\
\textsc{ReadUint32} & Read a 32-bit unsigned little-endian integer from unspecified input source\\
\textsc{ReadUint8}$(src)$ & Read an 8-bit unsigned integer (1 byte) from input $src$\\
\textsc{ReadUint32}$(src)$ & Read a 32-bit unsigned little-endian integer from input $src$\\
\textsc{ReadData}$(len)$ & Read $len$ bytes (8-bit unsigned) from an unspecified input source\\
\textsc{ReadData}$(len, src)$ & Read $len$ bytes (8-bit unsigned) from input $src$\\
\textsc{ReadChar}$(src)$ & Read a single character from input $src$\\
\textsc{ReadString}$(src)$ & Read a nul-terminated string from input $src$\\
\textsc{EOF} & Returns true if the input source is exhausted.\\
\textsc{Char}$(a)$ & Converts integer $a$ to a single character of appropriate ASCII value\\
\textsc{Length}$(a)$ & Returns length of string $a$ excluding any nul-termination bytes\\
\textsc{Swap}$(a,\ b)$ & Swaps the contents of $a$ and $b$ variables\\
\hline
\end{tabular}

Many of the input functions here and below are defined to read either
from an unspecified input source (such as the input file descriptor,
or a global buffer that has not been explicitly stated in the
pseudocode), but also have forms that may decode from specified inputs
/ buffers.  They both consume their input sources in the same manner,
using an implicit offset of how many bytes so far have been read.

\subsection{Other basic functions}

7-bit integer encoding stores values 7-bits at a time with the top bit
set if further bytes are required.

\begin{algorithmic}[1]
\Statex
\Statex (Read a variable sized unsigned integer 7-bits at a time.  Returns the value.)
\Function{ReadUint7}{$source$} \Comment{If $source$ is unspecified then it is the default input stream}
  \State $value \gets 0$
  \State $length \gets 0$
  \Repeat
    \State $c \gets$ \Call{ReadUint8}{}
    \State $value \gets (value \shiftl 7) + (c \bitand 127)$
    \State $length \gets length + 1$
  \Until{$c < 128$}
  \State \Return $value$
  \EndFunction
\end{algorithmic}

ITF8 integer encoding stores the additional number of bytes needed in
the count of the top bits set in the initial byte (ending with a zero
bit), followed by any subsequent whole bytes.  See the main CRAM
specification for more details.

\begin{algorithmic}[1]
\Statex
\Statex (Read a variable sized unsigned integer with ITF8 encoding.  Returns the value.)
\Function{ReadITF8}{$source$} \Comment{If $source$ is unspecified then it is the default input stream}
  \State $v \gets$ \Call{ReadUint8}{}
  \If{$i >= \mathtt{0xf0}$}\Comment{1111xxxx => +4 bytes}
    \State $v \gets (v\ \bitand \mathtt{0x0f}) \shiftl 28$
    \State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 20)$
    \State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 12)$
    \State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl  4)$
    \State $v \gets v + ($ \Call{ReadUint8}{} $\shiftr  4)$
  \ElsIf{$i >= \mathtt{0xe0}$}\Comment{1110xxxx => +3 bytes}
    \State $v \gets (v\ \bitand \mathtt{0x0f}) \shiftl 24$
    \State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 16)$
    \State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 8)$
    \State $v \gets v + $ \Call{ReadUint8}{}
  \ElsIf{$i >= \mathtt{0xc0}$}\Comment{110xxxxx => +2 bytes}
    \State $v \gets (v\ \bitand \mathtt{0x1f}) \shiftl 16$
    \State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 8)$
    \State $v \gets v + $ \Call{ReadUint8}{}
  \ElsIf{$i >= \mathtt{0x80}$}\Comment{10xxxxxx => +1 bytes}
    \State $v \gets (v\ \bitand \mathtt{0x3f}) \shiftl 8$
    \State $v \gets v + $ \Call{ReadUint8}{}
  \EndIf
  \State \Return $v$
  \EndFunction
\end{algorithmic}

\section{rANS 4x8 - Asymmetric Numeral Systems}

% Lifted over from CRAMv3.tex

This is the rANS format first defined in CRAM v3.0.

rANS is the range-coder variant of the Asymmetric Numerical
Systems\footnote{J. Duda, \textit{Asymmetric numeral systems: entropy
    coding combining speed of Huffman coding with compression rate of
    arithmetic coding}, \url{http://arxiv.org/abs/1311.2540}}.

The structure of the external rANS codec consists of several
components: meta-data consisting of compression-order, and compressed
and uncompressed sizes; normalised frequencies of the alphabet systems
to be encoded, either in Order-0 or Order-1 context; and the rANS
encoded byte stream itself.

Here "Order" refers to the number of bytes of context used in
computing the frequencies. It will be 0 or 1.  Ignoring punctuation
and space, an Order-0 analysis of English text may observe that `e' is
the most common letter (12-13\%), and that `u' occurs only around 2.5\%
of the time.  If instead we consider the frequency of a letter in the
context of one previous letter (Order-1) then these statistics change
considerably;  we know that if the previous letter was `q' then `e'
becomes a rare letter while `u' is the most likely.

These observed frequencies are inversely related to the amount of
storage required to encode a symbol (e.g. an alphabet
letter)\footnote{ C.E. Shannon, \textit{A Mathematical Theory of
    Communication}, Bell System Technical Journal, vol. 27,
    pp. 379-423, 623-656, July, October, 1948}.


\subsubsection*{\textbf{rANS 4x8 compressed data structure}}
A compressed data block consists of the following logical parts:

\begin{tabular}{|l|l|>{\raggedright}p{100pt}|>{\raggedright}p{220pt}|}
\hline
\textbf{Value data type} & \textbf{Name} & \textbf{Description}\tabularnewline
\hline
byte & order & the order of the codec, either 0 or 1\tabularnewline
\hline
uint32 & compressed size & the size in bytes of frequency table and compressed blob\tabularnewline
\hline
uint32 & data size & raw or uncompressed data size in bytes\tabularnewline
\hline
byte[] & frequency table & byte frequencies of input data written using RLE\tabularnewline
\hline
byte[] & compressed blob & compressed data\tabularnewline
\hline
\end{tabular}

\subsection{\textbf{Frequency table}}

The alphabet used here has a maximum of 256 possible symbols (all byte
values), but alphabets where fewer symbols are permitted too.

The symbol frequency table indicates which symbols are present and
what their relative frequencies are.  The total sum of symbol
frequencies are normalised to add up to 4095\footnote{While the maths
  work fine up to 4096, for historical reasons this has always been
  documented as having a limit of 4095.  Implementations may wish to
  validate decoding on $<= 4096$, but we recommend they use a limit of
  4095 in their encoding output.}.  Given rounding differences when
renormalising to a fixed sum, it is up to the encoder to decide how to
distribute any remainder or remove excess frequencies.  The normalised
frequency tables below are examples and not prescriptive of a specific
normalisation strategy.

Formally, this is an ordered alphabet $\mathbb{A}$ containing symbols $s$ where
$s_{i}$ with the $i$-th symbol in $\mathbb{A}$, occurring with the frequency $freq_{i}$.

\subsubsection*{Order-0 encoding}

The normalised symbol frequencies are then written out as \{symbol,
frequency\} pairs in ascending order of symbol (0 to 255 inclusive).
If a symbol has a frequency of 0 then it is omitted.

To avoid storing long consecutive runs of symbols if all are present
(eg a-z in a long piece of English text) we use run-length-encoding on
the alphabet symbols.  If two consecutive symbols have non-zero
frequencies then a counter of how many other non-zero frequency
consecutive symbols is output directly after the second consecutive
symbol, with that many symbols being subsequently omitted.

For example for non-zero frequency symbols `a', `b', `c', `d' and `e'
we would write out symbol `a', `b' and the value 3 (to indicate `c',
`d' and `e' are also present).

The frequency is output after every symbol (whether explicit or
implicit) using ITF8 encoding. This means that frequencies 0-127 are
encoded in 1 byte while frequencies 128-4095 are encoded in 2 bytes.

Finally the symbol 0 is written out to indicate the end of the
symbol-frequency table.

As an example, take the string \texttt{abracadabra}.

\begin{minipage}[t]{0.5\textwidth}
Symbol frequency:
\\[8pt]
\begin{tabular}{ |r|r| }
\hline
Symbol & Frequency\\
\hline
a & 5 \\
b & 2 \\
c & 1 \\
d & 1 \\
r & 2 \\
\hline
\end{tabular}
\end{minipage}
\begin{minipage}[t]{0.5\textwidth}
Normalised to sum to 4095:
\\[8pt]
\begin{tabular}{ |r|r|}
\hline
Symbol & Frequency\\
\hline
a & 1863 \\
b &  744 \\
c &  372 \\
d &  372 \\
r &  744 \\
\hline
\end{tabular}
\end{minipage}

Encoded as:
\begin{verbatim}
0x61      0x87 0x47      # `a'           <1863>
0x62 0x02 0x82 0xe8      # `b' <+2: c,d>  <744>
          0x81 0x74      # `c' (implicit) <372>
          0x81 0x74      # `d' (implicit) <372>
0x72      0x82 0xe8      # `r'            <744>
0x00                     # <0>
\end{verbatim}


\subsubsection*{Order-1 encoding}

To encode Order-1 statistics typically requires a larger table as for
an $N$ sized alphabet we need to potentially store an $N$x$N$ matrix.
We store these as a series of Order-0 tables.

We start with the outer context byte, emitting the symbol if it is
non-zero frequency.  We perform the same run-length-encoding as we
use for the Order-0 table and end the contexts with a nul byte.  After
each context byte we emit the Order-0 table relating to that context.

One last caveat is that we have no context for the first byte in the
data stream (in fact for 4 equally spaced starting points, see
``interleaving" below).  We use the ASCII value (`\textbackslash0') as
the starting context for each interleaved rANS state and so need to
consider this in our frequency table.

Consider \texttt{abracadabraabracadabraabracadabraabracadabr} as
example input.  Note for the last ``a'' was omitted from this string
in order to demonstrate how the method works when the data is not a
multiple of 4 long.  This can be broken into 4 approximate equal
portions \texttt{abracadabra abracadabra abracadabra abracadabr}.  We
operate one independent rANS stream per portion, providing us the
opportunity to exploit CPU data parallelism.

\begin{minipage}[t]{0.5\textwidth}
Naively observed Order-1 frequencies:
\\[8pt]
\begin{tabular}{ |r|r|r| }
\hline
Context & Symbol & Frequency\\
\hline
\textbackslash0 & a & 4 \\
\hline
a & a & 3 \\
  & b & 8 \\
  & c & 4 \\
  & d & 4 \\
\hline
b & r & 8 \\
\hline
c & a & 4 \\
\hline
d & a & 4 \\
\hline
r & a & 7 \\
\hline
\end{tabular}
\end{minipage}
\begin{minipage}[t]{0.5\textwidth}
Normalised (per Order-0 statistics):
\\[8pt]
\begin{tabular}{ |r|r|r|}
\hline
Context & Symbol & Frequency\\
\hline
\textbackslash0 & a & 4095 \\
\hline
a & a &  646 \\
  & b & 1725 \\
  & c &  862 \\
  & d &  862 \\
\hline
b & r & 4095 \\
\hline
c & a & 4095 \\
\hline
d & a & 4095 \\
\hline
r & a & 4095 \\
\hline
\end{tabular}
\end{minipage}

Note that the above table has redundant entries.  While our complete
string had three cases of two consecutive ``a'' characters
(``...cadabr\textbf{aa}braca...''), these spanned the junction of our
split streams and each rANS state is operating independently, starting
with the same last character of nul (0).  Hence during decode we will
not need to access the table for the frequency of ``a'' in the context
of a previous ``a''.  A similar issue occurs for the very last byte
used for each rANS state, which will not be used as a context.  In
extreme cases this may even be the only time that symbols occurs
anywhere.  While these scenarios represent unnecessary data to store,
and these frequency entries can be safely omitted, their presence does
not invalidate the data format and it may be simpler to use a more
naive algorithm when producing the frequency tables.


The above tables are encoded as:
\begin{verbatim}
0x00                 # `\0' context
0x61      0x8f 0xff  # a  <4095>
0x00                 # end of Order-0 table

0x61                 # `a' context
0x61      0x82 0x86  # a            <646>
0x62 0x02 0x86 0xbd  # b <+2: c,d> <1725>
          0x83 0x5e  # c (implicit) <862>
          0x83 0x5e  # d (implicit) <862>
0x00                 # end of Order-0 table

0x62 0x02            # `b' context, <+2: c, d>
0x72      0x8f 0xff  # r <4095>
0x00                 # end of Order-0 table

                     # `c' context (implicit)
0x61      0x8f 0xff  # a <4095>
0x00                 # end of Order-0 table

                     # `d' context (implicit)
0x61      0x8f 0xff  # a <4095>
0x00                 # end of Order-0 table

0x72                 # `r' context
0x61      0x8f 0xff  # a <4095>
0x00                 # end of Order-0 table

0x00                 # end of contexts
\end{verbatim}

\subsection{rANS entropy encoding}

The encoder takes a symbol $s$ and a current state $x$ (initially $L$ below) to
produce a new state $x'$ with function $C$.

{
\setlength{\parindent}{1cm}
\indent $x' = C(s,x)$
}

The decoding function $D$ is the inverse of $C$ such that $C(D(x)) = x$.

{
\setlength{\parindent}{1cm}
\indent $D(x') = (s,x)$
}

The entire encoded message can be viewed as a series of nested $C$
operations, with decoding yielding the symbols in reverse order, much
like popping items off a stack.  This is where the asymmetric part of
ANS comes from.

As we encode into $x$ the value will grow, so for efficiency we ensure
that it always fits within known bounds. This is governed by

{
\setlength{\parindent}{1cm}
\indent $L \leq x < bL-1$
}


where $b$ is the base and $L$ is the lower-bound.

We ensure this property is true before every use of $C$ and after every
use of $D$.  Finally to end the stream we flush any remaining data out
by storing the end state of $x$.


\textbf{Implementation specifics}

We use an unsigned 32-bit integer to hold $x$. In encoding it is
initialised to $L$. For decoding it is read little-endian from the
input stream.

Recall $freq_{i}$ is the frequency of the $i$-th symbol $s_{i}$ in alphabet
$\mathbb{A}$.  We define $cfreq_i$ to be cumulative frequency of all symbols
up to but not including $s_{i}$:

{
\setlength{\parindent}{1cm}
$ cfreq_{i} = \left\{
\begin{array}{l l}
0 & \quad \textrm{if $i < 1$} \\
cfreq_{i-1} + freq_{i-1} & \quad \textrm{if $i \geq 1$}
\end{array}
\right. $
% \\*[8pt]
% \indent $cfreq_{i} = \displaystyle \sum_{j=0}^{i-1} freq_j$
}


We have a reverse lookup table $cfreq\_to\_sym_c$ from 0 to 4095
(0xfff) that maps a cumulative frequency $c$ to a symbol $s$.

{
\setlength{\parindent}{1cm}
\indent   $cfreq\_to\_sym_c = s_{i} \quad where \quad c: \enskip cfreq_i \leq c <
cfreq_i + freq_i$
% \\*[8pt]
% \indent   $cfreq\_to\_sym_c = s_{i} ,
% \quad \forall c \in [cfreq_i, cfreq_i + freq_i)$
}


The $x' = C(s,x)$ function used for the $i$-th symbol $s$ is:

{
\setlength{\parindent}{1cm}
\indent    $x' = (x/freq_i) \times \mathtt{0x1000} + cfreq_i + (x \bmod freq_i)$
}

The $D(x') = (s,x)$ function used to produce the $i$-th symbol $s$ and
a new state $x$ is:

{
\setlength{\parindent}{1cm}
\indent    $c = x' \bitand \mathtt{0xfff}$\\*
\indent    $s_{i} = cfreq\_to\_sym_{c}$\\*
\indent    $x = freq_{i} (x' / \mathtt{0x1000}) + c - cfreq_{i}$
}

Most of these operations can be implemented as bit-shifts and bit-AND,
with the encoder modulus being implemented as a multiplication by the
reciprocal, computed once only per alphabet symbol.

We use $L = \mathtt{0x800000}$ and $b = 256$, permitting us to flush out one byte
at a time (encoded and decoded in reverse order).

Before every encode $C(s,x)$ we renormalise $x$, shifting out the bottom 8
bits of $x$ until $x < \mathtt{0x80000} \times freq_i$.  After
finishing all encoding we flush 4 more bytes (lowest 8-bits first) from $x$.

After every decoded $D(x')$ we renormalise $x'$, shifting in the bottom 8
bits until $x \geq \mathtt{0x800000}$.


\subsubsection*{Interleaving}

For efficiency, we interleave 4 separate rANS codecs at the same
time\footnote{F. Giesen, \textit{Interleaved entropy coders},
  \url{http://arxiv.org/abs/1402.3392}}.  For the Order-0 codecs these
simply encode or decode the 4 neighbouring bytes in cyclic fashion
using interleaved codec 0, 1, 2, and 3, sharing the same output buffer
(so the output bytes get interleaved).

For the Order-1 codec we cannot do this as we need to know the
previous byte value as the context for the next byte.  We therefore split
the input data into 4 approximately equal sized
fragments\footnote{This was why the `\textbackslash0' $\to$ `a'
  context in the example above had a frequency of 4 instead of 1.}
starting at $0$, $\lfloor{}len/4\rfloor{}$,
$\lfloor{}len/4\rfloor{}\times2$ and $\lfloor{}len/4\rfloor{}\times 3$.  Each
Order-1 codec operates in a cyclic fashion as with Order-0, all
starting with 0 as their state and sharing the same compressed output
buffer. Any remainder, when the input buffer is not divisible by 4, is
processed at the end by the 4th rANS state.

We do not permit Order-1 encoding of data streams smaller than 4
bytes.

\subsection{rANS decode pseudocode}

A na\"ive implementation of a rANS decoder follows.
This pseudocode is for clarity only and is not expected to be performant and we would normally rewrite this to use lookup tables for maximum efficiency.
The function \textsc{ReadUint8} fetches the next single unsigned byte from an unspecified input source.  Similarly for \textsc{ReadITF8} (variable size integer) and \textsc{ReadUint32} (32-bit unsigned integer in little endian format).

\vskip 0.5cm

\begin{algorithmic}[1]
\Procedure{RansDecode}{$input,\ output$}
\settowidth{\maxwidth}{$n\_out$}
\State \algalign{order}{\gets} \Call{ReadUint8}{}\Comment{Implicit read from $input$}
\State \algalign{n\_in}{\gets} \Call{ReadUint32}{}
\State \algalign{n\_out}{\gets} \Call{ReadUint32}{}
\State \If{$order = 0$}
  \State \Call{RansDecode0}{$output,\ n\_out$}
\Else
  \State \Call{RansDecode1}{$output,\ n\_out$}
\EndIf
\EndProcedure
\end{algorithmic}


\subsubsection*{rANS order-0}

The Order-0 code is the simplest variant.
Here we also define some of the functions for manipulating the rANS state, which are shared between Order-0 and Order-1 decoders.

\vskip 0.5cm

\begin{algorithmic}[1]
\Statex (Reads a table of Order-0 symbol frequencies $F_i$)
\Statex (and sets the cumulative frequency table $C_{i+1} = C_i+F_i$)
\Procedure{ReadFrequencies0}{$F,\ C$}
\State $s \gets$ \Call{ReadUint8}{}\Comment{Next alphabet symbol}
\State $last\_sym \gets s$
\State $rle \gets 0$
\Repeat
  \settowidth{\maxwidth}{$C_s$}
  \State \algalign{F_s}{\gets} \Call{ReadITF8}{}
  \If{$rle > 0$}
    \settowidth{\maxwidth}{rle\ }
    \State \algalign{rle}{\gets} $rle-1$
    \State \algalign{s}{\gets} $s+1$
  \Else
    \State $s \gets$ \Call{ReadUint8}{}
    \If{$s = last\_sym+1$}
      \State $rle \gets$ \Call{ReadUint8}{}
    \EndIf
  \EndIf
  \State $last\_sym \gets s$
\Until {$s = 0$}
\Statex \quad\ (Compute cumulative frequencies $C_i$ from $F_i$)
\State $C_0 \gets 0$
\For{$s\gets 0 \algorithmicto 255$}
  \State $C_{s+1} \gets C_s + F_s$
\EndFor
\EndProcedure

\Statex
\Statex (Bottom 12 bits of our rANS state $R$ are our frequency)
\Function{RansGetCumulativeFreq}{$R$}
  \State \Return $R\ \bitand$ 0xfff
\EndFunction
\Statex
\Statex (Convert frequency to a symbol. Find $s$ such that $C_s \le f < C_{s+1}$)
\Statex (We would normally implement this via a lookup table)
\Function{RansGetSymbolFromFreq}{$C, f$}
  \State $s \gets 0$
  \While{$f >= C_{s+1}$}
    \State $s \gets s+1$
  \EndWhile
  \State \Return $s$
\EndFunction
\Statex
\Statex (Compute the next rANS state $R$ given frequency $f$ and cumulative freq $c$)
\Function{RansAdvanceStep}{$R, c, f$}
  \State \Return $f \times (R \shiftr 12) + (R\ \bitand$ 0xfff$) - c$
\EndFunction
\Statex
\Statex (If too small, feed in more bytes to the rANS state $R$)
\Function{RansRenorm}{$R$}
  \While{$R < (1 \shiftl 23)$}
    \State $R \gets (R \shiftl 8) +$\ \Call{ReadUint8}{}
  \EndWhile
  \State \Return $R$
\EndFunction
\Statex
\Procedure{RansDecode0}{$output$, $nbytes$}
  \State \Call{ReadFrequencies0}{$F, C$}
  \For{$j\gets 0 \algorithmicto 3$}\Comment{Initialise the 4 interleaved streams}
    \State $R_j \gets$ \Call{ReadUint32}{}\Comment{Unsigned 32-bit little endian}
  \EndFor
  \For{$i\gets 0 \algorithmicto nbytes-1$}
    \State $j \gets i \bmod 4$
    \State $f \gets$ \Call{RansGetCumulativeFreq}{$R_j$}
    \State $s \gets$ \Call{RansGetSymbolFromFreq}{$C,\ f$}
    \State $output_i \gets s$
    \State $R_j \gets$\ \Call{RansAdvanceStep}{$R_j,\ C_s,\ F_s$}
    \State $R_j \gets$\ \Call{RansRenorm}{$R_j$}
  \EndFor
\EndProcedure
\end{algorithmic}

\subsubsection*{rANS order-1}

As described above, the decode logic is very similar to rANS Order-0 except we have a two dimensional array of frequencies to read and the decode uses the last character as the context for decoding the next one.
In the pseudocode we illustrate this by using two dimensional vectors $C_{i,j}$ and $F_{i,j}$.
For simplicity, we reuse the Order-0 code by referring to $C_i$ and $F_i$ of the 2D vectors to get a single dimensional vector that operates in the same manner as the Order-0 code.
This is not necessarily the most efficient implementation.

Note the code for dealing with the remaining bytes when an output buffer is not an exact multiple of 4 is less elegant in the Order-1 code.
This is correct, but it is unfortunately a design oversight.

\vskip 0.5cm

\begin{algorithmic}[1]
\Statex (Reads a table of Order-1 symbol frequencies $F_{i,j}$)
\Statex (and sets the cumulative frequency table $C_{i,j+1} = C_{i,j}+F_{i,j}$)
\Procedure{ReadFrequencies1}{$F,\ C$}
\State $sym \gets$ \Call{ReadUint8}{}\Comment{Next alphabet symbol}
\State $last\_sym \gets sym$
\State $rle \gets 0$
\Repeat
  \State \Call{ReadFrequencies0}{$F_i, C_i$}
  \If{$rle > 0$}
    \settowidth{\maxwidth}{sym}
    \State \algalign{rle}{\gets} $rle-1$
    \State \algalign{sym}{\gets} $sym+1$
  \Else
    \State $sym \gets$ \Call{ReadUint8}{}
    \If{$sym = last\_sym+1$}
      \State $rle \gets$ \Call{ReadUint8}{}
    \EndIf
  \EndIf
  \State $last\_sym \gets sym$
\Until {$sym = 0$}
\EndProcedure

\Statex
\Procedure{RansDecode1}{$output$, $nbytes$}
  \State \Call{ReadFrequencies1}{$F, C$}
  \For{$j\gets 0 \algorithmicto 3$}\Comment{Initialise 4 interleaved streams}
    \State $R_j \gets$ \Call{ReadUint32}{}\Comment{Unsigned 32-bit little endian}
    \State $L_j \gets 0$\Comment{Last symbol}
  \EndFor
  \State $i \gets 0$
  \While{$i < \lfloor nbytes/4 \rfloor$}
    \For{$j\gets 0 \algorithmicto 3$}
      \State $f \gets$ \Call{RansGetCumulativeFreq}{$R_j$}
      \State $s \gets$ \Call{RansGetSymbolFromFreq}{$C_{L_j},\ f$}
      \State $output_{i + j \times \lfloor nbytes/4 \rfloor} \gets s$
      \State $R_j \gets$\ \Call{RansAdvanceStep}{$R_j,\ C_{L_j,s},\ F_{L_j,s}$}
      \State $R_j \gets$\ \Call{RansRenorm}{$R_j$}
      \State $L_j \gets s$
    \EndFor
    \State $i \gets i+1$
  \EndWhile
  \Statex \ \ \ \ (Now deal with the remainder if buffer size is not a multiple of 4,)
  \Statex \ \ \ \ (using rANS state 3 exclusively.)
  \For{$i \gets i \times 4 \algorithmicto len-1$}
    \State $f \gets$ \Call{RansGetCumulativeFreq}{$R_3$}
    \State $s \gets$ \Call{RansGetSymbolFromFreq}{$C_{L_3},\ f$}
    \State $output_i \gets s$
    \State $R_3 \gets$\ \Call{RansAdvanceStep}{$R_3,\ C_{L_3,s},\ F_{L_3,s}$}
    \State $R_3 \gets$\ \Call{RansRenorm}{$R_3$}
    \State $L_3 \gets s$
  \EndFor
\EndProcedure
\end{algorithmic}

\section{rANS Nx16}

CRAM version 3.1 defines an additional rANS entropy encoder, using
16-bit renormalisation instead of the 8-bit used in CRAM 3.0 and with
inbuilt bit-packing and run-length encoding.  The lower-bound and
initial encoder state $L$ is also changed to 0x8000. The Order-1 rANS
Nx16 encoder has also been modified to permit a maximum frequency of 1024
instead of 4096.  This offers better cache performance for the large
Order-1 tables and usually has minimal impact on compression ratio.

Additionally it permits adjustment of the number of interleaved rANS
states from the fixed 4 used in rANS 4x8 to either 4 or 32 states.
The benefit of the 32-way interleaving is in enabling efficient use of
SIMD instructions for faster encoding and decoding speeds.  However it
has a small cost in size and initialisation times so it is not
recommended on smaller blocks of data.

Frequencies are now stored using uint7 format instead of ITF8.  The
tables are also stored differently, separating the list of symbols
present in the alphabet (those with frequency greater than zero) from
the frequencies themselves.

Finally transformations may be applied to the data prior to
compression (or after decompression).  These consist of stripe, for
structured data where every Nth byte is sent to one of N separate
compression streams, Run Length Encoding replacing repeated strings of
symbols with a symbol and count, and bit-packing where reduced
alphabets can combine multiple symbols into a byte prior to entropy
encoding.

The initial ``Order'' byte is expanded with additional bits to list
the transformations to be applied.  The specifics of each sub-format
are listed below, in the order they are applied.

\begin{itemize}
\item{\textbf{\textsc{Stripe}}:}
rANS Nx16 with multi-way interleaving (see Section~\ref{sec:ransstripe}).

\item{\textbf{\textsc{NoSize}}:}
Do not store the size of the uncompressed data stream.
This information is not required when the data stream is one of the four sub-streams in the \textsc{Stripe} format.

\item{\textbf{\textsc{Cat}}:}
If present, the order bit flag is ignored.

The uncompressed data stream is the same as the compressed stream.
This is useful for very short data where the overheads of compressing are too high.

\item{\textbf{\textsc{N32}}:}
Flag indicating whether to interleave 4 or 32 rANS states.

\item{\textbf{\textsc{Order}}:}
Bit field defining order-0 (unset) or order-1 (set) entropy encoding, as described above by the \textsc{RansDecodeNx16\_0} and \textsc{RansDecodeNx16\_1} functions.

\item{\textbf{\textsc{RLE}}:}
Bit field defining whether Run Length Encoding has been applied to the data.  If set, the reverse transorm will be applied using \textsc{DecodeRLE} after Order-0 or Order-1 uncompression (see Section~\ref{sec:ransRLE}).

\item{\textbf{\textsc{Pack}}:}
Bit field indicating the data was packed prior to compression (see Section~\ref{sec:ranspack}).  If set, unpack the bits after any RLE decoding has been applied (if required) using the \textsc{DecodePack} function.
\end{itemize}

\subsection{Frequency tables}

Frequency tables in rANS Nx16 separate the list of symbols from their
frequencies.  The symbol list must be stored in ascending ASCII order,
with their frequency values in the same ordering as their
corresponding symbols.  For the Order-1 frequency table this list of
symbols is those used in any context, thus we only have one alphabet
recorded for all contexts.  This means in some contexts some
(potentially many) symbols will have zero frequency.  To reduce the
Order-1 table size an additional zero run-length encoding step is
used.  Finally the Order-1 frequencies may optionally be compressed
using the Order-0 rANS Nx16 codec.

Frequencies must always add up to a power of 2, but do not necessarily
have to match the final power of two used in the Order-0 (4096) and
Order-1 (1024, 4096) entropy decoder algorithm.  A normalisation step is
applied after reading the frequencies to scale them appropriately.
This is required as the Order-1 frequencies may be scaled differently
for each context.

\begin{algorithmic}[1]
\Statex (Reads a set of symbols $A$ used in our alphabet)
\Function{ReadAlphabet}{}
\State $s \gets$ \Call{ReadUint8}{}
\State $last\_sym \gets s$
\State $rle \gets 0$
\Repeat
  \State $A \gets (A,s)$ \Comment{Append $s$ to the symbol set $A$}
  \If{$rle > 0$}
    \settowidth{\maxwidth}{rle\ }
    \State \algalign{rle}{\gets} $rle-1$
    \State \algalign{s}{\gets} $s+1$
  \Else
    \State $s \gets$ \Call{ReadUint8}{}
    \If{$s = last\_sym+1$}
      \State $rle \gets$ \Call{ReadUint8}{}
    \EndIf
  \EndIf
  \State $last\_sym \gets s$
\Until {$s = 0$}
\State \Return $A$
\EndFunction
\end{algorithmic}

\vskip 0.5cm

\begin{algorithmic}[1]
\Statex (Reads a table of Order-0 symbol frequencies $F_i$)
\Statex (and sets the cumulative frequency table $C_{i+1} = C_i+F_i$)
\Procedure{ReadFrequenciesNx16\_0}{$F,\ C$}
\State $F \gets (0,\ ...)$ \Comment(Set to zero for all $i \in \{0, 1,
  ..., 255\}$)
\State $A \gets$ \Call{ReadAlphabet}{}
\Foreach{$i \algorithmicin A$}
  \State $F_i \gets$ \Call{ReadUint7}{}
\EndForeach
\State
\State \Call{NormaliseFrequenciesNx16\_0}{$F,\ 12$}
\State
\State $C_0 \gets 0$
\For{$s\gets 0 \algorithmicto 255$}\Comment{All 256 possible byte values}
  \State $C_{s+1} \gets C_s + F_s$
\EndFor
\EndProcedure
\end{algorithmic}

\begin{algorithmic}[1]
\Statex (Normalises a table of frequencies $F_i$ to sum to a specified power of 2.)
\Procedure{NormaliseFrequenciesNx16\_0}{$F,\ bits$}
\State $tot \gets 0$
\For{$i\gets 0 \algorithmicto 255$}
  \State $tot \gets tot + F_i$
\EndFor
\If{$tot = 0 \logor tot = (1 \shiftl bits)$}
  \State \Return
\EndIf
\State
\State $shift \gets 0$
\While{$tot < (1 \shiftl bits)$}
  \State $tot \gets tot*2$
  \State $shift \gets shift+1$
\EndWhile
\State
\For{$i\gets 0 \algorithmicto 255$}
  \State $F_i \gets F_i \shiftl shift$
\EndFor
\EndProcedure
\end{algorithmic}

The Order-1 frequencies also store the complete alphabet of
observed symbols (ignoring context) followed by a table of frequencies for
each symbol in the alphabet.  Given many frequencies will be zero
where a symbol is present in one context but not in others, all zero
frequencies are followed by a run length to omit adjacent zeros.

The order-1 frequency table itself may still be quite large, so is
optionally compressed using the order-0 rANS Nx16 codec with a fixed
4-way interleaving.
This is specified in the bottom bit of the first byte.  If this is 1, it is
followed by 7-bit encoded uncompressed and compressed lengths and then
the compressed frequency data.  The pseudocode here differs slightly
to elsewhere as it indicates the input sources, which are either the
uncompressed frequency buffer or the default (unspecified) source.
The top 4 bits of the first byte indicate the number of bits used for
the frequency tables.  Permitted values are 10 and 12.

% Should we just have a generic INPUT buffer so we can use either
% unspecified inputs or specified ones?

\begin{algorithmic}[1]
\Statex (Reads a table of Order-1 symbol frequencies $F_{i,j}$)
\Statex (and sets the cumulative frequency table $C_{i,j+1} = C_{i,j}+F_{i,j}$)
\Procedure{ReadFrequenciesNx16\_1}{$F,\ C,\ bits$}
\State $comp \gets$ \Call{ReadUint8}{}
\State $bits \gets comp \shiftr 4$
\If{$(comp \logand 1) \ne 0$}
  \State $u\_size \gets$ \Call{ReadUint7}{} \Comment{Uncompressed size}
  \State $c\_size \gets$ \Call{ReadUint7}{} \Comment{Compressed size}
  \State $c\_data \gets$ \Call{ReadData}{$c\_size$}
  \State $source \gets$ \Call{RansDecodeNx16\_0}{$c\_data, 4$} \Comment{Create $u\_size$ bytes of $source$ from $c\_data$}