minesweeper

A minewseeper implementation to play around with Hare and Raylib
git clone https://git.tronto.net/minesweeper
Download | Log | Files | Refs | README | LICENSE

qoa.h (23885B)


      1 /*
      2 
      3 Copyright (c) 2023, Dominic Szablewski - https://phoboslab.org
      4 SPDX-License-Identifier: MIT
      5 
      6 QOA - The "Quite OK Audio" format for fast, lossy audio compression
      7 
      8 
      9 -- Data Format
     10 
     11 QOA encodes pulse-code modulated (PCM) audio data with up to 255 channels, 
     12 sample rates from 1 up to 16777215 hertz and a bit depth of 16 bits.
     13 
     14 The compression method employed in QOA is lossy; it discards some information
     15 from the uncompressed PCM data. For many types of audio signals this compression
     16 is "transparent", i.e. the difference from the original file is often not
     17 audible.
     18 
     19 QOA encodes 20 samples of 16 bit PCM data into slices of 64 bits. A single
     20 sample therefore requires 3.2 bits of storage space, resulting in a 5x
     21 compression (16 / 3.2).
     22 
     23 A QOA file consists of an 8 byte file header, followed by a number of frames.
     24 Each frame contains an 8 byte frame header, the current 16 byte en-/decoder
     25 state per channel and 256 slices per channel. Each slice is 8 bytes wide and
     26 encodes 20 samples of audio data.
     27 
     28 All values, including the slices, are big endian. The file layout is as follows:
     29 
     30 struct {
     31 	struct {
     32 		char     magic[4];         // magic bytes "qoaf"
     33 		uint32_t samples;          // samples per channel in this file
     34 	} file_header;             
     35 
     36 	struct {
     37 		struct {
     38 			uint8_t  num_channels; // no. of channels
     39 			uint24_t samplerate;   // samplerate in hz
     40 			uint16_t fsamples;     // samples per channel in this frame
     41 			uint16_t fsize;        // frame size (includes this header)
     42 		} frame_header;          
     43 
     44 		struct {
     45 			int16_t history[4];    // most recent last
     46 			int16_t weights[4];    // most recent last
     47 		} lms_state[num_channels]; 
     48 
     49 		qoa_slice_t slices[256][num_channels];
     50 
     51 	} frames[ceil(samples / (256 * 20))];
     52 } qoa_file_t;
     53 
     54 Each `qoa_slice_t` contains a quantized scalefactor `sf_quant` and 20 quantized
     55 residuals `qrNN`:
     56 
     57 .- QOA_SLICE -- 64 bits, 20 samples --------------------------/  /------------.
     58 |        Byte[0]         |        Byte[1]         |  Byte[2]  \  \  Byte[7]   |
     59 | 7  6  5  4  3  2  1  0 | 7  6  5  4  3  2  1  0 | 7  6  5   /  /    2  1  0 |
     60 |------------+--------+--------+--------+---------+---------+-\  \--+---------|
     61 |  sf_quant  |  qr00  |  qr01  |  qr02  |  qr03   |  qr04   | /  /  |  qr19   |
     62 `-------------------------------------------------------------\  \------------`
     63 
     64 Each frame except the last must contain exactly 256 slices per channel. The last
     65 frame may contain between 1 .. 256 (inclusive) slices per channel. The last
     66 slice (for each channel) in the last frame may contain less than 20 samples; the
     67 slice still must be 8 bytes wide, with the unused samples zeroed out.
     68 
     69 Channels are interleaved per slice. E.g. for 2 channel stereo: 
     70 slice[0] = L, slice[1] = R, slice[2] = L, slice[3] = R ...
     71 
     72 A valid QOA file or stream must have at least one frame. Each frame must contain
     73 at least one channel and one sample with a samplerate between 1 .. 16777215
     74 (inclusive).
     75 
     76 If the total number of samples is not known by the encoder, the samples in the
     77 file header may be set to 0x00000000 to indicate that the encoder is 
     78 "streaming". In a streaming context, the samplerate and number of channels may
     79 differ from frame to frame. For static files (those with samples set to a
     80 non-zero value), each frame must have the same number of channels and same
     81 samplerate.
     82 
     83 Note that this implementation of QOA only handles files with a known total
     84 number of samples.
     85 
     86 A decoder should support at least 8 channels. The channel layout for channel
     87 counts 1 .. 8 is:
     88 
     89 	1. Mono
     90 	2. L, R
     91 	3. L, R, C 
     92 	4. FL, FR, B/SL, B/SR 
     93 	5. FL, FR, C, B/SL, B/SR 
     94 	6. FL, FR, C, LFE, B/SL, B/SR
     95 	7. FL, FR, C, LFE, B, SL, SR 
     96 	8. FL, FR, C, LFE, BL, BR, SL, SR
     97 
     98 QOA predicts each audio sample based on the previously decoded ones using a
     99 "Sign-Sign Least Mean Squares Filter" (LMS). This prediction plus the 
    100 dequantized residual forms the final output sample.
    101 
    102 */
    103 
    104 
    105 
    106 /* -----------------------------------------------------------------------------
    107 	Header - Public functions */
    108 
    109 #ifndef QOA_H
    110 #define QOA_H
    111 
    112 #ifdef __cplusplus
    113 extern "C" {
    114 #endif
    115 
    116 #define QOA_MIN_FILESIZE 16
    117 #define QOA_MAX_CHANNELS 8
    118 
    119 #define QOA_SLICE_LEN 20
    120 #define QOA_SLICES_PER_FRAME 256
    121 #define QOA_FRAME_LEN (QOA_SLICES_PER_FRAME * QOA_SLICE_LEN)
    122 #define QOA_LMS_LEN 4
    123 #define QOA_MAGIC 0x716f6166 /* 'qoaf' */
    124 
    125 #define QOA_FRAME_SIZE(channels, slices) \
    126 	(8 + QOA_LMS_LEN * 4 * channels + 8 * slices * channels)
    127 
    128 typedef struct {
    129 	int history[QOA_LMS_LEN];
    130 	int weights[QOA_LMS_LEN];
    131 } qoa_lms_t;
    132 
    133 typedef struct {
    134 	unsigned int channels;
    135 	unsigned int samplerate;
    136 	unsigned int samples;
    137 	qoa_lms_t lms[QOA_MAX_CHANNELS];
    138 	#ifdef QOA_RECORD_TOTAL_ERROR
    139 		double error;
    140 	#endif
    141 } qoa_desc;
    142 
    143 unsigned int qoa_encode_header(qoa_desc *qoa, unsigned char *bytes);
    144 unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned int frame_len, unsigned char *bytes);
    145 void *qoa_encode(const short *sample_data, qoa_desc *qoa, unsigned int *out_len);
    146 
    147 unsigned int qoa_max_frame_size(qoa_desc *qoa);
    148 unsigned int qoa_decode_header(const unsigned char *bytes, int size, qoa_desc *qoa);
    149 unsigned int qoa_decode_frame(const unsigned char *bytes, unsigned int size, qoa_desc *qoa, short *sample_data, unsigned int *frame_len);
    150 short *qoa_decode(const unsigned char *bytes, int size, qoa_desc *file);
    151 
    152 #ifndef QOA_NO_STDIO
    153 
    154 int qoa_write(const char *filename, const short *sample_data, qoa_desc *qoa);
    155 void *qoa_read(const char *filename, qoa_desc *qoa);
    156 
    157 #endif /* QOA_NO_STDIO */
    158 
    159 
    160 #ifdef __cplusplus
    161 }
    162 #endif
    163 #endif /* QOA_H */
    164 
    165 
    166 /* -----------------------------------------------------------------------------
    167 	Implementation */
    168 
    169 #ifdef QOA_IMPLEMENTATION
    170 #include <stdlib.h>
    171 
    172 #ifndef QOA_MALLOC
    173 	#define QOA_MALLOC(sz) malloc(sz)
    174 	#define QOA_FREE(p) free(p)
    175 #endif
    176 
    177 typedef unsigned long long qoa_uint64_t;
    178 
    179 
    180 /* The quant_tab provides an index into the dequant_tab for residuals in the
    181 range of -8 .. 8. It maps this range to just 3bits and becomes less accurate at 
    182 the higher end. Note that the residual zero is identical to the lowest positive 
    183 value. This is mostly fine, since the qoa_div() function always rounds away 
    184 from zero. */
    185 
    186 static const int qoa_quant_tab[17] = {
    187 	7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
    188 	0,                      /*  0     */
    189 	0, 2, 2, 4, 4, 6, 6, 6  /*  1.. 8 */
    190 };
    191 
    192 
    193 /* We have 16 different scalefactors. Like the quantized residuals these become
    194 less accurate at the higher end. In theory, the highest scalefactor that we
    195 would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
    196 rely on the LMS filter to predict samples accurately enough that a maximum 
    197 residual of one quarter of the 16 bit range is sufficient. I.e. with the 
    198 scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
    199 
    200 The scalefactor values are computed as:
    201 scalefactor_tab[s] <- round(pow(s + 1, 2.75)) */
    202 
    203 static const int qoa_scalefactor_tab[16] = {
    204 	1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
    205 };
    206 
    207 
    208 /* The reciprocal_tab maps each of the 16 scalefactors to their rounded 
    209 reciprocals 1/scalefactor. This allows us to calculate the scaled residuals in 
    210 the encoder with just one multiplication instead of an expensive division. We 
    211 do this in .16 fixed point with integers, instead of floats.
    212 
    213 The reciprocal_tab is computed as:
    214 reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] */
    215 
    216 static const int qoa_reciprocal_tab[16] = {
    217 	65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
    218 };
    219 
    220 
    221 /* The dequant_tab maps each of the scalefactors and quantized residuals to 
    222 their unscaled & dequantized version.
    223 
    224 Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
    225 instead of 1. The dequant_tab assumes the following dequantized values for each 
    226 of the quant_tab indices and is computed as:
    227 float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
    228 dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
    229 
    230 The rounding employed here is "to nearest, ties away from zero",  i.e. positive
    231 and negative values are treated symmetrically.
    232 */
    233 
    234 static const int qoa_dequant_tab[16][8] = {
    235 	{   1,    -1,    3,    -3,    5,    -5,     7,     -7},
    236 	{   5,    -5,   18,   -18,   32,   -32,    49,    -49},
    237 	{  16,   -16,   53,   -53,   95,   -95,   147,   -147},
    238 	{  34,   -34,  113,  -113,  203,  -203,   315,   -315},
    239 	{  63,   -63,  210,  -210,  378,  -378,   588,   -588},
    240 	{ 104,  -104,  345,  -345,  621,  -621,   966,   -966},
    241 	{ 158,  -158,  528,  -528,  950,  -950,  1477,  -1477},
    242 	{ 228,  -228,  760,  -760, 1368, -1368,  2128,  -2128},
    243 	{ 316,  -316, 1053, -1053, 1895, -1895,  2947,  -2947},
    244 	{ 422,  -422, 1405, -1405, 2529, -2529,  3934,  -3934},
    245 	{ 548,  -548, 1828, -1828, 3290, -3290,  5117,  -5117},
    246 	{ 696,  -696, 2320, -2320, 4176, -4176,  6496,  -6496},
    247 	{ 868,  -868, 2893, -2893, 5207, -5207,  8099,  -8099},
    248 	{1064, -1064, 3548, -3548, 6386, -6386,  9933,  -9933},
    249 	{1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005},
    250 	{1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336},
    251 };
    252 
    253 
    254 /* The Least Mean Squares Filter is the heart of QOA. It predicts the next
    255 sample based on the previous 4 reconstructed samples. It does so by continuously
    256 adjusting 4 weights based on the residual of the previous prediction.
    257 
    258 The next sample is predicted as the sum of (weight[i] * history[i]).
    259 
    260 The adjustment of the weights is done with a "Sign-Sign-LMS" that adds or
    261 subtracts the residual to each weight, based on the corresponding sample from 
    262 the history. This, surprisingly, is sufficient to get worthwhile predictions.
    263 
    264 This is all done with fixed point integers. Hence the right-shifts when updating
    265 the weights and calculating the prediction. */
    266 
    267 static int qoa_lms_predict(qoa_lms_t *lms) {
    268 	int prediction = 0;
    269 	for (int i = 0; i < QOA_LMS_LEN; i++) {
    270 		prediction += lms->weights[i] * lms->history[i];
    271 	}
    272 	return prediction >> 13;
    273 }
    274 
    275 static void qoa_lms_update(qoa_lms_t *lms, int sample, int residual) {
    276 	int delta = residual >> 4;
    277 	for (int i = 0; i < QOA_LMS_LEN; i++) {
    278 		lms->weights[i] += lms->history[i] < 0 ? -delta : delta;
    279 	}
    280 
    281 	for (int i = 0; i < QOA_LMS_LEN-1; i++) {
    282 		lms->history[i] = lms->history[i+1];
    283 	}
    284 	lms->history[QOA_LMS_LEN-1] = sample;
    285 }
    286 
    287 
    288 /* qoa_div() implements a rounding division, but avoids rounding to zero for 
    289 small numbers. E.g. 0.1 will be rounded to 1. Note that 0 itself still 
    290 returns as 0, which is handled in the qoa_quant_tab[].
    291 qoa_div() takes an index into the .16 fixed point qoa_reciprocal_tab as an
    292 argument, so it can do the division with a cheaper integer multiplication. */
    293 
    294 static inline int qoa_div(int v, int scalefactor) {
    295 	int reciprocal = qoa_reciprocal_tab[scalefactor];
    296 	int n = (v * reciprocal + (1 << 15)) >> 16;
    297 	n = n + ((v > 0) - (v < 0)) - ((n > 0) - (n < 0)); /* round away from 0 */
    298 	return n;
    299 }
    300 
    301 static inline int qoa_clamp(int v, int min, int max) {
    302 	if (v < min) { return min; }
    303 	if (v > max) { return max; }
    304 	return v;
    305 }
    306 
    307 /* This specialized clamp function for the signed 16 bit range improves decode
    308 performance quite a bit. The extra if() statement works nicely with the CPUs
    309 branch prediction as this branch is rarely taken. */
    310 
    311 static inline int qoa_clamp_s16(int v) {
    312 	if ((unsigned int)(v + 32768) > 65535) {
    313 		if (v < -32768) { return -32768; }
    314 		if (v >  32767) { return  32767; }
    315 	}
    316 	return v;
    317 }
    318 
    319 static inline qoa_uint64_t qoa_read_u64(const unsigned char *bytes, unsigned int *p) {
    320 	bytes += *p;
    321 	*p += 8;
    322 	return 
    323 		((qoa_uint64_t)(bytes[0]) << 56) | ((qoa_uint64_t)(bytes[1]) << 48) |
    324 		((qoa_uint64_t)(bytes[2]) << 40) | ((qoa_uint64_t)(bytes[3]) << 32) |
    325 		((qoa_uint64_t)(bytes[4]) << 24) | ((qoa_uint64_t)(bytes[5]) << 16) |
    326 		((qoa_uint64_t)(bytes[6]) <<  8) | ((qoa_uint64_t)(bytes[7]) <<  0);
    327 }
    328 
    329 static inline void qoa_write_u64(qoa_uint64_t v, unsigned char *bytes, unsigned int *p) {
    330 	bytes += *p;
    331 	*p += 8;
    332 	bytes[0] = (v >> 56) & 0xff;
    333 	bytes[1] = (v >> 48) & 0xff;
    334 	bytes[2] = (v >> 40) & 0xff;
    335 	bytes[3] = (v >> 32) & 0xff;
    336 	bytes[4] = (v >> 24) & 0xff;
    337 	bytes[5] = (v >> 16) & 0xff;
    338 	bytes[6] = (v >>  8) & 0xff;
    339 	bytes[7] = (v >>  0) & 0xff;
    340 }
    341 
    342 
    343 /* -----------------------------------------------------------------------------
    344 	Encoder */
    345 
    346 unsigned int qoa_encode_header(qoa_desc *qoa, unsigned char *bytes) {
    347 	unsigned int p = 0;
    348 	qoa_write_u64(((qoa_uint64_t)QOA_MAGIC << 32) | qoa->samples, bytes, &p);
    349 	return p;
    350 }
    351 
    352 unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned int frame_len, unsigned char *bytes) {
    353 	unsigned int channels = qoa->channels;
    354 
    355 	unsigned int p = 0;
    356 	unsigned int slices = (frame_len + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
    357 	unsigned int frame_size = QOA_FRAME_SIZE(channels, slices);
    358 	int prev_scalefactor[QOA_MAX_CHANNELS] = {0};
    359 
    360 	/* Write the frame header */
    361 	qoa_write_u64((
    362 		(qoa_uint64_t)qoa->channels   << 56 |
    363 		(qoa_uint64_t)qoa->samplerate << 32 |
    364 		(qoa_uint64_t)frame_len       << 16 |
    365 		(qoa_uint64_t)frame_size
    366 	), bytes, &p);
    367 
    368 	
    369 	for (unsigned int c = 0; c < channels; c++) {
    370 		/* Write the current LMS state */
    371 		qoa_uint64_t weights = 0;
    372 		qoa_uint64_t history = 0;
    373 		for (int i = 0; i < QOA_LMS_LEN; i++) {
    374 			history = (history << 16) | (qoa->lms[c].history[i] & 0xffff);
    375 			weights = (weights << 16) | (qoa->lms[c].weights[i] & 0xffff);
    376 		}
    377 		qoa_write_u64(history, bytes, &p);
    378 		qoa_write_u64(weights, bytes, &p);
    379 	}
    380 
    381 	/* We encode all samples with the channels interleaved on a slice level.
    382 	E.g. for stereo: (ch-0, slice 0), (ch 1, slice 0), (ch 0, slice 1), ...*/
    383 	for (unsigned int sample_index = 0; sample_index < frame_len; sample_index += QOA_SLICE_LEN) {
    384 
    385 		for (unsigned int c = 0; c < channels; c++) {
    386 			int slice_len = qoa_clamp(QOA_SLICE_LEN, 0, frame_len - sample_index);
    387 			int slice_start = sample_index * channels + c;
    388 			int slice_end = (sample_index + slice_len) * channels + c;			
    389 
    390 			/* Brute for search for the best scalefactor. Just go through all
    391 			16 scalefactors, encode all samples for the current slice and 
    392 			meassure the total squared error. */
    393 			qoa_uint64_t best_rank = -1;
    394 			#ifdef QOA_RECORD_TOTAL_ERROR
    395 				qoa_uint64_t best_error = -1;
    396 			#endif
    397 			qoa_uint64_t best_slice = 0;
    398 			qoa_lms_t best_lms;
    399 			int best_scalefactor = 0;
    400 
    401 			for (int sfi = 0; sfi < 16; sfi++) {
    402 				/* There is a strong correlation between the scalefactors of
    403 				neighboring slices. As an optimization, start testing
    404 				the best scalefactor of the previous slice first. */
    405 				int scalefactor = (sfi + prev_scalefactor[c]) % 16;
    406 
    407 				/* We have to reset the LMS state to the last known good one
    408 				before trying each scalefactor, as each pass updates the LMS
    409 				state when encoding. */
    410 				qoa_lms_t lms = qoa->lms[c];
    411 				qoa_uint64_t slice = scalefactor;
    412 				qoa_uint64_t current_rank = 0;
    413 				#ifdef QOA_RECORD_TOTAL_ERROR
    414 					qoa_uint64_t current_error = 0;
    415 				#endif
    416 
    417 				for (int si = slice_start; si < slice_end; si += channels) {
    418 					int sample = sample_data[si];
    419 					int predicted = qoa_lms_predict(&lms);
    420 
    421 					int residual = sample - predicted;
    422 					int scaled = qoa_div(residual, scalefactor);
    423 					int clamped = qoa_clamp(scaled, -8, 8);
    424 					int quantized = qoa_quant_tab[clamped + 8];
    425 					int dequantized = qoa_dequant_tab[scalefactor][quantized];
    426 					int reconstructed = qoa_clamp_s16(predicted + dequantized);
    427 
    428 
    429 					/* If the weights have grown too large, we introduce a penalty
    430 					here. This prevents pops/clicks in certain problem cases */
    431 					int weights_penalty = ((
    432 						lms.weights[0] * lms.weights[0] + 
    433 						lms.weights[1] * lms.weights[1] + 
    434 						lms.weights[2] * lms.weights[2] + 
    435 						lms.weights[3] * lms.weights[3]
    436 					) >> 18) - 0x8ff;
    437 					if (weights_penalty < 0) {
    438 						weights_penalty = 0;
    439 					}
    440 
    441 					long long error = (sample - reconstructed);
    442 					qoa_uint64_t error_sq = error * error;
    443 
    444 					current_rank += error_sq + weights_penalty * weights_penalty;
    445 					#ifdef QOA_RECORD_TOTAL_ERROR
    446 						current_error += error_sq;
    447 					#endif
    448 					if (current_rank > best_rank) {
    449 						break;
    450 					}
    451 
    452 					qoa_lms_update(&lms, reconstructed, dequantized);
    453 					slice = (slice << 3) | quantized;
    454 				}
    455 
    456 				if (current_rank < best_rank) {
    457 					best_rank = current_rank;
    458 					#ifdef QOA_RECORD_TOTAL_ERROR
    459 						best_error = current_error;
    460 					#endif
    461 					best_slice = slice;
    462 					best_lms = lms;
    463 					best_scalefactor = scalefactor;
    464 				}
    465 			}
    466 
    467 			prev_scalefactor[c] = best_scalefactor;
    468 
    469 			qoa->lms[c] = best_lms;
    470 			#ifdef QOA_RECORD_TOTAL_ERROR
    471 				qoa->error += best_error;
    472 			#endif
    473 
    474 			/* If this slice was shorter than QOA_SLICE_LEN, we have to left-
    475 			shift all encoded data, to ensure the rightmost bits are the empty
    476 			ones. This should only happen in the last frame of a file as all
    477 			slices are completely filled otherwise. */
    478 			best_slice <<= (QOA_SLICE_LEN - slice_len) * 3;
    479 			qoa_write_u64(best_slice, bytes, &p);
    480 		}
    481 	}
    482 	
    483 	return p;
    484 }
    485 
    486 void *qoa_encode(const short *sample_data, qoa_desc *qoa, unsigned int *out_len) {
    487 	if (
    488 		qoa->samples == 0 || 
    489 		qoa->samplerate == 0 || qoa->samplerate > 0xffffff ||
    490 		qoa->channels == 0 || qoa->channels > QOA_MAX_CHANNELS
    491 	) {
    492 		return NULL;
    493 	}
    494 
    495 	/* Calculate the encoded size and allocate */
    496 	unsigned int num_frames = (qoa->samples + QOA_FRAME_LEN-1) / QOA_FRAME_LEN;
    497 	unsigned int num_slices = (qoa->samples + QOA_SLICE_LEN-1) / QOA_SLICE_LEN;
    498 	unsigned int encoded_size = 8 +                    /* 8 byte file header */
    499 		num_frames * 8 +                               /* 8 byte frame headers */
    500 		num_frames * QOA_LMS_LEN * 4 * qoa->channels + /* 4 * 4 bytes lms state per channel */
    501 		num_slices * 8 * qoa->channels;                /* 8 byte slices */
    502 
    503 	unsigned char *bytes = QOA_MALLOC(encoded_size);
    504 
    505 	for (unsigned int c = 0; c < qoa->channels; c++) {
    506 		/* Set the initial LMS weights to {0, 0, -1, 2}. This helps with the 
    507 		prediction of the first few ms of a file. */
    508 		qoa->lms[c].weights[0] = 0;
    509 		qoa->lms[c].weights[1] = 0;
    510 		qoa->lms[c].weights[2] = -(1<<13);
    511 		qoa->lms[c].weights[3] =  (1<<14);
    512 
    513 		/* Explicitly set the history samples to 0, as we might have some
    514 		garbage in there. */
    515 		for (int i = 0; i < QOA_LMS_LEN; i++) {
    516 			qoa->lms[c].history[i] = 0;
    517 		}
    518 	}
    519 
    520 
    521 	/* Encode the header and go through all frames */
    522 	unsigned int p = qoa_encode_header(qoa, bytes);
    523 	#ifdef QOA_RECORD_TOTAL_ERROR
    524 		qoa->error = 0;
    525 	#endif
    526 
    527 	int frame_len = QOA_FRAME_LEN;
    528 	for (unsigned int sample_index = 0; sample_index < qoa->samples; sample_index += frame_len) {
    529 		frame_len = qoa_clamp(QOA_FRAME_LEN, 0, qoa->samples - sample_index);		
    530 		const short *frame_samples = sample_data + sample_index * qoa->channels;
    531 		unsigned int frame_size = qoa_encode_frame(frame_samples, qoa, frame_len, bytes + p);
    532 		p += frame_size;
    533 	}
    534 
    535 	*out_len = p;
    536 	return bytes;
    537 }
    538 
    539 
    540 
    541 /* -----------------------------------------------------------------------------
    542 	Decoder */
    543 
    544 unsigned int qoa_max_frame_size(qoa_desc *qoa) {
    545 	return QOA_FRAME_SIZE(qoa->channels, QOA_SLICES_PER_FRAME);
    546 }
    547 
    548 unsigned int qoa_decode_header(const unsigned char *bytes, int size, qoa_desc *qoa) {
    549 	unsigned int p = 0;
    550 	if (size < QOA_MIN_FILESIZE) {
    551 		return 0;
    552 	}
    553 
    554 
    555 	/* Read the file header, verify the magic number ('qoaf') and read the 
    556 	total number of samples. */
    557 	qoa_uint64_t file_header = qoa_read_u64(bytes, &p);
    558 
    559 	if ((file_header >> 32) != QOA_MAGIC) {
    560 		return 0;
    561 	}
    562 
    563 	qoa->samples = file_header & 0xffffffff;
    564 	if (!qoa->samples) {
    565 		return 0;
    566 	}
    567 
    568 	/* Peek into the first frame header to get the number of channels and
    569 	the samplerate. */
    570 	qoa_uint64_t frame_header = qoa_read_u64(bytes, &p);
    571 	qoa->channels   = (frame_header >> 56) & 0x0000ff;
    572 	qoa->samplerate = (frame_header >> 32) & 0xffffff;
    573 
    574 	if (qoa->channels == 0 || qoa->samples == 0 || qoa->samplerate == 0) {
    575 		return 0;
    576 	}
    577 
    578 	return 8;
    579 }
    580 
    581 unsigned int qoa_decode_frame(const unsigned char *bytes, unsigned int size, qoa_desc *qoa, short *sample_data, unsigned int *frame_len) {
    582 	unsigned int p = 0;
    583 	*frame_len = 0;
    584 
    585 	if (size < 8 + QOA_LMS_LEN * 4 * qoa->channels) {
    586 		return 0;
    587 	}
    588 
    589 	/* Read and verify the frame header */
    590 	qoa_uint64_t frame_header = qoa_read_u64(bytes, &p);
    591 	unsigned int channels   = (frame_header >> 56) & 0x0000ff;
    592 	unsigned int samplerate = (frame_header >> 32) & 0xffffff;
    593 	unsigned int samples    = (frame_header >> 16) & 0x00ffff;
    594 	unsigned int frame_size = (frame_header      ) & 0x00ffff;
    595 
    596 	unsigned int data_size = frame_size - 8 - QOA_LMS_LEN * 4 * channels;
    597 	unsigned int num_slices = data_size / 8;
    598 	unsigned int max_total_samples = num_slices * QOA_SLICE_LEN;
    599 
    600 	if (
    601 		channels != qoa->channels || 
    602 		samplerate != qoa->samplerate ||
    603 		frame_size > size ||
    604 		samples * channels > max_total_samples
    605 	) {
    606 		return 0;
    607 	}
    608 
    609 
    610 	/* Read the LMS state: 4 x 2 bytes history, 4 x 2 bytes weights per channel */
    611 	for (unsigned int c = 0; c < channels; c++) {
    612 		qoa_uint64_t history = qoa_read_u64(bytes, &p);
    613 		qoa_uint64_t weights = qoa_read_u64(bytes, &p);
    614 		for (int i = 0; i < QOA_LMS_LEN; i++) {
    615 			qoa->lms[c].history[i] = ((signed short)(history >> 48));
    616 			history <<= 16;
    617 			qoa->lms[c].weights[i] = ((signed short)(weights >> 48));
    618 			weights <<= 16;
    619 		}
    620 	}
    621 
    622 
    623 	/* Decode all slices for all channels in this frame */
    624 	for (unsigned int sample_index = 0; sample_index < samples; sample_index += QOA_SLICE_LEN) {
    625 		for (unsigned int c = 0; c < channels; c++) {
    626 			qoa_uint64_t slice = qoa_read_u64(bytes, &p);
    627 
    628 			int scalefactor = (slice >> 60) & 0xf;
    629 			slice <<= 4;
    630 
    631 			int slice_start = sample_index * channels + c;
    632 			int slice_end = qoa_clamp(sample_index + QOA_SLICE_LEN, 0, samples) * channels + c;
    633 
    634 			for (int si = slice_start; si < slice_end; si += channels) {
    635 				int predicted = qoa_lms_predict(&qoa->lms[c]);
    636 				int quantized = (slice >> 61) & 0x7;
    637 				int dequantized = qoa_dequant_tab[scalefactor][quantized];
    638 				int reconstructed = qoa_clamp_s16(predicted + dequantized);
    639 				
    640 				sample_data[si] = reconstructed;
    641 				slice <<= 3;
    642 
    643 				qoa_lms_update(&qoa->lms[c], reconstructed, dequantized);
    644 			}
    645 		}
    646 	}
    647 
    648 	*frame_len = samples;
    649 	return p;
    650 }
    651 
    652 short *qoa_decode(const unsigned char *bytes, int size, qoa_desc *qoa) {
    653 	unsigned int p = qoa_decode_header(bytes, size, qoa);
    654 	if (!p) {
    655 		return NULL;
    656 	}
    657 
    658 	/* Calculate the required size of the sample buffer and allocate */
    659 	int total_samples = qoa->samples * qoa->channels;
    660 	short *sample_data = QOA_MALLOC(total_samples * sizeof(short));
    661 
    662 	unsigned int sample_index = 0;
    663 	unsigned int frame_len;
    664 	unsigned int frame_size;
    665 
    666 	/* Decode all frames */
    667 	do {
    668 		short *sample_ptr = sample_data + sample_index * qoa->channels;
    669 		frame_size = qoa_decode_frame(bytes + p, size - p, qoa, sample_ptr, &frame_len);
    670 
    671 		p += frame_size;
    672 		sample_index += frame_len;
    673 	} while (frame_size && sample_index < qoa->samples);
    674 
    675 	qoa->samples = sample_index;
    676 	return sample_data;
    677 }
    678 
    679 
    680 
    681 /* -----------------------------------------------------------------------------
    682 	File read/write convenience functions */
    683 
    684 #ifndef QOA_NO_STDIO
    685 #include <stdio.h>
    686 
    687 int qoa_write(const char *filename, const short *sample_data, qoa_desc *qoa) {
    688 	FILE *f = fopen(filename, "wb");
    689 	unsigned int size;
    690 	void *encoded;
    691 
    692 	if (!f) {
    693 		return 0;
    694 	}
    695 
    696 	encoded = qoa_encode(sample_data, qoa, &size);
    697 	if (!encoded) {
    698 		fclose(f);
    699 		return 0;
    700 	}
    701 
    702 	fwrite(encoded, 1, size, f);
    703 	fclose(f);
    704 
    705 	QOA_FREE(encoded);
    706 	return size;
    707 }
    708 
    709 void *qoa_read(const char *filename, qoa_desc *qoa) {
    710 	FILE *f = fopen(filename, "rb");
    711 	int size, bytes_read;
    712 	void *data;
    713 	short *sample_data;
    714 
    715 	if (!f) {
    716 		return NULL;
    717 	}
    718 
    719 	fseek(f, 0, SEEK_END);
    720 	size = ftell(f);
    721 	if (size <= 0) {
    722 		fclose(f);
    723 		return NULL;
    724 	}
    725 	fseek(f, 0, SEEK_SET);
    726 
    727 	data = QOA_MALLOC(size);
    728 	if (!data) {
    729 		fclose(f);
    730 		return NULL;
    731 	}
    732 
    733 	bytes_read = fread(data, 1, size, f);
    734 	fclose(f);
    735 
    736 	sample_data = qoa_decode(data, bytes_read, qoa);
    737 	QOA_FREE(data);
    738 	return sample_data;
    739 }
    740 
    741 #endif /* QOA_NO_STDIO */
    742 #endif /* QOA_IMPLEMENTATION */