[205] | 1 | /*
|
---|
| 2 | ** 2002 April 25
|
---|
| 3 | **
|
---|
| 4 | ** The author disclaims copyright to this source code. In place of
|
---|
| 5 | ** a legal notice, here is a blessing:
|
---|
| 6 | **
|
---|
| 7 | ** May you do good and not evil.
|
---|
| 8 | ** May you find forgiveness for yourself and forgive others.
|
---|
| 9 | ** May you share freely, never taking more than you give.
|
---|
| 10 | **
|
---|
| 11 | *************************************************************************
|
---|
| 12 | ** This file contains helper routines used to translate binary data into
|
---|
| 13 | ** a null-terminated string (suitable for use in SQLite) and back again.
|
---|
| 14 | ** These are convenience routines for use by people who want to store binary
|
---|
| 15 | ** data in an SQLite database. The code in this file is not used by any other
|
---|
| 16 | ** part of the SQLite library.
|
---|
| 17 | **
|
---|
| 18 | ** $Id: encode.c,v 1.12 2004/03/17 18:44:46 drh Exp $
|
---|
| 19 | */
|
---|
| 20 | #include <string.h>
|
---|
| 21 | #include <assert.h>
|
---|
| 22 |
|
---|
| 23 | /*
|
---|
| 24 | ** How This Encoder Works
|
---|
| 25 | **
|
---|
| 26 | ** The output is allowed to contain any character except 0x27 (') and
|
---|
| 27 | ** 0x00. This is accomplished by using an escape character to encode
|
---|
| 28 | ** 0x27 and 0x00 as a two-byte sequence. The escape character is always
|
---|
| 29 | ** 0x01. An 0x00 is encoded as the two byte sequence 0x01 0x01. The
|
---|
| 30 | ** 0x27 character is encoded as the two byte sequence 0x01 0x28. Finally,
|
---|
| 31 | ** the escape character itself is encoded as the two-character sequence
|
---|
| 32 | ** 0x01 0x02.
|
---|
| 33 | **
|
---|
| 34 | ** To summarize, the encoder works by using an escape sequences as follows:
|
---|
| 35 | **
|
---|
| 36 | ** 0x00 -> 0x01 0x01
|
---|
| 37 | ** 0x01 -> 0x01 0x02
|
---|
| 38 | ** 0x27 -> 0x01 0x28
|
---|
| 39 | **
|
---|
| 40 | ** If that were all the encoder did, it would work, but in certain cases
|
---|
| 41 | ** it could double the size of the encoded string. For example, to
|
---|
| 42 | ** encode a string of 100 0x27 characters would require 100 instances of
|
---|
| 43 | ** the 0x01 0x03 escape sequence resulting in a 200-character output.
|
---|
| 44 | ** We would prefer to keep the size of the encoded string smaller than
|
---|
| 45 | ** this.
|
---|
| 46 | **
|
---|
| 47 | ** To minimize the encoding size, we first add a fixed offset value to each
|
---|
| 48 | ** byte in the sequence. The addition is modulo 256. (That is to say, if
|
---|
| 49 | ** the sum of the original character value and the offset exceeds 256, then
|
---|
| 50 | ** the higher order bits are truncated.) The offset is chosen to minimize
|
---|
| 51 | ** the number of characters in the string that need to be escaped. For
|
---|
| 52 | ** example, in the case above where the string was composed of 100 0x27
|
---|
| 53 | ** characters, the offset might be 0x01. Each of the 0x27 characters would
|
---|
| 54 | ** then be converted into an 0x28 character which would not need to be
|
---|
| 55 | ** escaped at all and so the 100 character input string would be converted
|
---|
| 56 | ** into just 100 characters of output. Actually 101 characters of output -
|
---|
| 57 | ** we have to record the offset used as the first byte in the sequence so
|
---|
| 58 | ** that the string can be decoded. Since the offset value is stored as
|
---|
| 59 | ** part of the output string and the output string is not allowed to contain
|
---|
| 60 | ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
|
---|
| 61 | **
|
---|
| 62 | ** Here, then, are the encoding steps:
|
---|
| 63 | **
|
---|
| 64 | ** (1) Choose an offset value and make it the first character of
|
---|
| 65 | ** output.
|
---|
| 66 | **
|
---|
| 67 | ** (2) Copy each input character into the output buffer, one by
|
---|
| 68 | ** one, adding the offset value as you copy.
|
---|
| 69 | **
|
---|
| 70 | ** (3) If the value of an input character plus offset is 0x00, replace
|
---|
| 71 | ** that one character by the two-character sequence 0x01 0x01.
|
---|
| 72 | ** If the sum is 0x01, replace it with 0x01 0x02. If the sum
|
---|
| 73 | ** is 0x27, replace it with 0x01 0x03.
|
---|
| 74 | **
|
---|
| 75 | ** (4) Put a 0x00 terminator at the end of the output.
|
---|
| 76 | **
|
---|
| 77 | ** Decoding is obvious:
|
---|
| 78 | **
|
---|
| 79 | ** (5) Copy encoded characters except the first into the decode
|
---|
| 80 | ** buffer. Set the first encoded character aside for use as
|
---|
| 81 | ** the offset in step 7 below.
|
---|
| 82 | **
|
---|
| 83 | ** (6) Convert each 0x01 0x01 sequence into a single character 0x00.
|
---|
| 84 | ** Convert 0x01 0x02 into 0x01. Convert 0x01 0x28 into 0x27.
|
---|
| 85 | **
|
---|
| 86 | ** (7) Subtract the offset value that was the first character of
|
---|
| 87 | ** the encoded buffer from all characters in the output buffer.
|
---|
| 88 | **
|
---|
| 89 | ** The only tricky part is step (1) - how to compute an offset value to
|
---|
| 90 | ** minimize the size of the output buffer. This is accomplished by testing
|
---|
| 91 | ** all offset values and picking the one that results in the fewest number
|
---|
| 92 | ** of escapes. To do that, we first scan the entire input and count the
|
---|
| 93 | ** number of occurances of each character value in the input. Suppose
|
---|
| 94 | ** the number of 0x00 characters is N(0), the number of occurances of 0x01
|
---|
| 95 | ** is N(1), and so forth up to the number of occurances of 0xff is N(255).
|
---|
| 96 | ** An offset of 0 is not allowed so we don't have to test it. The number
|
---|
| 97 | ** of escapes required for an offset of 1 is N(1)+N(2)+N(40). The number
|
---|
| 98 | ** of escapes required for an offset of 2 is N(2)+N(3)+N(41). And so forth.
|
---|
| 99 | ** In this way we find the offset that gives the minimum number of escapes,
|
---|
| 100 | ** and thus minimizes the length of the output string.
|
---|
| 101 | */
|
---|
| 102 |
|
---|
| 103 | /*
|
---|
| 104 | ** Encode a binary buffer "in" of size n bytes so that it contains
|
---|
| 105 | ** no instances of characters '\'' or '\000'. The output is
|
---|
| 106 | ** null-terminated and can be used as a string value in an INSERT
|
---|
| 107 | ** or UPDATE statement. Use sqlite_decode_binary() to convert the
|
---|
| 108 | ** string back into its original binary.
|
---|
| 109 | **
|
---|
| 110 | ** The result is written into a preallocated output buffer "out".
|
---|
| 111 | ** "out" must be able to hold at least 2 +(257*n)/254 bytes.
|
---|
| 112 | ** In other words, the output will be expanded by as much as 3
|
---|
| 113 | ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
|
---|
| 114 | ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
|
---|
| 115 | **
|
---|
| 116 | ** The return value is the number of characters in the encoded
|
---|
| 117 | ** string, excluding the "\000" terminator.
|
---|
| 118 | **
|
---|
| 119 | ** If out==NULL then no output is generated but the routine still returns
|
---|
| 120 | ** the number of characters that would have been generated if out had
|
---|
| 121 | ** not been NULL.
|
---|
| 122 | */
|
---|
| 123 | int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){
|
---|
| 124 | int i, j, e, m;
|
---|
| 125 | unsigned char x;
|
---|
| 126 | int cnt[256];
|
---|
| 127 | if( n<=0 ){
|
---|
| 128 | if( out ){
|
---|
| 129 | out[0] = 'x';
|
---|
| 130 | out[1] = 0;
|
---|
| 131 | }
|
---|
| 132 | return 1;
|
---|
| 133 | }
|
---|
| 134 | memset(cnt, 0, sizeof(cnt));
|
---|
| 135 | for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
|
---|
| 136 | m = n;
|
---|
| 137 | for(i=1; i<256; i++){
|
---|
| 138 | int sum;
|
---|
| 139 | if( i=='\'' ) continue;
|
---|
| 140 | sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
|
---|
| 141 | if( sum<m ){
|
---|
| 142 | m = sum;
|
---|
| 143 | e = i;
|
---|
| 144 | if( m==0 ) break;
|
---|
| 145 | }
|
---|
| 146 | }
|
---|
| 147 | if( out==0 ){
|
---|
| 148 | return n+m+1;
|
---|
| 149 | }
|
---|
| 150 | out[0] = e;
|
---|
| 151 | j = 1;
|
---|
| 152 | for(i=0; i<n; i++){
|
---|
| 153 | x = in[i] - e;
|
---|
| 154 | if( x==0 || x==1 || x=='\''){
|
---|
| 155 | out[j++] = 1;
|
---|
| 156 | x++;
|
---|
| 157 | }
|
---|
| 158 | out[j++] = x;
|
---|
| 159 | }
|
---|
| 160 | out[j] = 0;
|
---|
| 161 | assert( j==n+m+1 );
|
---|
| 162 | return j;
|
---|
| 163 | }
|
---|
| 164 |
|
---|
| 165 | /*
|
---|
| 166 | ** Decode the string "in" into binary data and write it into "out".
|
---|
| 167 | ** This routine reverses the encoding created by sqlite_encode_binary().
|
---|
| 168 | ** The output will always be a few bytes less than the input. The number
|
---|
| 169 | ** of bytes of output is returned. If the input is not a well-formed
|
---|
| 170 | ** encoding, -1 is returned.
|
---|
| 171 | **
|
---|
| 172 | ** The "in" and "out" parameters may point to the same buffer in order
|
---|
| 173 | ** to decode a string in place.
|
---|
| 174 | */
|
---|
| 175 | int sqlite_decode_binary(const unsigned char *in, unsigned char *out){
|
---|
| 176 | int i, e;
|
---|
| 177 | unsigned char c;
|
---|
| 178 | e = *(in++);
|
---|
| 179 | i = 0;
|
---|
| 180 | while( (c = *(in++))!=0 ){
|
---|
| 181 | if( c==1 ){
|
---|
| 182 | c = *(in++) - 1;
|
---|
| 183 | }
|
---|
| 184 | out[i++] = c + e;
|
---|
| 185 | }
|
---|
| 186 | return i;
|
---|
| 187 | }
|
---|
| 188 |
|
---|
| 189 | #ifdef ENCODER_TEST
|
---|
| 190 | #include <stdio.h>
|
---|
| 191 | /*
|
---|
| 192 | ** The subroutines above are not tested by the usual test suite. To test
|
---|
| 193 | ** these routines, compile just this one file with a -DENCODER_TEST=1 option
|
---|
| 194 | ** and run the result.
|
---|
| 195 | */
|
---|
| 196 | int main(int argc, char **argv){
|
---|
| 197 | int i, j, n, m, nOut, nByteIn, nByteOut;
|
---|
| 198 | unsigned char in[30000];
|
---|
| 199 | unsigned char out[33000];
|
---|
| 200 |
|
---|
| 201 | nByteIn = nByteOut = 0;
|
---|
| 202 | for(i=0; i<sizeof(in); i++){
|
---|
| 203 | printf("Test %d: ", i+1);
|
---|
| 204 | n = rand() % (i+1);
|
---|
| 205 | if( i%100==0 ){
|
---|
| 206 | int k;
|
---|
| 207 | for(j=k=0; j<n; j++){
|
---|
| 208 | /* if( k==0 || k=='\'' ) k++; */
|
---|
| 209 | in[j] = k;
|
---|
| 210 | k = (k+1)&0xff;
|
---|
| 211 | }
|
---|
| 212 | }else{
|
---|
| 213 | for(j=0; j<n; j++) in[j] = rand() & 0xff;
|
---|
| 214 | }
|
---|
| 215 | nByteIn += n;
|
---|
| 216 | nOut = sqlite_encode_binary(in, n, out);
|
---|
| 217 | nByteOut += nOut;
|
---|
| 218 | if( nOut!=strlen(out) ){
|
---|
| 219 | printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
|
---|
| 220 | exit(1);
|
---|
| 221 | }
|
---|
| 222 | if( nOut!=sqlite_encode_binary(in, n, 0) ){
|
---|
| 223 | printf(" ERROR actual output size disagrees with predicted size\n");
|
---|
| 224 | exit(1);
|
---|
| 225 | }
|
---|
| 226 | m = (256*n + 1262)/253;
|
---|
| 227 | printf("size %d->%d (max %d)", n, strlen(out)+1, m);
|
---|
| 228 | if( strlen(out)+1>m ){
|
---|
| 229 | printf(" ERROR output too big\n");
|
---|
| 230 | exit(1);
|
---|
| 231 | }
|
---|
| 232 | for(j=0; out[j]; j++){
|
---|
| 233 | if( out[j]=='\'' ){
|
---|
| 234 | printf(" ERROR contains (')\n");
|
---|
| 235 | exit(1);
|
---|
| 236 | }
|
---|
| 237 | }
|
---|
| 238 | j = sqlite_decode_binary(out, out);
|
---|
| 239 | if( j!=n ){
|
---|
| 240 | printf(" ERROR decode size %d\n", j);
|
---|
| 241 | exit(1);
|
---|
| 242 | }
|
---|
| 243 | if( memcmp(in, out, n)!=0 ){
|
---|
| 244 | printf(" ERROR decode mismatch\n");
|
---|
| 245 | exit(1);
|
---|
| 246 | }
|
---|
| 247 | printf(" OK\n");
|
---|
| 248 | }
|
---|
| 249 | fprintf(stderr,"Finished. Total encoding: %d->%d bytes\n",
|
---|
| 250 | nByteIn, nByteOut);
|
---|
| 251 | fprintf(stderr,"Avg size increase: %.3f%%\n",
|
---|
| 252 | (nByteOut-nByteIn)*100.0/(double)nByteIn);
|
---|
| 253 | }
|
---|
| 254 | #endif /* ENCODER_TEST */
|
---|