Created
September 12, 2017 10:25
-
-
Save horrorho/7837e9b83f2aa42d2781374c99fd0ba3 to your computer and use it in GitHub Desktop.
Archive of mattmahoney.net/dc/tcgen100.zip
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* tcgen v1.00 - Test file generator for testing compression programs. | |
| (C) 2011, Dell Inc. Written by Matt Mahoney | |
| LICENSE | |
| This program is free software; you can redistribute it and/or | |
| modify it under the terms of the GNU General Public License as | |
| published by the Free Software Foundation; either version 3 of | |
| the License, or (at your option) any later version. | |
| This program is distributed in the hope that it will be useful, but | |
| WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| General Public License for more details at | |
| Visit <http://www.gnu.org/copyleft/gpl.html>. | |
| This program generates random files with known entropy for testing | |
| data compression programs. A good compressor should compress a | |
| file to close to the displayed size. To create test file: | |
| tcgen output command | |
| If output is . then the output file name is the same as the command. | |
| If output is - then output to stdout in binary mode. | |
| The command is a string containing patterns of either hyphens | |
| or the letters r, n, w, b, c, v, d, followed by 1 or more digits, | |
| like "n20b5-v2". Scanning left to right, each letter sets a variable | |
| of the same name. A hyphen or the end of command causes R blocks of | |
| N words of length W bytes each in the range C..C+B-1 (mod 256) to be | |
| written. Words are chosen randomly from a vocabulary | |
| of size V, initialized at random without duplicates at the start | |
| of each block. If D > 0 then each output byte has added to it the | |
| D'th previous byte. The default pattern is r1n1w1b256c0v1d0. | |
| Any characters which do not fit a pattern will seed the random | |
| number generator differently in a predictable way, but are otherwise | |
| ignored. | |
| The entropy is calculated to be R * (log(B^W choose V) + N*log(V)), | |
| reducing V to only words which are output at least once, then | |
| displayed to stderr. It is required that V be in 1..B^W, W > 0, | |
| B in 1..256, and C in 0..255. | |
| The random number generator is a RC4 keystream seeded with the | |
| command string (including terminating NUL) as the key. When a | |
| random number in the range 0..n-1 is needed (where n is B or V), | |
| then enough bytes are generated in MSB to LSB order to cover the | |
| range, repeating until the result is in range. The exact | |
| algorithm is as follows: | |
| (return random number in range 0..n-1) | |
| random(n) | |
| max = 0 | |
| value = 0 | |
| do | |
| while max < n-1 do | |
| max = max * 256 + 255 | |
| value = value * 256 + RC4() (http://en.wikipedia.org/wiki/RC4) | |
| until value < n | |
| return value | |
| (generate output) | |
| repeat R times | |
| k = 0 | |
| buf[0..D-1] = 0 | |
| for i = 0..V-1 (initialize vocabulary) | |
| do | |
| for j = 0..W-1 | |
| dict[i][j] = random(B) + C (mod 256) | |
| while dict[i] = dict[j] for some j < i | |
| for i = 0..N-1 (output block) | |
| s = dict[random(V)] | |
| for j = 0..W-1 | |
| if D > 0 then | |
| buf[k] = buf[k] + s[j] (mod 256) | |
| output buf[k] | |
| k = (k + 1) mod D | |
| else | |
| output s[j] | |
| */ | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <ctype.h> | |
| #include <math.h> | |
| #include <fcntl.h> | |
| typedef unsigned int uint; | |
| typedef unsigned char byte; | |
| void usage() { | |
| fprintf(stderr, | |
| "tcgen v1.00 - File generator for testing data compressors.\n" | |
| "(C) 2011, Dell Inc. Written by Matt Mahoney %s.\n" | |
| "Licensed under GPL v3. http://www.gnu.org/copyleft/gpl.html\n" | |
| "\n" | |
| "To create a test file and show its entropy: tcgen file command\n" | |
| "If file is . then command is also the output file name.\n" | |
| "If file is - then write to standard output in binary mode.\n" | |
| "Command may contain the following patterns where # is a number:\n" | |
| " r# = set number R of blocks to write, default r1\n" | |
| " n# = set number N of words per block to write, default n1\n" | |
| " k# = set N = # * 1000\n" | |
| " m# = set N = # * 1000000\n" | |
| " w# = set word length W, default w1\n" | |
| " b# = set base (alpabet size) B, range 1..256, default b256\n" | |
| " c# = set first character of alphabet C, range 0..255, default c0\n" | |
| " v# = set vocabulary size V, range 1..B^W, default v1\n" | |
| " d# = set delta offset to D bytes, default d0 = no delta coding\n" | |
| " - or end of command = write R blocks of N words of W bytes each from\n" | |
| "a random vocabulary of V different words each block using bytes in the\n" | |
| "range C..C+B-1. If D > 0 then add the D'th previous output byte.\n" | |
| "Examples:\n" | |
| " tcgen - b1c72-c73 Write HI to stdout\n" | |
| " tcgen . n1000b1 Write 1000 zero bytes to file n1000b1\n" | |
| " tcgen x n1000v256 Write 1000 random bytes to file x\n" | |
| " tcgen - r100n4c48b2 Write 100 random 0000 or 1111 to stdout\n" | |
| " tcgen x n256b1c1d1 Write an ascending byte sequence to file x\n", | |
| __DATE__); | |
| exit(1); | |
| } | |
| // Allocate array of n bytes | |
| byte* allocate(uint n) { | |
| if (n<1) return 0; | |
| byte* p=(byte*)calloc(n, 1); | |
| if (!p) fprintf(stderr, "Out of memory\n"), exit(1); | |
| return p; | |
| } | |
| // RC4 keystream generator. | |
| // See http://en.wikipedia.org/wiki/RC4 | |
| class RC4 { | |
| public: | |
| void init(const char *key, int len); // key schedule | |
| int get(); // get 1 byte of pseudo random stream | |
| uint random(uint n); // random number in 0..n-1 | |
| private: | |
| byte s[256]; | |
| int i, j; | |
| void swap(byte& a, byte& b) { | |
| byte t=a; a=b; b=t; | |
| } | |
| } rc4; | |
| // Key schedule | |
| void RC4::init(const char *key, int len) { | |
| j=0; | |
| for (i=0; i<256; ++i) | |
| s[i]=i; | |
| for (i=0; i<256; ++i) { | |
| j=j+s[i]+key[i%len]&255; | |
| swap(s[i], s[j]); | |
| } | |
| i=j=0; | |
| } | |
| // Get 1 byte of keystream | |
| int RC4::get() { | |
| i=i+1&255; | |
| j=j+s[i]&255; | |
| swap(s[i], s[j]); | |
| return s[s[i]+s[j]&255]; | |
| } | |
| // Random number in range 0..n-1 | |
| uint RC4::random(uint n) { | |
| if (n<1) return 0; | |
| uint value, range; | |
| do { | |
| value=0; | |
| range=0; | |
| while (range<n-1) { | |
| value=value*256+get(); | |
| range=range*256+255; | |
| } | |
| } while (value>=n); | |
| return value; | |
| } | |
| int main(int argc, char** argv) { | |
| // Get args | |
| if (argc<3) | |
| usage(); | |
| // Seed PRNG | |
| RC4 rc4; | |
| rc4.init(argv[1], strlen(argv[2])+1); | |
| // Get output file name | |
| FILE* out=0; | |
| const char* filename=argv[1]; | |
| if (!strcmp(filename, ".")) filename=argv[2]; | |
| else if (!strcmp(filename, "-")) filename=0; // stdout | |
| // Scan commmand | |
| double entropy=0; | |
| double size=0; | |
| uint r=1, n=1, w=1, b=256, c=0, v=1, d=0; | |
| bool done=false; | |
| for (int i=0; !done; ++i) { | |
| uint x=0; | |
| if (argv[2][i]) x=atoi(argv[2]+i+1); | |
| if (argv[2][i]==0 || argv[2][i]=='-' || isdigit(argv[2][i+1])) { | |
| switch(argv[2][i]) { | |
| case 'r': r=x; break; | |
| case 'n': n=x; break; | |
| case 'k': n=x*1000; break; | |
| case 'm': n=x*1000000; break; | |
| case 'w': w=x; break; | |
| case 'b': b=x; break; | |
| case 'c': c=x; break; | |
| case 'v': v=x; break; | |
| case 'd': d=x; break; | |
| case 0: done=true; | |
| case '-': | |
| // Generate r*n*w output bytes | |
| { | |
| // Check for valid b, w, v, c | |
| if (b<1 || b>256 || w<1 || v<1 || c>255) usage(); | |
| const double wlogb=w*log(b); | |
| if (log(v)>wlogb+1e-14) | |
| fprintf(stderr, "v=%u cannot exceed b^w=%1.0f\n", | |
| v, exp(wlogb)), exit(1); | |
| // If all is OK then open output | |
| if (!out) { | |
| if (filename) { | |
| out=fopen(filename, "wb"); | |
| if (!out) perror(filename), exit(1); | |
| } | |
| else { | |
| #ifndef unix | |
| setmode(1, O_BINARY); | |
| #endif | |
| out=stdout; | |
| } | |
| } | |
| byte* buf=allocate(d); // rotating output buffer | |
| byte* dict=allocate(v*w); // vocabulary | |
| byte* used=allocate(v); // to marked used vocabulary | |
| for (uint ir=0; ir<r; ++ir) { | |
| if (d) memset(buf, 0, d); // clear output buffer for each block | |
| memset(used, 0, v); // mark vocabulary as unused | |
| uint nused=0; // number of used[] set | |
| // Initialize vocabulary to v different words of length w | |
| for (uint i=0; i<v; ++i) { | |
| bool match; | |
| do { | |
| for (uint j=0; j<w; ++j) | |
| dict[i*w+j]=rc4.random(b)+c; | |
| match=false; | |
| for (uint j=0; j<i && !match; ++j) | |
| if (!memcmp(dict+i*w, dict+j*w, w)) | |
| match=true; | |
| } while (match); | |
| } | |
| // Output n words of length w | |
| uint k=0; // buf index | |
| for (uint i=0; i<n; ++i) { | |
| // Pick a random word. If new, add it's entropy | |
| uint rw=rc4.random(v); | |
| if (!used[rw]) { | |
| if (wlogb>50) entropy+=wlogb; | |
| else entropy+=log(exp(wlogb)-nused); | |
| entropy-=log(++nused); | |
| used[rw]=1; | |
| } | |
| // Output word, possibly delta coded | |
| const byte* s=dict+rw*w; | |
| for (uint j=0; j<w; ++j) { | |
| if (d>0) { | |
| putc(buf[k]+=s[j], out); | |
| if (++k>=d) k=0; | |
| } | |
| else | |
| putc(s[j], out); | |
| } | |
| } | |
| // Calculate entropy | |
| entropy+=n*log(v); | |
| } | |
| size+=double(r)*double(n)*double(w); | |
| free(used); | |
| free(dict); | |
| if (buf) free(buf); | |
| break; | |
| } | |
| } | |
| } | |
| } | |
| // Close output and show entropy | |
| if (out) { | |
| if (out!=stdout) fclose(out); | |
| entropy/=log(256); // in bytes | |
| fprintf(stderr, | |
| "\nGenerated %1.0f bytes with %1.2f bytes entropy (%4.2f%%)\n", | |
| size, entropy, 100*entropy/(size+1e-20)); | |
| if (filename) | |
| fprintf(stderr, "to file %s\n", filename); | |
| } | |
| else | |
| fprintf(stderr, "Command should end with . or _ and a number.\n"); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment