Thursday, September 20, 2007

Brainfuck Optimizer

So the improvement to make was turning consecutive p++;p++;p++; type things into p+=3;
The first step is separating the code generating parts from the bit which chooses which code to generate, like so:


void bf_emit_gt(FILE * output_file) { fputs("++p;", output_file); }
void bf_emit_lt(FILE * output_file) { fputs("--p;", output_file); }
void bf_emit_add(FILE * output_file) { fputs("++*p;", output_file); }
void bf_emit_sub(FILE * output_file) { fputs("--*p;", output_file); }
void bf_emit_dot(FILE * output_file) { fputs("putchar(*p);", output_file); }
void bf_emit_comma(FILE * output_file) { fputs("*p = getchar();", output_file); }
void bf_emit_left(FILE * output_file) { fputs("while (*p) {", output_file); }
void bf_emit_right(FILE * output_file) { fputs("}", output_file); }


and then updating the code switch/case stuff to use them:


while((ch = fgetc(input_file)) != EOF) {
switch(ch) {
case '>': bf_emit_gt(output_file); break;
case '<': bf_emit_lt(output_file); break;
case '+': bf_emit_add(output_file); break;
case '-': bf_emit_sub(output_file); break;
case '.': bf_emit_dot(output_file); break;
case ',': bf_emit_comma(output_file); break;
case '[': bf_emit_left(output_file); break;
case ']': bf_emit_right(output_file); break;
}
}


At this moment, the program should do the exact same thing as before and it would be a good plan to test it at this point.
Next up modify the functions so they can print out p+=4; and such.


void bf_emit_gt(FILE * output_file, int run_length) { fprintf(output_file, "p += %d;", run_length); }
void bf_emit_lt(FILE * output_file, int run_length) { fprintf(output_file, "p -= %d;", run_length); }
void bf_emit_add(FILE * output_file, int run_length) { fprintf(output_file, "*p+=%d;", run_length); }
void bf_emit_sub(FILE * output_file, int run_length) { fprintf(output_file, "*p-=%d;", run_length); }
void bf_emit_dot(FILE * output_file) { fputs("putchar(*p);", output_file); }
void bf_emit_comma(FILE * output_file) { fputs("*p = getchar();", output_file); }
void bf_emit_left(FILE * output_file) { fputs("while (*p) {", output_file); }
void bf_emit_right(FILE * output_file) { fputs("}", output_file); }


Now main can use them, counting up run length until the next character isn't of the same kind.


int main(int argc, char ** argv) {
int ch;
int run_length = 0;
int last_ch = 0;

if(!process_args(argc, argv)) {
display_usage(argv[0]); // ammendment, argv[0] is the program name
// we shall display this in usage
return EXIT_FAILURE;
}

filter_string = ".,+-<>[]";
fputs("#include <stdio.h>\n", output_file);
fputs("#include <stdlib.h>\n", output_file);
fprintf(output_file, "static int heap[%d];\n", heapsize);
fprintf(output_file, "int main(void) {int * p = heap+%d;", heapsize/2);
while((ch = fgetc(input_file)) != EOF) {
switch(ch) {
case '>':
if('>' == peekchar(input_file)) ++run_length;
else {
bf_emit_gt(output_file, ++run_length);
run_length = 0;
}
break;
case '<':
if('<' == peekchar(input_file)) ++run_length;
else {
bf_emit_lt(output_file, ++run_length);
run_length = 0;
}
break;
case '+':
if('+' == peekchar(input_file)) ++run_length;
else {
bf_emit_add(output_file, ++run_length);
run_length = 0;
}
break;
case '-':
if('-' == peekchar(input_file)) ++run_length;
else {
bf_emit_sub(output_file, ++run_length);
run_length = 0;
}
break;
case '.': bf_emit_dot(output_file); break;
case ',': bf_emit_comma(output_file); break;
case '[': bf_emit_left(output_file); break;
case ']': bf_emit_right(output_file); break;
}
}
fputs("return EXIT_SUCCESS;}\n", output_file);
fflush(output_file);
fclose(output_file);

return EXIT_SUCCESS;
}


The bottles.bf examples compiles into 12009 bytes with these optimizations where as it was 13551 bytes before!

A Simple Brinfuck to C Compiler!

This time we're going to look at a simple brainfuck compiler to C code, based on this: http://www.muppetlabs.com/~breadbox/bf/
Incase you didn't know, brainfuck is a really cool little language which has only got 8 instructions, the code looks like this:

++++++++++[>+++++++>++++++++++>+++>+<<<<-]>++.>+.+++++++..+++.>++.<<+++++++++++++++.>.+++.------.--------.>+.>.

This tutorial is based heavily on the code from the previous one, so make sure to have read that first!


// Most of this is heavily based on the previous tutorial
// So we've just copied the file and changed a couple of things
// This time, we filter like before but print out different text
// instead.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

int heapsize;
char * filter_string;
FILE * input_file, * output_file;

void display_usage(char * name) {
printf("usage: %s [-i input filename] [-o output filename] [-h heapsize, default 512]\n", name);
}

int process_args(int argc, char ** argv) {
int ch;

// Deal with the command line arguments here
// return 1 on success, 0 on failure
// (failure meaning malformed parameters or anything)

heapsize = 512;
input_file = stdin;
output_file = stdout;

while((ch = getopt(argc, argv, "i:o:h:")) != -1) { // look at all the args + -heap
// getopt returns -1 once it's finished
if(ch == '?') return 0;

switch(ch) {
case 'i':
input_file = fopen(optarg, "r");
if(!input_file) {
printf("Could not open input file \"%s\"\n", optarg);
return 0;
}
break;
case 'o':
output_file = fopen(optarg, "w");
if(!output_file) {
printf("Could not open output file \"%s\"\n", optarg);
return 0;
}
break;
case 'h':
heapsize = atoi(optarg);
break;
}
}

return 1;
}

// Heres the first version:
int main(int argc, char ** argv) {
int ch;

if(!process_args(argc, argv)) {
display_usage(argv[0]); // ammendment, argv[0] is the program name
// we shall display this in usage
return EXIT_FAILURE;
}

filter_string = ".,+-<>[]";
fputs("#include <stdio.h>\n", output_file);
fputs("#include <stdlib.h>\n", output_file);
fprintf(output_file, "static int heap[%d];\n", heapsize);
fprintf(output_file, "int main(void) {int * p = heap+%d;", heapsize/2);
while((ch = fgetc(input_file)) != EOF)
switch(ch) {
case '>': break;
case '<': fputs("--p;", output_file); break;
case '+': fputs("++*p;", output_file); break;
case '-': fputs("--*p;", output_file); break;
case '.': fputs("putchar(*p);", output_file); break;
case ',': fputs("*p = getchar();", output_file); break;
case '[': fputs("while (*p) {", output_file); break;
case ']': fputs("}", output_file); break;
}
fputs("return EXIT_SUCCESS;}\n", output_file);
fflush(output_file);
fclose(output_file);

return EXIT_SUCCESS;
}

// it seesm to work well but there's some improvements possible.
// Example outout: int main(void) {int * p = heap+256;putchar(*p);++p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;++p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;while (*p) {--p;++*p;++*p;++*p;++*p;++*p;++p;--*p;}++*p;++*p;++*p;++*p;++*p;++*p;++*p;++p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;++*p;while (*p) {--p;++....
// all these ++p could be turned into a single p+=50;
// similarly with -- ++* and --*
// that'll be next!

Wednesday, September 19, 2007

GATTACA

As usual the source code is a walkthrough. This time we're preparing for a bit of data processing/compression. We're dealing with the 65.14 MB file of the X chromosome from the human genome project.

The warmup (this post) is to write a little preprocessing tool.


// http://www.gutenberg.org/etext/3501

// We got this big DNA file but it's got NNNNNNNNNNNN in it, as well as newlines
// so lets filter out everything that isnt A C T or G

// So for the first we're writing a "real" program which will take command line
// arguments and process files.

// Well take a input filename (if not supplied use stdin),
// output filename (if not supplied use stdout), and a filter string (if not
// supplied error)

// to parse command line arguments you could
// 1) Do what most people do and spend a few mins writing
// a silly watered down version of getopt (every time)
// 2) Spend a few mins using getopt then knowing how to do it for future.
// The choice is obvious, don't make the same mistake as others!

// The basic structure of our command line processing etc shalle be:

/*
* int main(int argc, char ** argv) {
* if(!process_args(argc, argv)) {
* display_usage();
* return EXIT_FAILURE;
* }
*
* // ... do the actual stuff ...
*
* return EXIT_SUCCESS;
* }
* * * * * * * * * * *
*/

// I don't bother declaring the input, output files and filter string
// then passing them around as paramters, because they can be declared
// at the toplevel instead.

// The next stage is to have a quick read through the getopt manpage
// in a console/terminal do: `man 3 getopt`

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

FILE * input_file, * output_file;
char * filter_string;

void display_usage(char * name) {
printf("usage: %s \"filter-string\" [-i input filename] [-o output filename]\n", name);
}

int process_args(int argc, char ** argv) {
int ch;

// Deal with the command line arguments here
// return 1 on success, 0 on failure
// (failure meaning malformed parameters or anything)

if(argc <= 1)
return 0; // we require a filter string

filter_string = argv[1];
input_file = stdin;
output_file = stdout;

while((ch = getopt(argc-1, argv+1, "i:o:")) != -1) {
// getopt returns -1 once it's finished
if(ch == '?') return 0;

switch(ch) {
case 'i':
input_file = fopen(optarg, "r");
if(!input_file) {
printf("Could not open input file \"%s\"\n", optarg);
return 0;
}
break;
case 'o':
output_file = fopen(optarg, "w");
if(!output_file) {
printf("Could not open output file \"%s\"\n", optarg);
return 0;
}
break;
}
}

return 1;
}

int main(int argc, char ** argv) {
int ch;

if(!process_args(argc, argv)) {
display_usage(argv[0]); // ammendment, argv[0] is the program name
// we shall display this in usage
return EXIT_FAILURE;
}

// Now that we've parsed the command line args and set things up
// we now have an open input and output file, and a filter string
// the task now is simply read each char and print it if its in the string
// once we hit EOF, flush the output and finish.

while((ch = fgetc(input_file)) != EOF)
if(strchr(filter_string, ch))
fputc(ch, output_file);
fputc('\n', output_file);
fflush(output_file);
fclose(output_file);

return EXIT_SUCCESS;
}

// It even seems to work!
// % ./filter '(){}' -i filter.c
// ()()())())(){(()){()}}(){()}(){()()((())){()(){()(){()}()(){()}}}}(){(()){()}((()))(())()()()()}

// so now we can process the DNA file, strip out anything thats not "ACTG"
// % time ./filter "ACTG" -i dna.txt -o dna_flat.txt
// ./filter "ACTG" -i dna.txt -o dna_flat.txt 1.74s user 0.21s system 98% cpu 1.985 total
// I was going to write about how to improve the speed of the program next,
// the problem is it only took 2 seconds to process 66.9 MB of text
// It should be noted that the naive approach to solving this problem was
// 1) The easiest to program
// 2) Got the job done quickly
// 3) Required no optimisation
//
// So in general it's good to do the easiest thing (be lazy! + stay simple)
// and optimize when you find out it's too slow/uses to much ram (if it ever is/does)

Code in motion

Here's a preview of an upcoming text:


(click to see it in motion!)

Anti Idoms

Great post here, http://www.matasano.com/log/182/four-c-programming-anti-idioms/
Raises some very good points which aren't mentioned often enough.

more here http://www.cpax.org.uk/prg/writings/casting.php

Hello World!

First post, Here goes!
Please comment :)


// This is a comment, everything in comments are ignored
// in C. More specifically the C preprocessor removes
// them before the compiler has a chance to ignore it.

#include <stdio.h>
#include <stdlib.h>

// stdio declares various standard input/output functions
// we need to include it for the prototype of printf
// #include is like a textual substitution of the files
// content with that line.

// stdlib defines EXIT_SUCCESS which any working program
// should return when it halts execution, unless of course
// somthing miserable happened.

int main(void) {
// main is the function which every standalone C program
// defines, The OS will load up the program and call main
// then the ball starts rolling and, then..!

printf("Hello world!\n");

// Well there many subtle dangers to be aware of with here..
// Why on *earth* are you printf?
// Lets see what printf actually does:
//
//>> `man printf`
//>>
//>> int
//>> printf(const char * restrict format, ...);
//
// First of all printf takes a 'format' *that* doesn't sound
// like what *we* give it.. What format is the format!
//
// Well there are several pages of very scrutinous detail about
// about this apparently simple function infact there's even
// 8 lines talking about its "BUGS"!!
// If you put down printf("%s") no-one knows what will happen
// You and I certainly don't (and can't), This is Unspecified
// Behavior (Dangerous, A major cause of BUGS!).
//
// All we wanted was a bit of text output, and
// there's a function that does just that:
//
//>> int
//>> puts(const char *str);
//
// This would suit our task, and print a newline to boot!
// (the reason for doing that is most terminals are
// line buffered so you wont see anything until you
// hit the next line, or flush stdout.)
//
// So one should have wrote puts("Hello world!") instead.
// Let's see how we can avoid using Dangrous and
// Unnecessary tools for simple tasks in future.

return EXIT_SUCCESS;

// 'return' is like jumping out of the function with,
// in this case the value of EXIT_SUCCESS. That means that
// nothing down here can ever be reached, we're in
// No-Mans-Land.
}