http://en.wikipedia.org/wiki/Context-free_grammar
Post parsers in your language!
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
/* CFG grammar */
typedef struct {
int c;
const char *str;
} cfg_t;
int eval(const char *buf, const cfg_t *, char *res, size_t sz);
int main(void)
{
char buf[BUFSIZ];
char res[BUFSIZ];
char *p, *s, *tmp;
int ret;
size_t n;
const cfg_t cfg[26] = {
{'A', "hello"},
{'B', "world"},
{'C', "A B"},
{'D', "A D"}
};
while (fgets(buf, sizeof buf, stdin) != NULL) {
if (*buf && buf[strlen(buf) - 1] == '\n')
buf[strlen(buf) - 1] = 0;
p = buf;
s = res;
n = 0;
while ((ret = eval(p, cfg, s, BUFSIZ)) == 1) {
printf("%lu ==> %s\n", (unsigned long) n, s);
tmp = p;
p = s;
s = tmp;
n++;
}
if (ret == -1)
printf("'%s': expansion too big to fit in %d buf\n",
p, BUFSIZ);
else
printf("result: '%s'\n", s);
}
if (ferror(stdin)) {
perror("stdin");
return EXIT_FAILURE;
}
return 0;
}
int eval(const char *buf, const cfg_t * cfg, char *res, size_t sz)
{
size_t i;
int done = 0;
while (*buf != 0) {
if (isupper((unsigned char) *buf)) {
for (i = 0; i < 26; i++)
if (*buf == cfg[i].c)
break;
if (i != 26) {
done = 1;
if (strlen(cfg[i].str) >= sz)
return -1;
strcpy(res, cfg[i].str);
res += strlen(cfg[i].str);
sz -= strlen(cfg[i].str);
}
} else {
if (sz <= 1)
return -1;
*res = *buf;
res++;
sz--;
}
buf++;
}
*res = 0;
return done;
}
example
$ ./cfg
A
0 ==> hello
result: 'hello'
A B
0 ==> hello world
result: 'hello world'
BB
0 ==> worldworld
result: 'worldworld'
C
0 ==> A B
1 ==> hello world
result: 'hello world'
^D
$
http://dis.4chan.org/read/prog/1208126223/
Copy threads from 4chan!
This was a quick hack to create an assembler for a toy bytecode interpreter, part of a demonstration I did. It started off as a recursive-descent parser, but ended up being complete overkill for something so simple.
#!/usr/bin/env python
import sys
import re
def wrong_number_of_operands( opcode_name ):
exit( 'Wrong number of operands given for %s on line %d.' % ( opcode_name, line_num ) )
def bad_label( label ):
exit( 'Attempt to call unknown label "%s" on line %d.' % ( label, line_num ) )
def label_too_high( label ):
exit( 'Label "%s" on line %d has an address beyond 256 bytes.' % ( label, line_num ) )
def overflow( args ):
for arg in args:
if arg:
opcode_name = sys._getframe(1).f_code.co_name
wrong_number_of_operands( opcode_name )
def num( string ):
try:
return chr( int( string ) )
except TypeError:
opcode_name = sys._getframe(1).f_code.co_name
wrong_number_of_operands( opcode_name )
except ValueError:
exit( string + ' is not valid operand.' )
def fixup_labels():
for deferred_label in deferred_labels.keys():
target_address = label( deferred_label, bad_label )
for address in deferred_labels[ deferred_label ]:
word = bytes[ address ]
bytes[ address ] = word[ 0:2 ] + target_address + word[ 3 ]
def defer_label( label ):
if label not in deferred_labels.keys():
deferred_labels[ label ] = []
current_word = len( bytes )
deferred_labels[ label ].append( current_word )
return chr( 255 )
def label( string, keyerror_handler ):
try:
return chr( labels[ string ] )
except KeyError:
return keyerror_handler( string )
except ValueError:
label_too_high( string )
def je( operand1, operand2 ):
return 'e' + num( operand1 ) + label( operand2, defer_label )
def jne( operand1, operand2 ):
return 'n' + num( operand1 ) + label( operand2, defer_label )
def jg( operand1, operand2 ):
return '>' + num( operand1 ) + label( operand2, defer_label )
def jl( operand1, operand2 ):
return '<' + num( operand1 ) + label( operand2, defer_label )
def jmp( operand1, *args ):
overflow( args );
return 'j' + chr(0) + label( operand1, defer_label )
def call( operand1, *args ):
overflow( args )
return 'c' + chr(0) + label( operand1, defer_label )
def ret( *args ):
overflow( args )
return 'r'
def add( *args ):
overflow( args )
return '+'
def sub( *args ):
overflow( args )
return '-'
def mul( *args ):
overflow( args )
return '*'
def idiv( *args ):
overflow( args )
return '/' + num( operand1 )
def push( operand1, *args ):
overflow( args )
return 'u' + num( operand1 )
def pop( *args ):
overflow( args )
return 'o'
def dup( *args ):
overflow( args )
return 'd'
def swap( *args ):
overflow( args )
return 's'
def get( *args ):
overflow( args )
return 'g'
def put( *args ):
overflow( args )
return 'p'
def quit( *args ):
overflow( args )
return 'q'
def pad( bytes, size ):
pad_size = size - len( bytes )
return bytes + chr(0) * pad_size
def bytes_for( opcode, operand1, operand2 ):
opcode_val = { 'je' : je,
'jne' : jne,
'jg' : jg,
'jl' : jl,
'jmp' : jmp,
'call' : call,
'ret' : ret,
'add' : add,
'sub' : sub,
'mul' : mul,
'idiv' : idiv,
'push' : push,
'pop' : pop,
'dup' : dup,
'swap' : swap,
'put' : put,
'get' : get,
'quit' : quit }
if opcode in opcode_val:
bytes = opcode_val[ opcode ]( operand1, operand2 )
return pad( bytes, WORD_SIZE )
else:
exit( '"%s" is not a valid opcode.' % opcode )
def compile( assembly ):
global line_num, deferred_labels, labels, bytes
line_num = 0
deferred_labels = {}
labels = {}
bytes = []
regex = re.compile( r"""
(?:(\w+)\s*:)? # label
\s*(\w+) # opcode
(?:\s+(\w+))? # operand1
(?:\s*,\s*(\w+))? # operand2
""", re.VERBOSE )
for line in assembly.split( "\n" ):
instruction = regex.match( line )
line_num += 1
if instruction:
label, opcode, operand1, operand2 = instruction.groups()
if label:
labels[ label ] = len( bytes )
bytes.append( bytes_for( opcode, operand1, operand2 ) )
fixup_labels()
return ''.join( bytes )
def read_data_in( file_name ):
file = open( file_name , "r" )
data = file.read()
file.close()
return data
def write_out( bytecode, file_name ):
file = open( file_name , "wb" )
file.write( bytecode )
file.close()
WORD_SIZE = 4
assembly = read_data_in( sys.argv[1] )
bytecode = compile( assembly )
write_out( bytecode, "a.bytecode" )
I'm waiting for WAHa to post Wakaba's parser here...
It's not a parser, it's a generator that runs the CFG in reverse!
Now we need a CFG that generates CFGs that generates ???.