Idea: "Dynamic LR Parsing"

Like many programmers, I write most of my programming language parsers using the straightforward Recursive Descent approach. This works nice for batch parsers, and can even be made suitable to work as a library (where we need more robust error handling -- we can't just die!) with some patience. But from time to time, I wish my parsers had more introspectability.

Here is a quick idea I've experimented with. I'm trying to do some kind of "dynamic LR parsing" where we're basically keeping a manual work stack/queue hybrid. The possible expansions are not precomputed. Instead, when interpreting a parse task state (as for example, "PARSE_EXPR"), the concrete expansion is decided (as much as possible) by looking at the current peek token. The expanded work is then inserted as childs of the current work item, and executed even before the current work item's following siblings.

The goal of this approach is to keep some of the readability and ease of writing of Recursive Descent Parsing, but get some of the introspectability and centralized control flow from LR parsing.

Here is the basic structure of such Parser and work queue. Following that will be a simple infix expression parser.

typedef struct _Parser Parser;
struct _Parser {
        // Input cursor
        const char *text;
        int pos;
        int token; // current token. For now: a character.

        // Parse state.
        int done;
        int work[64];  // current and future work (e.g. WORK_parse_program, WORK_parse_expr, WORK_make_binop_expr, etc)
        int parent[64];  // parent of each work
        int num_work;
        int current;  // index of currently interpreted work.
        int childs;  // index of first child of current work (if exists, will jump there at following call to next_work())
};

void push_work(Parser *p, int work)
{
        msg_f("push work %s", work_string[work]);
        p->work[p->num_work] = work;
        p->parent[p->num_work] = p->current;
        if (! p->childs)
                p->childs = p->num_work;
        ++ p->num_work;
}

void next_work(Parser *p)
{
        assert(p->num_work);
        assert(p->current /num_work);
        if (p->childs)
        {
                p->current = p->childs;
        }
        else
        {
                int i = p->current;
                for (;;)
                {
                        if (i == 0)
                        {
                                p->done = 1;
                                break;
                        }
                        if (i + 1 /num_work &&
                            p->parent[i + 1] == p->parent[i])
                        {
                                i = i + 1;
                                break;
                        }
                        else
                        {
                                i = p->parent[i];
                        }
                }
                p->current = i;
        }
        p->childs = 0;
}

void use_token(Parser *p)
{
        msg_f("Use '%c'", p->token);
        ++ p->pos;
        p->token = p->text[p->pos];
}

void init(Parser *p, int w, const char *text)
{
        p->work[p->num_work] = w;
        ++ p->num_work;
        p->text = text;
        p->token = p->text[p->pos];
}

The following code parses infix expressions -- no precedence, no parentheses since I'm only trying to get the idea across.

void parse(Parser *p)
{
        for (; ! p->done; next_work(p))
        {
                print_parser_state(p);

                int w = p->work[p->current];
                int t = p->token;

                if (w == W_prog)
                {
                        if (t)
                        {
                                push_work(p, W_expr);
                        }
                        else
                        {
                                fatal_f("Expected program.");
                        }
                }
                else if (w == W_expr)
                {
                        push_work(p, W_literal);
                        push_work(p, W_look_binop);
                }
                else if (w == W_literal)
                {
                        if (literal(t))
                        {
                                use_token(p);
                        }
                        else
                        {
                                fatal_f("Expected literal");
                        }
                }
                else if (w == W_look_binop)
                {
                        if (binop(t))
                        {
                                msg_f("Found trailing binop.");
                                push_work(p, W_binop_char);
                                push_work(p, W_expr);
                                push_work(p, W_make_binop);
                        }
                        else
                        {
                                msg_f("No trailing binop found.");
                        }
                }
                else if (w == W_binop_char)
                {
                        assert(binop(t));
                        use_token(p);
                }
        }
}

Here is a parse trace for the string '3+4'

	Input is '3+4'
		  ^
	Parse W_prog
	push work W_expr
	Input is '3+4'
		  ^
	Parse W_expr
	push work W_literal
	push work W_look_binop
	Input is '3+4'
		  ^
	Parse W_literal
	Use '3'
	Input is '3+4'
		   ^
	Parse W_look_binop
	Found trailing binop.
	push work W_binop_char
	push work W_expr
	push work W_make_binop
	Input is '3+4'
		   ^
	Parse W_binop_char
	Use '+'
	Input is '3+4'
		    ^
	Parse W_expr
	push work W_literal
	push work W_look_binop
	Input is '3+4'
		    ^
	Parse W_literal
	Use '4'
	Input is '3+4'
		     ^
	Parse W_look_binop
	No trailing binop found.
	Input is '3+4'
		     ^
	Parse W_make_binop