
/*
 * Copyright (C) 1999-2003, Ian Main <imain@stemwinder.org> and
 * Jim Meier <fatjim@home.com>
 *
 * All rights reserved.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject
 * to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 */


#include <roy.h>

typedef enum {
    RTHERML_STATE_START,
    RTHERML_NODE_OPENED,
    RTHERML_NODE_OPENED2,
    RTHERML_COMMENT,
    RTHERML_COMMENT2,
    RTHERML_COMMENT3,
    RTHERML_COMMENT_END,
    RTHERML_COMMENT_END2,
    RTHERML_CLOSING_NODE,
    RTHERML_CLOSING_NODE2,
    RTHERML_NODE_SELF_CLOSE,
    RTHERML_IN_NODE,
    RTHERML_NODE_DATA,
    RTHERML_ATTRIBUTE,
    RTHERML_FANCY_QUOTE,
    RTHERML_VALUE_LENGTH,
    RTHERML_VALUE_LENGTH2,
    RTHERML_VALUE_LENGTH3,
    RTHERML_VALUE_QUOTE,
    RTHERML_VALUE_QUOTE2,
    RTHERML_VALUE_QUOTE3,
    RTHERML_EQUAL_VALUE,
    RTHERML_QUOTE_VALUE,
    RTHERML_VALUE

} RThermlState;

static void
rtherml_node_push (RThermlParser *parser, RBuf *type)
{
    RThermlNode *node;
   
    node = rchunk_alloc0 (sizeof (RThermlNode)); 
    node->type = type;
    node->linenum = parser->linenum;
    node->colnum = parser->colnum - rbuf_len (type);

    node->prev = parser->nodes;
    parser->nodes = node;
}

static RThermlNode *
rtherml_node_pop (RThermlParser *parser)
{
    RThermlNode *node;

    /* keep the current node so we can return it */
    node = parser->nodes;
    
    /* pop to next node on stack */
    if (parser->nodes) {
        parser->nodes = parser->nodes->prev;
    }

    return (node);
}

static void
rtherml_node_free (RThermlNode *tnode)
{
    if (tnode->type)
        rbuf_free (tnode->type);

    rchunk_free (tnode, sizeof (RThermlNode));
}


RThermlParser *
rtherml_parser_new (void)
{
    RThermlParser *parser = rmem_alloc0 (sizeof (RThermlParser));

    parser->state = RTHERML_STATE_START;
    parser->linenum = 1;
   
    return (parser);
}

static void
rtherml_parser_free_state (RThermlParser *parser)
{
    rbuf_free (parser->prev_chunk);
    rbuf_free (parser->current_attrib);
    rbuf_free (parser->current_value);
    rbuf_free (parser->end_quote);

    if (parser->nodes) {
        RThermlNode *node;

        while ((node = rtherml_node_pop (parser))) {
            rtherml_node_free (node);
        }
    }

    memset (parser, 0, sizeof (RThermlParser));
    
    parser->state = RTHERML_STATE_START;
    parser->linenum = 1;
}

void
rtherml_parser_free (RThermlParser *parser)
{
    rtherml_parser_free_state (parser);
    rmem_free (parser);
}


void
rtherml_set_user_data (RThermlParser *parser, void *user_data)
{
    parser->user_data = user_data;
}

void *
rtherml_get_user_data (RThermlParser *parser)
{
    return parser->user_data;
}

void
rtherml_set_error_handler (RThermlParser *parser, RThermlErrorHandler handler)
{
    parser->error_handler = handler;
}

void
rtherml_set_attrib_value_handler (RThermlParser *parser,
                                 RThermlAttribValueHandler handler)
{
    parser->attrib_value_handler = handler;
}


void
rtherml_set_node_start_handler (RThermlParser *parser,
                               RThermlNodeStartHandler handler)
{
    parser->node_start_handler = handler;
}

void
rtherml_set_node_end_handler (RThermlParser *parser, RThermlNodeEndHandler handler)
{
    parser->node_end_handler = handler;
}

static void
rtherml_err (RThermlParser *parser, char *fmt, ...)
{
    RBuf *err;
    va_list args;
    
    if (parser->error_handler) {
        err = rbuf_new ();

        va_start (args, fmt); 
        rbuf_append_vsprintf (err, fmt, args);
        va_end (args);

        parser->error_handler (parser, parser->linenum, parser->colnum, rbuf_str (err));
        rbuf_free (err);
    }
}

#define CHECK_PREV_CHUNK(buf) \
    do { \
        if (parser->prev_chunk) { \
            rbuf_prepend_rbuf (buf, parser->prev_chunk); \
            rbuf_free (parser->prev_chunk); \
            parser->prev_chunk = NULL; \
        } \
        s = NULL; \
    } while (0)


#define ADVANCE(c) \
    do { \
        if (*c == '\n') { \
            parser->linenum++; \
            parser->colnum = 1; \
        } else { \
            parser->colnum++; \
        } \
        if (++c == e) { \
            goto eob; \
        } \
    } while (0)

int
rtherml_parse_chunk (RThermlParser *parser, char *chunk, int len,
                    int is_last_chunk)
{
    RThermlState st = parser->state;
    /* Current character */ 
    char *c = NULL;
    /* End character */
    char *e = NULL;
    /* Character marking the start of the current transition */
    char *s = NULL;

    if (chunk == NULL) {
        rtherml_err (parser, "NULL buffer passed to rtherml_parser_chunk ().");
        goto err;
    }

    if (len == 0) {
        goto eob;
    }

    c = chunk;
    e = &(chunk[len]);
    
    while (TRUE) {
        
        switch (st) {

            case RTHERML_STATE_START:
                /* At the start of the document.. expecting a node to open with '<'.
                 * Everything else after that is the element type. */
                while (TRUE) {
                    if (c == e) {
                        goto eob;
                    }
                    
                    if (*c == '<') {
                        st = RTHERML_NODE_OPENED;
                        ADVANCE (c);
                        break;
                    } else if (!isspace (*c)) {
                        rtherml_err (parser, "Expected opening node or comment at document root.");
                        goto err;
                    }

                    ADVANCE (c);
                }
                /* Fallthrough */

            case RTHERML_NODE_OPENED:
                /* A '<' has been found.. it could now be a comment or the start 
                 * of a element name, or closing element name */
                if (*c == '!') {
                    /* Comment */
                    st = RTHERML_COMMENT;
                    ADVANCE (c);
                    /* This breaks out of case */
                    break;

                } else if (*c == '/') {
                    
                    /* Closing element - eg </foo> */
                    st = RTHERML_CLOSING_NODE;
                    ADVANCE (c);
                    /* Break out of case */
                    break;

                } else if (isspace (*c)) {
                    /* We have a "< ", which is invalid */
                    rtherml_err (parser, "Expecting element name after opening '<', found whitespace.");
                    goto err;
                }
                st = RTHERML_NODE_OPENED2;
                /* Fallthrough */

            case RTHERML_NODE_OPENED2:
                
                s = c;

                while (TRUE) {
                    
                    if (isspace (*c) || *c == '/' || *c =='>') {
                        RBuf *type;
                        type = rbuf_new_with_data (s, c - s);
                        CHECK_PREV_CHUNK(type);
                        rtherml_node_push (parser, type);
                        
                        if (parser->node_start_handler) {
                            if (parser->node_start_handler (parser, type) < 0) {
                                goto err;
                            }
                        }

                        st = RTHERML_IN_NODE;

                        break;
                    }
                    ADVANCE (c);
                }
                /* Fallthrough */

            case RTHERML_IN_NODE:
                /* We are inside a node.. could come up on a new attribute, or
                 * a closing '>' or "/>" */
                while (TRUE) {
                    if (!isspace (*c)) {
                        if (*c == '>') {
                            st = RTHERML_NODE_DATA;
                            ADVANCE (c);
                            break;
                        } 
                        
                        if (*c == '/') {
                            st = RTHERML_NODE_SELF_CLOSE;
                            ADVANCE (c);
                            break;
                        } else {
                            /* We're starting a new attribute */
                            st = RTHERML_ATTRIBUTE;
                            break;
                        }
                    }
                    ADVANCE (c);
                }

                break;

            case RTHERML_NODE_SELF_CLOSE:
                
                if (*c == '>') {
                    RThermlNode *node;
                    
                    node = rtherml_node_pop (parser);
                    if (node == NULL) {
                        rtherml_err (parser, "Closure for which no node was opened (unbalanced tree).");
                        goto err;
                    }

                    if (parser->node_end_handler) {
                        if (parser->node_end_handler (parser, node->type, NULL) < 0) {
                            goto err;
                        }
                    }
                    rtherml_node_free (node);

                    st = RTHERML_NODE_DATA;
                    ADVANCE (c);

                    break;

                } else {
                    rtherml_err (parser, "Found '/' in node, expected '>' to follow.");
                    goto err;
                }
                break;

            case RTHERML_ATTRIBUTE:
                s = c;
                while (TRUE) {
                    
                    if (isspace (*c) || *c == '=') {
                        parser->current_attrib = rbuf_new_with_data (s, c - s);
                        CHECK_PREV_CHUNK (parser->current_attrib);
                        
                        st = RTHERML_EQUAL_VALUE;
                        break;
                    }
                    if (*c == '(') {
                        parser->current_attrib = rbuf_new_with_data (s, c - s);
                        CHECK_PREV_CHUNK (parser->current_attrib);
                        
                        st = RTHERML_FANCY_QUOTE;
                        ADVANCE (c);
                        break;
                    }

                    ADVANCE (c);
                } 
                break;

            case RTHERML_EQUAL_VALUE:
                /* First thing to do is determine the type of assignment they are using.
                 * This can be plain '=', or it can be ([0-9]+)= or (end)=. */
                while (TRUE) {
                    if (*c == '=' || !isspace (*c)) {
                        break;
                    }
                    ADVANCE (c);
                }

                if (*c != '=') {
                    rtherml_err (parser, "Expected '=' character after attribute name.");
                    goto err;
                }
                
                st = RTHERML_QUOTE_VALUE;
                ADVANCE (c);
                /* Fallthrough */

            case RTHERML_QUOTE_VALUE:

                /* Find the next non-whitespace char to use as our quote character */
                while (TRUE) {
                    if (!isspace (*c))
                        break;
                    ADVANCE (c);
                }

                parser->quote_char = *c;

                st = RTHERML_VALUE;
                ADVANCE (c);
                /* Fallthrough */

            case RTHERML_VALUE:

                /* We are now positioned at the value, just start pushing data until we find the closing
                 * character. */
                s = c;
                while (*c != parser->quote_char) {
                    ADVANCE (c);
                }

                parser->current_value = rbuf_new_with_data (s, c - s);
                CHECK_PREV_CHUNK (parser->current_value);

                if (parser->attrib_value_handler) {
                    if (parser->attrib_value_handler (parser, parser->current_attrib, 
                                                      parser->current_value) < 0) {
                        goto err;
                    }
                } else {
                    rbuf_free (parser->current_attrib);
                    rbuf_free (parser->current_value);
                }

                parser->current_attrib = NULL;
                parser->current_value = NULL;

                st = RTHERML_IN_NODE;
                ADVANCE (c);
                break;

            case RTHERML_FANCY_QUOTE:

                /* Found a '(', eat characters until we find a 
                 * ')'. */
                s = c;
                while (TRUE) {
                    if (*c == ')') {
                        RBuf *buf;
                        
                        buf = rbuf_new_with_data (s, c - s);
                        CHECK_PREV_CHUNK (buf);
                        
                        if (rbuf_len (buf) == 0) {
                            rtherml_err (parser, "Empty length or end quote specification \"()\" not allowed.");
                            rbuf_free (buf);
                            goto err;
                        }

                        if (isdigit (rbuf_str (buf)[0])) {
                            parser->value_length = (int) strtol (rbuf_str (buf), (char **) NULL, 10);
                            rbuf_free (buf);
                            st = RTHERML_VALUE_LENGTH;
                            ADVANCE (c);
                            break;
                        } else {
                            parser->end_quote = buf;
                            st = RTHERML_VALUE_QUOTE;
                            ADVANCE (c);
                            break;
                        }
                    }
                    ADVANCE (c);
                }
                break;

            case RTHERML_VALUE_LENGTH:

                if (*c != '=') {
                    rtherml_err (parser, "Expecting '=' character after length specification.  eg attrib=(5)\"value\".");
                    goto err;
                }

                st = RTHERML_VALUE_LENGTH2;
                ADVANCE (c);
                /* Fallthrough */
            
            case RTHERML_VALUE_LENGTH2:

                if (*c != '\"') {
                    rtherml_err (parser, "Expecting quote '\"' character after length specification.  eg attrib=(5)\"value\".");
                    goto err;
                }

                st = RTHERML_VALUE_LENGTH3;
                ADVANCE (c);
                /* Fallthrough */

            case RTHERML_VALUE_LENGTH3:

                s = c;
                /* Chew up chars until our length is done. */
                while (parser->i < parser->value_length) {
                    parser->i++;
                    ADVANCE (c);
                }

                if (*c != '\"') {
                    rtherml_err (parser, "Length specified values should end with a quote '\"' character.");
                    goto err;
                }

                parser->current_value = rbuf_new_with_data (s, c - s);
                CHECK_PREV_CHUNK (parser->current_value);

                if (parser->attrib_value_handler) {
                    if (parser->attrib_value_handler (parser, parser->current_attrib, 
                                                      parser->current_value) < 0) {
                        goto err;
                    }
                } else {
                    rbuf_free (parser->current_attrib);
                    rbuf_free (parser->current_value);
                }

                parser->current_attrib = NULL;
                parser->current_value = NULL;
                parser->i = 0;

                st = RTHERML_IN_NODE;
                ADVANCE (c);
                break;

            case RTHERML_VALUE_QUOTE:
                
                if (*c != '=') {
                    rtherml_err (parser, "Expecting '=' character after end quote specification.");
                    goto err;
                }

                st = RTHERML_VALUE_QUOTE2;
                ADVANCE (c);
                /* Fallthrough */


            case RTHERML_VALUE_QUOTE2:
                
                if (*c != '\"') {
                    rtherml_err (parser, "Expecting a quote '\"' character after end quote specification.");
                    goto err;
                }

                st = RTHERML_VALUE_QUOTE3;
                ADVANCE (c);
                /* Fallthrough */

            case RTHERML_VALUE_QUOTE3:

                s = c;

                while (parser->i < rbuf_len (parser->end_quote)) {
                    if (*c != rbuf_str (parser->end_quote)[parser->i]) {
                        parser->i = 0;
                    } else {
                        parser->i++;
                    }
                    ADVANCE (c);
                }
                
                /* It all checked out.. save our value */
                parser->current_value = rbuf_new_with_data (s, c - s);
                CHECK_PREV_CHUNK (parser->current_value);

                /* Now we have to truncate the end quote. */
                rbuf_truncate (parser->current_value, rbuf_len (parser->current_value) - rbuf_len (parser->end_quote));

                if (parser->attrib_value_handler) {
                    if (parser->attrib_value_handler (parser, parser->current_attrib, 
                                                      parser->current_value) < 0) {
                        goto err;
                    }
                } else {
                    rbuf_free (parser->current_attrib);
                    rbuf_free (parser->current_value);
                }

                parser->current_attrib = NULL;
                parser->current_value = NULL;

                rbuf_free (parser->end_quote);
                parser->end_quote = 0;
                parser->i = 0;
   
                st = RTHERML_IN_NODE;
                break;
                        

            case RTHERML_CLOSING_NODE:
                /* Closing a node.. found a "</". */
                if (*c == '>' ||
                    *c == '/') {
                    rtherml_err (parser, "Element types may not start with a '>' or '/'.");
                    goto err;
                }

                if (isspace (*c)) {
                    rtherml_err (parser, "Whitespace found after '</', expecting element name.");
                    goto err;
                }

                st = RTHERML_CLOSING_NODE2;
                /* Fallthrough */

            case RTHERML_CLOSING_NODE2:
                
                s = c;
                while (TRUE) {
                    if (isspace (*c)) {
                        rtherml_err (parser, "Whitespace found in element closure.");
                    } else if (*c == '>') {
                        RThermlNode *node;
                        RBuf *closure;

                        node = rtherml_node_pop (parser);
                        if (!node) {
                            rtherml_err (parser, "Closing node for which no node was opened (unbalanced tree).");
                            goto err;
                        }
                        closure = rbuf_new_with_data (s, c - s);
                        CHECK_PREV_CHUNK (closure);

                        if (!rbuf_equal_rbuf (node->type, closure)) {
                            rtherml_err (parser, "Closing node of wrong type (unbalanced tree).\nMatching node is of type '%b', line %d, column %d.",
                                        node->type, node->linenum, node->colnum);
                            rbuf_free (closure);
                            goto err;
                        }

                        rbuf_free (closure);
                        
                        if (parser->node_end_handler) {
                            if (parser->node_end_handler (parser, node->type, node->data)) {
                                goto err;
                            }
                        }
      
                        rtherml_node_free (node);

                        st = RTHERML_NODE_DATA;
                        ADVANCE (c);
                        break;
                    }

                    ADVANCE (c);
                }

                break;

            case RTHERML_NODE_DATA:

                s = c;
                /* For now we'll just ignore data and look for the start of a new
                 * node, comment etc */
                while (TRUE) {
                    if (*c == '<') {
                        RThermlNode *node = parser->nodes;
                        if (node && node->has_data) {
                            if (!node->data) {
                                node->data = rbuf_new ();
                            }
                            rbuf_append_data (node->data, s, c - s);
                            CHECK_PREV_CHUNK (node->data);
                        } else {
                            /* Be sure to reset 's' if there was no data */
                            s = NULL;
                            /* Also have to clean out any old data */
                            if (parser->prev_chunk) {
                                rbuf_free (parser->prev_chunk);
                                parser->prev_chunk = NULL;
                            }
                        }

                        st = RTHERML_NODE_OPENED;
                        ADVANCE (c);
                        break;
                    }
                    if (!isspace (*c)) {
                        if (parser->nodes) {
                            parser->nodes->has_data = TRUE;
                        }
                    }
                    ADVANCE (c);
                }
                break;


            case RTHERML_COMMENT:
                if (*c != '-') {
                    /* Oops, we have '<!' but garbage after it */
                    rtherml_err (parser, "Expected '-' after '<!' to open a comment.");
                    goto err;
                }

                st = RTHERML_COMMENT2;
                ADVANCE (c);
                /* Fallthrough */

            case RTHERML_COMMENT2:
                if (*c != '-') {
                    /* Oops, we have '<!-' but not another - */
                    rtherml_err (parser, "Expected '-' after '<!-' to open a comment.");
                    goto err;
                }

                st = RTHERML_COMMENT3;
                ADVANCE (c);
                /* Fallthrough */

            case RTHERML_COMMENT3:

                /* Now we chew up stuff until we hit a "-->" */
                while (TRUE) {
                    if (*c == '-') {
                        st = RTHERML_COMMENT_END;
                        ADVANCE (c);
                        break;
                    }
                    ADVANCE (c);
                }
                /* Fallthrough */

            case RTHERML_COMMENT_END:

                if (*c != '-') {
                    st = RTHERML_COMMENT3;
                } else {
                    st = RTHERML_COMMENT_END2;
                }

                ADVANCE (c);
                break;

            case RTHERML_COMMENT_END2:

                if (*c != '>') {
                    st = RTHERML_COMMENT3;
                } else {
                    st = RTHERML_NODE_DATA;
                }
                
                ADVANCE (c);
                break;
        }
    }

    /* End of buffer */
eob:
    if (is_last_chunk) {
        if (parser->nodes) {
            rtherml_err (parser, "Reached end of document but nodes are still left open.\nLast opened node was of type '%b', line %d, column %d",
                        parser->nodes->type, parser->nodes->linenum, parser->nodes->colnum);
            return (1);
        }

        rtherml_parser_free_state (parser);

        return (0);
    } else {
        parser->state = st;
        if (s) {
            if (!parser->prev_chunk) {
                parser->prev_chunk = rbuf_new ();
            }
            rbuf_append_data (parser->prev_chunk, s, e - s);
        }
        
        return (0);
    }

err:
    rtherml_parser_free_state (parser);
    return (1);

}



