
 /***************************************************************************/

/*
 * Portions Copyright (c) 1999 GMRS Software GmbH
 * Carl-von-Linde-Str. 38, D-85716 Unterschleissheim, http://www.gmrs.de
 * All rights reserved.
 *
 * Author: Arno Unkrig <arno@unkrig.de>
 *
 * All advertising materials mentioning features or use of this software
 * must display the following acknowledgement:
 * "This product includes software developed by GMRS Software GmbH."
 * The name of GMRS Software GmbH may not be used to endorse or promote
 * products derived from this software without specific prior written
 * permission.
 */
 
/* This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License in the file COPYING for more details.
 */

 /***************************************************************************/

/*
 * Changes to version 1.2.2 were made by Martin Bayer <mbayer@zedat.fu-berlin.de>
 * Dates and reasons of modifications:
 * Thu Oct  4 22:14:38 CEST 2001: included EURO-sign
 */
  
 /***************************************************************************/


#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "html.h"
#include "sgml.h"

#ifndef nelems
#define nelems(array) (sizeof(array) / sizeof((array)[0]))
#endif

/* ------------------------------------------------------------------------- */

/*
 * Keep this array sorted alphabetically!
 */
static const struct TextToInt {
  char name[7];
  int  code;
} latin1_entities[] = {
  { "AElig",  LATIN1_AElig   },
  { "Aacute", LATIN1_Aacute  },
  { "Acirc",  LATIN1_Acirc   },
  { "Agrave", LATIN1_Agrave  },
  { "Aring",  LATIN1_Aring   },
  { "Atilde", LATIN1_Atilde  },
  { "Auml",   LATIN1_Auml    },
  { "Ccedil", LATIN1_Ccedil  },
  { "ETH",    LATIN1_ETH     },
  { "Eacute", LATIN1_Eacute  },
  { "Ecirc",  LATIN1_Ecirc   },
  { "Egrave", LATIN1_Egrave  },
  { "Euml",   LATIN1_Euml    },
  { "Iacute", LATIN1_Iacute  },
  { "Icirc",  LATIN1_Icirc   },
  { "Igrave", LATIN1_Igrave  },
  { "Iuml",   LATIN1_Iuml    },
  { "Ntilde", LATIN1_Ntilde  },
  { "Oacute", LATIN1_Oacute  },
  { "Ocirc",  LATIN1_Ocirc   },
  { "Ograve", LATIN1_Ograve  },
  { "Oslash", LATIN1_Oslash  },
  { "Otilde", LATIN1_Otilde  },
  { "Ouml",   LATIN1_Ouml    },
  { "THORN",  LATIN1_THORN   },
  { "Uacute", LATIN1_Uacute  },
  { "Ucirc",  LATIN1_Ucirc   },
  { "Ugrave", LATIN1_Ugrave  },
  { "Uuml",   LATIN1_Uuml    },
  { "Yacute", LATIN1_Yacute  },
  { "aacute", LATIN1_aacute  },
  { "acirc",  LATIN1_acirc   },
  { "acute",  LATIN1_acute   },
  { "aelig",  LATIN1_aelig   },
  { "agrave", LATIN1_agrave  },
  { "amp",    '&'            },
  { "aring",  LATIN1_aring   },
  { "atilde", LATIN1_atilde  },
  { "auml",   LATIN1_auml    },
  { "brvbar", LATIN1_brvbar  },
  { "ccedil", LATIN1_ccedil  },
  { "cedil",  LATIN1_cedil   },
  { "cent",   LATIN1_cent    },
  { "copy",   LATIN1_copy    },
  { "curren", LATIN1_curren  },
  { "deg",    LATIN1_deg     },
  { "divide", LATIN1_divide  },
  { "eacute", LATIN1_eacute  },
  { "ecirc",  LATIN1_ecirc   },
  { "egrave", LATIN1_egrave  },
  { "eth",    LATIN1_eth     },
  { "euml",   LATIN1_euml    },
  { "euro",   LATIN1_curren  },
  { "frac12", LATIN1_frac12  },
  { "frac14", LATIN1_frac14  },
  { "frac34", LATIN1_frac34  },
  { "gt",     '>'            },
  { "iacute", LATIN1_iacute  },
  { "icirc",  LATIN1_icirc   },
  { "iexcl",  LATIN1_iexcl   },
  { "igrave", LATIN1_igrave  },
  { "iquest", LATIN1_iquest  },
  { "iuml",   LATIN1_iuml    },
  { "laquo",  LATIN1_laquo   },
  { "lt",     '<'            },
  { "macr",   LATIN1_macr    },
  { "micro",  LATIN1_micro   },
  { "middot", LATIN1_middot  },
  { "nbsp",   LATIN1_nbsp    },
  { "not",    LATIN1_not     },
  { "ntilde", LATIN1_ntilde  },
  { "oacute", LATIN1_oacute  },
  { "ocirc",  LATIN1_ocirc   },
  { "ograve", LATIN1_ograve  },
  { "ordf",   LATIN1_ordf    },
  { "ordm",   LATIN1_ordm    },
  { "oslash", LATIN1_oslash  },
  { "otilde", LATIN1_otilde  },
  { "ouml",   LATIN1_ouml    },
  { "para",   LATIN1_para    },
  { "plusmn", LATIN1_plusmn  },
  { "pound",  LATIN1_pound   },
  { "quot",   '"'            },
  { "raquo",  LATIN1_raquo   },
  { "reg",    LATIN1_reg     },
  { "sect",   LATIN1_sect    },
  { "shy",    LATIN1_shy     },
  { "sup1",   LATIN1_sup1    },
  { "sup2",   LATIN1_sup2    },
  { "sup3",   LATIN1_sup3    },
  { "szlig",  LATIN1_szlig   },
  { "thorn",  LATIN1_thorn   },
  { "times",  LATIN1_times   },
  { "uacute", LATIN1_uacute  },
  { "ucirc",  LATIN1_ucirc   },
  { "ugrave", LATIN1_ugrave  },
  { "uml",    LATIN1_uml     },
  { "uuml",   LATIN1_uuml    },
  { "yacute", LATIN1_yacute  },
  { "yen",    LATIN1_yen     },
  { "yuml",   LATIN1_yuml    },
};

/*
 * Well, &curren; in not realy the same as &euro;, but Latin-1 does not have
 * the EURO-sign. So, this is probably the best what we can do...
 */

/* ------------------------------------------------------------------------- */

void
replace_sgml_entities(string *s)
{
  string::size_type j = 0;
  
  for (;;) {
    string::size_type l = s->length();

    /*
     * Skip characters before ampersand.
     */
    while (j < l && s->at(j) != '&') ++j;
    if (j >= l) break;

    /*
     * So we have an ampersand...
     */

    /*
     * Don't process the last three characters; an SGML entity wouldn't fit
     * in anyway!
     */
    if (j + 3 >= l) break;          // Watch out! Unsigned arithmetics!

    string::size_type beg = j++;    // Skip the ampersand;

    /*
     * Look at the next character.
     */
    char c = s->at(j++);
    if (c == '#') {

      /*
       * Decode entities like "&#233;".
       * Some authors forget the ";", but we tolerate this.
       */
      c = s->at(j++);
      if (isdigit(c)) {
        int x = c - '0';
        for (; j < l; ++j) {
          c = s->at(j);
          if (c == ';') { ++j; break; }
          if (!isdigit(c)) break;
          x = 10 * x + c - '0';
        }
        s->replace(beg, j - beg, 1, (char) x);
        j = beg + 1;
      }
    } else

    if (isalpha(c)) {

      /*
       * Decode entities like "&nbsp;".
       * Some authors forget the ";", but we tolerate this.
       */
      char name[8];
      name[0] = c;
      size_t i = 1;
      for (; j < l; ++j) {
        c = s->at(j);
        if (c == ';') { ++j; break; }
        if (!isalnum(c)) break;
        if (i < sizeof(name) - 1) name[i++] = c;
      }
      name[i] = '\0';

      const TextToInt *entity = (const TextToInt *) bsearch(
        name,
        latin1_entities, nelems(latin1_entities), sizeof(TextToInt),
        (int (*)(const void *, const void *)) strcmp
      );
      if (entity != NULL) {
        s->replace(beg, j - beg, 1, (char) entity->code);
        j = beg + 1;
      }
    } else {
      ;                         /* EXTENSION: Allow literal '&' sometimes. */
    }
  }
}

/* ------------------------------------------------------------------------- */

