/* This is a full featured entitiy fixer.
 * I have to process quite too many defective XML files out there ..
 *
 * This does not fix problems in the XML file structure,
 * however it fixes any problems you might observe with unknown entities.
 * This knows all common HTML entities and transforms them into XML entities.
 * Additionally it knows about double escapes.
 * It also fixes defective escapes (or not escaped & signs).
 *
 * This assumes there are no entities defined in the XML file.
 * Additionally see latin1-utf8.c to fix lazy character encodings.
 *
 * This should be built in xml2sql, as we have access to the known entities there.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#if 0
#define TINO_DEBUG
#endif

#include "tino-debug.c"

static struct ent
  {
    int		utf8;
    const char	*entity;
  } entities[] =
  {
    { -2, "amp" },
    { -1, "lt" },
    { -1, "gt" },

    { 34, "quot"	},

    { 160, "nbsp"	},
    { 161, "iexcl"	},
    { 162, "cent"	},
    { 163, "pound"	},
    { 164, "curren"	},
    { 165, "yen"	},
    { 166, "brvbar"	},
    { 167, "sect"	},
    { 168, "uml"	},
    { 169, "copy"	},
    { 170, "ordf"	},
    { 171, "laquo"	},
    { 172, "not"	},
    { 173, "shy"	},
    { 174, "reg"	},
    { 175, "macr"	},
    { 176, "deg"	},
    { 177, "plusmn"	},
    { 178, "sup2"	},
    { 179, "sup3"	},
    { 180, "acute"	},
    { 181, "micro"	},
    { 182, "para"	},
    { 183, "middot"	},
    { 184, "cedil"	},
    { 185, "sup1"	},
    { 186, "ordm"	},
    { 187, "raquo"	},
    { 188, "frac14"	},
    { 189, "frac12"	},
    { 190, "frac34"	},
    { 191, "iquest"	},
    { 192, "Agrave"	},
    { 193, "Aacute"	},
    { 194, "Acirc"	},
    { 195, "Atilde"	},
    { 196, "Auml"	},
    { 197, "Aring"	},
    { 198, "AElig"	},
    { 199, "Ccedil"	},
    { 200, "Egrave"	},
    { 201, "Eacute"	},
    { 202, "Ecirc"	},
    { 203, "Euml"	},
    { 204, "Igrave"	},
    { 205, "Iacute"	},
    { 206, "Icirc"	},
    { 207, "Iuml"	},
    { 208, "ETH"	},
    { 209, "Ntilde"	},
    { 210, "Ograve"	},
    { 211, "Oacute"	},
    { 212, "Ocirc"	},
    { 213, "Otilde"	},
    { 214, "Ouml"	},
    { 215, "times"	},
    { 216, "Oslash"	},
    { 217, "Ugrave"	},
    { 218, "Uacute"	},
    { 219, "Ucirc"	},
    { 220, "Uuml"	},
    { 221, "Yacute"	},
    { 222, "THORN"	},
    { 223, "szlig"	},
    { 224, "agrave"	},
    { 225, "aacute"	},
    { 226, "acirc"	},
    { 227, "atilde"	},
    { 228, "auml"	},
    { 229, "aring"	},
    { 230, "aelig"	},
    { 231, "ccedil"	},
    { 232, "egrave"	},
    { 233, "eacute"	},
    { 234, "ecirc"	},
    { 235, "euml"	},
    { 236, "igrave"	},
    { 237, "iacute"	},
    { 238, "icirc"	},
    { 239, "iuml"	},
    { 240, "eth"	},
    { 241, "ntilde"	},
    { 242, "ograve"	},
    { 243, "oacute"	},
    { 244, "ocirc"	},
    { 245, "otilde"	},
    { 246, "ouml"	},
    { 247, "divide"	},
    { 248, "oslash"	},
    { 249, "ugrave"	},
    { 250, "uacute"	},
    { 251, "ucirc"	},
    { 252, "uuml"	},
    { 253, "yacute"	},
    { 254, "thorn"	},
    { 255, "yuml"	},
    { 913, "Alpha"	},
    { 914, "Beta"	},
    { 915, "Gamma"	},
    { 916, "Delta"	},
    { 917, "Epsilon"	},
    { 918, "Zeta"	},
    { 919, "Eta"	},
    { 920, "Theta"	},
    { 921, "Iota"	},
    { 922, "Kappa"	},
    { 923, "Lambda"	},
    { 924, "Mu"		},
    { 925, "Nu"		},
    { 926, "Xi"		},
    { 927, "Omicron"	},
    { 928, "Pi"		},
    { 929, "Rho"	},
    { 931, "Sigma"	},
    { 932, "Tau"	},
    { 933, "Upsilon"	},
    { 934, "Phi"	},
    { 935, "Chi"	},
    { 936, "Psi"	},
    { 937, "Omega"	},
    { 945, "alpha"	},
    { 946, "beta"	},
    { 947, "gamma"	},
    { 948, "delta"	},
    { 949, "epsilon"	},
    { 950, "zeta"	},
    { 951, "eta"	},
    { 952, "theta"	},
    { 953, "iota"	},
    { 954, "kappa"	},
    { 955, "lambda"	},
    { 956, "mu"		},
    { 957, "nu"		},
    { 958, "xi"		},
    { 959, "omicron"	},
    { 960, "pi"		},
    { 961, "rho"	},
    { 962, "sigmaf"	},
    { 963, "sigma"	},
    { 964, "tau"	},
    { 965, "upsilon"	},
    { 966, "phi"	},
    { 967, "chi"	},
    { 968, "psi"	},
    { 969, "omega"	},
    { 977, "thetasym"	},
    { 978, "upsih"	},
    { 982, "piv"	},

    { 8194, "ensp"	},
    { 8195, "emsp"	},
    { 8201, "thinsp"	},
    { 8204, "zwnj"	},
    { 8205, "zwj"	},
    { 8206, "lrm"	},
    { 8207, "rlm"	},
    { 8211, "ndash"	},
    { 8212, "mdash"	},
    { 8216, "lsquo"	},
    { 8217, "rsquo"	},
    { 8218, "sbquo"	},
    { 8220, "ldquo"	},
    { 8221, "rdquo"	},
    { 8222, "bdquo"	},
    { 8224, "dagger"	},
    { 8225, "Dagger"	},
    { 8226, "bull"	},
    { 8230, "hellip"	},
    { 8240, "permil"	},
    { 8242, "prime"	},
    { 8249, "lsaquo"	},
    { 8250, "rsaquo"	},
    { 8254, "oline"	},
    { 8260, "frasl"	},
    { 8364, "euro"	},
    { 8465, "image"	},
    { 8472, "weierp"	},
    { 8476, "real"	},
    { 8482, "trade"	},
    { 8501, "alefsym"	},
    { 8592, "larr"	},
    { 8593, "uarr"	},
    { 8594, "rarr"	},
    { 8595, "darr"	},
    { 8596, "harr"	},
    { 8629, "crarr"	},
    { 8656, "lArr"	},
    { 8657, "uArr"	},
    { 8658, "rArr"	},
    { 8659, "dArr"	},
    { 8660, "hArr"	},
    { 8704, "forall"	},
    { 8706, "part"	},
    { 8707, "exist"	},
    { 8709, "empty"	},
    { 8711, "nabla"	},
    { 8712, "isin"	},
    { 8713, "notin"	},
    { 8715, "ni"	},
    { 8719, "prod"	},
    { 8721, "sum"	},
    { 8722, "minus"	},
    { 8727, "lowast"	},
    { 8730, "radic"	},
    { 8733, "prop"	},
    { 8734, "infin"	},
    { 8736, "ang"	},
    { 8745, "cap"	},
    { 8746, "cup"	},
    { 8747, "int"	},
    { 8756, "there4"	},
    { 8764, "sim"	},
    { 8773, "cong"	},
    { 8776, "asymp"	},
    { 8800, "ne"	},
    { 8801, "equiv"	},
    { 8804, "le"	},
    { 8805, "ge"	},
    { 8834, "sub"	},
    { 8835, "sup"	},
    { 8836, "nsub"	},
    { 8838, "sube"	},
    { 8839, "supe"	},
    { 8853, "oplus"	},
    { 8855, "otimes"	},
    { 8869, "and"	},
    { 8869, "perp"	},
    { 8870, "or"	},
    { 8901, "sdot"	},
    { 8968, "lceil"	},
    { 8969, "rceil"	},
    { 8970, "lfloor"	},
    { 8971, "rfloor"	},
    { 9001, "lang"	},
    { 9002, "rang"	},
    { 9674, "loz"	},
    { 9824, "spades"	},
    { 9827, "clubs"	},
    { 9829, "hearts"	},
    { 9830, "diams"	},
    { 0 }
  };

/* We have seen an &<char> so far
 * where <char> is not a number sign (#).
 *
 * Find a named entitiy and transform it into the
 * correct UTF-8 code.  Unknown entities are escaped
 * by an &amp;
 *
 * If the UTF-8 code in above table is -2
 * the entity is an &amp; (escape) and we have to
 * re-loop below to catch all those (often observed) multi-escapes.
 * Therefor (in this case only) we return 1, else 0.
 *
 * If the UTF-8 code in above table is -1
 * the entity has a special meaning in XML,
 * thus copy it transparently (as there is no UTF-8
 * character given we cannot translate it anyway).
 */
static int
put_named(int c)
{
  char		buf[BUFSIZ];
  struct ent	*e;
  int		i;

  DP(("put_named(%c)", c));
  buf[0]	= 0;
  for (i=0; c!=';'; )
    {
      if (i>=sizeof buf-1 || !isalnum(c))
	{
	  ungetc(c, stdin);
	  printf("&amp;%s", buf);
	  DP(("put_named() &amp;%s<%02x>", buf, c));
	  return 0;
	}

      buf[i++]	= c;
      buf[i]	= 0;
      
      if ((c=getchar())==EOF)
	{
	  printf("&amp;%s", buf);
	  DP(("put_named() &amp;%s<EOF>", buf));
	  return 0;
	}
    }
  DP(("put_named() buf=%s", buf));
  for (e=entities; e->entity; e++)
    if (!strcmp(e->entity, buf))
      {
	/* Saw an escape for ampersand (&amp;)
	 */
	if (e->utf8== -2)
	  return 1;

	if (e->utf8<0)
	  printf("&%s;", buf);
	else
	  printf("&#x%02x;", e->utf8);

	return 0;
      }
  printf("&amp;%s;", buf);
  return 0;
}

/* We have seen an &# so far
 * Output a numeric entitiy.
 *
 * V1.0.1 bugfix: Routine nearly completely rewritten.
 * In case of invalid entities starting with &#<someting>
 * this now becomes &amp;#<something> as expected.
 */
static void
put_numeric(void)
{
  char		buf[BUFSIZ];
  int		c, i, offs;
  unsigned long	l;
  char		*end;

  DP(("put_numeric()"));
  if ((c=getchar())==EOF)
    {
      printf("&amp;#");
      return;
    }

  offs	= 0;
  if (c=='x')
    {
      /* Remember that we have to ignore the x in the buf */
      offs	= 1;
    }
  else if (!isdigit(c))
    {
      ungetc(c, stdin);
      printf("&amp;#");
      return;
    }

  for (i=0;;)
    {
      buf[i++]	= c;
      buf[i]	= 0;

      if ((c=getchar())==EOF)
	break;
      if (c==';' && i>offs)
	{
	  l	= strtoul(buf+offs, &end, (offs ? 16 : 10));
	  /* Ignore illegal hex numbers
	   * Some values are declared to be incorrect in UTF-8
	   */
	  if (!end || *end || l<0 || l>0x7fffffffl || (l>=0xd800l && l<=0xdfffl) || l==0xfffel || l==0xffffl)
	    break;

	  /* Return valid entity, rewrite it as hex always
	   */
	  printf("&#x%02lx;", l);
	  return;
	}
      if (i>=sizeof buf-1 || (offs ? !isxdigit(c) : !isdigit(c)))
	break;
    }

  /* Illegal sequence
   */
  if (c!=EOF)
    ungetc(c, stdin);
  printf("&amp;#%s", buf);
}

/* We have seen an & so far
 */
static void
put_entity(void)
{
  int	c;

  DP(("put_entity()"));
  do
    {
      if ((c=getchar())==EOF)
	{
	  printf("&amp;");
	  return;
	}
      if (c=='#')
	{
	  put_numeric();
	  return;
	}
    }
  while (put_named(c));
}

int
main(int argc, char **argv)
{
  int	c;

  /* XXX
   * Add further entities to ignore (copy transparently) on commandline?
   */
  while ((c=getchar())!=EOF)
    {
      if (c=='&')
	put_entity();
      else
	putchar(c);
    }
  return ferror(stdin);
}
