/* Convert non-UTF-8 (ISO-8859-1) into UTF-8.
 *
 * If you give any arg the data is transformed
 * directly in UTF-8, else it is just escaped,
 * into standard XML escape codes.
 */
#include <stdio.h>

/* Returns EOF,
 * Character 0-0x7fffffff
 * or high-bit set (top 3 bits) in case of illegal characters:
 * 0x8xxxxxxx	EOF within sequence
 * 0x9xxxxxxx	Unknown encoding (First char is not UTF-8)
 * 0xAxxxxxxx	Sequence too short (Missing character)
 * 0xBxxxxxxx	Sequence too long (RFC violation)
 */
long
getutf2(FILE *fd, int c, long accu)
{
  long	min;
  int	len, i, d;

  if (c<0xf0)
    {
      accu	&= 0x3ff;
      min	= 0x00000800;
      len	= 3;
    }
  else if (c<0xf8)
    {
      accu	&= 0x1ff;
      min	= 0x00010000;
      len	= 4;
    }
  else if (c<0xfc)
    {
      accu	&= 0xff;
      min	= 0x00200000;
      len	= 5;
    }
  else
    {
      accu	&= 0x7f;
      min	= 0x04000000;
      len	= 6;
    }
  for (i=2; ++i<=len; )
    {
      /* accu can be max. 0x01ffffff here
       */
      if ((d=getc(fd))==EOF)
        return 0x80000000|accu;
      if ((d&0xa0)!=0x80)
        {
          ungetc(d, fd);
          return 0xA0000000|accu;
        }
      accu	<<= 6;
      accu	|=  d&0x3f;
    }
  if (accu<min)
    return 0xB0000000|accu;
  return accu;
}

inline long
getutf(FILE *fd)
{
  int	c, d;

  if ((c=getc(fd))<0x80)
    return c;
  if (c<0xc0 || c>0xfd)
    return 0x90000000|c;

  if ((d=getc(fd))==EOF)
    return 0x80000000|c;
  if ((d&0xc0)!=0x80)
    {
      ungetc(d, fd);
      return 0xA0000000|c;
    }

  d	&= 0x3f;
  d	|= (c&0x1f)<<6;
  if (c<0xe0)
    {
      if (d<0x80)
        return 0xB0000000|d;
      return d;
    }
  /* Higher level UTF characters are processed via a non inline subfunction
   * !this part is untested!
   */
  return getutf2(fd, c, d);
}

int
main(int argc, char **argv)
{
  long	c, pos;
  int	errs;

  errs	= 0;
  for (pos=0; (c=getutf(stdin))!=EOF; pos++)
    {
      if (c<0 || c>0xff)
	{
	  if (++errs<10)
	    fprintf(stderr, "warning: ignoring UNICODE %lx at pos %ld\n", c, pos);
	  else if (errs==10)
	    fprintf(stderr, "warning: suppressing further warnings\n");
	  if (argc>2)
	    {
	      /* Be careful with this!
	       */
	      printf(argv[1], c);
	    }
	  continue;
	}
      putchar(c);
    }
  return ferror(stdin);
}
