/* $Header: /CVSROOT/tinohtmlparse/tinohtmlparse.c,v 1.9 2009-01-28 19:29:31 tino Exp $
*
* This Works is placed under the terms of the Copyright Less License,
* see file COPYRIGHT.CLL. USE AT OWN RISK, ABSOLUTELY NO WARRANTY.
*
* NOTE THAT ABOVE ONLY APPLIES TO THE CODE WITHIN THIS FILE, YOU MUST
* HONOR THE EKTHML LICENSE IF YOU USE EKHTML!
*
* Written from scratch while reading the demo code tester.c from
* ekhtml.
*
* $Log: tinohtmlparse.c,v $
* Revision 1.9 2009-01-28 19:29:31 tino
* Forgot TAB in meta
*
* Revision 1.8 2009-01-28 18:27:29 tino
* TAB for separator for last argument changed
*
* Revision 1.7 2007-12-30 17:57:03 tino
* Placed under the CLL, also one entity code was fixed (∧)
*
* Revision 1.6 2007-09-16 06:04:49 tino
* Percent escape handling corrected
*
* Revision 1.5 2007/02/12 07:00:02 tino
* Commit for dist, see ChangeLog
*
* Revision 1.4 2006/06/11 06:57:30 tino
* Mainly only documentation corrected
*
* Revision 1.3 2006/02/12 03:35:41 tino
* new dist for version with html entitiy parsing
*
* Revision 1.2 2005/02/06 00:17:06 tino
* Only full lines are fed to the parser to make output more easy to parse.
*
* Revision 1.1 2005/02/05 23:07:28 tino
* first commit, tinohtmlparse.c is missing "text" aggregation
*/
#include
#include
#include
#include
#include
#include "ekhtml/include/ekhtml.h"
#include "tinohtmlparse_version.h"
#include "tino_html_entities.h"
static int raw_mode, old_mode;
static void
spc(void)
{
putchar(' ');
}
static void
tab(void)
{
putchar(old_mode ? ' ' : '\t');
}
static void
lf(void)
{
putchar('\n');
}
static void
co(char c)
{
putchar(c);
if (c=='\n')
putchar('+');
}
static void
cp(char c)
{
putchar(isprint(c) && !isspace(c) ? c : '_');
}
static void
cx(int c)
{
co("0123456789abcdef"[((unsigned)c)&0xf]);
}
static void
ce(int c)
{
if (c<=32 || c>126) /* || c=='%') */
{
co('%');
cx(c>>4);
cx(c);
}
else
co(c);
}
/**********************************************************************/
static void
p_b(const char *s, ...)
{
va_list list;
va_start(list, s);
vprintf(s, list);
va_end(list);
}
static void
p_p(ekhtml_string_t *s)
{
int i;
for (i=0; ilen; i++)
cp(s->str[i]);
}
static void
p_s(ekhtml_string_t *s)
{
spc();
p_p(s);
}
static void
p_e(void)
{
lf();
}
static void
p_2(const char *typ, ekhtml_string_t *tag)
{
p_b(typ);
tab();
p_p(tag);
p_e();
}
static void
p_t(ekhtml_string_t *s)
{
int i, m;
m = s->len;
if (m && s->str[m-1]=='\n')
m--;
for (i=0; istr[i]);
}
/* Print string,
* but parse html_entities
*
* Well, I assume that it's a link.
* So UTF8 will be transformed into % notation.
*/
static void
p_ent(ekhtml_string_t *s)
{
int i, m;
m = s->len;
for (i=0; istr[i]=='&')
{
int len;
int u;
len = m-i;
u = tino_html_entity_check(s->str+i, &len);
if (u>=0)
{
if (u>0xff)
{
co('%');
co('u');
cx(u>>12);
cx(u>>8);
cx(u>>4);
cx(u);
}
else
ce(u);
i += len;
continue;
}
}
ce(s->str[i++]);
}
}
/* Prefixed Message
*/
static void
p_m(const char *prefix, ekhtml_string_t *s)
{
int i;
int l;
for (l=0, i=0; ilen; l++)
{
int j;
printf("%s %d ", prefix, l);
for (j=i;;j++)
if (j>=s->len)
{
printf("0 ");
break;
}
else if (s->str[j]=='\n')
{
printf("1 ");
break;
}
printf("-");
tab();
while (istr[i++]);
lf();
i++;
}
}
/**********************************************************************/
static void
cb_start(void *x, ekhtml_string_t *tag, ekhtml_attr_t *att)
{
ekhtml_attr_t *attr;
p_2("open", tag);
for (attr=att; attr; attr=attr->next)
{
p_b("attr");
p_s(tag);
p_s(&attr->name);
spc();
if (attr->isBoolean)
{
p_b("B");
}
else if (attr->quoteChar)
{
switch (attr->quoteChar)
{
case '"':
case '\'':
p_b("%c", attr->quoteChar);
break;
default:
p_b("%02x", (unsigned)(unsigned char)attr->quoteChar);
break;
}
}
else
{
p_b("N");
}
tab();
p_ent(&attr->val);
p_e();
}
}
static void
cb_end(void *x, ekhtml_string_t *tag)
{
p_2("close", tag);
}
static void
cb_comment(void *x, ekhtml_string_t *comment)
{
p_m("comment", comment);
}
/* ekHTML does not call this with full lines. Instead it calls this
* with just the data it has so far. This is correct, but in our case
* I don't want to have partial lines in normal situations.
* This is now "fixed" by feeding full lines into the parser.
*/
static void
cb_data(void *x, ekhtml_string_t *data)
{
p_m("text", data);
}
/**********************************************************************/
int
main(int argc, char **argv)
{
ekhtml_parser_t *p;
ekhtml_string_t s;
char buf[BUFSIZ*10];
int fill;
int i;
raw_mode = 0;
for (i=1; iunicode; p++)
printf("%04x\t%5d\t%s\n", p->unicode, p->unicode, p->entity);
return 0;
}
if (!strcmp(argv[1], "-o") || !strcmp(argv[1], "--old"))
{
old_mode = !old_mode;
continue;
}
fprintf(stderr,
"Usage: %s [options] < HTMLFILE > parsed_output\n"
"\t\tVersion " TINOHTMLPARSE_VERSION " compiled " __DATE__ "\n"
"\t--raw\t(also -r) Do not interpret htmlentities\n"
"\t\tIf you depend on the broken values, give --raw\n"
"\t\tElse tinohtmlparse tries to parse things for HTML entities\n"
"\t\tand transfor something clever, which probably is unwanted\n"
"\t--list\t(also -l) List known htmlentities (without & and ;)\n"
"\t\tNote that n; and X; also is known but not listed.\n"
"\t--old\t(also -o) Use old SPC instead of TAB to separate last arg\n"
, argv[0]
);
return 1;
}
p = ekhtml_parser_new(NULL);
ekhtml_parser_datacb_set(p, cb_data);
ekhtml_parser_commentcb_set(p, cb_comment);
ekhtml_parser_startcb_add(p, NULL, cb_start);
ekhtml_parser_endcb_add(p, NULL, cb_end);
fill = 0;
for (;;)
{
int n, i, k;
if (fflush(stdout) || ferror(stdin) || feof(stdout) || ferror(stdout))
return 1;
n = fread(buf+fill, 1, sizeof buf-fill, stdin);
fill += n;
/* Ugly fix:
* Only feed full lines to ekhtml,
* such that cb_data only gets full lines.
*/
k = fill;
for (i=0; i