#!/bin/sh # # $Header: /CVSROOT/tinohtmlparse/tinohtmlabsurl.sh,v 1.6 2009-07-12 22:19:44 tino Exp $ # # THIS IS NOT PERFECT, # it does not process a lot of URI attributes for now, # like applet_codebase, applet_code etc. # # This Works is placed under the terms of the Copyright Less License, # see file COPYRIGHT.CLL. USE AT OWN RISK, ABSOLUTELY NO WARRANTY. # # $Log: tinohtmlabsurl.sh,v $ # Revision 1.6 2009-07-12 22:19:44 tino # Url-Parser fixed in tinohtmlabsurl.sh # # Revision 1.5 2007-12-30 17:57:03 tino # Placed under the CLL, also one entity code was fixed (∧) # # Revision 1.4 2007-02-12 07:00:02 tino # Commit for dist, see ChangeLog # # Revision 1.3 2006/06/16 20:35:41 tino # Jump now name anchor, which can be understood better. # Capability to send all non-URL-lines elsewhere, if you concentrate on URLs # # Revision 1.2 2005/02/06 00:17:06 tino # Only full lines are fed to the parser to make output more easy to parse. # # Revision 1.1 2005/02/05 23:07:28 tino # first commit, tinohtmlparse.c is missing "text" aggregation if [ 1 != "$#" -a 2 != "$#" ] then echo "Usage: tinohtmlparse | `basename "$0"` BASEURL [/dev/stdout] Second argument gives file for non-URL lines" >&2 exit 1 fi awk -v BASE="$1" -v NOURLOUT="${2:-/dev/stdout}" ' function shift(x) { sub(/^[^[:space:]]*[[:space:]]/,"",x); return x; } # http://user:pass@hostname/path/to/index.html?var=data#text # -type-!!-user---!!-host-!!-path--!!-file---!!-query-!!-anchor- # # relative URLs: path does not start with / # Special characters: # # starts the anchor anywhere in the URL # ? starts the query part anywhere in the URL # @ must preceede host, which must precede / # # It does not assume directory structure: # .. becomes path "" name ".." # ../.. becomes path "../" name ".." function parseuri(u, b, c) { delete parsed # fetch #anchor b = u; gsub(/^[^#]*/, "", b); gsub(/#.*$/,"",u); parsed["anchor"]=b # Fetch ?query b = u; gsub(/^[^?]*/, "", b); gsub(/\?.*$/,"",u); parsed["query"]=b # Fetch schema:// b = ""; if (match(u,/^[a-z]*:\/?\/?/)) { b = substr(u,RSTART,RLENGTH); u = substr(u,RSTART+RLENGTH); } parsed["type"]=b # Fetch host if there is a schema c = "" if (b!="") { b = u gsub(/\/.*$/,"",b); gsub(/^[^/]*/,"",u); # check user:password@ if (match(b,/@/)) { c = substr(b,1,RSTART); b = substr(b,RSTART+1); } } parsed["user"]=c; parsed["host"]=b; # Now left is path/file b = u gsub(/^.*\//,"",b); parsed["file"]=b; gsub(/[^/]*$/,"",u); parsed["path"]=u } function dump(t,a,s) { s = sprintf("# %s", t); a="type"; s=s sprintf(" %s=\"%s\"", a, parsed[a]); a="user"; s=s sprintf(" %s=\"%s\"", a, parsed[a]); a="host"; s=s sprintf(" %s=\"%s\"", a, parsed[a]); a="path"; s=s sprintf(" %s=\"%s\"", a, parsed[a]); a="file"; s=s sprintf(" %s=\"%s\"", a, parsed[a]); a="query"; s=s sprintf(" %s=\"%s\"", a, parsed[a]); a="anchor"; s=s sprintf(" %s=\"%s\"", a, parsed[a]); print s >NOURLOUT } function setbase(b) { parseuri(b) # Make sure, BASE path starts and ends with / # (the latter is the same as base["path"]=="") if (parsed["path"]!~/^\// || parsed["path"]!~/\/$/) parsed["path"]="/"; dump("base"); basetype = parsed["type"] basehost = parsed["host"] basepath = parsed["path"] basefile = parsed["file"] basequery = parsed["query"] } # Make a full URI from a relative one # The idea is function makefull(u,p) { parseuri(u) dump("1"); # Bugfix: # We have two options here: # base: http://example.com/dir/file?query#anchor # uri=: ?query # Does the result have a file? # Browsers think 'yes' # This now shall parse #anchor type destinations, too. if (parsed["type"]=="" && parsed["host"]=="" && parsed["path"]=="" && parsed["file"]=="" && parsed["query"]=="") parsed["query"]=basequery; if (parsed["type"]=="" && parsed["host"]=="" && parsed["path"]=="" && parsed["file"]=="") parsed["file"]=basefile; # Take over type, host and path from BASE URI if (parsed["type"]=="") parsed["type"]=basetype; # Well, what to do when switching from http: to https:? # news:whatever has no host! if (parsed["host"]=="" && parsed["type"]==basetype) parsed["host"]=basehost; # This apparently only works when there was no host if (parsed["path"]!~/^\//) parsed["path"] = basepath parsed["path"] dump("2"); return parsed["type"] parsed["user"] parsed["host"] parsed["path"] parsed["file"] parsed["query"] parsed["anchor"]; } BEGIN { tag["base","href"]=1 # I am conservative # Do not touch background,href,src I do not understand. tag["a","href"]=2 tag["area","href"]=2 tag["bgsound","src"]=2 tag["body","background"]=2 tag["embed","src"]=2 tag["form","action"]=2 tag["frame","src"]=2 tag["iframe","src"]=2 tag["input","src"]=2 tag["img","src"]=2 tag["layer","background"]=2 tag["layer","src"]=2 tag["link","href"]=2 tag["script","src"]=2 tag["table","background"]=2 tag["td","background"]=2 tag["th","background"]=2 setbase(BASE) IGNORECASE=1 } $1=="attr" && (t=tag[tolower($2),tolower($3)])!=0 { uri=shift(shift(shift(shift($0)))) if (t==1) setbase(uri) else { n=makefull(uri) if (uri!=n) { print "# " $0 > NOURLOUT print $1 " " $2 " " $3 " " $4 " " n next } print next } } { print >NOURLOUT } '