From d8234df96ab8fb03ed71f6358f7211ebe725b495 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 7 Dec 2000 20:16:11 +0000 Subject: [PATCH] Added -nonest for htmlSwitch statement. Robot puts reference to bad URLs in bad area. --- configure | 2 +- configure.in | 7 +- dcdot.tcl | 192 +++++++++++++++++++++++++++++++++++++ hswitch.c | 37 ++++++-- robot.tcl | 299 +++++++++++++++++++++++++++++++++++++++++----------------- 5 files changed, 441 insertions(+), 96 deletions(-) create mode 100755 dcdot.tcl diff --git a/configure b/configure index 9ac06b3..ecbc3a3 100755 --- a/configure +++ b/configure @@ -1213,7 +1213,7 @@ if test "x$yazpath" != "xNONE"; then else for i in ../yaz* ../yaz; do if test -d $i; then - if test -r $i/include/yaz/yaz-version.h; then + if test -r $i/yaz-config; then yazconfig=$i/yaz-config fi fi diff --git a/configure.in b/configure.in index 66ba230..aacc46b 100644 --- a/configure.in +++ b/configure.in @@ -1,6 +1,6 @@ dnl (c) Index Data 1996-2000 dnl See the file LICENSE for details. -dnl $Id: configure.in,v 1.4 1999/12/27 11:49:31 adam Exp $ +dnl $Id: configure.in,v 1.5 2000/12/07 20:16:11 adam Exp $ AC_INIT(hswitch.c) dnl ------ Substitutions AC_SUBST(CC) @@ -27,6 +27,9 @@ if test "x$tclconfig" = xNONE; then saveprefix=${prefix} AC_PREFIX_PROGRAM(tclsh) if test "x$prefix" = xNONE; then + AC_PREFIX_PROGRAM(tclsh8.3) + fi + if test "x$prefix" = xNONE; then AC_PREFIX_PROGRAM(tclsh8.2) fi if test "x$prefix" = xNONE; then @@ -102,7 +105,7 @@ if test "x$yazpath" != "xNONE"; then else for i in ../yaz* ../yaz; do if test -d $i; then - if test -r $i/include/yaz/yaz-version.h; then + if test -r $i/yaz-config; then yazconfig=$i/yaz-config fi fi diff --git a/dcdot.tcl b/dcdot.tcl new file mode 100755 index 0000000..aeb4f91 --- /dev/null +++ b/dcdot.tcl @@ -0,0 +1,192 @@ +#!/usr/bin/tclsh +# $Id: dcdot.tcl,v 1.1 2000/12/07 20:16:11 adam Exp $ +# + +proc RobotRestart {} { + global robotMoreWork + + set robotMoreWork 0 +} + +proc RobotTextHtml {url} { + global URL + + set head 0 + htmlSwitch $URL($url,buf) \ + title { + set URL($url,title) $body + } -nonest meta { + set scheme {} + if {[info exist parm(scheme)]} { + set scheme $parm(scheme) + unset parm(scheme) + } + if {[info exist parm(name)]} { + if {[info exist parm(content)]} { + set URL($url,meta,$parm(name),$scheme) $parm(content) + unset parm(content) + } + unset parm(name) + } + } a { + if {[info exists parm(href)]} { + lappend URL($url,links) $parm(href) + } + } +} + +proc Robot200 {url} { + global URL domains + + # puts "Parsing $url" + switch $URL($url,head,content-type) { + text/html { + RobotTextHtml $url + } + } + # puts "Parsing done" +} + +proc RobotReadContent {url sock} { + global URL + + set buffer [read $sock 16384] + set readCount [string length $buffer] + + if {$readCount <= 0} { + close $sock + Robot200 $url + RobotRestart + } else { + # puts "Got $readCount bytes" + set URL($url,buf) $URL($url,buf)$buffer + } +} + +proc RobotReadHeader {url sock} { + global URL + + set buffer [read $sock 2148] + set readCount [string length $buffer] + + if {$readCount <= 0} { + close $sock + RobotRestart + } else { + # puts "Got $readCount bytes" + set URL($url,buf) $URL($url,buf)$buffer + + set n [string first \n\n $URL($url,buf)] + if {$n > 1} { + set code 0 + set version {} + set headbuf [string range $URL($url,buf) 0 $n] + incr n + incr n + set URL($url,buf) [string range $URL($url,buf) $n end] + + regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code + set lines [split $headbuf \n] + foreach line $lines { + if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} { + set URL($url,head,[string tolower $name]) $value + } + } + set URL($url,state) skip + switch $code { + 200 { + if {![info exists URL($url,head,content-type)]} { + set URL($url,head,content-type) {} + } + switch $URL($url,head,content-type) { + text/html { + fileevent $sock readable [list RobotReadContent $url $sock] + } + text/plain { + fileevent $sock readable [list RobotReadContent $url $sock] + } + default { + close $sock + Robot200 $url + RobotRestart + } + } + } + default { + Robot404 $url + close $sock + RobotRestart + } + } + } + } +} + +proc RobotConnect {url sock} { + global URL agent + + fconfigure $sock -translation {auto crlf} -blocking 0 + fileevent $sock readable [list RobotReadHeader $url $sock] + puts $sock "GET $URL($url,path) HTTP/1.0" + puts $sock "Host: $URL($url,host)" + puts $sock "User-Agent: $agent" + puts $sock "" + flush $sock +} + +proc RobotGetUrl {url phost} { + global URL + if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} { + return -1 + } + if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { + set port 80 + set host $hostport + } + set URL($url,method) $method + set URL($url,host) $host + set URL($url,port) $port + set URL($url,path) $path + set URL($url,state) head + set URL($url,buf) {} + if [catch {set sock [socket -async $host $port]}] { + return -1 + } + RobotConnect $url $sock + + return 0 +} + +if {![llength [info commands htmlSwitch]]} { + set e [info sharedlibextension] + if {[catch {load ./tclrobot$e}]} { + load tclrobot$e + } +} + +set agent "zmbot/0.0" +if {![catch {set os [exec uname -s -r]}]} { + set agent "$agent ($os)" +} + +proc RobotGetDCDOT {url} { + global robotMoreWork 1 + + set robotMoreWork 1 + if [RobotGetUrl $url {}] { + set robotMoreWork 0 + } + + while {$robotMoreWork} { + vwait robotMoreWork + } +} + +if {$argc == 1} { + set url [lindex $argv 0] + RobotGetDCDOT $url + set mask {,meta,[Dd][Cc]\.*} + foreach a [array names URL $url$mask] { + puts "URL($a) = $URL($a)" + } +} \ No newline at end of file diff --git a/hswitch.c b/hswitch.c index 3a05f5e..7c7f5c3 100644 --- a/hswitch.c +++ b/hswitch.c @@ -1,5 +1,5 @@ /* - * $Id: hswitch.c,v 1.2 1998/10/15 12:31:01 adam Exp $ + * $Id: hswitch.c,v 1.3 2000/12/07 20:16:11 adam Exp $ */ #include #include @@ -8,11 +8,11 @@ #include "tclrobot.h" -#define TAG_MAX_LEN 32 +#define TAG_MAX_LEN 64 #define SPACECHR " \t\r\n\f" -#define DEBUG(x) +#define DEBUG(x) static int skipSpace (const char *cp) { @@ -81,6 +81,7 @@ struct tagParm { struct tagInfo { int level; + int nest; char *pattern; char *code; @@ -163,6 +164,7 @@ static int tagEnd (Tcl_Interp *interp, struct tagInfo *tag, -- (tag->level); if (!tag->level) { + int tcl_err; struct tagParm *tp = tag->tagParms; char *value = malloc (body_end - tag->body_start + 1); @@ -183,7 +185,12 @@ static int tagEnd (Tcl_Interp *interp, struct tagInfo *tag, tp = tp->next; free (tp0); } - Tcl_Eval (interp, tag->code); + tcl_err = Tcl_Eval (interp, tag->code); + if (tcl_err == TCL_ERROR) + { + printf ("Error: code=%d %s\n", tcl_err, interp->result); + exit (1); + } } } return i; @@ -195,7 +202,7 @@ int htmlSwitch (ClientData clientData, Tcl_Interp *interp, struct tagInfo *tags; int noTags; const char *cp; - int i, argi = 1; + int i = 0, argi = 1; cp = argv[argi++]; noTags = (argc - argi)/2; @@ -207,24 +214,42 @@ int htmlSwitch (ClientData clientData, Tcl_Interp *interp, } tags = malloc (sizeof(*tags) * noTags); assert (tags); - for (i = 0; i= 0 ? tags+tagI : NULL, tagStr, cp); + if (tagI >= 0 && tags[tagI].nest == 0) + { + cp += tagEnd (interp, tags+tagI, tagStr, body_start, cp); + } } else if (cp[0] == '<' && cp[1] == '/')/* end tag */ { diff --git a/robot.tcl b/robot.tcl index ab3cef4..93c4541 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.5 1999/12/27 11:49:31 adam Exp $ +# $Id: robot.tcl,v 1.6 2000/12/07 20:16:11 adam Exp $ # proc RobotFileNext {area} { if {[catch {set ns [glob ${area}/*]}]} { @@ -15,7 +15,9 @@ proc RobotFileNext {area} { } return http://[string range $n $off end] } - if {[file isdirectory $n]} { + } + foreach n $ns { + if {[file isdirectory $n]} { set sb [RobotFileNext $n] if {[string length $sb]} { return $sb @@ -55,11 +57,20 @@ proc RobotFileUnlink {area host path} { } } -proc RobotFileOpen {area host path} { +proc RobotFileClose {out} { + if [string compare $out stdout] { + close $out + } +} + +proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] global workdir - #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path" + if {![info exists workdir]} { + return stdout + } + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path" if {[string compare $orgPwd $workdir]} { puts "workdir = $workdir" puts "pwd = $orgPwd" @@ -77,9 +88,13 @@ proc RobotFileOpen {area host path} { } set d [lindex $comp $len] if {[string length $d]} { - set out [open $d w] + if {[file isdirectory $d]} { + set out [open $d/:.html $mode] + } else { + set out [open $d $mode] + } } else { - set out [open :.html w] + set out [open :.html $mode] } cd $orgPwd #puts "RobotFileStop" @@ -88,11 +103,11 @@ proc RobotFileOpen {area host path} { proc RobotRestart {} { global URL + global robotMoreWork while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { - puts "No more unvisited" break } set r [RobotGetUrl $url {}] @@ -103,14 +118,13 @@ proc RobotRestart {} { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) } } - exit 0 + set robotMoreWork 0 } -proc headSave {url out title} { +proc headSave {url out} { global URL - puts $out {} - puts $out "$title" + puts $out {} if {[info exists URL($url,head,last-modified)]} { puts $out "$URL($url,head,last-modified)" } @@ -139,7 +153,7 @@ proc RobotHref {url hrefx hostx pathx} { upvar $hostx host upvar $pathx path - # puts "Ref url = $url href=$href" + puts "Ref url = $url href=$href" # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -150,12 +164,9 @@ proc RobotHref {url hrefx hostx pathx} { } } # get host (if any) - if {![regexp {^//([^/]+)(.*)} $hpath x host epath]} { - set epath $hpath - set host $URL($url,host) - } else { - if {![string length $epath]} { - set epath / + if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} { + if {![string length $surl]} { + set surl / } set ok 0 foreach domain $domains { @@ -167,22 +178,24 @@ proc RobotHref {url hrefx hostx pathx} { if {!$ok} { return 0 } + } else { + regexp {^([^\#]*)} $hpath x surl + set host $URL($url,host) } - if {[regexp {^(\#|\?)} $epath]} { - # within page + if {![string length $surl]} { return 0 - } elseif {![regexp {^([/][^\#?]*)} $epath x path]} { + } + if {[string first / $surl]} { # relative path - set ext [file extension $URL($url,path)] - if {[string compare $ext {}]} { - set dpart [file dirname $URL($url,path)] + regexp {^([^\#?]*)} $URL($url,path) x dpart + set l [string last / $dpart] + if {[expr $l >= 0]} { + set surl [string range $dpart 0 $l]$surl } else { - set dpart $URL($url,path) + set surl $dpart/$surl } - regexp {^([^\#?]+)} $epath x path - set path [string trimright $dpart /]/$path } - set c [split $path /] + set c [split $surl /] set i [llength $c] incr i -1 set path [lindex $c $i] @@ -202,60 +215,100 @@ proc RobotHref {url hrefx hostx pathx} { } } set href "$method://$host$path" - # puts "Ref href = $href" + puts "Ref href = $href" return 1 } proc Robot401 {url} { global URL - puts "Bad link $url" + puts "Bad URL $url" + set fromurl {} + catch { + set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] + set fromurl [gets $inf] + close $inf + } RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist forbidden $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen forbidden $URL($url,host) $URL($url,path)] - close $outf + if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] + puts $outf "URL=$url 401" + puts $outf "Reference $fromurl" + RobotFileClose $outf } } proc Robot404 {url} { global URL - puts "Bad link $url" + puts "Bad URL $url" + set fromurl {} + catch { + set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] + set fromurl [gets $inf] + RobotFileClose $inf + } RobotFileUnlink unvisited $URL($url,host) $URL($url,path) if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - close $outf + puts $outf "URL=$url 404" + puts $outf "Reference $fromurl" + RobotFileClose $outf } -} + } proc Robot301 {url tourl} { global URL puts "Redirecting from $url to $tourl" + + set fromurl {} + catch { + set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] + set fromurl [gets $inf] + RobotFileClose $inf + } RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] + puts $outf "URL=$url to $tourl 301" + puts $outf "Reference $fromurl" + RobotFileClose $outf + } if {[RobotHref $url tourl host path]} { if {![RobotFileExist unvisited $host $path]} { + puts "Mark as unvisited" set outf [RobotFileOpen unvisited $host $path] - close $outf + puts $outf 301 + RobotFileClose $outf } } } -proc Robot200 {url} { - global URL domains - - # puts "Parsing $url" - set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] - set ti 0 - if {[info exists URL($url,buf)]} { - set htmlContent $URL($url,buf) - - htmlSwitch $htmlContent \ +proc RobotTextHtml {url out} { + global URL + + set head 0 + htmlSwitch $URL($url,buf) \ title { - if {!$ti} { - headSave $url $out $body - set ti 1 + if {!$head} { + headSave $url $out + set head 1 + } + puts $out "$body" + } -nonest meta { + if {!$head} { + headSave $url $out + set head 1 + } + puts -nonewline $out "} } body { regsub -all -nocase {} $body {} abody regsub -all {<[^\>]+>} $abody {} nbody @@ -267,9 +320,9 @@ proc Robot200 {url} { puts "no href" continue } - if {!$ti} { - headSave $url $out "untitled" - set ti 1 + if {!$head} { + headSave $url $out + set head 1 } if {1} { set href $parm(href) @@ -279,28 +332,59 @@ proc Robot200 {url} { puts $out "$href" puts $out "$body" puts $out "" - + if {![RobotFileExist visited $host $path]} { - if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} { - puts "--- Error $msg" - exit 1 + if {![RobotFileExist bad $host $path]} { + if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} { + puts "--- Error $msg" + exit 1 + } + puts $outf $url + RobotFileClose $outf } - close $outf } } - } - } - if {!$ti} { - headSave $url $out "untitled" - set ti 1 + } + if {!$head} { + headSave $url $out + set head 1 } + puts $out "" +} + +proc RobotTextPlain {url out} { + global URL + + headSave $url $out + puts $out "" + puts $out $URL($url,buf) + puts $out "" puts $out "" - close $out +} + +proc Robot200 {url} { + global URL domains + + puts "Parsing $url" + set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] + switch $URL($url,head,content-type) { + text/html { + RobotTextHtml $url $out + } + text/plain { + RobotTextPlain $url $out + } + default { + headSave $url $out + puts $out "" + } + } + RobotFileClose $out # puts "Parsing done" RobotFileUnlink unvisited $URL($url,host) $URL($url,path) } -proc RobotReadBody {url sock} { +proc RobotReadContent {url sock} { global URL set buffer [read $sock 16384] @@ -316,10 +400,10 @@ proc RobotReadBody {url sock} { } } -proc RobotReadHead {url sock} { +proc RobotReadHeader {url sock} { global URL - set buffer [read $sock 8192] + set buffer [read $sock 2148] set readCount [string length $buffer] if {$readCount <= 0} { @@ -370,17 +454,21 @@ proc RobotReadHead {url sock} { RobotRestart } 200 { - if {[info exists URL($url,head,content-type)]} { - if {![string compare $URL($url,head,content-type) text/html]} { - set URL($url,state) html - } + if {![info exists URL($url,head,content-type)]} { + set URL($url,head,content-type) {} } - if {[string compare $URL($url,state) html]} { - close $sock - Robot200 $url - RobotRestart - } else { - fileevent $sock readable [list RobotReadBody $url $sock] + switch $URL($url,head,content-type) { + text/html { + fileevent $sock readable [list RobotReadContent $url $sock] + } + text/plain { + fileevent $sock readable [list RobotReadContent $url $sock] + } + default { + close $sock + Robot200 $url + RobotRestart + } } } default { @@ -394,13 +482,14 @@ proc RobotReadHead {url sock} { } proc RobotConnect {url sock} { - global URL + global URL agent fconfigure $sock -translation {auto crlf} -blocking 0 puts "Reading $url" - fileevent $sock readable [list RobotReadHead $url $sock] + fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" + puts $sock "User-Agent: $agent" puts $sock "" flush $sock } @@ -441,21 +530,57 @@ if {![llength [info commands htmlSwitch]]} { } } -if {[llength $argv] < 2} { - puts "Tclrobot: usage " - puts " Example: '*.dk' www.indexdata.dk" + +set agent "zmbot/0.0" +if {![catch {set os [exec uname -s -r]}]} { + set agent "$agent ($os)" + puts "agent: $agent" +} + +proc bgerror {m} { + puts "BGERROR $m" +} + +if {0} { + proc RobotRestart {} { + global robotMoreWork + set robotMoreWork 0 + puts "myrestart" + } + set robotMoreWork 1 + set url {http://www.indexdata.dk/zap/} + RobotGetUrl $url {} + while {$robotMoreWork} { + vwait robotMoreWork + } + puts "-----------" + puts $URL($url,buf) + puts "-----------" exit 1 } +set robotMoreWork 0 set workdir [pwd] +if {[llength $argv] < 2} { + puts "Tclrobot: usage " + puts " Example: '*.indexdata.dk' http://www.indexdata.dk/" + exit 1 +} + set domains [lindex $argv 0] set site [lindex $argv 1] if {[string length $site]} { - set x [RobotFileOpen unvisited $site /] - close $x + set robotMoreWork 1 + if [RobotGetUrl $site {}] { + set robotMoreWork 0 + puts "Couldn't process $site" + } else { + #set x [RobotFileOpen unvisited $site /robots.txt] + #RobotFileClose $x + } } - -RobotRestart -vwait forever +while {$robotMoreWork} { + vwait robotMoreWork +} -- 1.7.10.4