From 73e20940379b3da1488bba449758f9a48833e65a Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 7 Jun 2001 08:10:10 +0000 Subject: [PATCH] Bug fix for relative links. --- robot.tcl | 78 +++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/robot.tcl b/robot.tcl index 5c2b518..fa3c595 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,8 +1,8 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $ +# $Id: robot.tcl,v 1.17 2001/06/07 08:10:10 adam Exp $ # proc RobotFileNext1 {area lead} { - puts "RobotFileNext1 area=$area lead=$lead" + # puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -43,7 +43,7 @@ proc RobotReadRecord {inf fromurlx distancex} { gets $inf gets $inf set distance [string trim [gets $inf]] - puts "got distance = $distance" + # puts "got distance = $distance" gets $inf gets $inf set fromurl [string trim [gets $inf]] @@ -52,7 +52,7 @@ proc RobotReadRecord {inf fromurlx distancex} { proc RobotFileNext {area} { global robotSeq global idleTime ns - puts "RobotFileNext robotSeq=$robotSeq" + # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { return {} } @@ -87,27 +87,27 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - puts "RobotFileExist begin area=$area host=$host path=$path" + # puts "RobotFileExist begin area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "RobotFileExist end npath=$npath" + # puts "RobotFileExist end npath=$npath" return [file exists $npath] } proc RobotFileUnlink {area host path} { - puts "RobotFileUnlink begin" - puts "area=$area host=$host path=$path" + # puts "RobotFileUnlink begin" + # puts "area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "npath=$npath" + # puts "npath=$npath" set comp [split $npath /] set l [llength $comp] incr l -1 @@ -118,7 +118,7 @@ proc RobotFileUnlink {area host path} { if {![catch {glob $path/*}]} return exec rmdir ./$path } - puts "RobotFileUnlink end" + # puts "RobotFileUnlink end" } proc RobotFileClose {out} { @@ -134,7 +134,7 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -203,7 +203,7 @@ proc RobotStart {} { global URL global robotsRunning robotsMax idleTime - puts "RobotStart" + # puts "RobotStart" while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { @@ -313,28 +313,31 @@ proc RobotHref {url hrefx hostx pathx} { set surl $dpart/$surl } } - set c [split $surl /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { .. { - incr i -2 - if {$i < 0} { - set i 0 + if {$pathl > 0} { + incr pathl -1 + set path [lrange $path 0 $pathl] } } - . { - incr i -1 - } - default { - set path [lindex $c $i]/$path - incr i -1 + . { + + } + default { + incr pathl + lappend path $c } } } + if {$pathl} { + set path [join $path /] + } else { + set path "" + } regsub -all {~} $path {%7E} path set href "$method://$host$path" puts "Ref href = $href" @@ -429,7 +432,7 @@ proc RobotTextHtml {url out} { } puts $out {>} } body { - regsub -all -nocase {} $body {} abody + regsub -all -nocase {))*} $body {} abody regsub -all {<[^\>]+>} $abody {} nbody puts $out "" puts $out $nbody @@ -587,6 +590,10 @@ proc RobotTextPlain {url out} { proc Robot200 {url} { global URL domains + set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)] + puts -nonewline $out $URL($url,buf) + RobotFileClose $out + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] puts $out "" @@ -609,7 +616,7 @@ proc Robot200 {url} { } } text/plain { - RobotTextPlain $url $out + RobotTextPlain $url $out $outr } application/pdf { set pdff [open test.pdf w] @@ -810,6 +817,17 @@ set idleTime 60000 set i 0 set l [llength $argv] +# For testing only +if {0} { + set url "http://www.sportsfiskeren.dk/sportsfiskeren/corner/index.htm" + set href "../../data/../../data2/newsovs.asp?Mode=5" + + set URL($url,path) /sportsfiskeren/corner/index.htm + set URL($url,hostport) www.sportsfiskeren.dk + RobotHref $url href host path + exit 0 +} + if {$l < 2} { puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]} puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" -- 1.7.10.4