From: Adam Dickmeiss Date: Tue, 13 Nov 2001 11:17:26 +0000 (+0000) Subject: MIME check when reading HTTP header (not when reading content). X-Git-Tag: ZMBOT.0.1~8 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=7476a63e6732f7f51eea10bf38daaea4a31be57f;p=tclrobot.git MIME check when reading HTTP header (not when reading content). File robots.txt always read - even when text/plain is denied. --- diff --git a/robot.tcl b/robot.tcl index f94c8d3..ee70b9a 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.27 2001/11/09 13:26:50 adam Exp $ +# $Id: robot.tcl,v 1.28 2001/11/13 11:17:26 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -50,7 +50,9 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {area} { - global robotSeq global idletime ns + global robotSeq + global idletime ns + global status # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { @@ -67,7 +69,7 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "Round robin" + puts "Round robin un,ba,vi=$status(unvisited),$status(bad),$status(visited)" return wait } incr robotSeq @@ -99,6 +101,7 @@ proc RobotFileExist {area host path} { } proc RobotFileUnlink {area host path} { + global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" set lpath [split $path /] @@ -109,10 +112,12 @@ proc RobotFileUnlink {area host path} { set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t # puts "npath=$npath" set comp [split $npath /] + if {[catch {exec rm [join $comp /]}]} return + set l [llength $comp] incr l -1 - if {[catch {exec rm [join $comp /]}]} return incr l -1 + incr status($area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return @@ -130,6 +135,7 @@ proc RobotFileClose {out} { proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] global workdir + global status if {![info exists workdir]} { return stdout @@ -157,6 +163,7 @@ proc RobotFileOpen {area host path {mode w}} { set out [open frobots.txt w] puts "creating robots.txt in $d" close $out + incr status(unvisited) } } } @@ -170,6 +177,9 @@ proc RobotFileOpen {area host path {mode w}} { } else { set out [open f $mode] } + if {$mode == "w"} { + incr status($area) + } cd $orgPwd return $out } @@ -676,11 +686,6 @@ proc RobotWriteMetadata {url out} { text/plain { RobotTextPlain $url $out } - application/pdf { - set pdff [open test.pdf w] - puts -nonewline $pdff $URL($url,buf) - close $pdff - } } puts $out "" } @@ -692,10 +697,6 @@ proc Robot200 {url} { puts -nonewline $out $URL($url,buf) RobotFileClose $out - if {![checkrule mime $URL($url,head,content-type)]} { - RobotError $url mimedeny - return - } set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] RobotWriteMetadata $url $out RobotFileClose $out @@ -730,6 +731,7 @@ proc RobotReadHeader {url sock} { if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 RobotRestart $url $sock + return } set readCount [string length $buffer] @@ -770,12 +772,19 @@ proc RobotReadHeader {url sock} { if {![info exists URL($url,head,content-type)]} { set URL($url,head,content-type) {} } - set binary 0 - switch $URL($url,head,content-type) { - application/pdf { - set binary 1 + set binary 1 + switch -glob -- $URL($url,head,content-type) { + text/* { + set binary 0 } } + if {![regexp {/robots.txt$} $url]} { + if {![checkrule mime $URL($url,head,content-type)]} { + RobotError $url mimedeny + RobotRestart $url $sock + return + } + } fileevent $sock readable [list RobotReadContent $url $sock $binary] } default { @@ -892,6 +901,10 @@ set workdir [pwd] set idletime 60000 set acceptLanguage {} set debuglevel 0 +set status(unvisited) 0 +set status(visited) 0 +set status(bad) 0 +set status(raw) 0 # Rules: allow, deny, url @@ -1061,9 +1074,12 @@ puts "domains=$domains" puts "max distance=$maxdistance" puts "max jobs=$robotsMax" + RobotStart while {$robotsRunning} { vwait robotsRunning } + +puts "End un,ba,vi=$status(unvisited),$status(bad),$status(visited)"