From 5c476d6b3055153cfcb6972965b6e450f685ddeb Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 10 Jun 2003 11:43:52 +0000 Subject: [PATCH] Tasks. TKL integration --- robot.tcl | 694 +++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 464 insertions(+), 230 deletions(-) diff --git a/robot.tcl b/robot.tcl index e67a9b8..55e7f2a 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.34 2002/06/18 19:57:53 adam Exp $ +# $Id: robot.tcl,v 1.35 2003/06/10 11:43:52 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -49,32 +49,34 @@ proc RobotReadRecord {inf fromurlx distancex} { set fromurl [string trim [gets $inf]] } -proc RobotFileNext {area} { +proc RobotFileNext {task area} { global robotSeq global idletime ns global status - # puts "RobotFileNext robotSeq=$robotSeq" - if {$robotSeq < 0} { + # puts "RobotFileNext robotSeq=$robotSeq($task)" + if {$robotSeq($task) < 0} { return {} } - if {$robotSeq == 0} { - if {[catch {set ns [glob ${area}/*]}]} { - return {} + if {$robotSeq($task) == 0} { + if {[catch {set ns($task) [glob $task/$area/*]}]} { + return done } } - set off [string length $area] + # puts "ns=$ns($task)" + set off [string length $task/$area] incr off - set n [lindex $ns $robotSeq] + set n [lindex $ns($task) $robotSeq($task)] + # puts "n=$n" if {![string length $n]} { - set robotSeq -1 + set robotSeq($task) -1 flush stdout - set statusfile [open status w] - puts $statusfile "$status(unvisited) $status(bad) $status(visited)" + set statusfile [open $task/status w] + puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" close $statusfile return wait } - incr robotSeq + incr robotSeq($task) if {[file isfile $n/frobots.txt]} { puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt @@ -85,12 +87,12 @@ proc RobotFileNext {area} { } } puts "no more work at end of RobotFileNext n=$n" - puts "ns=$ns" + puts "ns=$ns($task)" return {} } -proc RobotFileExist {area host path} { +proc RobotFileExist {task area host path} { global debuglevel if {$debuglevel > 3} { @@ -101,14 +103,14 @@ proc RobotFileExist {area host path} { incr l -1 set t [lindex $lpath $l] incr l -1 - set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t if {$debuglevel > 3} { puts "RobotFileExist end npath=$npath" } return [file exists $npath] } -proc RobotFileUnlink {area host path} { +proc RobotFileUnlink {task area host path} { global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" @@ -117,7 +119,7 @@ proc RobotFileUnlink {area host path} { incr l -1 set t [lindex $lpath $l] incr l -1 - set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t # puts "npath=$npath" set comp [split $npath /] if {[catch {exec rm [join $comp /]}]} return @@ -125,7 +127,7 @@ proc RobotFileUnlink {area host path} { set l [llength $comp] incr l -1 incr l -1 - incr status($area) -1 + incr status($task,$area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return @@ -140,7 +142,7 @@ proc RobotFileClose {out} { } } -proc RobotFileOpen {area host path {mode w}} { +proc RobotFileOpen {task area host path {mode w}} { set orgPwd [pwd] global workdir global status @@ -158,28 +160,44 @@ proc RobotFileOpen {area host path {mode w}} { puts "pwd = $orgPwd" exit 1 } - set comp [split $area/$host$path /] + + set comp [split $task/$area/$host /] set len [llength $comp] incr len -1 - for {set i 0} {$i < $len} {incr i} { - if {$i > 1} { - set d "d[lindex $comp $i]" - } else { - set d [lindex $comp $i] - } - if {[catch {cd ./$d}]} { + + # puts "1 comp=$comp" + + for {set i 0} {$i <= $len} {incr i} { + set d [lindex $comp $i] + if {[catch {cd $d}]} { exec mkdir $d cd ./$d - if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { + if {![string compare $area unvisited] && $i == $len && $mode == "w"} { if {[string compare $path /robots.txt]} { set out [open frobots.txt w] puts "creating robots.txt in $d" close $out - incr status(unvisited) + incr status($task,unvisited) } } } } + + set comp [split $path /] + set len [llength $comp] + incr len -1 + + # puts "2 path=$path comp=$comp" + + for {set i 0} {$i < $len} {incr i} { + set d "d[lindex $comp $i]" + if {[string length $d] > 1} { + if {[catch {cd $d}]} { + exec mkdir $d + cd ./$d + } + } + } set d [lindex $comp $len] if {[string length $d]} { set out [open f$d $mode] @@ -187,93 +205,248 @@ proc RobotFileOpen {area host path {mode w}} { set out [open f $mode] } if {$mode == "w"} { - incr status($area) + incr status($task,$area) } cd $orgPwd return $out } -proc RobotRR {} { - global robotSeq robotsRunning +proc RobotStartJob {fname t} { + global control + + + set f [open $fname r] + set xml [read $f] + puts "Reading $fname" + regexp {([^<]*)} $xml x status + close $f + if {$status == "done"} { + puts "already done" + return + } + puts "status = $status" + if {![task $t]} { + return + } + htmlSwitch $xml \ + url { + url $body + } filter { + set type $parm(type) + set action $parm(action) + if {$type == "domain"} { + $action url http://$body/* + } + if {$type == "url"} { + $action url $body + } + if {$type == "mime"} { + $action mime $body + } + } distance { + set control($t,distance) $body + } status { + set control($t,filestatus) $body + } + if {$status == "pending"} { + regsub {[^<]*} $xml {running} xml2 + set f [open $fname w] + puts -nonewline $f $xml2 + close $f + } +} + +proc RobotDoneJob {t} { + global daemon_dir + + if {![info exists daemon_dir]} { + return + } + + set fname $t.tkl + + set f [open $fname r] + set xml [read $f] + puts "Reading $fname" + regexp {([^<]*)} $xml x status + puts "------" + puts "status = $status" + close $f + + regsub {[^<]*} $xml {done} xml2 + set f [open $fname w] + puts -nonewline $f $xml2 + close $f +} + +proc RobotScanDir {} { + global daemon_dir + + if {![info exists daemon_dir]} { + return + } + foreach d $daemon_dir { + if {[catch {set files [glob $d/*.tkl]}]} { + return + } + foreach fname $files { + if {[file isfile $fname] && [file readable $fname]} { + set t [file rootname $fname] + RobotStartJob $fname $t + } + } + } +} + +proc RobotRR {task} { + global robotSeq robotsRunning tasks robotsMax status + + puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------" incr robotsRunning -1 + + # only one task gets through... + if {[string compare [lindex $tasks 0] $task]} { + return + } + puts "RobotRR. task = $task" while {$robotsRunning} { vwait robotsRunning } - set robotSeq 0 - RobotStart + puts "Scan" + if {[catch {RobotScanDir} msg]} { + puts "RobotScanDir failed" + puts $msg + } + foreach t $tasks { + set statusfile [open $t/status w] + puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" + close $statusfile + set robotSeq($t) 0 + RobotStart $t + } +} + +proc RobotDaemonSig {} { + global daemon_cnt + + incr daemon_cnt +} + +proc RobotDaemonLoop {} { + global daemon_cnt tasks robotsRunning status + + set daemon_cnt 0 + while 1 { + puts $daemon_cnt + + RobotScanDir + + if {[info exists tasks]} { + puts "daemon loop tasks $tasks" + foreach t $tasks { + set robotSeq($t) 0 + RobotStart $t + } + while {$robotsRunning} { + vwait robotsRunning + } + } + after 30000 RobotDaemonSig + vwait daemon_cnt + } } -proc RobotRestart {url sock} { +proc RobotRestart {task url sock} { global URL robotsRunning close $sock after cancel $URL($sock,cancel) - foreach v [array names URL $url,*] { + foreach v [array names URL $task,$url,*] { unset URL($v) } incr robotsRunning -1 - RobotStart + RobotStart $task } -proc RobotStart {} { +proc RobotStart {task} { global URL - global robotsRunning robotsMax idletime + global robotsRunning robotsMax idletime status tasks - # puts "RobotStart" + # puts "RobotStart $task running=$robotsRunning" while {1} { - set url [RobotFileNext unvisited] + set url [RobotFileNext $task unvisited] + if {[string compare $url done] == 0} { + puts "In RobotStart task $task done" + + catch {unset ntasks} + foreach t $tasks { + if {[string compare $t $task]} { + lappend ntasks $t + } else { + puts "task $t done" + } + } + if {![info exists ntasks]} { + unset tasks + puts "all done" + } else { + set tasks $ntasks + } + RobotDoneJob $task + return + } if {![string length $url]} { return } - incr robotsRunning + incr robotsRunning if {[string compare $url wait] == 0} { - after $idletime RobotRR - return + after $idletime [list RobotRR $task] + return } - set r [RobotGetUrl $url {}] + set r [RobotGetUrl $task $url {}] if {!$r} { if {$robotsRunning >= $robotsMax} return } else { incr robotsRunning -1 - if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotFileClose $outf } - RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) } } } -proc headSave {url out} { +proc headSave {task url out} { global URL - if {[info exists URL($url,head,last-modified)]} { - puts $out "$URL($url,head,last-modified)" + if {[info exists URL($task,$url,head,last-modified)]} { + puts $out "$URL($task,$url,head,last-modified)" } puts $out {} - if {[info exists URL($url,head,date)]} { - puts $out " $URL($url,head,date)" + if {[info exists URL($task,$url,head,date)]} { + puts $out " $URL($task,$url,head,date)" } - if {[info exists URL($url,head,content-length)]} { - puts $out " $URL($url,head,content-length)" + if {[info exists URL($task,$url,head,content-length)]} { + puts $out " $URL($task,$url,head,content-length)" } - if {[info exists URL($url,head,server)]} { - puts $out " $URL($url,head,server)" + if {[info exists URL($task,$url,head,server)]} { + puts $out " $URL($task,$url,head,server)" } puts $out {} puts $out {} puts $out " $url" - if {[info exists URL($url,head,content-type)]} { - puts $out " $URL($url,head,content-type)" + if {[info exists URL($task,$url,head,content-type)]} { + puts $out " $URL($task,$url,head,content-type)" } puts $out {} } -proc RobotHref {url hrefx hostx pathx} { - global URL domains debuglevel +proc RobotHref {task url hrefx hostx pathx} { + global URL control debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path @@ -307,13 +480,13 @@ proc RobotHref {url hrefx hostx pathx} { if {![string length $surl]} { set surl / } - if {[info exist domains]} { + if {[info exist control($task,domains)]} { set ok 0 - foreach domain $domains { + foreach domain $control($task,domains) { if {[string match $domain $host]} { set ok 1 break - } + } } if {!$ok} { return 0 @@ -321,16 +494,16 @@ proc RobotHref {url hrefx hostx pathx} { } } else { regexp {^([^\#]*)} $hpath x surl - set host $URL($url,hostport) + set host $URL($task,$url,hostport) } if {![string length $surl]} { return 0 } if {[string first / $surl]} { # relative path - set curpath $URL($url,path) - if {[info exists URL($url,bpath)]} { - set curpath $URL($url,bpath) + set curpath $URL($task,$url,path) + if {[info exists URL($task,$url,bpath)]} { + set curpath $URL($task,$url,bpath) } regexp {^([^\#?]*)} $curpath x dpart set l [string last / $dpart] @@ -374,55 +547,55 @@ proc RobotHref {url hrefx hostx pathx} { if {$debuglevel > 1} { puts "Ref result = $href" } - return [checkrule url $href] + return [checkrule $task url $href] } -proc RobotError {url code} { +proc RobotError {task url code} { global URL puts "Bad URL $url (code $code)" set fromurl {} set distance -1 - if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } - RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) - if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } -proc RobotRedirect {url tourl code} { +proc RobotRedirect {task url tourl code} { global URL puts "Redirecting from $url to $tourl" set distance {} set fromurl {} - if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } - if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } - if {[RobotHref $url tourl host path]} { - if {![RobotFileExist visited $host $path]} { - if {![RobotFileExist unvisited $host $path]} { - set outf [RobotFileOpen unvisited $host $path] + if {[RobotHref $task $url tourl host path]} { + if {![RobotFileExist $task visited $host $path]} { + if {![RobotFileExist $task unvisited $host $path]} { + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } else { set olddistance {} - set inf [RobotFileOpen visited $host $path r] + set inf [RobotFileOpen $task visited $host $path r] RobotReadRecord $inf oldurl olddistance RobotFileClose $inf if {[string length $olddistance] == 0} { @@ -433,34 +606,34 @@ proc RobotRedirect {url tourl code} { } puts "distance=$distance olddistance=$olddistance" if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $tourl $distance RobotFileClose $outf } } } - if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} { + if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} { puts "unlink failed" exit 1 } } -proc link {url out href body distance} { - global URL maxdistance - if {[expr $distance > $maxdistance]} return +proc link {task url out href body distance} { + global URL control + if {[expr $distance > $control($task,distance)]} return - if {![RobotHref $url href host path]} return + if {![RobotHref $task $url href host path]} return puts $out "" puts $out "$href" puts $out "$body" puts $out "" - if {![RobotFileExist visited $host $path]} { + if {![RobotFileExist $task visited $host $path]} { set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] + if {![RobotFileExist $task bad $host $path]} { + if {[RobotFileExist $task unvisited $host $path]} { + set inf [RobotFileOpen $task unvisited $host $path r] RobotReadRecord $inf oldurl olddistance RobotFileClose $inf } @@ -471,12 +644,12 @@ proc link {url out href body distance} { set olddistance 1000 } if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $url $distance RobotFileClose $outf } } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] + set inf [RobotFileOpen $task visited $host $path r] RobotReadRecord $inf xurl olddistance close $inf if {[string length $olddistance] == 0} { @@ -486,15 +659,15 @@ proc link {url out href body distance} { puts "OK remarking url=$url href=$href" puts "olddistance = $olddistance" puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $url $distance RobotFileClose $outf } } } -proc RobotTextHtml {url out} { - global URL maxdistance +proc RobotTextHtml {task url out} { + global URL control # set title so we can emit it for the body set title {} @@ -505,11 +678,11 @@ proc RobotTextHtml {url out} { set distance 0 set fdistance 0 - if {$maxdistance < 1000 && [info exists URL($url,dist)]} { - set fdistance $URL($url,dist) + if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} { + set fdistance $URL($task,$url,dist) set distance [expr $fdistance + 1] } - htmlSwitch $URL($url,buf) \ + htmlSwitch $URL($task,$url,buf) \ title { set title $body } -nonest meta { @@ -562,8 +735,8 @@ proc RobotTextHtml {url out} { continue } set href [string trim $parm(href)] - if {![RobotHref $url href host path]} continue - set URL($url,bpath) $path + if {![RobotHref $task $url href host path]} continue + set URL($task,$url,bpath) $path } a { # .. # we're not using nonest - otherwise body isn't set @@ -571,32 +744,33 @@ proc RobotTextHtml {url out} { if {![info exists parm(href)]} { continue } - link $url $out [string trim $parm(href)] $body $distance + link $task $url $out [string trim $parm(href)] $body $distance } -nonest area { if {$nofollow} continue if {![info exists parm(href)]} { continue } - link $url $out [string trim $parm(href)] $body $distance + link $task $url $out [string trim $parm(href)] $body $distance } -nonest frame { if {![info exists parm(src)]} { continue } - link $url $out [string trim $parm(src)] $body $fdistance + link $task $url $out [string trim $parm(src)] $body $fdistance } } -proc RobotsTxt {url} { +proc RobotsTxt {task url} { global agent URL - RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf) + RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf) } -proc RobotsTxt0 {v buf} { +proc RobotsTxt0 {task v buf} { global URL agent set section 0 foreach l [split $buf \n] { if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} { + set arg [string trim $arg] puts "cmd=$cmd arg=$arg" switch -- [string tolower $cmd] { user-agent { @@ -621,186 +795,189 @@ proc RobotsTxt0 {v buf} { } } -proc RobotTextPlain {url out} { +proc RobotTextPlain {task url out} { global URL puts $out "" - regsub -all {<} $URL($url,buf) {\<} content + regsub -all {<} $URL($task,$url,buf) {\<} content puts $out $content puts $out "" - if {![string compare $URL($url,path) /robots.txt]} { - RobotsTxt $url + if {![string compare $URL($task,$url,path) /robots.txt]} { + RobotsTxt $task $url } } -proc RobotWriteMetadata {url out} { - global URL domains +proc RobotWriteMetadata {task url out} { + global URL puts $out "" set distance 1000 - if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } - set URL($url,dist) $distance + set URL($task,$url,dist) $distance puts $out "" puts $out " $distance" puts $out "" - headSave $url $out + headSave $task $url $out puts "Parsing $url distance=$distance" - switch $URL($url,head,content-type) { + switch $URL($task,$url,head,content-type) { text/html { if {[string length $distance]} { - RobotTextHtml $url $out + RobotTextHtml $task $url $out } } text/plain { - RobotTextPlain $url $out + RobotTextPlain $task $url $out } } puts $out "" } -proc Robot200 {url} { - global URL domains +proc Robot200 {task url} { + global URL - set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)] - puts -nonewline $out $URL($url,buf) + set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)] + puts -nonewline $out $URL($task,$url,buf) RobotFileClose $out - set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] - RobotWriteMetadata $url $out + set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)] + RobotWriteMetadata $task $url $out RobotFileClose $out - RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) } -proc RobotReadContent {url sock binary} { +proc RobotReadContent {task url sock binary} { global URL set buffer [read $sock 16384] set readCount [string length $buffer] if {$readCount <= 0} { - Robot200 $url - RobotRestart $url $sock + Robot200 $task $url + RobotRestart $task $url $sock } elseif {!$binary && [string first \0 $buffer] >= 0} { - Robot200 $url - RobotRestart $url $sock + Robot200 $task $url + RobotRestart $task $url $sock } else { # puts "Got $readCount bytes" - set URL($url,buf) $URL($url,buf)$buffer + set URL($task,$url,buf) $URL($task,$url,buf)$buffer } } -proc RobotReadHeader {url sock} { +proc RobotReadHeader {task url sock} { global URL debuglevel if {$debuglevel > 1} { puts "HTTP head $url" } if {[catch {set buffer [read $sock 2148]}]} { - RobotError $url 404 - RobotRestart $url $sock + RobotError $task $url 404 + RobotRestart $task $url $sock return } set readCount [string length $buffer] if {$readCount <= 0} { - RobotError $url 404 - RobotRestart $url $sock + RobotError $task $url 404 + RobotRestart $task $url $sock } else { # puts "Got $readCount bytes" - set URL($url,buf) $URL($url,buf)$buffer + set URL($task,$url,buf) $URL($task,$url,buf)$buffer - set n [string first \r\n\r\n $URL($url,buf)] + set n [string first \r\n\r\n $URL($task,$url,buf)] if {$n > 1} { set code 0 set version {} - set headbuf [string range $URL($url,buf) 0 $n] + set headbuf [string range $URL($task,$url,buf) 0 $n] incr n 4 - set URL($url,buf) [string range $URL($url,buf) $n end] + set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code set lines [split $headbuf \n] foreach line $lines { if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { - set URL($url,head,[string tolower $name]) [string trim $value] + set URL($task,$url,head,[string tolower $name]) [string trim $value] } } puts "HTTP CODE $code" - set URL($url,state) skip + set URL($task,$url,state) skip switch $code { 301 { - RobotRedirect $url $URL($url,head,location) 301 - RobotRestart $url $sock + RobotRedirect $task $url $URL($task,$url,head,location) 301 + RobotRestart $task $url $sock } 302 { - RobotRedirect $url $URL($url,head,location) 302 - RobotRestart $url $sock + RobotRedirect $task $url $URL($task,$url,head,location) 302 + RobotRestart $task $url $sock } 200 { - if {![info exists URL($url,head,content-type)]} { - set URL($url,head,content-type) {} + if {![info exists URL($task,$url,head,content-type)]} { + set URL($task,$url,head,content-type) {} } set binary 1 - switch -glob -- $URL($url,head,content-type) { + switch -glob -- $URL($task,$url,head,content-type) { text/* { set binary 0 } } if {![regexp {/robots.txt$} $url]} { - if {![checkrule mime $URL($url,head,content-type)]} { - RobotError $url mimedeny - RobotRestart $url $sock + if {![checkrule $task mime $URL($task,$url,head,content-type)]} { + RobotError $task $url mimedeny + RobotRestart $task $url $sock return } } - fileevent $sock readable [list RobotReadContent $url $sock $binary] + fileevent $sock readable [list RobotReadContent $task $url $sock $binary] } default { - RobotError $url $code - RobotRestart $url $sock + RobotError $task $url $code + RobotRestart $task $url $sock } } } } } -proc RobotSockCancel {url sock} { +proc RobotSockCancel {task url sock} { puts "RobotSockCancel sock=$sock url=$url" - RobotError $url 401 - RobotRestart $url $sock + RobotError $task $url 401 + RobotRestart $task $url $sock } -proc RobotConnect {url sock} { +proc RobotConnect {task url sock} { global URL agent acceptLanguage fconfigure $sock -translation {lf crlf} -blocking 0 - fileevent $sock readable [list RobotReadHeader $url $sock] - puts $sock "GET $URL($url,path) HTTP/1.0" - puts $sock "Host: $URL($url,host)" + fileevent $sock readable [list RobotReadHeader $task $url $sock] + puts $sock "GET $URL($task,$url,path) HTTP/1.0" + puts $sock "Host: $URL($task,$url,host)" puts $sock "User-Agent: $agent" if {[string length $acceptLanguage]} { puts $sock "Accept-Language: $acceptLanguage" } puts $sock "" - flush $sock - set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]] + set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]] + if {[catch {flush $sock}]} { + RobotError $task $url 404 + RobotRestart $task $url $sock + } } proc RobotNop {} { } -proc RobotGetUrl {url phost} { +proc RobotGetUrl {task url phost} { global URL robotsRunning flush stdout - puts "Retrieve $robotsRunning url=$url" + puts "Retrieve running=$robotsRunning url=$url task=$task" if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } @@ -808,25 +985,25 @@ proc RobotGetUrl {url phost} { set port 80 set host $hostport } - set URL($url,method) $method - set URL($url,host) $host - set URL($url,hostport) $hostport - set URL($url,path) $path - set URL($url,state) head - set URL($url,buf) {} + set URL($task,$url,method) $method + set URL($task,$url,host) $host + set URL($task,$url,hostport) $hostport + set URL($task,$url,path) $path + set URL($task,$url,state) head + set URL($task,$url,buf) {} if {[string compare $path /robots.txt]} { set ok 1 if {![info exists URL($hostport,robots)]} { puts "READING robots.txt for host $hostport" - if {[RobotFileExist visited $hostport /robots.txt]} { - set inf [RobotFileOpen visited $hostport /robots.txt r] + if {[RobotFileExist $task visited $hostport /robots.txt]} { + set inf [RobotFileOpen $task visited $hostport /robots.txt r] set buf [read $inf 32768] close $inf } else { set buf "User-agent: *\nAllow: /\n" } - RobotsTxt0 URL($hostport,robots) $buf + RobotsTxt0 $task URL($hostport,robots) $buf } if {[info exists URL($hostport,robots)]} { foreach l $URL($hostport,robots) { @@ -844,7 +1021,7 @@ proc RobotGetUrl {url phost} { if [catch {set sock [socket -async $host $port]}] { return -1 } - RobotConnect $url $sock + RobotConnect $task $url $sock return 0 } @@ -856,7 +1033,7 @@ if {![llength [info commands htmlSwitch]]} { } } -set agent "zmbot/0.1" +set agent "zmbot/0.2" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" } @@ -870,28 +1047,22 @@ proc bgerror {m} { } set robotsRunning 0 -set robotSeq 0 set workdir [pwd] -set idletime 60000 +set idletime 30000 set acceptLanguage {} set debuglevel 0 -set status(unvisited) 0 -set status(visited) 0 -set status(bad) 0 -set status(raw) 0 - # Rules: allow, deny, url -proc checkrule {type this} { - global alrules +proc checkrule {task type this} { + global control global debuglevel if {$debuglevel > 3} { puts "CHECKRULE $type $this" } - if {[info exist alrules]} { - foreach l $alrules { + if {[info exist control($task,alrules)]} { + foreach l $control($task,alrules) { if {$debuglevel > 3} { puts "consider $l" } @@ -938,11 +1109,11 @@ proc checkrule {type this} { proc url {href} { - global debuglevel + global debuglevel task - if {[RobotHref http://www.indexdata.dk/ href host path]} { - if {![RobotFileExist visited $host $path]} { - set outf [RobotFileOpen unvisited $host $path] + if {[RobotHref $task http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist $task visited $host $path]} { + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf href 0 RobotFileClose $outf } @@ -950,15 +1121,15 @@ proc url {href} { } proc deny {type stuff} { - global alrules + global control task - lappend alrules [list deny $type $stuff] + lappend control($task,alrules) [list deny $type $stuff] } proc allow {type stuff} { - global alrules + global control task - lappend alrules [list allow $type $stuff] + lappend control($task,alrules) [list allow $type $stuff] } proc debug {level} { @@ -967,6 +1138,37 @@ proc debug {level} { set debuglevel $level } +proc task {t} { + global tasks task status robotSeq control + + set task $t + + if {[info exists tasks]} { + if {[lsearch -exact $tasks $t] >= 0} { + return 0 + } + } + + lappend tasks $t + set status($t,unvisited) 0 + set status($t,visited) 0 + set status($t,bad) 0 + set status($t,raw) 0 + set status($t,active) 1 + set robotSeq($t) 0 + set control($t,distance) 10 + return 1 +} + +# Little utility that ensures that at least one task is present (main). +proc chktask {} { + global tasks + if {![info exist tasks]} { + task main + } +} + + # Parse options set i 0 @@ -979,9 +1181,25 @@ if {$l < 2} { exit 1 } + + while {$i < $l} { set arg [lindex $argv $i] switch -glob -- $arg { + -t* { + set t [string range $arg 2 end] + if {![string length $t]} { + set t [lindex $argv [incr i]] + } + task $t + } + -D* { + set dir [string range $arg 2 end] + if {![string length $dir]} { + set dir [lindex $argv [incr i]] + } + lappend daemon_dir $dir + } -j* { set robotsMax [string range $arg 2 end] if {![string length $robotsMax]} { @@ -989,17 +1207,19 @@ while {$i < $l} { } } -c* { - set maxdistance [string range $arg 2 end] - if {![string length $maxdistance]} { - set maxdistance [lindex $argv [incr i]] + chktask + set control($task,distance) [string range $arg 2 end] + if {![string length $control($task,distance)]} { + set control($task,distance) [lindex $argv [incr i]] } } -d* { + chktask set dom [string range $arg 2 end] if {![string length $dom]} { set dom [lindex $argv [incr i]] } - lappend domains $dom + lappend control($task,domains) $dom } -i* { set idletime [string range $arg 2 end] @@ -1008,23 +1228,30 @@ while {$i < $l} { } } -l* { + chktask set acceptLanguage [string range $arg 2 end] if {![string length $acceptLanguage]} { set acceptLanguage [lindex $argv [incr i]] } } -r* { + chktask set rfile [string range $arg 2 end] if {![string length $rfile]} { set rfile [lindex $argv [incr i]] } + catch {unset maxdistance} source $rfile + if {[info exists maxdistance]} { + set control($task,distance) $maxdistance + } } default { + chktask set href $arg - if {[RobotHref http://www.indexdata.dk/ href host path]} { - if {![RobotFileExist visited $host $path]} { - set outf [RobotFileOpen unvisited $host $path] + if {[RobotHref $task http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist $task visited $host $path]} { + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf href 0 RobotFileClose $outf } @@ -1034,29 +1261,36 @@ while {$i < $l} { incr i } -if {![info exist domains]} { - set domains {*} -} -if {![info exist maxdistance]} { - set maxdistance 50 -} if {![info exist robotsMax]} { set robotsMax 5 } -puts "domains=$domains" -puts "max distance=$maxdistance" -puts "max jobs=$robotsMax" - - -RobotStart - - -while {$robotsRunning} { - vwait robotsRunning +if {[info exist daemon_dir]} { + RobotDaemonLoop +} else { + foreach t $tasks { + puts "task $t" + puts "max distance=$control($t,distance)" + if {[info exists control($t,domains)]} { + puts "domains=$control($t,domains)" + } + } + puts "max jobs=$robotsMax" + + foreach t $tasks { + RobotStart $t + } + + while {$robotsRunning} { + vwait robotsRunning + } + + if {[info exists tasks]} { + foreach t $tasks { + set statusfile [open $t/status w] + puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" + close $statusfile + } + } } -set statusfile [open status w] -puts $statusfile "$status(unvisited) $status(bad) $status(visited)" -close $statusfile - -- 1.7.10.4