From: Adam Dickmeiss Date: Wed, 6 Jun 2001 07:10:31 +0000 (+0000) Subject: Added README. Ignore case in keywords in robots.txt. X-Git-Tag: ZMBOT.0.1~23 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=9d508bb1bb6e7479fb9e6753797fc78151cfc0e4;p=tclrobot.git Added README. Ignore case in keywords in robots.txt. --- diff --git a/README b/README new file mode 100644 index 0000000..e7b1184 --- /dev/null +++ b/README @@ -0,0 +1,71 @@ +zmbot: a Simple Web harvesting robot for Z'mbol. + +Introduction + + zmbot is a simple web harvester written in Tcl. The following + summaries the features: + + o Simple administration. One script does the job and no external + database is required to operate. + + o Interruptible. Harvesting may safely be stopped/interrupted at any + point. + + o Gentle harvesting. By default a site is visited once per minute - + robots.txt honored. + + o Concurrent harvesting (jobs) in one process and one thread. + + o Inspects content-type header to determine structure of page. + + o Written in Tcl and is quite portable. (Some may not think this as being + feature; Perl version is welcomed!). + + o Creates simple XML output. One file per URL. + + The robot is started from the command line and takes one or more URL's + as parameter(s). Options, prefixed with minus, alter the behaviour of + the harvesting. The following options are supported: + + -j jobs The maximum number of concurrent HTTP sessions; default 5 jobs. + + -i idle Idle time in microseconds between visits to the same site; + default 60000 = 60 seconds. + + -c count Maximum distance from original URL as given from the command + line; default 50. + + + -d domain Only sites matching domain are visited. The domain given is + a Tcl glob expression (.e.g *.somwhere.com). Remember to + quote the domain when given on the command line so that your + shell doesn't expand this. This option may be repeated thus + allowing you to specify many "allowed" domains. + + Example 1: Harvest three links away from www.somwhere.com world-wide: + ./robot.tcl -c 3 http://www.somwhere.com/ + + Example 2: Harvest the site www.somwhere.com only: + ./robot.tcl -d www.somewhere.com http://www.somewhere.com/ + + Example 3: Harvest up to two click from www.a.dk and www.b.dk in dk-domain: + ./robot.tcl -d '*.dk' -c 2 http://www.a.dk/ http://www.b.dk/ + + The zmbot robot creates three directories, visited, unvisited, bad + for visited pages, unvisited pages, and bad pages respectively. The + visited area holds keywords and metadata for all successully retrieved + pages. The unvisited area serves as a "todo" list of pages to be visited + in the future. The bad area holds pages that for some reason cannot be + retrieved: non-existant, permission denied, robots.txt disallow, etc. + +Installation: + + $ ./configure + $ make + + The configure script looks for the Tcl shell, tclsh, to determine the + location of Tcl and its configuration file tclConfig.sh. To manually specify + Tcl's location, add --with-tclconfig and specify the directory where + tclConfig.sh is installed. For example: + ./configure --with-tclconfig=/usr/local/lib + diff --git a/robot.tcl b/robot.tcl index 5bd9f82..5c2b518 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.15 2001/06/05 08:44:50 adam Exp $ +# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $ # proc RobotFileNext1 {area lead} { puts "RobotFileNext1 area=$area lead=$lead" @@ -484,6 +484,56 @@ proc RobotTextHtml {url out} { } } } + } -nonest area { + if {![info exists parm(href)]} { + puts "no href" + continue + } + if {[expr $distance <= $maxDistance]} { + set href [string trim $parm(href)] + if {![RobotHref $url href host path]} continue + + puts $out "" + puts $out "$href" + puts $out "" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist bad $host $path]} { + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } + } } } @@ -497,21 +547,21 @@ proc RobotsTxt0 {v buf} { global URL agent set section 0 foreach l [split $buf \n] { - if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} { + if {[regexp {([-A-Za-z]+):[ \t]*([^\#\t ]+)} $l match cmd arg]} { puts "cmd=$cmd arg=$arg" - switch $cmd { - User-Agent { + switch -- [string tolower $cmd] { + user-agent { if {$section} break set pat [string tolower $arg]* set section [string match $pat $agent] } - Disallow { + disallow { if {$section} { puts "rule [list 0 $arg]" lappend $v [list 0 $arg] } } - Allow { + allow { if {$section} { puts "rule [list 1 $arg]" lappend $v [list 1 $arg] @@ -707,7 +757,7 @@ proc RobotGetUrl {url phost} { set buf [read $inf 32768] close $inf } else { - set buf "User-Agent: *\nAllow: /\n" + set buf "User-agent: *\nAllow: /\n" } RobotsTxt0 URL($hostport,robots) $buf } @@ -720,6 +770,7 @@ proc RobotGetUrl {url phost} { } } if {!$ok} { + puts "skipped due to robots.txt" return -1 } }