--- /dev/null
+zmbot: a Simple Web harvesting robot for Z'mbol.
+
+Introduction
+
+ zmbot is a simple web harvester written in Tcl. The following
+ summaries the features:
+
+ o Simple administration. One script does the job and no external
+ database is required to operate.
+
+ o Interruptible. Harvesting may safely be stopped/interrupted at any
+ point.
+
+ o Gentle harvesting. By default a site is visited once per minute -
+ robots.txt honored.
+
+ o Concurrent harvesting (jobs) in one process and one thread.
+
+ o Inspects content-type header to determine structure of page.
+
+ o Written in Tcl and is quite portable. (Some may not think this as being
+ feature; Perl version is welcomed!).
+
+ o Creates simple XML output. One file per URL.
+
+ The robot is started from the command line and takes one or more URL's
+ as parameter(s). Options, prefixed with minus, alter the behaviour of
+ the harvesting. The following options are supported:
+
+ -j jobs The maximum number of concurrent HTTP sessions; default 5 jobs.
+
+ -i idle Idle time in microseconds between visits to the same site;
+ default 60000 = 60 seconds.
+
+ -c count Maximum distance from original URL as given from the command
+ line; default 50.
+
+
+ -d domain Only sites matching domain are visited. The domain given is
+ a Tcl glob expression (.e.g *.somwhere.com). Remember to
+ quote the domain when given on the command line so that your
+ shell doesn't expand this. This option may be repeated thus
+ allowing you to specify many "allowed" domains.
+
+ Example 1: Harvest three links away from www.somwhere.com world-wide:
+ ./robot.tcl -c 3 http://www.somwhere.com/
+
+ Example 2: Harvest the site www.somwhere.com only:
+ ./robot.tcl -d www.somewhere.com http://www.somewhere.com/
+
+ Example 3: Harvest up to two click from www.a.dk and www.b.dk in dk-domain:
+ ./robot.tcl -d '*.dk' -c 2 http://www.a.dk/ http://www.b.dk/
+
+ The zmbot robot creates three directories, visited, unvisited, bad
+ for visited pages, unvisited pages, and bad pages respectively. The
+ visited area holds keywords and metadata for all successully retrieved
+ pages. The unvisited area serves as a "todo" list of pages to be visited
+ in the future. The bad area holds pages that for some reason cannot be
+ retrieved: non-existant, permission denied, robots.txt disallow, etc.
+
+Installation:
+
+ $ ./configure
+ $ make
+
+ The configure script looks for the Tcl shell, tclsh, to determine the
+ location of Tcl and its configuration file tclConfig.sh. To manually specify
+ Tcl's location, add --with-tclconfig and specify the directory where
+ tclConfig.sh is installed. For example:
+ ./configure --with-tclconfig=/usr/local/lib
+
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.15 2001/06/05 08:44:50 adam Exp $
+# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $
#
proc RobotFileNext1 {area lead} {
puts "RobotFileNext1 area=$area lead=$lead"
}
}
}
+ } -nonest area {
+ if {![info exists parm(href)]} {
+ puts "no href"
+ continue
+ }
+ if {[expr $distance <= $maxDistance]} {
+ set href [string trim $parm(href)]
+ if {![RobotHref $url href host path]} continue
+
+ puts $out "<cr>"
+ puts $out "<identifier>$href</identifier>"
+ puts $out "<description></description>"
+ puts $out "</cr>"
+
+ if {![RobotFileExist visited $host $path]} {
+ set olddistance 1000
+ if {![RobotFileExist bad $host $path]} {
+ if {[RobotFileExist unvisited $host $path]} {
+ set inf [RobotFileOpen unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ puts "OK remarking url=$url href=$href"
+ puts "olddistance = $olddistance"
+ puts "newdistance = $distance"
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ }
+ }
}
}
global URL agent
set section 0
foreach l [split $buf \n] {
- if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
+ if {[regexp {([-A-Za-z]+):[ \t]*([^\#\t ]+)} $l match cmd arg]} {
puts "cmd=$cmd arg=$arg"
- switch $cmd {
- User-Agent {
+ switch -- [string tolower $cmd] {
+ user-agent {
if {$section} break
set pat [string tolower $arg]*
set section [string match $pat $agent]
}
- Disallow {
+ disallow {
if {$section} {
puts "rule [list 0 $arg]"
lappend $v [list 0 $arg]
}
}
- Allow {
+ allow {
if {$section} {
puts "rule [list 1 $arg]"
lappend $v [list 1 $arg]
set buf [read $inf 32768]
close $inf
} else {
- set buf "User-Agent: *\nAllow: /\n"
+ set buf "User-agent: *\nAllow: /\n"
}
RobotsTxt0 URL($hostport,robots) $buf
}
}
}
if {!$ok} {
+ puts "skipped due to robots.txt"
return -1
}
}