#!/usr/bin/perl # simple brute force robots validator use WWW::RobotRules; my $rules = WWW::RobotRules->new('Robo Check'); use LWP::Simple qw($ua get); $ua->timeout(10); my $host; my $u; while() { s/\&\;/\&/i; chomp; (print "ILLEGAL: $_\n" && next) unless($u = URI->new($_)); (print "ILLEGAL: $_\n" && next) unless(length($u->host) > 3 ); (print "REJECTED: $_\n" && next) unless(/http(.?)\:\/\/([^\/]+)/); my $nhost = "http$1://$2"; my $url = $_; if($nhost ne $host) { $host = $nhost; $rules->agent("Robo Check"); print STDERR "FETCHING: $host/robots.txt\n"; my $robots_txt = get "$host/robots.txt"; printf STDERR "GOT %d length\n",length($robots_txt); $rules->parse("$host/robots.txt", $robots_txt) if defined $robots_txt; } if($rules->allowed($url)) { print "GOOD\t$url\n"; }else{ print "BAD\t$url\n"; } }