Perlfect Solutions
 

[Perlfect-search] patch to support robots.txt

Jerrad Pierce belg4mit@MIT.EDU
Wed, 20 Mar 2002 19:26:15 -0500
As promised here is a patch to support robots.txt. It requires libwww-perl,
as it hacks in code reuse from WWW::RobotRules. It's untested with HTTP
indexing, I use filesystem, but I anticipate no issues. If it seems too
hacked in, it is trivial to add support for HTTP indexing with the second
patch below. To use add something to the effect of the following to your
conf.pl, and of course add rules to your robots.txt at your discretion:

  # Play nice
  $ROBOT_AGENT = "Perlfect/Search";

Enjoy!

PS> Of course the last chunk of the first patch is simply a matter of taste,
    except for the addition of $ROBOT_AGENT to the list, though I suspect
    the compiler might be able to optimize it better.

=cut 1
--- indexer.bak Wed Mar 20 21:35:15 2002
+++ indexer.pl  Thu Mar 21 00:11:36 2002
@@ -116,6 +116,24 @@

 print "Building string of special characters...\n";
 build_char_string();
+if( $ROBOT_AGENT ){
+  print "Loading robots.txt...\n";
+  eval "use WWW::RobotRules";
+  warn("Ignoring robots.txt, is libwww installed? $!") && ($ROBOT_AGENT = '') if $@;
+  $ROBOT = WWW::RobotRules->new($ROBOT_AGENT);
+  my($robots_txt, $url);
+  if( $HTTP_START_URL ) {
+    my $http_user_agent = LWP::UserAgent->new;
+    (undef, $robots_txt) = get_url($url=$HTTP_START_URL.'/robots.txt');
+  }
+  else{
+    open(ROBOT, $url="$DOCUMENT_ROOT/robots.txt") ||
+      warn("Could not fetch robots.txt: $!");
+    read(ROBOT, $robots_txt, -s $url);
+  }
+  #Give some - output here like no index?
+  $ROBOT->parse($url, $robots_txt) if $robots_txt;
+}
 print "Loading \'no index\' regular expressions:\n";
 load_excludes();
 print "Loading stopwords...";
@@ -477,6 +495,10 @@
       return "listed in no_index.txt";
     }
   }
+  if( $ROBOT_AGENT ){
+    my $file = "http://localhost$file_relative" unless $HTTP_START_URL;
+    return "disallowed by robots.txt" unless $ROBOT->allowed($file);
+  }
   # For PDF files check filename for security reasons (it later gets handed to a shell!):
   if( $file =~ m/\.pdf$/i && $PDFTOTEXT ) {
     if( $file !~ m/^[\/\\a-zA-Z0-9_.:+-]*$/ || $file =~ m/\.\./ ) {
@@ -569,20 +591,21 @@
 # Shut up misguided -w warnings about "used only once". Has no functional meaning.
 sub warnings_sillyness {
   my $zz;
-  $zz = $SIZES_DB_FILE;
-  $zz = $TITLE_WEIGHT;
-  $zz = $SPECIAL_CHARACTERS;
-  $zz = $H_WEIGHT;
-  $zz = $INDEX_URLS;
-  $zz = $DESC_WORDS;
-  $zz = $INV_INDEX_DB_FILE;
-  $zz = $MINLENGTH;
-  $zz = $BASE_URL;
-  $zz = $DESC_DB_FILE;
-  $zz = $TITLES_DB_FILE;
-  $zz = $TERMS_DB_FILE;
-  $zz = $DOCS_DB_FILE;
-  $zz = $TMP_DIR;
-  $zz = $CONTENT_DB_FILE;
-  $zz = $INDEX_NUMBERS;
+  $zz = $SIZES_DB_FILE
+      = $TITLE_WEIGHT
+      = $SPECIAL_CHARACTERS
+      = $H_WEIGHT
+      = $INDEX_URLS
+      = $DESC_WORDS
+      = $INV_INDEX_DB_FILE
+      = $MINLENGTH
+      = $BASE_URL
+      = $DESC_DB_FILE
+      = $TITLES_DB_FILE
+      = $TERMS_DB_FILE
+      = $DOCS_DB_FILE
+      = $TMP_DIR
+      = $CONTENT_DB_FILE
+      = $INDEX_NUMBERS
+      = $ROBOT_AGENT;
=cut 2
+++ indexer_web.pl      Thu Mar 21 00:15:20 2002
@@ -18,12 +18,21 @@
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 # USA

-use LWP::UserAgent;
+# This pulls in LWP::UserAgent for us, it saves an eval at the expense
+# of some potentially unused code
+use LWP::RobotUA;
 use URI;
 use MD5;

 my $md5 = new MD5;
-my $http_user_agent = LWP::UserAgent->new;
+my $http_user_agent;
+if( $ROBOT_AGENT ){
+  # This should be email, but $BASE_URL is better than nothing
+  $http_user_agent = LWP::RobotUA->new($ROBOT_AGENT, $BASE_URL);
+}
+else{
+  $http_user_agent = LWP::UserAgent->new;
+}
 my $host = "";
 my $base = "";
 my %list;
-- 
H4sICNoBwDoAA3NpZwA9jbsNwDAIRHumuC4NklvXTOD0KSJEnwU8fHz4Q8M9i3sGzkS7BBrm
OkCTwsycb4S3DloZuMIYeXpLFqw5LaMhXC2ymhreVXNWMw9YGuAYdfmAbwomoPSyFJuFn2x8
Opr8bBBidccAAAA=
--
MOTD on Prickle-Prickle, the 6th of Discord, in the YOLD 3168:
Soy un perdedor!