Perlfect Solutions
 

[Perlfect-search] problems with 3.31b

Luhman, Rick perlfect-search@perlfect.com
Tue, 25 Mar 2003 13:02:57 -0600
This message is in MIME format. Since your mail reader does not understand
this format, some or all of this message may not be legible.

------_=_NextPart_000_01C2F301.25CDDB00
Content-Type: text/plain

Tools.pl file for 3.31b

-----Original Message-----
From: Daniel Naber [mailto:daniel.naber@t-online.de] 
Sent: Thursday, March 20, 2003 3:44 PM
To: perlfect-search@perlfect.com
Subject: Re: [Perlfect-search] problems with 3.31b

On Thursday 20 March 2003 21:57, Luhman, Rick wrote:

> '-' is not recognized as an internal or external command,
> operable program or batch file.

Please send me the "tools.pl" file from your Perlfect Search installation.

Regards
 Daniel

-- 
http://www.danielnaber.de

_______________________________________________
perlfect-search mailing list
perlfect-search@perlfect.com
To unsubscribe, set other personal options or view the list archives please
visit:





------_=_NextPart_000_01C2F301.25CDDB00
Content-Type: application/octet-stream;
        name="tools.pl"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
        filename="tools.pl"

# Perlfect Search - tools.pl=0A=
# Mapping of Numbers to HTML entities and other useful functions=0A=
# Hash taken from htdig's SGMLEntities.cc (shortened)=0A=
#$rcs =3D ' $Id: tools.pl,v 1.13 2003/02/24 22:45:42 daniel Exp $ ' =
;=0A=
=0A=
# Copyright (C) 1999-2002 Giorgos Zervas <giorgos@perlfect.com> and =0A=
#  Daniel Naber <daniel.naber@t-online.de>=0A=
# =0A=
# This program is free software; you can redistribute it and/or =
modify=0A=
# it under the terms of the GNU General Public License as published =
by=0A=
# the Free Software Foundation; either version 2 of the License, or =
(at=0A=
# your option) any later version.=0A=
#=0A=
# This program is distributed in the hope that it will be useful, =
but=0A=
# WITHOUT ANY WARRANTY; without even the implied warranty of=0A=
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU=0A=
# General Public License for more details.=0A=
#=0A=
# You should have received a copy of the GNU General Public License=0A=
# along with this program; if not, write to the Free Software=0A=
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA =
02111-1307=0A=
# USA=0A=
=0A=
my $http_max_pages_counter =3D 0;=0A=
=0A=
# Remove some HTML special characters from a string. This is =
necessary=0A=
# to avoid cross site scripting attacks. =0A=
# See http://www.cert.org/advisories/CA-2000-02.html=0A=
sub cleanup {=0A=
  my $str =3D $_[0];=0A=
  if( ! defined($str) ) {=0A=
    return "";=0A=
  }=0A=
  $str =3D~ s/[<>"'&]/ /igs;=0A=
  return $str;=0A=
}=0A=
=0A=
# Escape some HTML special characters in a string. This is necessary=0A=
# to avoid cross site scripting attacks. =0A=
# See http://www.cert.org/advisories/CA-2000-02.html=0A=
sub html_escape {=0A=
  my $str =3D $_[0];=0A=
  if( ! defined($str) ) {=0A=
    return "";=0A=
  }=0A=
  $str =3D~ s/&/&amp;/igs;=0A=
  $str =3D~ s/</&lt;/igs;=0A=
  $str =3D~ s/>/&gt;/igs;=0A=
  $str =3D~ s/"/&quot;/igs;=0A=
  $str =3D~ s/'/&apos;/igs;=0A=
  return $str;=0A=
}=0A=
=0A=
sub init_config {=0A=
        if( $DOCUMENT_ROOT !~ m#/$# ) {=0A=
                # add ending slash if necessary=0A=
                $DOCUMENT_ROOT =3D $DOCUMENT_ROOT."/";=0A=
        }=0A=
        $BASE_URL =3D~ s/\/$//;         # remove trailing slash=0A=
}=0A=
=0A=
sub init_robot_check {=0A=
        my $base =3D shift;=0A=
        if( $ROBOT_AGENT && $HTTP_START_URL ) {=0A=
                eval "use WWW::RobotRules";=0A=
                if( $@ ) {=0A=
                        die("Cannot use robots.txt, maybe WWW::RobotRules is not installed? =
$!");=0A=
                }=0A=
                $main::robot =3D WWW::RobotRules->new($ROBOT_AGENT);=0A=
                my $url =3D "$base/robots.txt";=0A=
                print "Loading $url...\n";=0A=
                my $http_user_agent =3D LWP::UserAgent->new;=0A=
                my $robots_txt;=0A=
                (undef, undef, undef, $robots_txt) =3D get_url($http_user_agent, =
$url);=0A=
                if( $robots_txt ) {=0A=
                        $main::robot->parse($url, $robots_txt);=0A=
                } else {=0A=
                        warn("Not using any robots.txt.\n");=0A=
                }=0A=
        }=0A=
}=0A=
=0A=
sub isHTML {=0A=
  my $filename =3D shift;=0A=
  my $ext =3D get_suffix($filename);=0A=
  if( $ext ) {=0A=
    return grep(/^$ext$/i, @HIGHLIGHT_EXT);=0A=
  } else {=0A=
    return 0;=0A=
  }=0A=
}=0A=
=0A=
sub get_suffix {=0A=
  my $filename =3D shift;=0A=
  ($suffix) =3D ($filename =3D~ m/\.([^.]*)$/);=0A=
  return $suffix;=0A=
}=0A=
=0A=
sub filterFile {=0A=
  my $filename =3D shift;=0A=
  my $ext =3D shift;=0A=
  my $buffer;=0A=
  my $pid =3D open(FILE, "-|");=0A=
  my $tmp =3D $/;=0A=
  undef $/;=0A=
  # flush STDOUT;=0A=
  {=0A=
    local $|;=0A=
    $|=3D1;=0A=
    print "";=0A=
  }=0A=
  if( ! defined($pid) ) {=0A=
    warn "cannot fork: $!";=0A=
    return;=0A=
  }=0A=
  if( ! $pid ) {=0A=
    # child=0A=
    my $cmd =3D $EXT_FILTER{$ext};=0A=
    my @args =3D split(/\s+/, $cmd);=0A=
    foreach (@args) {=0A=
      if( $_ eq 'FILENAME' ) {=0A=
        $_ =3D~ s/FILENAME/$filename/g;=0A=
      }=0A=
    }=0A=
    # We're using exec() with a list of argument because =0A=
    # this makes sure the filename is escaped automatically:=0A=
    exec(@args) || die "can't exec program $args[0]: $!";=0A=
    # NOTREACHED=0A=
  }=0A=
  $buffer =3D <FILE>;=0A=
  close(FILE);=0A=
  $/ =3D $tmp;=0A=
  return $buffer;=0A=
}=0A=
=0A=
sub debug {=0A=
        my $str =3D shift;=0A=
        if( $HTTP_DEBUG && $ENV{'REQUEST_METHOD'} ) {=0A=
                print $str;=0A=
        } elsif( $HTTP_DEBUG && ! $ENV{'REQUEST_METHOD'} ) {=0A=
                print STDERR $str;=0A=
        }=0A=
}=0A=
=0A=
sub error {=0A=
        my $str =3D shift;=0A=
        if( $ENV{'REQUEST_METHOD'} ) {=0A=
                print $str;=0A=
        } else {=0A=
                print STDERR $str;=0A=
        }=0A=
}=0A=
=0A=
# Fetch URL via http, return real URL (differs only in case of =
redirect) and=0A=
# document's contents. Return nothing in case of error or unwanted =
Content-Type=0A=
sub get_url {=0A=
  my $http_user_agent =3D shift;=0A=
  my $url =3D shift;=0A=
  my $search_mode =3D shift;            # $search_mode =3D don't show =
debugging=0A=
=0A=
  # Avoid endless loops:=0A=
  if( $http_max_pages_counter >=3D $HTTP_MAX_PAGES ) {=0A=
    error("Error: Ignoring '$url': \$HTTP_MAX_PAGES=3D$HTTP_MAX_PAGES =
limit reached.\n");=0A=
    return;=0A=
  }=0A=
  $http_max_pages_counter++;=0A=
=0A=
  my $request =3D HTTP::Request->new(GET =3D> $url);=0A=
  my $response =3D $http_user_agent->request($request);=0A=
  if( $response->is_error ) {=0A=
    error("Error: Couldn't get '$url': response code " =
.$response->code. "\n");=0A=
    return;=0A=
  }=0A=
=0A=
  if( $response->headers_as_string =3D~ m/^Content-Type:\s*(.+)$/im ) =
{=0A=
    my $content_type =3D $1;=0A=
    $content_type =3D~ s/^(.*?);.*$/$1/;                # ignore possible charset =
value=0A=
    if( ! grep(/^$content_type$/i, @HTTP_CONTENT_TYPES) ) {=0A=
      debug("Ignoring '$url': content-type '$content_type'\n");=0A=
      return;=0A=
    }=0A=
  }=0A=
=0A=
  my $buffer =3D $response->content;=0A=
  my $size =3D length($buffer);=0A=
  debug("Fetched  '$url', $size bytes\n") if( ! $search_mode );=0A=
  # Maybe we are we redirected, so use the new URL.=0A=
  # Note: this also uses <base href=3D"...">, so href=3D"..." has to =
point=0A=
  # to the page itself, not to the directory (even though the latter =
=0A=
  # will work okay in browsers):=0A=
  $url =3D $response->base;=0A=
  return ($url, $response->last_modified, $size, $buffer);=0A=
}=0A=
=0A=
## Are there meta tags that forbid visiting this page /=0A=
## following its URLs? Returns "", "none", "noindex" or "nofollow"=0A=
sub robot_meta_tag {=0A=
  my $content =3D shift;=0A=
  my $meta_tags =3D "";=0A=
  while( ${$content} =3D~ m/<meta(.*?)>/igs ) {=0A=
    my $tag =3D $1;=0A=
    if( $tag =3D~ m/name\s*=3D\s*"robots"/is ) {=0A=
      my ($value) =3D ($tag =3D~ m/content\s*=3D\s*"(.*?)"/igs);=0A=
      if( $value =3D~ m/none/is ) {=0A=
        $meta_tags =3D "none";=0A=
      } elsif( $value =3D~ m/noindex/is && $value =3D~ m/nofollow/is ) =
{=0A=
        $meta_tags =3D "none";=0A=
      } elsif( $value =3D~ m/noindex/is ) {=0A=
        $meta_tags =3D "noindex";=0A=
      } elsif( $value =3D~ m/nofollow/is ) {=0A=
        $meta_tags =3D "nofollow";=0A=
      }=0A=
    }=0A=
  }=0A=
  return $meta_tags;=0A=
}=0A=
=0A=
# Load the user's list of (common) words that should not be indexed.=0A=
# Use a hash so lookup is faster. Well-chosen stopwords can make =0A=
# indexing faster.=0A=
sub load_stopwords {=0A=
  my %stopwords;=0A=
  open(FILE, $STOPWORDS_FILE) or (warn "Cannot open '$STOPWORDS_FILE': =
$!" and return);=0A=
  while (<FILE>) {=0A=
    chomp;=0A=
    $_ =3D~ s/\r//g; # get rid of carriage returns=0A=
    $stopwords{$_} =3D 1;=0A=
  }=0A=
  close(FILE);=0A=
  return %stopwords;=0A=
}=0A=
=0A=
my $special_chars;      # special characters we have to replace=0A=
# Build list of special characters that will be replaced in =
normalize(),=0A=
# put this list in global variable $special_chars.=0A=
sub build_char_string {=0A=
  foreach my $number (keys %entities) {=0A=
    $special_chars .=3D chr($number);=0A=
  }=0A=
}=0A=
=0A=
# Represent all special characters as the character they are based =
on.=0A=
sub remove_accents {=0A=
  my $buffer =3D $_[0];=0A=
  # Special cases:=0A=
  $buffer =3D~ s/&thorn;/th/igs;=0A=
  $buffer =3D~ s/&eth;/d/igs;=0A=
  $buffer =3D~ s/&szlig;/ss/igs;=0A=
  # Now represent special characters as the characters they are based =
on:=0A=
  $buffer =3D~ =
s/&(..?)(grave|acute|circ|tilde|uml|ring|cedil|slash|lig);/$1/igs;=0A=
  return $buffer;=0A=
}=0A=
=0A=
# Represent all special characters as HTML entities like &<entitiy>;=0A=
sub normalize_special_chars {=0A=
  my $buffer =3D $_[0];=0A=
  # There may be special characters that are not encoded, so encode =
them:=0A=
  $buffer =3D~ s/([$special_chars])/"&#".ord($1).";"/gse;=0A=
  # Special characters can be encoded using hex values:=0A=
  $buffer =3D~ s/&#x([\dA-F]{2});/"&#".hex("0x".$1).";"/igse;=0A=
  # Special characters may be encoded with numbers, undo that (use the =
if() to avoid warnings):=0A=
  $buffer =3D~ s/&#(\d\d\d);/if( $1 >=3D 192 && $1 <=3D 255 ) { =
"&$entities{$1};"; }/gse;=0A=
  return $buffer;=0A=
}=0A=
=0A=
%entities =3D (=0A=
        192 =3D> 'Agrave',   #  capital A, grave accent =0A=
        193 =3D> 'Aacute',   #  capital A, acute accent =0A=
        194 =3D> 'Acirc',            #  capital A, circumflex accent =0A=
        195 =3D> 'Atilde',   #  capital A, tilde =0A=
        196 =3D> 'Auml',             #  capital A, dieresis or umlaut mark =0A=
        197 =3D> 'Aring',            #  capital A, ring =0A=
        198 =3D> 'AElig',            #  capital AE diphthong (ligature) =0A=
        199 =3D> 'Ccedil',   #  capital C, cedilla =0A=
        200 =3D> 'Egrave',   #  capital E, grave accent =0A=
        201 =3D> 'Eacute',   #  capital E, acute accent =0A=
        202 =3D> 'Ecirc',            #  capital E, circumflex accent =0A=
        203 =3D> 'Euml',             #  capital E, dieresis or umlaut mark =0A=
        205 =3D> 'Igrave',   #  capital I, grave accent =0A=
        204 =3D> 'Iacute',   #  capital I, acute accent =0A=
        206 =3D> 'Icirc',            #  capital I, circumflex accent =0A=
        207 =3D> 'Iuml',             #  capital I, dieresis or umlaut mark =0A=
        208 =3D> 'ETH',              #  capital Eth, Icelandic (Dstrok) =0A=
        209 =3D> 'Ntilde',   #  capital N, tilde =0A=
        210 =3D> 'Ograve',   #  capital O, grave accent =0A=
        211 =3D> 'Oacute',   #  capital O, acute accent =0A=
        212 =3D> 'Ocirc',            #  capital O, circumflex accent =0A=
        213 =3D> 'Otilde',   #  capital O, tilde =0A=
        214 =3D> 'Ouml',             #  capital O, dieresis or umlaut mark =0A=
        216 =3D> 'Oslash',   #  capital O, slash =0A=
        217 =3D> 'Ugrave',   #  capital U, grave accent =0A=
        218 =3D> 'Uacute',   #  capital U, acute accent =0A=
        219 =3D> 'Ucirc',            #  capital U, circumflex accent =0A=
        220 =3D> 'Uuml',             #  capital U, dieresis or umlaut mark =0A=
        221 =3D> 'Yacute',   #  capital Y, acute accent =0A=
        222 =3D> 'THORN',            #  capital THORN, Icelandic =0A=
        223 =3D> 'szlig',            #  small sharp s, German (sz ligature) =0A=
        224 =3D> 'agrave',   #  small a, grave accent =0A=
        225 =3D> 'aacute',   #  small a, acute accent =0A=
        226 =3D> 'acirc',            #  small a, circumflex accent =0A=
        227 =3D> 'atilde',   #  small a, tilde=0A=
        228 =3D> 'auml',             #  small a, dieresis or umlaut mark =0A=
        229 =3D> 'aring',            #  small a, ring=0A=
        230 =3D> 'aelig',            #  small ae diphthong (ligature) =0A=
        231 =3D> 'ccedil',   #  small c, cedilla =0A=
        232 =3D> 'egrave',   #  small e, grave accent =0A=
        233 =3D> 'eacute',   #  small e, acute accent =0A=
        234 =3D> 'ecirc',            #  small e, circumflex accent =0A=
        235 =3D> 'euml',             #  small e, dieresis or umlaut mark =0A=
        236 =3D> 'igrave',   #  small i, grave accent =0A=
        237 =3D> 'iacute',   #  small i, acute accent =0A=
        238 =3D> 'icirc',            #  small i, circumflex accent =0A=
        239 =3D> 'iuml',             #  small i, dieresis or umlaut mark =0A=
        240 =3D> 'eth',              #  small eth, Icelandic =0A=
        241 =3D> 'ntilde',   #  small n, tilde =0A=
        242 =3D> 'ograve',   #  small o, grave accent =0A=
        243 =3D> 'oacute',   #  small o, acute accent =0A=
        244 =3D> 'ocirc',            #  small o, circumflex accent =0A=
        245 =3D> 'otilde',   #  small o, tilde =0A=
        246 =3D> 'ouml',             #  small o, dieresis or umlaut mark =0A=
        248 =3D> 'oslash',   #  small o, slash =0A=
        249 =3D> 'ugrave',   #  small u, grave accent =0A=
        250 =3D> 'uacute',   #  small u, acute accent =0A=
        251 =3D> 'ucirc',            #  small u, circumflex accent =0A=
        252 =3D> 'uuml',             #  small u, dieresis or umlaut mark =0A=
        253 =3D> 'yacute',   #  small y, acute accent =0A=
        254 =3D> 'thorn',            #  small thorn, Icelandic =0A=
        255 =3D> 'yuml',             #  small y, dieresis or umlaut mark=0A=
);=0A=
=0A=
1;=0A=

------_=_NextPart_000_01C2F301.25CDDB00--