|
|
[Perlfect-search] problems with 3.31b
Luhman, Rick perlfect-search@perlfect.com
Tue, 25 Mar 2003 13:02:57 -0600
This message is in MIME format. Since your mail reader does not understand
this format, some or all of this message may not be legible.
------_=_NextPart_000_01C2F301.25CDDB00
Content-Type: text/plain
Tools.pl file for 3.31b
-----Original Message-----
From: Daniel Naber [mailto:daniel.naber@t-online.de]
Sent: Thursday, March 20, 2003 3:44 PM
To: perlfect-search@perlfect.com
Subject: Re: [Perlfect-search] problems with 3.31b
On Thursday 20 March 2003 21:57, Luhman, Rick wrote:
> '-' is not recognized as an internal or external command,
> operable program or batch file.
Please send me the "tools.pl" file from your Perlfect Search installation.
Regards
Daniel
--
http://www.danielnaber.de
_______________________________________________
perlfect-search mailing list
perlfect-search@perlfect.com
To unsubscribe, set other personal options or view the list archives please
visit:
------_=_NextPart_000_01C2F301.25CDDB00
Content-Type: application/octet-stream;
name="tools.pl"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
filename="tools.pl"
# Perlfect Search - tools.pl=0A=
# Mapping of Numbers to HTML entities and other useful functions=0A=
# Hash taken from htdig's SGMLEntities.cc (shortened)=0A=
#$rcs =3D ' $Id: tools.pl,v 1.13 2003/02/24 22:45:42 daniel Exp $ ' =
;=0A=
=0A=
# Copyright (C) 1999-2002 Giorgos Zervas <giorgos@perlfect.com> and =0A=
# Daniel Naber <daniel.naber@t-online.de>=0A=
# =0A=
# This program is free software; you can redistribute it and/or =
modify=0A=
# it under the terms of the GNU General Public License as published =
by=0A=
# the Free Software Foundation; either version 2 of the License, or =
(at=0A=
# your option) any later version.=0A=
#=0A=
# This program is distributed in the hope that it will be useful, =
but=0A=
# WITHOUT ANY WARRANTY; without even the implied warranty of=0A=
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU=0A=
# General Public License for more details.=0A=
#=0A=
# You should have received a copy of the GNU General Public License=0A=
# along with this program; if not, write to the Free Software=0A=
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA =
02111-1307=0A=
# USA=0A=
=0A=
my $http_max_pages_counter =3D 0;=0A=
=0A=
# Remove some HTML special characters from a string. This is =
necessary=0A=
# to avoid cross site scripting attacks. =0A=
# See http://www.cert.org/advisories/CA-2000-02.html=0A=
sub cleanup {=0A=
my $str =3D $_[0];=0A=
if( ! defined($str) ) {=0A=
return "";=0A=
}=0A=
$str =3D~ s/[<>"'&]/ /igs;=0A=
return $str;=0A=
}=0A=
=0A=
# Escape some HTML special characters in a string. This is necessary=0A=
# to avoid cross site scripting attacks. =0A=
# See http://www.cert.org/advisories/CA-2000-02.html=0A=
sub html_escape {=0A=
my $str =3D $_[0];=0A=
if( ! defined($str) ) {=0A=
return "";=0A=
}=0A=
$str =3D~ s/&/&/igs;=0A=
$str =3D~ s/</</igs;=0A=
$str =3D~ s/>/>/igs;=0A=
$str =3D~ s/"/"/igs;=0A=
$str =3D~ s/'/'/igs;=0A=
return $str;=0A=
}=0A=
=0A=
sub init_config {=0A=
if( $DOCUMENT_ROOT !~ m#/$# ) {=0A=
# add ending slash if necessary=0A=
$DOCUMENT_ROOT =3D $DOCUMENT_ROOT."/";=0A=
}=0A=
$BASE_URL =3D~ s/\/$//; # remove trailing slash=0A=
}=0A=
=0A=
sub init_robot_check {=0A=
my $base =3D shift;=0A=
if( $ROBOT_AGENT && $HTTP_START_URL ) {=0A=
eval "use WWW::RobotRules";=0A=
if( $@ ) {=0A=
die("Cannot use robots.txt, maybe WWW::RobotRules is not installed? =
$!");=0A=
}=0A=
$main::robot =3D WWW::RobotRules->new($ROBOT_AGENT);=0A=
my $url =3D "$base/robots.txt";=0A=
print "Loading $url...\n";=0A=
my $http_user_agent =3D LWP::UserAgent->new;=0A=
my $robots_txt;=0A=
(undef, undef, undef, $robots_txt) =3D get_url($http_user_agent, =
$url);=0A=
if( $robots_txt ) {=0A=
$main::robot->parse($url, $robots_txt);=0A=
} else {=0A=
warn("Not using any robots.txt.\n");=0A=
}=0A=
}=0A=
}=0A=
=0A=
sub isHTML {=0A=
my $filename =3D shift;=0A=
my $ext =3D get_suffix($filename);=0A=
if( $ext ) {=0A=
return grep(/^$ext$/i, @HIGHLIGHT_EXT);=0A=
} else {=0A=
return 0;=0A=
}=0A=
}=0A=
=0A=
sub get_suffix {=0A=
my $filename =3D shift;=0A=
($suffix) =3D ($filename =3D~ m/\.([^.]*)$/);=0A=
return $suffix;=0A=
}=0A=
=0A=
sub filterFile {=0A=
my $filename =3D shift;=0A=
my $ext =3D shift;=0A=
my $buffer;=0A=
my $pid =3D open(FILE, "-|");=0A=
my $tmp =3D $/;=0A=
undef $/;=0A=
# flush STDOUT;=0A=
{=0A=
local $|;=0A=
$|=3D1;=0A=
print "";=0A=
}=0A=
if( ! defined($pid) ) {=0A=
warn "cannot fork: $!";=0A=
return;=0A=
}=0A=
if( ! $pid ) {=0A=
# child=0A=
my $cmd =3D $EXT_FILTER{$ext};=0A=
my @args =3D split(/\s+/, $cmd);=0A=
foreach (@args) {=0A=
if( $_ eq 'FILENAME' ) {=0A=
$_ =3D~ s/FILENAME/$filename/g;=0A=
}=0A=
}=0A=
# We're using exec() with a list of argument because =0A=
# this makes sure the filename is escaped automatically:=0A=
exec(@args) || die "can't exec program $args[0]: $!";=0A=
# NOTREACHED=0A=
}=0A=
$buffer =3D <FILE>;=0A=
close(FILE);=0A=
$/ =3D $tmp;=0A=
return $buffer;=0A=
}=0A=
=0A=
sub debug {=0A=
my $str =3D shift;=0A=
if( $HTTP_DEBUG && $ENV{'REQUEST_METHOD'} ) {=0A=
print $str;=0A=
} elsif( $HTTP_DEBUG && ! $ENV{'REQUEST_METHOD'} ) {=0A=
print STDERR $str;=0A=
}=0A=
}=0A=
=0A=
sub error {=0A=
my $str =3D shift;=0A=
if( $ENV{'REQUEST_METHOD'} ) {=0A=
print $str;=0A=
} else {=0A=
print STDERR $str;=0A=
}=0A=
}=0A=
=0A=
# Fetch URL via http, return real URL (differs only in case of =
redirect) and=0A=
# document's contents. Return nothing in case of error or unwanted =
Content-Type=0A=
sub get_url {=0A=
my $http_user_agent =3D shift;=0A=
my $url =3D shift;=0A=
my $search_mode =3D shift; # $search_mode =3D don't show =
debugging=0A=
=0A=
# Avoid endless loops:=0A=
if( $http_max_pages_counter >=3D $HTTP_MAX_PAGES ) {=0A=
error("Error: Ignoring '$url': \$HTTP_MAX_PAGES=3D$HTTP_MAX_PAGES =
limit reached.\n");=0A=
return;=0A=
}=0A=
$http_max_pages_counter++;=0A=
=0A=
my $request =3D HTTP::Request->new(GET =3D> $url);=0A=
my $response =3D $http_user_agent->request($request);=0A=
if( $response->is_error ) {=0A=
error("Error: Couldn't get '$url': response code " =
.$response->code. "\n");=0A=
return;=0A=
}=0A=
=0A=
if( $response->headers_as_string =3D~ m/^Content-Type:\s*(.+)$/im ) =
{=0A=
my $content_type =3D $1;=0A=
$content_type =3D~ s/^(.*?);.*$/$1/; # ignore possible charset =
value=0A=
if( ! grep(/^$content_type$/i, @HTTP_CONTENT_TYPES) ) {=0A=
debug("Ignoring '$url': content-type '$content_type'\n");=0A=
return;=0A=
}=0A=
}=0A=
=0A=
my $buffer =3D $response->content;=0A=
my $size =3D length($buffer);=0A=
debug("Fetched '$url', $size bytes\n") if( ! $search_mode );=0A=
# Maybe we are we redirected, so use the new URL.=0A=
# Note: this also uses <base href=3D"...">, so href=3D"..." has to =
point=0A=
# to the page itself, not to the directory (even though the latter =
=0A=
# will work okay in browsers):=0A=
$url =3D $response->base;=0A=
return ($url, $response->last_modified, $size, $buffer);=0A=
}=0A=
=0A=
## Are there meta tags that forbid visiting this page /=0A=
## following its URLs? Returns "", "none", "noindex" or "nofollow"=0A=
sub robot_meta_tag {=0A=
my $content =3D shift;=0A=
my $meta_tags =3D "";=0A=
while( ${$content} =3D~ m/<meta(.*?)>/igs ) {=0A=
my $tag =3D $1;=0A=
if( $tag =3D~ m/name\s*=3D\s*"robots"/is ) {=0A=
my ($value) =3D ($tag =3D~ m/content\s*=3D\s*"(.*?)"/igs);=0A=
if( $value =3D~ m/none/is ) {=0A=
$meta_tags =3D "none";=0A=
} elsif( $value =3D~ m/noindex/is && $value =3D~ m/nofollow/is ) =
{=0A=
$meta_tags =3D "none";=0A=
} elsif( $value =3D~ m/noindex/is ) {=0A=
$meta_tags =3D "noindex";=0A=
} elsif( $value =3D~ m/nofollow/is ) {=0A=
$meta_tags =3D "nofollow";=0A=
}=0A=
}=0A=
}=0A=
return $meta_tags;=0A=
}=0A=
=0A=
# Load the user's list of (common) words that should not be indexed.=0A=
# Use a hash so lookup is faster. Well-chosen stopwords can make =0A=
# indexing faster.=0A=
sub load_stopwords {=0A=
my %stopwords;=0A=
open(FILE, $STOPWORDS_FILE) or (warn "Cannot open '$STOPWORDS_FILE': =
$!" and return);=0A=
while (<FILE>) {=0A=
chomp;=0A=
$_ =3D~ s/\r//g; # get rid of carriage returns=0A=
$stopwords{$_} =3D 1;=0A=
}=0A=
close(FILE);=0A=
return %stopwords;=0A=
}=0A=
=0A=
my $special_chars; # special characters we have to replace=0A=
# Build list of special characters that will be replaced in =
normalize(),=0A=
# put this list in global variable $special_chars.=0A=
sub build_char_string {=0A=
foreach my $number (keys %entities) {=0A=
$special_chars .=3D chr($number);=0A=
}=0A=
}=0A=
=0A=
# Represent all special characters as the character they are based =
on.=0A=
sub remove_accents {=0A=
my $buffer =3D $_[0];=0A=
# Special cases:=0A=
$buffer =3D~ s/þ/th/igs;=0A=
$buffer =3D~ s/ð/d/igs;=0A=
$buffer =3D~ s/ß/ss/igs;=0A=
# Now represent special characters as the characters they are based =
on:=0A=
$buffer =3D~ =
s/&(..?)(grave|acute|circ|tilde|uml|ring|cedil|slash|lig);/$1/igs;=0A=
return $buffer;=0A=
}=0A=
=0A=
# Represent all special characters as HTML entities like &<entitiy>;=0A=
sub normalize_special_chars {=0A=
my $buffer =3D $_[0];=0A=
# There may be special characters that are not encoded, so encode =
them:=0A=
$buffer =3D~ s/([$special_chars])/"&#".ord($1).";"/gse;=0A=
# Special characters can be encoded using hex values:=0A=
$buffer =3D~ s/&#x([\dA-F]{2});/"&#".hex("0x".$1).";"/igse;=0A=
# Special characters may be encoded with numbers, undo that (use the =
if() to avoid warnings):=0A=
$buffer =3D~ s/&#(\d\d\d);/if( $1 >=3D 192 && $1 <=3D 255 ) { =
"&$entities{$1};"; }/gse;=0A=
return $buffer;=0A=
}=0A=
=0A=
%entities =3D (=0A=
192 =3D> 'Agrave', # capital A, grave accent =0A=
193 =3D> 'Aacute', # capital A, acute accent =0A=
194 =3D> 'Acirc', # capital A, circumflex accent =0A=
195 =3D> 'Atilde', # capital A, tilde =0A=
196 =3D> 'Auml', # capital A, dieresis or umlaut mark =0A=
197 =3D> 'Aring', # capital A, ring =0A=
198 =3D> 'AElig', # capital AE diphthong (ligature) =0A=
199 =3D> 'Ccedil', # capital C, cedilla =0A=
200 =3D> 'Egrave', # capital E, grave accent =0A=
201 =3D> 'Eacute', # capital E, acute accent =0A=
202 =3D> 'Ecirc', # capital E, circumflex accent =0A=
203 =3D> 'Euml', # capital E, dieresis or umlaut mark =0A=
205 =3D> 'Igrave', # capital I, grave accent =0A=
204 =3D> 'Iacute', # capital I, acute accent =0A=
206 =3D> 'Icirc', # capital I, circumflex accent =0A=
207 =3D> 'Iuml', # capital I, dieresis or umlaut mark =0A=
208 =3D> 'ETH', # capital Eth, Icelandic (Dstrok) =0A=
209 =3D> 'Ntilde', # capital N, tilde =0A=
210 =3D> 'Ograve', # capital O, grave accent =0A=
211 =3D> 'Oacute', # capital O, acute accent =0A=
212 =3D> 'Ocirc', # capital O, circumflex accent =0A=
213 =3D> 'Otilde', # capital O, tilde =0A=
214 =3D> 'Ouml', # capital O, dieresis or umlaut mark =0A=
216 =3D> 'Oslash', # capital O, slash =0A=
217 =3D> 'Ugrave', # capital U, grave accent =0A=
218 =3D> 'Uacute', # capital U, acute accent =0A=
219 =3D> 'Ucirc', # capital U, circumflex accent =0A=
220 =3D> 'Uuml', # capital U, dieresis or umlaut mark =0A=
221 =3D> 'Yacute', # capital Y, acute accent =0A=
222 =3D> 'THORN', # capital THORN, Icelandic =0A=
223 =3D> 'szlig', # small sharp s, German (sz ligature) =0A=
224 =3D> 'agrave', # small a, grave accent =0A=
225 =3D> 'aacute', # small a, acute accent =0A=
226 =3D> 'acirc', # small a, circumflex accent =0A=
227 =3D> 'atilde', # small a, tilde=0A=
228 =3D> 'auml', # small a, dieresis or umlaut mark =0A=
229 =3D> 'aring', # small a, ring=0A=
230 =3D> 'aelig', # small ae diphthong (ligature) =0A=
231 =3D> 'ccedil', # small c, cedilla =0A=
232 =3D> 'egrave', # small e, grave accent =0A=
233 =3D> 'eacute', # small e, acute accent =0A=
234 =3D> 'ecirc', # small e, circumflex accent =0A=
235 =3D> 'euml', # small e, dieresis or umlaut mark =0A=
236 =3D> 'igrave', # small i, grave accent =0A=
237 =3D> 'iacute', # small i, acute accent =0A=
238 =3D> 'icirc', # small i, circumflex accent =0A=
239 =3D> 'iuml', # small i, dieresis or umlaut mark =0A=
240 =3D> 'eth', # small eth, Icelandic =0A=
241 =3D> 'ntilde', # small n, tilde =0A=
242 =3D> 'ograve', # small o, grave accent =0A=
243 =3D> 'oacute', # small o, acute accent =0A=
244 =3D> 'ocirc', # small o, circumflex accent =0A=
245 =3D> 'otilde', # small o, tilde =0A=
246 =3D> 'ouml', # small o, dieresis or umlaut mark =0A=
248 =3D> 'oslash', # small o, slash =0A=
249 =3D> 'ugrave', # small u, grave accent =0A=
250 =3D> 'uacute', # small u, acute accent =0A=
251 =3D> 'ucirc', # small u, circumflex accent =0A=
252 =3D> 'uuml', # small u, dieresis or umlaut mark =0A=
253 =3D> 'yacute', # small y, acute accent =0A=
254 =3D> 'thorn', # small thorn, Icelandic =0A=
255 =3D> 'yuml', # small y, dieresis or umlaut mark=0A=
);=0A=
=0A=
1;=0A=
------_=_NextPart_000_01C2F301.25CDDB00--
|
|