################################################################################
#
# .urlmon.mfs - filters for urlmon used for www-page content monitoring
# including 'print' and 'expand'
#
# Copyright (C) 2001-2008, 2011-2014 Dimitar Ivanov
#
# urlmon is Copyright (C) by Jeremy Impson
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
################################################################################
### Expand Filter ###
#
$filtercode{'expand'} = sub {
$version_expand = 'expand 1.6.5';
BEGIN {
require 5.004;
use Carp;
use Time::Local;
use POSIX qw(strftime);
use LWP::MediaTypes qw(guess_media_type);
use Fcntl qw(:DEFAULT :flock);
use File::Copy;
use File::Basename;
use Encode;
$myagent = "MIR/20.14";
# Use $URLMON_CTYPE if you want to override $LC_CTYPE (handled by Perl).
# See also: perllocale - Perl locale handling for i18n and l10n
$ENV{'LC_CTYPE'} = $ENV{'URLMON_CTYPE'} if defined $ENV{'URLMON_CTYPE'};
# Although links is pretty fast and can read frames and tables
# smoothly, here the default program for dumping to ascii is lynx
$whichlynx = ();
$dumpers[0] = lynx;
$dumpopt[0] = "-dump -force_html -nolist -image_links -verbose -accept_all_cookies -cookie_file=/dev/null -cookie_save_file=/dev/null -useragent=$myagent";
$dumpers[1] = links;
$dumpopt[1] = "-dump";
# links/lynx program-specific filters to be applied to the content after
# converting to ascii
@lynxfilters = qw( s/[\s]*\[IMG\]//g||/.*/
s/^Warn.*L_y_n_x.*\n//mg||/.*/
s/[\s-]*\[(LINK|IMAGE|INLINE|USEMAP|BUTTON)\][-]*//g||/.*/ );
# Check if the first choice is installed, if not then use next.
# The dumper's options are set in the main part below. Here, $whichlynx is
# set to point to the first existing dumper and is used later.
for $dumper ( @dumpers ) {
$whichlynx = "/usr/bin/$dumper";
if( ! -x "$whichlynx" ) {
chomp( $whichlynx = `which $dumper 2>&1` );
}
last unless ( $whichlynx =~ /(which:|no |not found)/i );
}
my @fulldatetime = localtime();
my (@time) = (@fulldatetime)[0..5];
my ($sec,$min,$hour,$day,$mon,$year) = @time;
# start time in Epoch units set at time the routine is entered
$start = timelocal(@time);
# my date-tag
$Dtag = sprintf(".%4d.%02d.%02d", $year+1900, $mon+1, $day);
# present time
$time = sprintf("%02d:%02d:%02d", $hour, $min, $sec);
$datetime = strftime( "%a %d.%m.%y %H:%M", @fulldatetime );
$daddy=$$;
$debug=0;
}
END {
#
# Mail program
if ( -x "/bin/mail" ) { $mail = "/bin/mail"; }
else { $mail = `which mail`; chomp $mail; }
my ($dadsmail, $mailcom) = (".mail.$daddy", "");
local *MAILFH;
#
my ($subj_short, $subj_long, @readlines, @unifile, %seen, $FH);
if( defined $forktab ) {
print "DEBUG: This is the end for proc: $$.
Waiting forks to finish if existing.\n" if $debug;
# Remove process-id from the fork table
ManageFork("$forktab", $mypid, "remove");
# WAIT if there are forked processes still busy.
# Wait here but do not lock then (last argument is missing)!!
WaitThenLock("$forktab", 0.5, $patientlimit);
} else {
# For forked process only the parent executes this part of the code
# at the end!! The children have prepared a list of mails to be delivered
print "DEBUG: I'm the parent proc: $$.\n" if $debug;
if( -e "$dadsmail" ) { # send only unique mails from the list in $dadsmail
# from the kids
OpenFile(\*IN, "<", "$dadsmail");
chomp ( @unifile = );
CloseFile(\*IN, "$dadsmail");
unlink("$dadsmail");
# Send only unique orders!
%seen = (); @unifile = grep { ! $seen{$_} ++ } @unifile;
foreach $delivery (@unifile) {
if( ($signo = system("$delivery")) &= 127 ) { # catch INT and QUIT
die "ERROR: program killed by signal $signo\n";
}
}
}
}
foreach $muser ( keys %addresses ) { # only children go through this
next if ( $muser =~ /^#/ );
# Eliminate duplicate files before sending to user: works only for
# a single process!! Incompatible with 'urlmon -F ...'
%seen = ();
@unifile = grep { ! $seen{$_} ++ } @{$addresses{$muser}};
### Tag files and send mail
foreach $file ( @unifile ) {
$FH = WaitThenLock( $file, $sliipp, $patientlimit, $lock) if $forktab;
my $flag = 0;
my $bname = basename($file);
my $nisum = Grep_wc("$file", "^[0-9]+ $newitemstr", 0);
my $lisum = Grep_wc("$file", "^[0-9]+ $newlinestr", 0);
### Check if the file was already taged
foreach $tagstr ($nisum, $lisum ) {
%seen = ();
my @sum = split(/\n/, $tagstr);
@readlines = grep { ! $seen{$_} ++ } @sum;
if ( (scalar @sum) != (scalar @readlines) ) { $flag++; last;}
}
### Count the total of new items
my $items = Grep_wc("$file", "^[0-9][0-9][0-9].*(http|https|ftp|file):.*$progname\$", 1);
# delete space characters
$items =~ s/\s+//;
### Count the total of new lines
my $lines=0;
my @lines = split( '\n', $lisum );
foreach $l (@lines) { $lines += (split( ' ', $l))[0]; }
# put the number of new items in front of the subject (if any)
if( $items > 0 ) {
$subj_long = '$items items ' . "$subject_long";
$subj_short = '$items items ' . "$subject_short";
} elsif ( $lines > 0 ) {
$subj_long = '$lines lines ' . "$subject_long";
$subj_short = '$lines lines ' . "$subject_short";
} else {
$subj_long = $subject_long;
$subj_short = $subject_short;
}
### Interpolate variables: $items/$lines, $file, and $bname
$subj_long =~ s/(\$\w+)/$1/gee;
$subj_short =~ s/(\$\w+)/$1/gee;
if( ! $flag ) {
### Tag the summary file, edit in place
OpenFile(\*EDIT, "+<", "$file");
@readlines = ;
@readlines =
("\n --- Code : $version_expand ---\n\n",
"$subj_long\n", "$nisum", "$lisum", @readlines);
$_sni_=q{}; # number of new items in the current section
$_i_=1; # asign global numbers of the items
@readlines = grep{ (s/^([0-9][0-9][0-9])\s(.*(http|https|ftp|file):.*$progname$)/$1\.$_sni_\.$_i_\.$items $2/ && $_i_++)
|| (/^([0-9]+) new item.*/ && (($_sni_=$_) =~ s/^([0-9]+) new.*\n/$1/))
|| /.*\n/
} @readlines;
seek(EDIT,0,0);
select((select(EDIT), $| = 1)[0]);
print EDIT @readlines;
truncate(EDIT,tell(EDIT)) or die "ERROR: truncating '$file': $!";
CloseFile(\*EDIT, "$file");
}
if( $forktab ) { unlink("$file$lock"); close($FH); }
$mailcom = "$mail -s \"$subj_short\" $muser < $file";
### Send mail if the $user variable is valid and $debug is off
if( $muser !~ /^$defstr$/ ) {
if( $debug == 0 ) {
if( ! $forktab ) { # send mail at the instance if single process
system("$mailcom"); # Send mail
} else { # otherwise prepare $dadsmail such that the parent process
# will sort the unique entries and send mail
sysopen(MAILFH, "$dadsmail", O_RDWR|O_CREAT|O_APPEND)
or die "ERROR: can't open '$dadsmail': $!\n";
while( ! flock(MAILFH, LOCK_EX) ) {;}
print MAILFH "$mailcom\n";
close(MAILFH) or die "ERROR: can't close '$dadsmail': $!\n";
}
} else {
print "NEW items in $file:\n$nisum";
print "NEW lines in $file:\n$lisum";
print "TOTAL of items in mail: $items\n";
print "$mailcom\n";
}}
}}
} # End of END
($content, $lookforstr, $dburl, $repdir, $Filename, $Summary,
$mailtouser, $accupage, $getby , $htmlconv, $debug) = @_;
($Old, $New, $Rep) = (".old", ".new", ".rep");
my (@oldlines, @newlines, @difflines, @todolist);
$urlin='';
my ($urlnr, $cwd);
my $fh, $accurls, $pagecontent;
my $pagenr=1;
$debug = $debug || 0;
$progname = (split(':',$0))[0];
chomp( $cwd = `pwd` );
$repstr = '##########################################################################';
$nextstr = '--------------------------------------------------------------------------';
$newitemstr = "new item"; # new item label
$newlinestr = "new line"; # new line label
# Vars $subject_long and $subject_short are evaluated in the END subroutine
$subject_long = "/$datetime/ $progname: ".'$file';
$subject_short = "/$datetime/ from ".'$bname';
$nr=0; # counter for the number of items
$defstr = "_DeFaUlT_"; # Dummy string
$lock = "..LOCK"; # Extension of the lock file
$sliipp = "0.1"; # Sleep period for lock mode
$patientlimit = "1000"; # Waiting-limit in sec.
$ppid = getppid(); # parent pid
$mypid = $$; # pid
$fft0 = "_Fi";
$fft1 = "Fi_:";
$fftoken = "$fft0$fft1"; # token for filter file
$lotoken = "_LO"; # token for link only prefilter
#############################################################################
#
# Find out whether urlmon is forked and if necessary create an forking table
#
if( $mypid != $daddy ) { # if forked
$forktab = "/tmp/.forktab.$daddy"; # forking-table name
# Add process to the fork table
ManageFork($forktab, $mypid, "add");
}
#
#
for $variable ( $lookforstr, $dburl, $repdir, $Filename,
$Summary, $mailtouser, $accupage, $getby, $htmlconv ) {
$variable = $defstr if ( ! defined $variable || $variable eq "-" )
}
$urlnr = $0; # Extract the internal number of URL request
$urlnr =~ s/(.*)\s\((.+)\)\s(.*)/$2/g;
$urlnr =~ s/([0-9]+)\/[0-9]+/$1/g;
# This number is not unique!!!!
# It changes if the list of URL's is changed or if forking is used.
# Then the internal index is also changed. Thus the name of the
# automatically generated file corresponding to this url is changed
#
# Extract the requested URL address
($urlin = $0) =~ s/(.+)((http|https|ftp|file):\/\.*)/$2/g;
if ( $dburl eq $defstr ) {
if ( $lookforstr eq $defstr ) {
$dburl = $urlin;
} else {
($dburl = $0) =~ s/(.*)((http|https|ftp|file):\/\/\S+\/)(.+)/$2/g;
}
}
if ( $dburl !~ /["']{2}/ && $dburl !~ /^(http|https|ftp|file):\/\/.*/ ) {
print "Your base URL is not a valid one!\n";
print "DBURL: $dburl\n";
chdir $cwd;
exit 1;
}
if ( $lookforstr eq $defstr ) { $lookforstr = ''; }
my (@server) = split('/',$urlin);
if ( $Filename eq $defstr ) {
$Filename = "$server[2]_$urlnr";
}
if ( $repdir eq $defstr ) {
# $repdir = "$ENV{'HOME'}/.$progname";
$repdir = $cwd;
}
if ( $Summary eq $defstr ) {
$Summary = "Summary$Rep";
}
for $variable ($Filename , $repdir, $Summary, $mailtouser, $getby, $htmlconv) {
$variable = ExpandPath($variable); # Evaluate path
}
# assume that strings containing '/' are paths!
if ( $mailtouser =~ /.*\/.*/ ) { $mailfile = $mailtouser; }
else { $mailfile = ""; }
if ( $mailfile ) {
if( -e "$mailfile" ) {
$mailtouser = CatFile( "$mailfile" );
} else {
$mailtouser = "$defstr";
print "WARNING: There is no such mail file: $mailfile\n"; }
}
for ( $mailtouser ) {
s/[&,\n]/\ /gm; # replace ampersand, comma and new-line with space
s/\s+/\ /g; # replace multiple space characters with a single one
s/^\s+//g; # delete leading spaces
s/\s+$//g; # delete trailing spaces
}
if ( $Summary =~ /''/ || $Summary =~ /""/ ) { $Summary = ''; }
if ( $repdir !~ /^\/.*/ ) { $repdir = "$cwd/$repdir"; }
# Check html-convert method and set proper command line options
$htmlconv =~ s/^['"]//;
$htmlconv =~ s/['"]$//;
if ( $htmlconv =~ /$defstr/ ) {$htmlconv = $whichlynx;}
chomp( $lynx = `which $htmlconv 2>&1` );
$lynxopt = "";
if ( $lynx =~ /(lynx|links)$/ ) {
my $i = 0;
for $dumper ( @dumpers ) {
$lynxopt = "$dumpopt[$i]" if ( $lynx =~ /$dumper/ );
$i++;
}
}
if ( $lynx =~ /(which:|no |not found)/i || $lynx eq "" ) {
die "ERROR: ascii converter '$htmlconv' for '$urlin' not found\n";
}
if ( $accupage =~ /$defstr/ ) { $accupage = 1; }
chdir $repdir or die "ERROR: can't cd to $repdir: $!\n";
if ( $debug >= 1 ) {
printf("-------------------- DEBUG -----------------------\n");
print "Filter 'expand' version: $version_expand\n";
print "progname= $progname\n\$0= $0\n";
print "lookforstr= $lookforstr\n";
print "dburl= $dburl\n";
print "urlin= $urlin\n";
print "urlnr= $urlnr\n";
print "server= $server[2]\nFilename= $Filename\n";
print "lynx= $lynx lynxopt= $lynxopt\n" if $lynx;
print "downloader= $getby\n";
print "htmlconv= $htmlconv\n";
print "repdir=$repdir cwd=$cwd pwd=".`pwd`;
print "mailtouser=$mailtouser mailfile=$mailfile\n";
print "Accumulate pages: accupage=$accupage\n";
printf("--------------------- END ------------------------\n");
}
# Insert patterns from filter files into $lookforstring
$lookforstr = InsertFilterFiles($lookforstr) if ($lookforstr ne $defstr);
# Unescape commas
$lookforstr =~ s/\\,/,/g;
#
# Find out if complex $lookforstr i.e. with pre- and post-filtering
# and/or multiple filter patterns
# The pre- and post-string are separated by unescaped '&' character
# Individual patterns in the pre- and post-strings are separated
# by unescaped ';' character
#
(@prefilters, @postfilters, $prestr, $poststr) = ();
# Pre- and Post-filters are divided by unescaped '&', not followed by '&'
($prestr, $poststr) = split( /(? 1 ) {
if ( ! -e "$accufile" ) { # I'm the first member of the club
CopyNewToOld("$Filename");
OpenFile(\*EDIT, ">", "$accufile");
flock(EDIT, LOCK_EX) or die "ERROR: can't flock '$accufile': $!";
select((select(EDIT), $| = 1)[0]);
print EDIT "$pagenr\n$urlin";
CloseFile(\*EDIT, "$accufile");
} else { # All following members of the club go through a dummy wait
undef $pagenr;
while( ! defined $pagenr ) { # Wait if the first member is not ready
OpenFile(\*IN, "<", "$accufile");
$pagenr = ;
CloseFile(\*IN, "$accufile");
}
$pagenr = 0;
}
print "PAGEnr = $pagenr\n" if ($debug > 2 );
}
if ( $pagenr == 1 && $accupage == 1 ) { CopyNewToOld("$Filename"); }
### Make a List of DB-entries to be fetched
if ( $urlin ne $dburl ) {
# Split the content in lines and pre-filter them
my (@clines) = split( '\n', $content);
@clines = grep(s/^(.*)$/$1\n/g, @clines);
@clines = FilterLines( \@clines, \@prefilters);
OpenFile(\*OUT, "+>>", "$Filename$New"); # READ, WRITE, APPEND, CREATE
foreach $line (@clines) { # Find new item to pull them out
print "--- new line\n$line\n" if ($debug > 1 );
# in case of link only prefilter, reset the loflag to 1
$loflag = 1 if ( $loflag > 0 );
# loop over all references in the line
foreach $ref ( split /\s+href\s*=/i, $line ) {
print "\n--- ref before split\n$ref" if ($debug > 1 );
my @ref = $ref;
if ( ! $loflag ) {
# Apply the pre-filters again if not "link only" prefilter:
# the returned value is an array with only one element
$ref = (FilterLines( \@ref, \@prefilters))[0];
} else {
# In case of "link only" filter, do not apply the prefilter again,
# but take the first http-link found in the present $line which
# is the second iteration value in the split-loop
$ref = 0 if ( $loflag++ != 2 );
}
next if ! $ref;
chomp $ref;
# extract the reference:
# this is the first non-blank field [followed by blank]
$ref = (split(' ', $ref))[0];
# and the first field ending with '>' if the blank-field was following
$ref = (split('>', $ref))[0];
# Delete quotation marks if any
$ref =~ tr/'"//d;
$ref =~ s/(\S+)\s+(.*)/$1/gi;
print "--- ref after extract\n$ref\n" if ($debug > 1 );
print "URL_base=$dburl\n" if ($debug > 1 );
# Look up for overlapping in the strings : this could be
# very NASTY and could BREAK the correct construction of the uri's
$ref = "$dburl $ref" =~ /^([\w+-\/\:\.\"\']+?)([^\/]+) ([\S]+)\2(\/[\S]+)$/ ? "$1$2$4\n" : "$dburl$ref\n";
print "--- ref after overlap\n$ref" if ($debug > 1 );
# Delete $dburl if there is a (second) absolute url in the uri
$ref =~ s/^http.*(http.*)$/$1/;
# Add the reference URI to the list
push(@newlines,"$ref");
print "--- ref final\n$ref\n" if ($debug > 1 );
}
}
my %seen = ();
@newlines = grep { ! $seen{$_} ++ } @newlines; # keep only unique elements
@newlines = sort( @newlines ); # sort the list
print OUT @newlines;
CloseFile(\*OUT, "$Filename$New");
#
} else { # Ascii-content monitoring mode
my $record = '';
# The content is retrieved in lynx2ascii().
# Postfilters are not applied to the content here.
$record = join('', lynx2ascii( $urlin, $content )) . "\n";
$record = "$record\n";
@newlines = split('\n', $record);
@newlines = grep(s/^(.*)$/\1\n/g, @newlines);
#
# Apply the PRE-FILTERS
#
@newlines = FilterLines( \@newlines, \@prefilters) if @prefilters;
$/="\n"; # To be sure the record separator is set to default
print "NEWLINES (debug):\n@newlines\n" if ( $debug > 1 );
OpenFile(\*OUT, ">>", "$Filename$New");
print OUT @newlines;
CloseFile(\*OUT, "$Filename$New");
}
# If accumulating pages
if ( -e "$accufile" )
{
my @pagenr = ();
OpenFile(\*EDIT, "+<", "$accufile");
while( ! flock(EDIT, LOCK_EX) ) {;}
select((select(EDIT), $| = 1)[0]);
@pagenr = ;
@pagenr = grep { s/^\s+// || /./ } @pagenr; # strip blanks at begin of line
chomp ($pagenr = $pagenr[0]);
if ( ! $pagenr )
{ print "WARNING: problem reading the accumulated page number\n";
$pagenr = 0;
}
seek(EDIT,0,0);
unshift (@pagenr, "$urlin\n");
print EDIT eval ($pagenr + 1) . "\n" . "@pagenr";
truncate(EDIT,tell(EDIT)) or die "ERROR: truncating '$accufile': $!";
CloseFile(\*EDIT, "$accufile");
print "PAGENR = $pagenr\n" if ($debug > 2 );
# Do return if there are still pages to accumulate
if ( $pagenr < $accupage ) {
chdir $cwd;
$pagecontent=join(' ', sort( @newlines) );
print "Returning: $pagecontent\n" if ($debug > 2 );
return $pagecontent;
}
unlink("$accufile");
# Adjust the list of all accumulated pages
@pagenr = grep { s/^\s*\d+\s*$//g || /./ } @pagenr;
@pagenr = grep { s/^\s+// || /./ } @pagenr; # strip blanks at begin of line
@pagenr = grep { s/\s+$// || /./ } @pagenr; # strip blanks at end of line
@pagenr = grep { s/\n/\ /mg || /./ } @pagenr;
# Keep only unique entries in the list and then reverse the order
my %seen = ();
@pagenr = reverse( grep { ! $seen{$_} ++ } @pagenr);
$accurls = join( "\n", @pagenr );
print "ACCURLS = " . join( " ", @pagenr ) . "\n" if( $debug > 2);
}
# Read new items for any case (needed when they have been accumulated)
# and also to sort them (if not ascii-content monitoring mode)
if ( -e "$Filename$New" && $urlin ne $dburl ) {
@newlines = ();
OpenFile(\*IN, "+<", "$Filename$New");
@newlines = ;
# Keep only unique elements and sort them
my %seen = ();
@newlines = sort( grep { ! $seen{$_} ++ } @newlines );
seek(IN,0,0);
print IN @newlines;
truncate( IN, tell(IN) ) or die "ERROR: sorting '$Filename$New' in place: $!";
CloseFile(\*IN, "$Filename$New");
}
OpenFile(\*IN, "<", "$Filename$Old");
@oldlines = ;
CloseFile(\*IN, "$Filename$Old");
@newlines=();
OpenFile(\*IN, "<", "$Filename$New");
@newlines = ;
CloseFile(\*IN, "$Filename$New");
my $repfile = '';
$nr=1; # index of the url which is incremented in lynx2ascii
### Find out new DB-entries and fetch them
if ( $urlin ne $dburl ) {
@todolist = LookForNews( \@newlines, \@oldlines);
print "--- DEBUG\nTODO:\n@todolist\n--- END\n"
if ( @todolist && $debug > 1 );
if ( @todolist ) {
($nr, $repfile) = RetrieveNewItems (@todolist);
if ( $nr ) { # $nr is returned zero if there are not new items
if ( $Summary =~ /STDOUT/i ) {
print CatFile( "$repfile" );
} else { # Add the report to the summary and send mail
AddToSummary("$repfile","$Summary") if $Summary;
QueueMail();
}
}
}
} elsif( ! -z "$Filename$New" ) {
### Ascii-content monitoring mode:
# find out new lines, or small differences, or consider the whole new
# content as new if the difference btw. old and new is too big
my $difnrl;
# This boundary value gives the meaning of "big" and "small"
my $upbou = 0.05;
my $norm = $upbou;
my $diff = "diff -c";
@difflines = `$diff $Filename$Old $Filename$New |grep ^+`;
@difflines = grep(s/^\+\ (.*)$/$1/g, @difflines);
$difnrl = scalar @difflines;
print "DIFFLINES (debug):\n@difflines\n" if ( $debug > 1 );
if( ! @difflines ) { # If no additional lines in the new file was found
# try to see whether both files are too different!
# If so, assume that the whole content is new.
@difflines = `$diff $Filename$Old $Filename$New`;
# Only changed lines are considered in the weights
$difnrl = scalar (grep { /^!/ } @difflines);
if( $difnrl > 0 ) {
OpenFile(\*IN, "<", "$Filename$Old");
@oldlines = ;
CloseFile(\*IN, "$Filename$Old");
my $newnrl = scalar @newlines;
my $oldnrl = scalar @oldlines;
$norm = sprintf( "%.2g", $difnrl / ($newnrl + $oldnrl) );
if( $norm > $upbou ) {
print "WARNING $urlin :\nBig difference ($norm) between old and new content!\n";
print "Assuming the whole new content is different.\n";
@difflines = @newlines;
} else {
unshift( @difflines,
"WARNING $urlin :\nSmall difference ($norm) between old and new content!\n"
."Assuming there is no difference within the bound of $upbou\n"
. "Result from '$diff ' follows:\n"
."~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" );
print "@difflines\n";
}
}
}
#
# Apply the POST-FILTERS to the content as a single string
#
# The whole list goes to the first element
if ( @postfilters && @difflines ) {
@difflines = join('', @difflines);
@difflines = FilterLines( \@difflines, \@postfilters);
$/="\n"; # To be sure the record separator is set to default
undef @difflines if ( @difflines && "$difflines[0]" eq "" );
print "DIFFLINES postfilter:\n--@difflines--\n" if ( $debug > 1 );
}
if ( @difflines ) {
@difflines = join('', @difflines); # if it was not post-filtered
$repfile = InitFile("$Filename$Rep","$newlinestr",
(scalar split('\n', $difflines[0])));
OpenFile(\*OUT, ">>", "$repfile");
if ( $accupage == 1 )
{ print OUT "$nextstr\n";
if( $difnrl > 0 && $norm >= $upbou ) { # if there is a difference
print OUT "$urlin\n\n";
}
} else { # If the result is accumulated from a list of pages
print OUT "List of accumulated pages:\n";
print OUT "--------------------------\n";
print OUT "$accurls\n";
print OUT "$nextstr\n\n";
}
print OUT "@difflines\n";
CloseFile(\*OUT, "$repfile");
if ( $Summary =~ /STDOUT/i ) {
print CatFile( "$repfile" );
} else {
AddToSummary("$repfile","$Summary") if $Summary;
QueueMail();
}
}
}
#
# Restore previous state if the new file has a size of zero!!
# It means that something bad happened and this can not be the
# real new state of affair ;-)
#
if ( -z "$Filename$New" && !-z "$Filename$Old" ) {
rename("$Filename$Old","$Filename$New");
print "INFO: '$Filename$New' has zero size, old state restored. May be check your pre-filters..\n";
@newlines = @oldlines;
}
chdir $cwd;
return join(' ', @newlines);
};
#-------------------------------------------------------------------------
sub CopyNewToOld {
my $filename = shift;
if ( -s "$filename$New" ) { rename("$filename$New","$filename$Old"); }
elsif ( ! -e "$filename$Old") { copy("/dev/null","$filename$Old"); }
return 0;
}
sub ExpandPath ($)
{
my $path = shift;
# we want to catch: ~, ~/foo, ~user, and ~user/foo
$path =~ s{^~([^/]*)}
{ $1 ? (getpwnam($1))[7]
: ($ENV{'HOME'} || $ENV{'LOGDIR'} || (getpwuid($>))[7])
}ex;
# evaluate $ENV-variables like $ENV{'HOME'}, $ENV{'LOGNAME'}
$path =~ s/(\$\w+\{'\w+'\})/$1/gee;
$path =~ s/(\$\w+)/$1/gee;
return $path;
}
sub CheckForLinkOnlyPrefilter($)
{
my $filter = shift;
my @lo = split(':',$filter);
# If a "link only" filter, join the split array skipping 0-element
# i.e. just remove the "$lotoken" string from the filters
if ( $lo[0] eq $lotoken ) {
$filter = join(':', @lo[1 .. $#lo] );
print "Found link only prefilter: $lo[1]\n" if $debug;
}
return $filter;
}
sub InsertFilterFiles($)
{
my $filters = shift;
my $filename, $add;
my (@filters, @files, %seen) = ();
# Split on tokens skipping empty lines
# First pass
@files = grep { /^$fft1/ } split( /$fft0/, $filters );
$add = join( '', @files );
# Second pass
@files = grep { /./ } split( /$fft1/, $add );
@files = grep { s/^([^;]*){1}?;(.*)$/$1/} @files;
# Keep only unique
@files = grep { ! $seen{$_} ++ } @files;
print "FF: Filter Files= '@files'\n" if ($debug > 3);
foreach $file (@files) {
$filename = $file;
$file = ExpandPath($file); # Evaluate path
if( -e $file ) {
OpenFile(\*IN, "<", "$file");
@filters=;
CloseFile(\*IN, "$file");
# Skip all comments, remove space characters and empty lines
@filters = grep { !/^\s*\#/ } @filters;
@filters = grep { s/\s+//g } @filters;
@filters = grep { /./ } @filters;
chomp @filters;
$add = join( '' , @filters );
# Add the new filters to the present
print "FF: FILTERS = '$filters'\n" if ($debug > 3);
$filters =~ s/$fftoken$filename;/$add/g;
print "FF: FILTERS after insert = '$filters'\n" if ($debug > 3);
} else {
print "WARNING: There is no such filter file: $file\n";
}
}
return $filters;
}
sub FilterLines ($$)
{
my ($content, $filters) = @_;
my @content = @$content;
my @filters = @$filters;
# In list context the filters are applied to each element.
# In scalar context (the first element of the list contains a multiline
# single string) the filters are applied on a single string, such
# that you can control them with the /..../smg options for perl regexps
foreach $filter ( @filters ) {
if( $filter !~ /(?new();
$ua->env_proxy(); # use proxy servers as defined in env. variables
$ua->proxy(['http', 'ftp'], $myproxy) if $myproxy;
$ua->agent($myagent);
$ua->timeout([$secs]);
my $n=0;
foreach $url (keys %requests) {
sleep $ENV{'URLMON_SLEEP_FORNEXT'}
if ( $ENV{'URLMON_SLEEP_FORNEXT'} && $n && !$debug );
$n++;
$content_type=guess_media_type($url);
print "CONTENT TYPE of $url : $content_type\n" if ($debug > 4 );
if ( ! defined $downloader ) {
# Fetch the target url
$response = q{};
$response = $ua->request($requests{$url}) if ($debug < 7 );
if ((! ref $response) or (! $response->is_success) ) {
$responses{$url} = "ERROR: retrieving '$url': $response->status_line";
print "$responses{$url}\n" if ( $debug > 2 );
} else {
$responses{$url} = $response->content();
}
} else {
if ( $downloader =~ /wget/ ) { $downloader .= " -q -O-"; }
print "Trying to retrieve by '$downloader': $url\n" if ( $debug > 2 );
$responses{$url} = `$downloader $url`;
}
print "Done: $url\n" if ( $debug > 2 );
}
my @res_content = ();
my $allrecords = "";
foreach $url (keys %responses) { # Convert to ascii, postfilter and save result
@res_content = lynx2ascii( $url, $responses{$url} );
$allrecords .= Apply_ContentLine_PostFilters( \@res_content, \@postfilters);
}
my $repfile = '';
if ( $allrecords ) {
# Initialize the report file
$repfile = InitFile("$Filename$Rep", "$newitemstr", $nr-1);
# Write all records to the file
OpenFile(\*OUT, ">>", "$repfile");
print OUT "$allrecords";
CloseFile(\*OUT, "$repfile");
}
return ($nr-1, $repfile);
} # End of RetrieveNewItems
sub InitFile () {
my ($fname, $label, $nrits ) = @_;
my @replist = <$fname$Dtag.[0-9][0-9]>;
my $nextrep = scalar @replist;
$nextrep = sprintf("%02d",$nextrep%100); # Revolve after 100 reports
my $actname = "$fname$Dtag.$nextrep";
### Tag the file
system( "date > $actname;
echo $nrits $label\\(s\\) in $actname \\
>> $actname" );
### Make the new link
print "WARNING: $fname is not a link.\n" if( -e "$fname" && ! -l "$fname" );
my ($path, $base, $link) = ( dirname("$actname"),
basename("$actname"),
basename("$fname") );
system("cd $path; ln -sf $base $link; cd - >/dev/null");
return "$actname";
}
sub AddToSummary ($$) { # Accumulate data for all url's into a summary
my ($fname, $summary) = @_;
my $longname = "$summary$Dtag.$time";
my ($FH, $IN, $OUT, @input);
# Lock the file if you use -F option of the urlmon
$FH = WaitThenLock( $summary, $sliipp, $patientlimit, $lock) if $forktab;
# Create and link a new summary file if not existing
# otherwise append to previous
if ( ! -e "$longname" ) {
print "WARNING: $summary is not a link.\n"
if( -e "$summary" && ! -l "$summary" );
unlink("$summary");
TouchFile( "$longname" );
my ($path, $base, $link) = ( dirname("$longname"),
basename("$longname"),
basename("$summary") );
system("cd $path; ln -sf $base $link; cd - >/dev/null");
}
OpenFile(\*IN, "<", "$fname");
OpenFile(\*OUT, ">>", "$summary");
@input = ;
select((select(OUT), $| = 1)[0]);
# Put first a header, then paste the file
print OUT "\n$repstr\n#\n# $fname\n#\n$repstr\n";
print OUT @input;
CloseFile(\*OUT, "$summary");
CloseFile(\*IN, "$fname");
if( $forktab ) { # Unlock
unlink("$summary$lock"); close($FH);
}
}
sub lynx2ascii
{ print "LYNX2ASCII\n" if ( $debug > 2 );
my ($url, $content) = @_;
my @content = ();
my $lynxpid;
if( length $content ) {
my ($file) = ".$mypid.$progname";
OpenFile(\*OUT, ">", "$file");
print OUT "$content";
CloseFile(OUT,"$file");
# $lynx is used as ascii-converter,
# but you can provide also an external program
$lynxpid = open( LYNX, "$lynx $lynxopt $file 2>&1 |") or
die "ERROR: forking '$lynx $lynxopt $file' to convert '$url': $!\n";
@content = ;
unlink("$file") or print "Warning: can't delete \"$file\"\n";
close(LYNX) or die "ERROR: closing LYNX-pipe: $!\n";
} else {
print "INFO: '$url' has zero content\n";
}
# Delete some links/lynx program-specific scrap
@content = FilterLines( \@content, \@lynxfilters);
return @content;
} # End of lynx2ascii
sub Apply_ContentLine_PostFilters {
my ($c, $pf) = @_;
my @content = @$c;
my @postfilters = @$pf;
my $result_str = "";
#
# Apply the POST-FILTERS to the content as a single string
#
print "--- Content before POST filtering ---\n@content\n" if ( $debug > 5 );
if ( @postfilters ) {
@content = join('', @content);
@content = FilterLines( \@content, \@postfilters);
undef @content if ( @content && "$content[0]" eq "" );
$/="\n"; # To be sure the record separator is set to default
}
print "--- Content after POST filtering ---\n@content\n" if ( $debug > 5 );
# Enumerate the result and save it into the string
if ( @content ) {
if( $urlin ne $dburl || $accupage == 1 ) {
$result_str = TagNext($result_str, $url);
}
$result_str .= join('', @content)."\n";
}
return $result_str;
}
sub TagNext {
my ($string, $url) = @_;
if( $nr > 0 && $urlin ne $dburl ) { # only database entries are numbered
$string .= "$nextstr\n";
$string .= sprintf("%03d %s --> %s\n\n", $nr++, $url, $progname);
# if you change the previous line then the counter in the END-subroutine
# have to to be adjusted!
}
return $string;
}
sub CloseFile {
my ($file) = $_[1];
close ($_[0]) or die "ERROR: can't close file $file: $!\n";
print "'$file' closed successfully\n" if ( $debug > 2 );
}
sub OpenFile {
my ($file) = $_[2];
open ($_[0], "$_[1]", $file)
or die "ERROR: can't open file '$file': $!\n";
print "'$file' opened successfully\n" if ( $debug > 2 );
}
sub QueueMail () {
### Only summary files are tagged which will not be mailed!
return if ( $mailtouser =~ /^$defstr$/ && ! $Summary );
### If a valid mail user then push into the queue
my $file = "$Summary" || "$Filename$Rep";
chomp( my $path = `pwd` );
if ( $file !~ /^\/.*/ ) { # If the file name does not
$file = "$path/$file"; # include an absolute path
}
%addresses = () unless %addresses;
foreach $address (split( '\ ', $mailtouser)) {
push( @{$addresses{$address}}, "$file" );
}
}
sub ManageFork ($$$) {
my ($ffile,$id,$dowhat) = @_;
my $limit = "10";
my $sltime = "0.1";
my (@readlines);
my $FH;
if( $dowhat eq "add" ) {
print "DEBUG: ManageFork add $id\n" if( $debug > 2);
$FH = WaitThenLock( $ffile, $sltime, $limit, $lock);
open(OUT, ">> $ffile"); close(OUT);
OpenFile(\*EDIT, "+<", "$ffile");
@readlines = ;
push(@readlines, "$id\n");
seek(EDIT,0,0);
print EDIT @readlines;
truncate(EDIT, tell(EDIT)) or die "ERROR: truncating '$ffile': $!";
CloseFile(\*EDIT, "$ffile");
print "DEBUG: ManageFork add unlock: $ffile\n" if( $debug > 2);
unlink("$ffile$lock"); close(FH);
print "DEBUG: ManageFork add end $id\n" if( $debug > 2);
return 0;
}
if( $dowhat eq "remove" ) {
print "DEBUG: ManageFork remove $id\n" if( $debug > 2);
$FH = WaitThenLock( $ffile, $sltime, $limit, $lock);
if( -e "$ffile" ) {
OpenFile(\*EDIT, "+<", "$ffile");
@readlines = ;
@readlines = grep(!/$id/, @readlines ); # weed out lines with this id
seek(EDIT,0,0);
print EDIT @readlines;
truncate(EDIT, tell(EDIT)) or die "ERROR: truncating '$ffile': $!";
CloseFile(\*EDIT, "$ffile");
# Delete file if no more processes in the table
unlink ("$ffile") if( ! (scalar @readlines) );
}
print "DEBUG: ManageFork remove unlock: $ffile\n" if( $debug > 2);
unlink("$ffile$lock"); close(FH);
print "DEBUG: ManageFork remove end $id\n" if( $debug > 2);
return 0;
}
}
sub WaitThenLock ($$$$) {
my ($lfile, $speriod, $limit, $lext) = @_;
$lext = '' unless $lext;
local *LOCKFH;
print "DEBUG: WaitThenLock $lfile, $speriod, $limit, $lext\n" if( $debug > 2);
my $total = 0;
$lfile = "$lfile$lext" if $lext;
# This first part of the routine is just good if only need to wait
# (with defined upper time-limit) as long as a file exists
while ( -e "$lfile" ) {
select(undef,undef,undef,$speriod); # Trick for waiting less than 1 sec.
$total += $speriod;
if( $total > $limit ) { # Check against the upper limit
unlink("$lfile");
print "Waiting Godot too long: $total sec...\n".
"Lock file $lfile removed. Going ahead ;-)\n";
}
}
# If $lext is not defined, than we are just waiting and we will
# not lock but return!
return 0 if( ! $lext );
# Open and lock if $lext is defined!
sysopen(LOCKFH, "$lfile", O_RDWR|O_CREAT);
# Here is the exclusive lock which is unlocked (unlik+close)
# in the corresponding routines where it was called.
while( ! flock(LOCKFH, LOCK_EX) ) {;}
return *LOCKFH;
}
sub TouchFile {
my $_filename = shift;
sysopen( FILE, "$_filename", O_RDWR|O_CREAT ) && close FILE
|| die "ERROR: can't create file '$_filename': $!\n";
}
sub CatFile {
my $_file_name = shift;
my @_content = ();
OpenFile(\*IN, "<", "$_file_name");
@_content = ;
CloseFile(\*IN, "$_file_name");
return join('', @_content);
}
sub Grep_wc {
my ($_file_name, $_regex, $_count) = @_;
OpenFile(\*WC, "<", "$_file_name");
my @_wc_data = ;
CloseFile(\*WC, "$_file_name");
@_wc_data = grep { /$_regex/ } @_wc_data;
if ( $_count ) {
return (scalar @_wc_data);
} else {
return join('', @_wc_data);
}
}
### Print Filter ###
#
$filtercode{'print'} = sub {
my ($content) = @_;
print "$content";
return $content;
};
1;
__END__
################################################################################
#
# Documentation of the filters in POD
#
=head1 NAME
C<.urlmon.mfs> - filters for B used for www-page content monitoring
=head1 DESCRIPTION
C<.urlmon.mfs> includes the 'print' and 'expand' filters.
You can instruct B to load the filter's code from current working
directory by putting at the beginning of your urlmon's database file
following line:
CODE=$ENV{'PWD'}/.urlmon.mfs
If the above doesn't work, then:
CODE=///.urlmon.mfs
For more details see the documentation of B.
The 'B' filter just prints out the fetched HTML page. It doesn't
require arguments.
The remaining documentation deals with the 'B' filter which is
a general purpose content filter aimed to produce reports and a summary
of reports from monitoring. It examines differences (see the paragraph
NOTES below) between the previous and present text content of the
monitored URL and reports them! Alternatively, it can pre-filter
a HTML document and make a list of included URLs (hyperlinks) which will be
retrieved, their text content filtered, and eventually reported. This is
the most powerful feature of that filter! The monitored documents are
fetched by API provided by the LWP modules and can be converted to ASCII
by 'lynx' or 'links'. The filter can be used in forking mode: C<'-F'>
option of B. Simultaneous reading and updating of files by the child
processes is based on rudimentary file locking. This mechanism is not very
elegant or efficient, but it works :-)
There are three possibilities for monitoring. Which one is used depends
on the first and second arguments passed to the filter (see also
the ARGUMENTS description):
=head2
A) You want to monitor the text content of a given page and you don't want to apply filtering
In this case the first and second arguments can be skipped, alternatively you
can say FARGS=-,-
=head2
B) You have the situation as in A), but you want to do text content filtering
You must put as second argument the same URL as given in the 'URL=' variable
of the urlmon database file:
URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=&,http://www.example.org/pub/test.html
=head2
C) You want to monitor a list of specific hyperlinks as extracted from the target URL, whereas new members in the list should be retrieved and filtered
First you need to apply a pre-filter, which is defined by regular expressions,
to capture the wanted hyperlinks found in the observed page. Consider the
targeted URL used in case B) as follows:
URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=details.cgi&,-
Assume that you expect to find in the file "test.html" lines with links
which point to some script called "details.cgi":
This is First
This is Second
Hi everybody!
Now, if you are interested in these links (and aim to request and evaluate
them), you can make a list of the wanted URL's by using the pre-filter
"details.cgi" as defined in FARGS above. After processing the HTML code in
"test.html", the 'B' filter will find and request following list:
http://www.example.org/pub/details.cgi?quest=1
http://www.example.org/pub/details.cgi?quest=2
Note that the string "http://www.example.org/pub" is automatically put in
front of every listed item. Or to be precise, an extracted hyperlink is
appended to the monitored URL-string stripped from its basename.
In difference, if the lines with links in the above example look like
This is First
This is Second
you need to give a second argument in FARGS defining the in-front part of
the extracted links:
URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=details.cgi&,http://www.example.org/
The resulting list will be:
http://www.example.org/cgi/details.cgi?quest=1
http://www.example.org/cgi/details.cgi?quest=2
These documents are retrieved then, and filtered by .
In all examples in C), the string "details.cgi" plays the role of a pre-filter
submitted as FARGS argument. You are not restricted to use only one
pre-filter, but as many as you want. They apply to the HTML content of the
monitored URL, and are used to find out the wanted links in the HTML document.
Hence, these pre-filters behave a little bit differently compared to the cases
A) and B)! Furthermore, they are applied two times at different steps in the
procedure of determining the list of documents to be retrieved.
Be careful when using more than one pre-filter - define the filters such
that they include only regular expressions which are found at the
right-hand side of the http-link looking for! Still, if you want that
the pre-filters are applied only one time, and that the filter works as
a regex to the whole input line, then the token "_LO:" must be put in front
of the pre-filter string (alias "link only", or LO). Using again the last
example, a link only filter definition could be:
URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=_LO:/This.is/&,http://www.example.org/
One link per line is expected when using a LO-filter.
Note, that in case C) the comparison between the last and present content of
the monitored page is actually a comparison between the last and present list
of extracted URL's. Also, in the discussed examples we implicitly assumed that
the last list is missing as we just start to monitor the page.
Enjoy!
=head1 ARGUMENTS
Arguments which are skipped (undetermined) receive default values! If you
want to skip arguments between determined arguments, then you have to put
the '-' character, as for example: FARGS=,-,-,
For some arguments the empty string '' or "", which means an empty
argument value, can be given! In such cases '-' and '' are not the same!
The arguments used by the 'B' filter, and ordered by their index,
are described as follows:
B<1)> A string defining pre- and post-filters.
Although from the point of view of B the whole code of
'B' is considered as an external filter, the actual content
filtering is determined by this argument. Hence, the argument is called
"filter(s)" in this documentation. All filters are applied by using the
grep() function. They are separated by unescaped ';' character not
followed by ';'. Thus ';;' is not interpreted as a filter separator, and
one can use it in braced blocks (i.e. { BLOCK }, see below) instead of
';'. A combination of pre- and post-filters is a complex string consisting
of two substrings separated by unescaped '&' not followed by '&' which
means that you can have '&&' in braced block or elsewhere.
The pre-filters can be applied to the content before the old and new contents
are compared (case A and B in the examples above), or can be used for
extracting wanted links (case C). In latter case, the URL's content is
processed line by line. Only when the pre-filter is passed, the line is
further evaluated. It is split into strings separated by the substring
' href=', and then the pre-filter is applied again to every single string.
This behavior can be changed by defining a pre-filter as "link only".
In order to do so put in front of your pre-filter the token "_LO:" (look
at the example in C). Consequently, the prefilter is applied to whole
lines without splitting them.
The post-filters are used to filter the text content of a retrieved page.
For post-filtering the content is joined into a single record (i.e.
single string).
Ultimately, the pre- and post- filters have generally
a slightly different syntax since the first are applied by grep() in list
context, the latter in scalar context, respectively. This complication has
the advantage that by using post-filters one can easily discard a whole
record based on various criteria, as for example looking for given
keywords in the record. For both, pre- and post-filters, every single
filter is automatically wrapped within '//' (e.g. is wrapped to
//) if it doesn't include an unescaped '/' and if it isn't defined
in block. One can also give already wrapped filter(s), e.g.
//;!//i; etc., or a mixture of wrapped and unwrapped
filters. Any complex filter(s) must be defined as a single string without
space characters!! Every filter from the complex filter string is
implemented as block expression, i.e filtering is applied by using: '@list
= grep { // } @list;' Eventually, you could apply also filters
which are explicitly defined in a block! This allows writing sophisticated
filter code. Also escaped commas can be used. Before applying the filters
commas are then unescaped which means that for an escaped comma one needs
to have '\\,'.
EXAMPLES FOR VALID PRE-FILTERS:
(?i)BoY -> will be wrapped to /(?i)BoY/, equal to /BoY/i
!/girl/i -> delete lines containing the word 'girl'
(?i)(Woman|girl) -> keep lines containing the word 'woman' or 'girl'
s/sex/SEX/g||/.*/;!/violence/i -> substitute global + abandon violence
^^^^^^---> this part is needed in case that the whole list
corresponding to the record does not contain the
word 'sex' but you want to keep the record anyway
{$I0++<=9&&$_>100} -> the first 10 lines of the record; if these are
numbers, then include only numbers > 100 ; $I0 is
declared as a global variable! Be careful whether
it is used elsewhere!
Using /./ instead of /.*/ in the regular expressions will discard all
blank lines!
EXAMPLES FOR VALID POST-FILTERS:
!/bad/ -> delete the whole record if it contains the word 'bad'
/(Happy.*Days|Days.*Happy)/si -> keep the record if it contains
somewhere in it the two words 'Happy' and 'Days', case insensitive
s/EVIL/GOOD/mg||/.*/ -> global change of 'EVIL' into 'GOOD'
s/^.*ADVERTisement.*\n//mgi||/.*/ -> delete lines with 'advertisement'
s/^.*ADVERTisement.*\n//mgi -> this is wrong, because if there is not
such word in the record, it will be discarded by grep!!
s/^(?:(?!Love).)*\n//mg||/.*/;s/^(?:(?!Peace).)*\n//mg||/.*/ -> keep
lines which contain both 'Love' and 'Peace'
/Luck(?=(?:(?!Trouble).)*$)/si -> Luck which is not followed by Trouble
/smart/si&/^(?:(?!soft).)*$/si -> smart but not soft
COMBINATIONS OF PRE- AND POST-FILTERS:
simple -> this applies a pre-filter only
&easy -> this applies a post-filter only
simple&easy -> it's the same as /simple/&/easy/
(?i)simple;!/complex/i&easy;s/^(?:(?!dummy).)*\n//mig||/.*/
(?i)simple;!/complex/i;&easy;s/^(?:(?!dummy).)*\n//mig||/.*/; -> the same
as previous but with explicit separator ';' put after every one filter
Well, this is not all about filters. One can put them (or part of them) in
a file which is parsed and included into the complex filter string. And
that's really fun! To include such file(s) the token '_FiFi_:' should be
given somewhere in the complex filter string e.g.:
simple&easy;_FiFi_:~/urlmon/myfilters;
Then all of the filters defined in '~/urlmon/myfilters' will be pasted
exactly at this place in the filter string. Or another example:
simple;_FiFi_:~/urlmon/addpre;&easy;_FiFi_:~/urlmon/addpost;
Finally, one can define all of his filters (pre- and post-filters) in an
external file and just say:
_FiFi_:~/urlmon/allmyfilters;
It's important to put ';' after the file name! Before pasting the filter
file it is parsed such that all blank characters, empty lines and also
comments (lines starting with any number of blank characters followed by
'#') are removed. Consequently, you can define filters in an external file
and use blank characters in the code as opposed to the default situation
where the complex filter string should be a single string without blanks.
The other rules for writing filters, as previously explained, are still
valid. So, separating pre- and post-filters in a filter file is done as
usual by putting '&'. But don't forget, at the end, when the complex filter
string is returned to the program, it should contain only one separating
'&'.
B<2)> Base URL (use '-' for auto-find-out, '' or "" for none). For
database requests, if the extracted document's path/URL for the request
has an overlapping part with the base URL-string, then this part of the
extracted string which is in front of the overlapping is deleted. For
example, if the base URL is: "http://www.example.org/pub/documents" and the
document's path is: "../somedir/somesubdir/documents/fetchme?id=123" then
the constructed URL for the database request will be:
"http://www.example.org/pub/documents/fetchme?id=123".
B<3)> Directory (for working) where the data files will be saved - the
current working directory is default. Paths should be absolute or relative
to the working directory. One can also use Perl variables $ENV{'HOME'},
$ENV{'PWD'}, and also ~/, ~// etc.
B<4)> Basename of the file in which the results for this URL will be saved
(use '-' for auto naming). Generally, three files are created: one with
extension '.old', next with extension '.new', and the last with extension
'.rep' which will be a link to the report file. The name of the actual
report file is generated automatically and consists of the basename, date
and time tag, and increasing counter number.
B<5)> File where to save summary of the results (use '-' for auto, '' or
"" for none). "Summary.rep" is the default (see also the previous
argument). The magic word "STDOUT" (case insensitive) will cause the
summary to be sent to the standard output. In this case mail will not be
sent and the output will not have a header. Use absolute paths in the
filename if you have different directories for different URL entries and
you want to be sure that all reports are written to the same file! In case
the 3rd argument is given, the relative paths for this file are then
relative to it.
B<6)> E-mail address(es) or file with address(es) to send report to. In
case you have a summary (see argument 5) the summary is e-mailed.
Otherwise the file with the results for this URL is sent. The validity of
the address is not checked. Multiple e-mail addresses can be passed to the
program by giving a string containing different addresses separated by
'&': "user1@host1&user2@host2&user3" etc. When using file with e-mail
addresses, write the absolute path, $ENV{'HOME'}//, or
at least ./! E-mail addresses in this file can be separated by
blanks, new lines, or '&'. If there are no changes between the old and new
documents, mail will not be sent. The mail queue is checked for unique
delivery orders such that a report will be sent only once to the user.
Hence one can give repeatedly his e-mail address in different URL entries
(of the urlmon database) for the same report.
B<7)> Total number of pages in a collection. This argument is needed if
you want to observe many pages (URLs) as a single one collected page. As
for example, when you monitor an URL with a list of database items
(links), but the number of items in the list is limited by the provider,
you have to load in sequence all following pages with items to go through
the whole list. The solution is to define explicitly additional URL
entries in the urlmon's database file for every following page in the
provider's generated list. When monitoring a collection of pages and
B is executed in forking mode, the present argument is essential.
Nevertheless, it has to be defined in any case when pages are collected.
The basename of the file where the results are saved (see argument 4) must
be the same for all URLs in the collection!
B<8)> Program to use for retrieving the extracted list of items from the
monitored URL - like in case B above. It should print out the downloaded
HTML documensts to standard output. Such a program could be 'wget' for example.
In this case the wget-options '-q -O-' are automatically added on execution.
B<9)> Program to use for converting the HTML document to ASCII. 'lynx',
'links', or user-defined program can be used:
'lynx' or //lynx - use 'lynx' (default)
'links' or //links - use 'links'
// - user-defined program that converts
HTML to ASCII. The program should expects as (only) command-line
argument the name of the file to be converted, and should print out the
ASCII text to STDOUT. Eventually, a private converter normally serves
as an external content filter that presumably does [quite] complicated
filtering and executes additional user-defined tasks.
B<10)> Debug level: 1-5 are possible values
=head1 NOTES
The difference between the new and old text content of a document is
considered to be the additional lines in the new document compared to the
old one. If there are no additional lines but only few lines have
changed, or alternatively [almost] the whole text is new, then only the
changed parts of the documents are reported in the first case; the whole
content is reported to be changed in the latter case, respectively. For
details on the method of how the boundary is defined between small and big
difference, look at the code :-)
=head1 ENVIRONMENT VARIABLES
There are following variables which can be defined in the OS environment to
further configure the behavior of the program:
B - could be used to override the LC_CTYPE setting and to define
the character encoding of the resulting output text.
B - defines the time (in floating seconds) which the
'expand'-filter waits before fetching the next item on its list of extracted
urls. "Don't wait" is the default behavior.
=head1 COPYRIGHT
Copyright (C) 2001-2008, 2011-2014 Dimitar Ivanov,
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
You can find the latest version of the program at http://gnu.mirendom.net
=head1 SEE ALSO
Check the documentation in the B distribution.
=cut
#
# End of POD
#