################################################################################ # # .urlmon.mfs - filters for urlmon used for www-page content monitoring # including 'print' and 'expand' # # Copyright (C) 2001-2008, 2011-2014 Dimitar Ivanov # # urlmon is Copyright (C) by Jeremy Impson # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ################################################################################ ### Expand Filter ### # $filtercode{'expand'} = sub { $version_expand = 'expand 1.6.5'; BEGIN { require 5.004; use Carp; use Time::Local; use POSIX qw(strftime); use LWP::MediaTypes qw(guess_media_type); use Fcntl qw(:DEFAULT :flock); use File::Copy; use File::Basename; use Encode; $myagent = "MIR/20.14"; # Use $URLMON_CTYPE if you want to override $LC_CTYPE (handled by Perl). # See also: perllocale - Perl locale handling for i18n and l10n $ENV{'LC_CTYPE'} = $ENV{'URLMON_CTYPE'} if defined $ENV{'URLMON_CTYPE'}; # Although links is pretty fast and can read frames and tables # smoothly, here the default program for dumping to ascii is lynx $whichlynx = (); $dumpers[0] = lynx; $dumpopt[0] = "-dump -force_html -nolist -image_links -verbose -accept_all_cookies -cookie_file=/dev/null -cookie_save_file=/dev/null -useragent=$myagent"; $dumpers[1] = links; $dumpopt[1] = "-dump"; # links/lynx program-specific filters to be applied to the content after # converting to ascii @lynxfilters = qw( s/[\s]*\[IMG\]//g||/.*/ s/^Warn.*L_y_n_x.*\n//mg||/.*/ s/[\s-]*\[(LINK|IMAGE|INLINE|USEMAP|BUTTON)\][-]*//g||/.*/ ); # Check if the first choice is installed, if not then use next. # The dumper's options are set in the main part below. Here, $whichlynx is # set to point to the first existing dumper and is used later. for $dumper ( @dumpers ) { $whichlynx = "/usr/bin/$dumper"; if( ! -x "$whichlynx" ) { chomp( $whichlynx = `which $dumper 2>&1` ); } last unless ( $whichlynx =~ /(which:|no |not found)/i ); } my @fulldatetime = localtime(); my (@time) = (@fulldatetime)[0..5]; my ($sec,$min,$hour,$day,$mon,$year) = @time; # start time in Epoch units set at time the routine is entered $start = timelocal(@time); # my date-tag $Dtag = sprintf(".%4d.%02d.%02d", $year+1900, $mon+1, $day); # present time $time = sprintf("%02d:%02d:%02d", $hour, $min, $sec); $datetime = strftime( "%a %d.%m.%y %H:%M", @fulldatetime ); $daddy=$$; $debug=0; } END { # # Mail program if ( -x "/bin/mail" ) { $mail = "/bin/mail"; } else { $mail = `which mail`; chomp $mail; } my ($dadsmail, $mailcom) = (".mail.$daddy", ""); local *MAILFH; # my ($subj_short, $subj_long, @readlines, @unifile, %seen, $FH); if( defined $forktab ) { print "DEBUG: This is the end for proc: $$. Waiting forks to finish if existing.\n" if $debug; # Remove process-id from the fork table ManageFork("$forktab", $mypid, "remove"); # WAIT if there are forked processes still busy. # Wait here but do not lock then (last argument is missing)!! WaitThenLock("$forktab", 0.5, $patientlimit); } else { # For forked process only the parent executes this part of the code # at the end!! The children have prepared a list of mails to be delivered print "DEBUG: I'm the parent proc: $$.\n" if $debug; if( -e "$dadsmail" ) { # send only unique mails from the list in $dadsmail # from the kids OpenFile(\*IN, "<", "$dadsmail"); chomp ( @unifile = ); CloseFile(\*IN, "$dadsmail"); unlink("$dadsmail"); # Send only unique orders! %seen = (); @unifile = grep { ! $seen{$_} ++ } @unifile; foreach $delivery (@unifile) { if( ($signo = system("$delivery")) &= 127 ) { # catch INT and QUIT die "ERROR: program killed by signal $signo\n"; } } } } foreach $muser ( keys %addresses ) { # only children go through this next if ( $muser =~ /^#/ ); # Eliminate duplicate files before sending to user: works only for # a single process!! Incompatible with 'urlmon -F ...' %seen = (); @unifile = grep { ! $seen{$_} ++ } @{$addresses{$muser}}; ### Tag files and send mail foreach $file ( @unifile ) { $FH = WaitThenLock( $file, $sliipp, $patientlimit, $lock) if $forktab; my $flag = 0; my $bname = basename($file); my $nisum = Grep_wc("$file", "^[0-9]+ $newitemstr", 0); my $lisum = Grep_wc("$file", "^[0-9]+ $newlinestr", 0); ### Check if the file was already taged foreach $tagstr ($nisum, $lisum ) { %seen = (); my @sum = split(/\n/, $tagstr); @readlines = grep { ! $seen{$_} ++ } @sum; if ( (scalar @sum) != (scalar @readlines) ) { $flag++; last;} } ### Count the total of new items my $items = Grep_wc("$file", "^[0-9][0-9][0-9].*(http|https|ftp|file):.*$progname\$", 1); # delete space characters $items =~ s/\s+//; ### Count the total of new lines my $lines=0; my @lines = split( '\n', $lisum ); foreach $l (@lines) { $lines += (split( ' ', $l))[0]; } # put the number of new items in front of the subject (if any) if( $items > 0 ) { $subj_long = '$items items ' . "$subject_long"; $subj_short = '$items items ' . "$subject_short"; } elsif ( $lines > 0 ) { $subj_long = '$lines lines ' . "$subject_long"; $subj_short = '$lines lines ' . "$subject_short"; } else { $subj_long = $subject_long; $subj_short = $subject_short; } ### Interpolate variables: $items/$lines, $file, and $bname $subj_long =~ s/(\$\w+)/$1/gee; $subj_short =~ s/(\$\w+)/$1/gee; if( ! $flag ) { ### Tag the summary file, edit in place OpenFile(\*EDIT, "+<", "$file"); @readlines = ; @readlines = ("\n --- Code : $version_expand ---\n\n", "$subj_long\n", "$nisum", "$lisum", @readlines); $_sni_=q{}; # number of new items in the current section $_i_=1; # asign global numbers of the items @readlines = grep{ (s/^([0-9][0-9][0-9])\s(.*(http|https|ftp|file):.*$progname$)/$1\.$_sni_\.$_i_\.$items $2/ && $_i_++) || (/^([0-9]+) new item.*/ && (($_sni_=$_) =~ s/^([0-9]+) new.*\n/$1/)) || /.*\n/ } @readlines; seek(EDIT,0,0); select((select(EDIT), $| = 1)[0]); print EDIT @readlines; truncate(EDIT,tell(EDIT)) or die "ERROR: truncating '$file': $!"; CloseFile(\*EDIT, "$file"); } if( $forktab ) { unlink("$file$lock"); close($FH); } $mailcom = "$mail -s \"$subj_short\" $muser < $file"; ### Send mail if the $user variable is valid and $debug is off if( $muser !~ /^$defstr$/ ) { if( $debug == 0 ) { if( ! $forktab ) { # send mail at the instance if single process system("$mailcom"); # Send mail } else { # otherwise prepare $dadsmail such that the parent process # will sort the unique entries and send mail sysopen(MAILFH, "$dadsmail", O_RDWR|O_CREAT|O_APPEND) or die "ERROR: can't open '$dadsmail': $!\n"; while( ! flock(MAILFH, LOCK_EX) ) {;} print MAILFH "$mailcom\n"; close(MAILFH) or die "ERROR: can't close '$dadsmail': $!\n"; } } else { print "NEW items in $file:\n$nisum"; print "NEW lines in $file:\n$lisum"; print "TOTAL of items in mail: $items\n"; print "$mailcom\n"; }} }} } # End of END ($content, $lookforstr, $dburl, $repdir, $Filename, $Summary, $mailtouser, $accupage, $getby , $htmlconv, $debug) = @_; ($Old, $New, $Rep) = (".old", ".new", ".rep"); my (@oldlines, @newlines, @difflines, @todolist); $urlin=''; my ($urlnr, $cwd); my $fh, $accurls, $pagecontent; my $pagenr=1; $debug = $debug || 0; $progname = (split(':',$0))[0]; chomp( $cwd = `pwd` ); $repstr = '##########################################################################'; $nextstr = '--------------------------------------------------------------------------'; $newitemstr = "new item"; # new item label $newlinestr = "new line"; # new line label # Vars $subject_long and $subject_short are evaluated in the END subroutine $subject_long = "/$datetime/ $progname: ".'$file'; $subject_short = "/$datetime/ from ".'$bname'; $nr=0; # counter for the number of items $defstr = "_DeFaUlT_"; # Dummy string $lock = "..LOCK"; # Extension of the lock file $sliipp = "0.1"; # Sleep period for lock mode $patientlimit = "1000"; # Waiting-limit in sec. $ppid = getppid(); # parent pid $mypid = $$; # pid $fft0 = "_Fi"; $fft1 = "Fi_:"; $fftoken = "$fft0$fft1"; # token for filter file $lotoken = "_LO"; # token for link only prefilter ############################################################################# # # Find out whether urlmon is forked and if necessary create an forking table # if( $mypid != $daddy ) { # if forked $forktab = "/tmp/.forktab.$daddy"; # forking-table name # Add process to the fork table ManageFork($forktab, $mypid, "add"); } # # for $variable ( $lookforstr, $dburl, $repdir, $Filename, $Summary, $mailtouser, $accupage, $getby, $htmlconv ) { $variable = $defstr if ( ! defined $variable || $variable eq "-" ) } $urlnr = $0; # Extract the internal number of URL request $urlnr =~ s/(.*)\s\((.+)\)\s(.*)/$2/g; $urlnr =~ s/([0-9]+)\/[0-9]+/$1/g; # This number is not unique!!!! # It changes if the list of URL's is changed or if forking is used. # Then the internal index is also changed. Thus the name of the # automatically generated file corresponding to this url is changed # # Extract the requested URL address ($urlin = $0) =~ s/(.+)((http|https|ftp|file):\/\.*)/$2/g; if ( $dburl eq $defstr ) { if ( $lookforstr eq $defstr ) { $dburl = $urlin; } else { ($dburl = $0) =~ s/(.*)((http|https|ftp|file):\/\/\S+\/)(.+)/$2/g; } } if ( $dburl !~ /["']{2}/ && $dburl !~ /^(http|https|ftp|file):\/\/.*/ ) { print "Your base URL is not a valid one!\n"; print "DBURL: $dburl\n"; chdir $cwd; exit 1; } if ( $lookforstr eq $defstr ) { $lookforstr = ''; } my (@server) = split('/',$urlin); if ( $Filename eq $defstr ) { $Filename = "$server[2]_$urlnr"; } if ( $repdir eq $defstr ) { # $repdir = "$ENV{'HOME'}/.$progname"; $repdir = $cwd; } if ( $Summary eq $defstr ) { $Summary = "Summary$Rep"; } for $variable ($Filename , $repdir, $Summary, $mailtouser, $getby, $htmlconv) { $variable = ExpandPath($variable); # Evaluate path } # assume that strings containing '/' are paths! if ( $mailtouser =~ /.*\/.*/ ) { $mailfile = $mailtouser; } else { $mailfile = ""; } if ( $mailfile ) { if( -e "$mailfile" ) { $mailtouser = CatFile( "$mailfile" ); } else { $mailtouser = "$defstr"; print "WARNING: There is no such mail file: $mailfile\n"; } } for ( $mailtouser ) { s/[&,\n]/\ /gm; # replace ampersand, comma and new-line with space s/\s+/\ /g; # replace multiple space characters with a single one s/^\s+//g; # delete leading spaces s/\s+$//g; # delete trailing spaces } if ( $Summary =~ /''/ || $Summary =~ /""/ ) { $Summary = ''; } if ( $repdir !~ /^\/.*/ ) { $repdir = "$cwd/$repdir"; } # Check html-convert method and set proper command line options $htmlconv =~ s/^['"]//; $htmlconv =~ s/['"]$//; if ( $htmlconv =~ /$defstr/ ) {$htmlconv = $whichlynx;} chomp( $lynx = `which $htmlconv 2>&1` ); $lynxopt = ""; if ( $lynx =~ /(lynx|links)$/ ) { my $i = 0; for $dumper ( @dumpers ) { $lynxopt = "$dumpopt[$i]" if ( $lynx =~ /$dumper/ ); $i++; } } if ( $lynx =~ /(which:|no |not found)/i || $lynx eq "" ) { die "ERROR: ascii converter '$htmlconv' for '$urlin' not found\n"; } if ( $accupage =~ /$defstr/ ) { $accupage = 1; } chdir $repdir or die "ERROR: can't cd to $repdir: $!\n"; if ( $debug >= 1 ) { printf("-------------------- DEBUG -----------------------\n"); print "Filter 'expand' version: $version_expand\n"; print "progname= $progname\n\$0= $0\n"; print "lookforstr= $lookforstr\n"; print "dburl= $dburl\n"; print "urlin= $urlin\n"; print "urlnr= $urlnr\n"; print "server= $server[2]\nFilename= $Filename\n"; print "lynx= $lynx lynxopt= $lynxopt\n" if $lynx; print "downloader= $getby\n"; print "htmlconv= $htmlconv\n"; print "repdir=$repdir cwd=$cwd pwd=".`pwd`; print "mailtouser=$mailtouser mailfile=$mailfile\n"; print "Accumulate pages: accupage=$accupage\n"; printf("--------------------- END ------------------------\n"); } # Insert patterns from filter files into $lookforstring $lookforstr = InsertFilterFiles($lookforstr) if ($lookforstr ne $defstr); # Unescape commas $lookforstr =~ s/\\,/,/g; # # Find out if complex $lookforstr i.e. with pre- and post-filtering # and/or multiple filter patterns # The pre- and post-string are separated by unescaped '&' character # Individual patterns in the pre- and post-strings are separated # by unescaped ';' character # (@prefilters, @postfilters, $prestr, $poststr) = (); # Pre- and Post-filters are divided by unescaped '&', not followed by '&' ($prestr, $poststr) = split( /(? 1 ) { if ( ! -e "$accufile" ) { # I'm the first member of the club CopyNewToOld("$Filename"); OpenFile(\*EDIT, ">", "$accufile"); flock(EDIT, LOCK_EX) or die "ERROR: can't flock '$accufile': $!"; select((select(EDIT), $| = 1)[0]); print EDIT "$pagenr\n$urlin"; CloseFile(\*EDIT, "$accufile"); } else { # All following members of the club go through a dummy wait undef $pagenr; while( ! defined $pagenr ) { # Wait if the first member is not ready OpenFile(\*IN, "<", "$accufile"); $pagenr = ; CloseFile(\*IN, "$accufile"); } $pagenr = 0; } print "PAGEnr = $pagenr\n" if ($debug > 2 ); } if ( $pagenr == 1 && $accupage == 1 ) { CopyNewToOld("$Filename"); } ### Make a List of DB-entries to be fetched if ( $urlin ne $dburl ) { # Split the content in lines and pre-filter them my (@clines) = split( '\n', $content); @clines = grep(s/^(.*)$/$1\n/g, @clines); @clines = FilterLines( \@clines, \@prefilters); OpenFile(\*OUT, "+>>", "$Filename$New"); # READ, WRITE, APPEND, CREATE foreach $line (@clines) { # Find new item to pull them out print "--- new line\n$line\n" if ($debug > 1 ); # in case of link only prefilter, reset the loflag to 1 $loflag = 1 if ( $loflag > 0 ); # loop over all references in the line foreach $ref ( split /\s+href\s*=/i, $line ) { print "\n--- ref before split\n$ref" if ($debug > 1 ); my @ref = $ref; if ( ! $loflag ) { # Apply the pre-filters again if not "link only" prefilter: # the returned value is an array with only one element $ref = (FilterLines( \@ref, \@prefilters))[0]; } else { # In case of "link only" filter, do not apply the prefilter again, # but take the first http-link found in the present $line which # is the second iteration value in the split-loop $ref = 0 if ( $loflag++ != 2 ); } next if ! $ref; chomp $ref; # extract the reference: # this is the first non-blank field [followed by blank] $ref = (split(' ', $ref))[0]; # and the first field ending with '>' if the blank-field was following $ref = (split('>', $ref))[0]; # Delete quotation marks if any $ref =~ tr/'"//d; $ref =~ s/(\S+)\s+(.*)/$1/gi; print "--- ref after extract\n$ref\n" if ($debug > 1 ); print "URL_base=$dburl\n" if ($debug > 1 ); # Look up for overlapping in the strings : this could be # very NASTY and could BREAK the correct construction of the uri's $ref = "$dburl $ref" =~ /^([\w+-\/\:\.\"\']+?)([^\/]+) ([\S]+)\2(\/[\S]+)$/ ? "$1$2$4\n" : "$dburl$ref\n"; print "--- ref after overlap\n$ref" if ($debug > 1 ); # Delete $dburl if there is a (second) absolute url in the uri $ref =~ s/^http.*(http.*)$/$1/; # Add the reference URI to the list push(@newlines,"$ref"); print "--- ref final\n$ref\n" if ($debug > 1 ); } } my %seen = (); @newlines = grep { ! $seen{$_} ++ } @newlines; # keep only unique elements @newlines = sort( @newlines ); # sort the list print OUT @newlines; CloseFile(\*OUT, "$Filename$New"); # } else { # Ascii-content monitoring mode my $record = ''; # The content is retrieved in lynx2ascii(). # Postfilters are not applied to the content here. $record = join('', lynx2ascii( $urlin, $content )) . "\n"; $record = "$record\n"; @newlines = split('\n', $record); @newlines = grep(s/^(.*)$/\1\n/g, @newlines); # # Apply the PRE-FILTERS # @newlines = FilterLines( \@newlines, \@prefilters) if @prefilters; $/="\n"; # To be sure the record separator is set to default print "NEWLINES (debug):\n@newlines\n" if ( $debug > 1 ); OpenFile(\*OUT, ">>", "$Filename$New"); print OUT @newlines; CloseFile(\*OUT, "$Filename$New"); } # If accumulating pages if ( -e "$accufile" ) { my @pagenr = (); OpenFile(\*EDIT, "+<", "$accufile"); while( ! flock(EDIT, LOCK_EX) ) {;} select((select(EDIT), $| = 1)[0]); @pagenr = ; @pagenr = grep { s/^\s+// || /./ } @pagenr; # strip blanks at begin of line chomp ($pagenr = $pagenr[0]); if ( ! $pagenr ) { print "WARNING: problem reading the accumulated page number\n"; $pagenr = 0; } seek(EDIT,0,0); unshift (@pagenr, "$urlin\n"); print EDIT eval ($pagenr + 1) . "\n" . "@pagenr"; truncate(EDIT,tell(EDIT)) or die "ERROR: truncating '$accufile': $!"; CloseFile(\*EDIT, "$accufile"); print "PAGENR = $pagenr\n" if ($debug > 2 ); # Do return if there are still pages to accumulate if ( $pagenr < $accupage ) { chdir $cwd; $pagecontent=join(' ', sort( @newlines) ); print "Returning: $pagecontent\n" if ($debug > 2 ); return $pagecontent; } unlink("$accufile"); # Adjust the list of all accumulated pages @pagenr = grep { s/^\s*\d+\s*$//g || /./ } @pagenr; @pagenr = grep { s/^\s+// || /./ } @pagenr; # strip blanks at begin of line @pagenr = grep { s/\s+$// || /./ } @pagenr; # strip blanks at end of line @pagenr = grep { s/\n/\ /mg || /./ } @pagenr; # Keep only unique entries in the list and then reverse the order my %seen = (); @pagenr = reverse( grep { ! $seen{$_} ++ } @pagenr); $accurls = join( "\n", @pagenr ); print "ACCURLS = " . join( " ", @pagenr ) . "\n" if( $debug > 2); } # Read new items for any case (needed when they have been accumulated) # and also to sort them (if not ascii-content monitoring mode) if ( -e "$Filename$New" && $urlin ne $dburl ) { @newlines = (); OpenFile(\*IN, "+<", "$Filename$New"); @newlines = ; # Keep only unique elements and sort them my %seen = (); @newlines = sort( grep { ! $seen{$_} ++ } @newlines ); seek(IN,0,0); print IN @newlines; truncate( IN, tell(IN) ) or die "ERROR: sorting '$Filename$New' in place: $!"; CloseFile(\*IN, "$Filename$New"); } OpenFile(\*IN, "<", "$Filename$Old"); @oldlines = ; CloseFile(\*IN, "$Filename$Old"); @newlines=(); OpenFile(\*IN, "<", "$Filename$New"); @newlines = ; CloseFile(\*IN, "$Filename$New"); my $repfile = ''; $nr=1; # index of the url which is incremented in lynx2ascii ### Find out new DB-entries and fetch them if ( $urlin ne $dburl ) { @todolist = LookForNews( \@newlines, \@oldlines); print "--- DEBUG\nTODO:\n@todolist\n--- END\n" if ( @todolist && $debug > 1 ); if ( @todolist ) { ($nr, $repfile) = RetrieveNewItems (@todolist); if ( $nr ) { # $nr is returned zero if there are not new items if ( $Summary =~ /STDOUT/i ) { print CatFile( "$repfile" ); } else { # Add the report to the summary and send mail AddToSummary("$repfile","$Summary") if $Summary; QueueMail(); } } } } elsif( ! -z "$Filename$New" ) { ### Ascii-content monitoring mode: # find out new lines, or small differences, or consider the whole new # content as new if the difference btw. old and new is too big my $difnrl; # This boundary value gives the meaning of "big" and "small" my $upbou = 0.05; my $norm = $upbou; my $diff = "diff -c"; @difflines = `$diff $Filename$Old $Filename$New |grep ^+`; @difflines = grep(s/^\+\ (.*)$/$1/g, @difflines); $difnrl = scalar @difflines; print "DIFFLINES (debug):\n@difflines\n" if ( $debug > 1 ); if( ! @difflines ) { # If no additional lines in the new file was found # try to see whether both files are too different! # If so, assume that the whole content is new. @difflines = `$diff $Filename$Old $Filename$New`; # Only changed lines are considered in the weights $difnrl = scalar (grep { /^!/ } @difflines); if( $difnrl > 0 ) { OpenFile(\*IN, "<", "$Filename$Old"); @oldlines = ; CloseFile(\*IN, "$Filename$Old"); my $newnrl = scalar @newlines; my $oldnrl = scalar @oldlines; $norm = sprintf( "%.2g", $difnrl / ($newnrl + $oldnrl) ); if( $norm > $upbou ) { print "WARNING $urlin :\nBig difference ($norm) between old and new content!\n"; print "Assuming the whole new content is different.\n"; @difflines = @newlines; } else { unshift( @difflines, "WARNING $urlin :\nSmall difference ($norm) between old and new content!\n" ."Assuming there is no difference within the bound of $upbou\n" . "Result from '$diff ' follows:\n" ."~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" ); print "@difflines\n"; } } } # # Apply the POST-FILTERS to the content as a single string # # The whole list goes to the first element if ( @postfilters && @difflines ) { @difflines = join('', @difflines); @difflines = FilterLines( \@difflines, \@postfilters); $/="\n"; # To be sure the record separator is set to default undef @difflines if ( @difflines && "$difflines[0]" eq "" ); print "DIFFLINES postfilter:\n--@difflines--\n" if ( $debug > 1 ); } if ( @difflines ) { @difflines = join('', @difflines); # if it was not post-filtered $repfile = InitFile("$Filename$Rep","$newlinestr", (scalar split('\n', $difflines[0]))); OpenFile(\*OUT, ">>", "$repfile"); if ( $accupage == 1 ) { print OUT "$nextstr\n"; if( $difnrl > 0 && $norm >= $upbou ) { # if there is a difference print OUT "$urlin\n\n"; } } else { # If the result is accumulated from a list of pages print OUT "List of accumulated pages:\n"; print OUT "--------------------------\n"; print OUT "$accurls\n"; print OUT "$nextstr\n\n"; } print OUT "@difflines\n"; CloseFile(\*OUT, "$repfile"); if ( $Summary =~ /STDOUT/i ) { print CatFile( "$repfile" ); } else { AddToSummary("$repfile","$Summary") if $Summary; QueueMail(); } } } # # Restore previous state if the new file has a size of zero!! # It means that something bad happened and this can not be the # real new state of affair ;-) # if ( -z "$Filename$New" && !-z "$Filename$Old" ) { rename("$Filename$Old","$Filename$New"); print "INFO: '$Filename$New' has zero size, old state restored. May be check your pre-filters..\n"; @newlines = @oldlines; } chdir $cwd; return join(' ', @newlines); }; #------------------------------------------------------------------------- sub CopyNewToOld { my $filename = shift; if ( -s "$filename$New" ) { rename("$filename$New","$filename$Old"); } elsif ( ! -e "$filename$Old") { copy("/dev/null","$filename$Old"); } return 0; } sub ExpandPath ($) { my $path = shift; # we want to catch: ~, ~/foo, ~user, and ~user/foo $path =~ s{^~([^/]*)} { $1 ? (getpwnam($1))[7] : ($ENV{'HOME'} || $ENV{'LOGDIR'} || (getpwuid($>))[7]) }ex; # evaluate $ENV-variables like $ENV{'HOME'}, $ENV{'LOGNAME'} $path =~ s/(\$\w+\{'\w+'\})/$1/gee; $path =~ s/(\$\w+)/$1/gee; return $path; } sub CheckForLinkOnlyPrefilter($) { my $filter = shift; my @lo = split(':',$filter); # If a "link only" filter, join the split array skipping 0-element # i.e. just remove the "$lotoken" string from the filters if ( $lo[0] eq $lotoken ) { $filter = join(':', @lo[1 .. $#lo] ); print "Found link only prefilter: $lo[1]\n" if $debug; } return $filter; } sub InsertFilterFiles($) { my $filters = shift; my $filename, $add; my (@filters, @files, %seen) = (); # Split on tokens skipping empty lines # First pass @files = grep { /^$fft1/ } split( /$fft0/, $filters ); $add = join( '', @files ); # Second pass @files = grep { /./ } split( /$fft1/, $add ); @files = grep { s/^([^;]*){1}?;(.*)$/$1/} @files; # Keep only unique @files = grep { ! $seen{$_} ++ } @files; print "FF: Filter Files= '@files'\n" if ($debug > 3); foreach $file (@files) { $filename = $file; $file = ExpandPath($file); # Evaluate path if( -e $file ) { OpenFile(\*IN, "<", "$file"); @filters=; CloseFile(\*IN, "$file"); # Skip all comments, remove space characters and empty lines @filters = grep { !/^\s*\#/ } @filters; @filters = grep { s/\s+//g } @filters; @filters = grep { /./ } @filters; chomp @filters; $add = join( '' , @filters ); # Add the new filters to the present print "FF: FILTERS = '$filters'\n" if ($debug > 3); $filters =~ s/$fftoken$filename;/$add/g; print "FF: FILTERS after insert = '$filters'\n" if ($debug > 3); } else { print "WARNING: There is no such filter file: $file\n"; } } return $filters; } sub FilterLines ($$) { my ($content, $filters) = @_; my @content = @$content; my @filters = @$filters; # In list context the filters are applied to each element. # In scalar context (the first element of the list contains a multiline # single string) the filters are applied on a single string, such # that you can control them with the /..../smg options for perl regexps foreach $filter ( @filters ) { if( $filter !~ /(?new(); $ua->env_proxy(); # use proxy servers as defined in env. variables $ua->proxy(['http', 'ftp'], $myproxy) if $myproxy; $ua->agent($myagent); $ua->timeout([$secs]); my $n=0; foreach $url (keys %requests) { sleep $ENV{'URLMON_SLEEP_FORNEXT'} if ( $ENV{'URLMON_SLEEP_FORNEXT'} && $n && !$debug ); $n++; $content_type=guess_media_type($url); print "CONTENT TYPE of $url : $content_type\n" if ($debug > 4 ); if ( ! defined $downloader ) { # Fetch the target url $response = q{}; $response = $ua->request($requests{$url}) if ($debug < 7 ); if ((! ref $response) or (! $response->is_success) ) { $responses{$url} = "ERROR: retrieving '$url': $response->status_line"; print "$responses{$url}\n" if ( $debug > 2 ); } else { $responses{$url} = $response->content(); } } else { if ( $downloader =~ /wget/ ) { $downloader .= " -q -O-"; } print "Trying to retrieve by '$downloader': $url\n" if ( $debug > 2 ); $responses{$url} = `$downloader $url`; } print "Done: $url\n" if ( $debug > 2 ); } my @res_content = (); my $allrecords = ""; foreach $url (keys %responses) { # Convert to ascii, postfilter and save result @res_content = lynx2ascii( $url, $responses{$url} ); $allrecords .= Apply_ContentLine_PostFilters( \@res_content, \@postfilters); } my $repfile = ''; if ( $allrecords ) { # Initialize the report file $repfile = InitFile("$Filename$Rep", "$newitemstr", $nr-1); # Write all records to the file OpenFile(\*OUT, ">>", "$repfile"); print OUT "$allrecords"; CloseFile(\*OUT, "$repfile"); } return ($nr-1, $repfile); } # End of RetrieveNewItems sub InitFile () { my ($fname, $label, $nrits ) = @_; my @replist = <$fname$Dtag.[0-9][0-9]>; my $nextrep = scalar @replist; $nextrep = sprintf("%02d",$nextrep%100); # Revolve after 100 reports my $actname = "$fname$Dtag.$nextrep"; ### Tag the file system( "date > $actname; echo $nrits $label\\(s\\) in $actname \\ >> $actname" ); ### Make the new link print "WARNING: $fname is not a link.\n" if( -e "$fname" && ! -l "$fname" ); my ($path, $base, $link) = ( dirname("$actname"), basename("$actname"), basename("$fname") ); system("cd $path; ln -sf $base $link; cd - >/dev/null"); return "$actname"; } sub AddToSummary ($$) { # Accumulate data for all url's into a summary my ($fname, $summary) = @_; my $longname = "$summary$Dtag.$time"; my ($FH, $IN, $OUT, @input); # Lock the file if you use -F option of the urlmon $FH = WaitThenLock( $summary, $sliipp, $patientlimit, $lock) if $forktab; # Create and link a new summary file if not existing # otherwise append to previous if ( ! -e "$longname" ) { print "WARNING: $summary is not a link.\n" if( -e "$summary" && ! -l "$summary" ); unlink("$summary"); TouchFile( "$longname" ); my ($path, $base, $link) = ( dirname("$longname"), basename("$longname"), basename("$summary") ); system("cd $path; ln -sf $base $link; cd - >/dev/null"); } OpenFile(\*IN, "<", "$fname"); OpenFile(\*OUT, ">>", "$summary"); @input = ; select((select(OUT), $| = 1)[0]); # Put first a header, then paste the file print OUT "\n$repstr\n#\n# $fname\n#\n$repstr\n"; print OUT @input; CloseFile(\*OUT, "$summary"); CloseFile(\*IN, "$fname"); if( $forktab ) { # Unlock unlink("$summary$lock"); close($FH); } } sub lynx2ascii { print "LYNX2ASCII\n" if ( $debug > 2 ); my ($url, $content) = @_; my @content = (); my $lynxpid; if( length $content ) { my ($file) = ".$mypid.$progname"; OpenFile(\*OUT, ">", "$file"); print OUT "$content"; CloseFile(OUT,"$file"); # $lynx is used as ascii-converter, # but you can provide also an external program $lynxpid = open( LYNX, "$lynx $lynxopt $file 2>&1 |") or die "ERROR: forking '$lynx $lynxopt $file' to convert '$url': $!\n"; @content = ; unlink("$file") or print "Warning: can't delete \"$file\"\n"; close(LYNX) or die "ERROR: closing LYNX-pipe: $!\n"; } else { print "INFO: '$url' has zero content\n"; } # Delete some links/lynx program-specific scrap @content = FilterLines( \@content, \@lynxfilters); return @content; } # End of lynx2ascii sub Apply_ContentLine_PostFilters { my ($c, $pf) = @_; my @content = @$c; my @postfilters = @$pf; my $result_str = ""; # # Apply the POST-FILTERS to the content as a single string # print "--- Content before POST filtering ---\n@content\n" if ( $debug > 5 ); if ( @postfilters ) { @content = join('', @content); @content = FilterLines( \@content, \@postfilters); undef @content if ( @content && "$content[0]" eq "" ); $/="\n"; # To be sure the record separator is set to default } print "--- Content after POST filtering ---\n@content\n" if ( $debug > 5 ); # Enumerate the result and save it into the string if ( @content ) { if( $urlin ne $dburl || $accupage == 1 ) { $result_str = TagNext($result_str, $url); } $result_str .= join('', @content)."\n"; } return $result_str; } sub TagNext { my ($string, $url) = @_; if( $nr > 0 && $urlin ne $dburl ) { # only database entries are numbered $string .= "$nextstr\n"; $string .= sprintf("%03d %s --> %s\n\n", $nr++, $url, $progname); # if you change the previous line then the counter in the END-subroutine # have to to be adjusted! } return $string; } sub CloseFile { my ($file) = $_[1]; close ($_[0]) or die "ERROR: can't close file $file: $!\n"; print "'$file' closed successfully\n" if ( $debug > 2 ); } sub OpenFile { my ($file) = $_[2]; open ($_[0], "$_[1]", $file) or die "ERROR: can't open file '$file': $!\n"; print "'$file' opened successfully\n" if ( $debug > 2 ); } sub QueueMail () { ### Only summary files are tagged which will not be mailed! return if ( $mailtouser =~ /^$defstr$/ && ! $Summary ); ### If a valid mail user then push into the queue my $file = "$Summary" || "$Filename$Rep"; chomp( my $path = `pwd` ); if ( $file !~ /^\/.*/ ) { # If the file name does not $file = "$path/$file"; # include an absolute path } %addresses = () unless %addresses; foreach $address (split( '\ ', $mailtouser)) { push( @{$addresses{$address}}, "$file" ); } } sub ManageFork ($$$) { my ($ffile,$id,$dowhat) = @_; my $limit = "10"; my $sltime = "0.1"; my (@readlines); my $FH; if( $dowhat eq "add" ) { print "DEBUG: ManageFork add $id\n" if( $debug > 2); $FH = WaitThenLock( $ffile, $sltime, $limit, $lock); open(OUT, ">> $ffile"); close(OUT); OpenFile(\*EDIT, "+<", "$ffile"); @readlines = ; push(@readlines, "$id\n"); seek(EDIT,0,0); print EDIT @readlines; truncate(EDIT, tell(EDIT)) or die "ERROR: truncating '$ffile': $!"; CloseFile(\*EDIT, "$ffile"); print "DEBUG: ManageFork add unlock: $ffile\n" if( $debug > 2); unlink("$ffile$lock"); close(FH); print "DEBUG: ManageFork add end $id\n" if( $debug > 2); return 0; } if( $dowhat eq "remove" ) { print "DEBUG: ManageFork remove $id\n" if( $debug > 2); $FH = WaitThenLock( $ffile, $sltime, $limit, $lock); if( -e "$ffile" ) { OpenFile(\*EDIT, "+<", "$ffile"); @readlines = ; @readlines = grep(!/$id/, @readlines ); # weed out lines with this id seek(EDIT,0,0); print EDIT @readlines; truncate(EDIT, tell(EDIT)) or die "ERROR: truncating '$ffile': $!"; CloseFile(\*EDIT, "$ffile"); # Delete file if no more processes in the table unlink ("$ffile") if( ! (scalar @readlines) ); } print "DEBUG: ManageFork remove unlock: $ffile\n" if( $debug > 2); unlink("$ffile$lock"); close(FH); print "DEBUG: ManageFork remove end $id\n" if( $debug > 2); return 0; } } sub WaitThenLock ($$$$) { my ($lfile, $speriod, $limit, $lext) = @_; $lext = '' unless $lext; local *LOCKFH; print "DEBUG: WaitThenLock $lfile, $speriod, $limit, $lext\n" if( $debug > 2); my $total = 0; $lfile = "$lfile$lext" if $lext; # This first part of the routine is just good if only need to wait # (with defined upper time-limit) as long as a file exists while ( -e "$lfile" ) { select(undef,undef,undef,$speriod); # Trick for waiting less than 1 sec. $total += $speriod; if( $total > $limit ) { # Check against the upper limit unlink("$lfile"); print "Waiting Godot too long: $total sec...\n". "Lock file $lfile removed. Going ahead ;-)\n"; } } # If $lext is not defined, than we are just waiting and we will # not lock but return! return 0 if( ! $lext ); # Open and lock if $lext is defined! sysopen(LOCKFH, "$lfile", O_RDWR|O_CREAT); # Here is the exclusive lock which is unlocked (unlik+close) # in the corresponding routines where it was called. while( ! flock(LOCKFH, LOCK_EX) ) {;} return *LOCKFH; } sub TouchFile { my $_filename = shift; sysopen( FILE, "$_filename", O_RDWR|O_CREAT ) && close FILE || die "ERROR: can't create file '$_filename': $!\n"; } sub CatFile { my $_file_name = shift; my @_content = (); OpenFile(\*IN, "<", "$_file_name"); @_content = ; CloseFile(\*IN, "$_file_name"); return join('', @_content); } sub Grep_wc { my ($_file_name, $_regex, $_count) = @_; OpenFile(\*WC, "<", "$_file_name"); my @_wc_data = ; CloseFile(\*WC, "$_file_name"); @_wc_data = grep { /$_regex/ } @_wc_data; if ( $_count ) { return (scalar @_wc_data); } else { return join('', @_wc_data); } } ### Print Filter ### # $filtercode{'print'} = sub { my ($content) = @_; print "$content"; return $content; }; 1; __END__ ################################################################################ # # Documentation of the filters in POD # =head1 NAME C<.urlmon.mfs> - filters for B used for www-page content monitoring =head1 DESCRIPTION C<.urlmon.mfs> includes the 'print' and 'expand' filters. You can instruct B to load the filter's code from current working directory by putting at the beginning of your urlmon's database file following line: CODE=$ENV{'PWD'}/.urlmon.mfs If the above doesn't work, then: CODE=///.urlmon.mfs For more details see the documentation of B. The 'B' filter just prints out the fetched HTML page. It doesn't require arguments. The remaining documentation deals with the 'B' filter which is a general purpose content filter aimed to produce reports and a summary of reports from monitoring. It examines differences (see the paragraph NOTES below) between the previous and present text content of the monitored URL and reports them! Alternatively, it can pre-filter a HTML document and make a list of included URLs (hyperlinks) which will be retrieved, their text content filtered, and eventually reported. This is the most powerful feature of that filter! The monitored documents are fetched by API provided by the LWP modules and can be converted to ASCII by 'lynx' or 'links'. The filter can be used in forking mode: C<'-F'> option of B. Simultaneous reading and updating of files by the child processes is based on rudimentary file locking. This mechanism is not very elegant or efficient, but it works :-) There are three possibilities for monitoring. Which one is used depends on the first and second arguments passed to the filter (see also the ARGUMENTS description): =head2 A) You want to monitor the text content of a given page and you don't want to apply filtering In this case the first and second arguments can be skipped, alternatively you can say FARGS=-,- =head2 B) You have the situation as in A), but you want to do text content filtering You must put as second argument the same URL as given in the 'URL=' variable of the urlmon database file: URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=&,http://www.example.org/pub/test.html =head2 C) You want to monitor a list of specific hyperlinks as extracted from the target URL, whereas new members in the list should be retrieved and filtered First you need to apply a pre-filter, which is defined by regular expressions, to capture the wanted hyperlinks found in the observed page. Consider the targeted URL used in case B) as follows: URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=details.cgi&,- Assume that you expect to find in the file "test.html" lines with links which point to some script called "details.cgi": This is First This is Second Hi everybody! Now, if you are interested in these links (and aim to request and evaluate them), you can make a list of the wanted URL's by using the pre-filter "details.cgi" as defined in FARGS above. After processing the HTML code in "test.html", the 'B' filter will find and request following list: http://www.example.org/pub/details.cgi?quest=1 http://www.example.org/pub/details.cgi?quest=2 Note that the string "http://www.example.org/pub" is automatically put in front of every listed item. Or to be precise, an extracted hyperlink is appended to the monitored URL-string stripped from its basename. In difference, if the lines with links in the above example look like This is First This is Second you need to give a second argument in FARGS defining the in-front part of the extracted links: URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=details.cgi&,http://www.example.org/ The resulting list will be: http://www.example.org/cgi/details.cgi?quest=1 http://www.example.org/cgi/details.cgi?quest=2 These documents are retrieved then, and filtered by . In all examples in C), the string "details.cgi" plays the role of a pre-filter submitted as FARGS argument. You are not restricted to use only one pre-filter, but as many as you want. They apply to the HTML content of the monitored URL, and are used to find out the wanted links in the HTML document. Hence, these pre-filters behave a little bit differently compared to the cases A) and B)! Furthermore, they are applied two times at different steps in the procedure of determining the list of documents to be retrieved. Be careful when using more than one pre-filter - define the filters such that they include only regular expressions which are found at the right-hand side of the http-link looking for! Still, if you want that the pre-filters are applied only one time, and that the filter works as a regex to the whole input line, then the token "_LO:" must be put in front of the pre-filter string (alias "link only", or LO). Using again the last example, a link only filter definition could be: URL=http://www.example.org/pub/test.html MOD=abcdefghijklmnopqrstuvwxyz FILTER=expand FARGS=_LO:/This.is/&,http://www.example.org/ One link per line is expected when using a LO-filter. Note, that in case C) the comparison between the last and present content of the monitored page is actually a comparison between the last and present list of extracted URL's. Also, in the discussed examples we implicitly assumed that the last list is missing as we just start to monitor the page. Enjoy! =head1 ARGUMENTS Arguments which are skipped (undetermined) receive default values! If you want to skip arguments between determined arguments, then you have to put the '-' character, as for example: FARGS=,-,-, For some arguments the empty string '' or "", which means an empty argument value, can be given! In such cases '-' and '' are not the same! The arguments used by the 'B' filter, and ordered by their index, are described as follows: B<1)> A string defining pre- and post-filters. Although from the point of view of B the whole code of 'B' is considered as an external filter, the actual content filtering is determined by this argument. Hence, the argument is called "filter(s)" in this documentation. All filters are applied by using the grep() function. They are separated by unescaped ';' character not followed by ';'. Thus ';;' is not interpreted as a filter separator, and one can use it in braced blocks (i.e. { BLOCK }, see below) instead of ';'. A combination of pre- and post-filters is a complex string consisting of two substrings separated by unescaped '&' not followed by '&' which means that you can have '&&' in braced block or elsewhere. The pre-filters can be applied to the content before the old and new contents are compared (case A and B in the examples above), or can be used for extracting wanted links (case C). In latter case, the URL's content is processed line by line. Only when the pre-filter is passed, the line is further evaluated. It is split into strings separated by the substring ' href=', and then the pre-filter is applied again to every single string. This behavior can be changed by defining a pre-filter as "link only". In order to do so put in front of your pre-filter the token "_LO:" (look at the example in C). Consequently, the prefilter is applied to whole lines without splitting them. The post-filters are used to filter the text content of a retrieved page. For post-filtering the content is joined into a single record (i.e. single string). Ultimately, the pre- and post- filters have generally a slightly different syntax since the first are applied by grep() in list context, the latter in scalar context, respectively. This complication has the advantage that by using post-filters one can easily discard a whole record based on various criteria, as for example looking for given keywords in the record. For both, pre- and post-filters, every single filter is automatically wrapped within '//' (e.g. is wrapped to //) if it doesn't include an unescaped '/' and if it isn't defined in block. One can also give already wrapped filter(s), e.g. //;!//i; etc., or a mixture of wrapped and unwrapped filters. Any complex filter(s) must be defined as a single string without space characters!! Every filter from the complex filter string is implemented as block expression, i.e filtering is applied by using: '@list = grep { // } @list;' Eventually, you could apply also filters which are explicitly defined in a block! This allows writing sophisticated filter code. Also escaped commas can be used. Before applying the filters commas are then unescaped which means that for an escaped comma one needs to have '\\,'. EXAMPLES FOR VALID PRE-FILTERS: (?i)BoY -> will be wrapped to /(?i)BoY/, equal to /BoY/i !/girl/i -> delete lines containing the word 'girl' (?i)(Woman|girl) -> keep lines containing the word 'woman' or 'girl' s/sex/SEX/g||/.*/;!/violence/i -> substitute global + abandon violence ^^^^^^---> this part is needed in case that the whole list corresponding to the record does not contain the word 'sex' but you want to keep the record anyway {$I0++<=9&&$_>100} -> the first 10 lines of the record; if these are numbers, then include only numbers > 100 ; $I0 is declared as a global variable! Be careful whether it is used elsewhere! Using /./ instead of /.*/ in the regular expressions will discard all blank lines! EXAMPLES FOR VALID POST-FILTERS: !/bad/ -> delete the whole record if it contains the word 'bad' /(Happy.*Days|Days.*Happy)/si -> keep the record if it contains somewhere in it the two words 'Happy' and 'Days', case insensitive s/EVIL/GOOD/mg||/.*/ -> global change of 'EVIL' into 'GOOD' s/^.*ADVERTisement.*\n//mgi||/.*/ -> delete lines with 'advertisement' s/^.*ADVERTisement.*\n//mgi -> this is wrong, because if there is not such word in the record, it will be discarded by grep!! s/^(?:(?!Love).)*\n//mg||/.*/;s/^(?:(?!Peace).)*\n//mg||/.*/ -> keep lines which contain both 'Love' and 'Peace' /Luck(?=(?:(?!Trouble).)*$)/si -> Luck which is not followed by Trouble /smart/si&/^(?:(?!soft).)*$/si -> smart but not soft COMBINATIONS OF PRE- AND POST-FILTERS: simple -> this applies a pre-filter only &easy -> this applies a post-filter only simple&easy -> it's the same as /simple/&/easy/ (?i)simple;!/complex/i&easy;s/^(?:(?!dummy).)*\n//mig||/.*/ (?i)simple;!/complex/i;&easy;s/^(?:(?!dummy).)*\n//mig||/.*/; -> the same as previous but with explicit separator ';' put after every one filter Well, this is not all about filters. One can put them (or part of them) in a file which is parsed and included into the complex filter string. And that's really fun! To include such file(s) the token '_FiFi_:' should be given somewhere in the complex filter string e.g.: simple&easy;_FiFi_:~/urlmon/myfilters; Then all of the filters defined in '~/urlmon/myfilters' will be pasted exactly at this place in the filter string. Or another example: simple;_FiFi_:~/urlmon/addpre;&easy;_FiFi_:~/urlmon/addpost; Finally, one can define all of his filters (pre- and post-filters) in an external file and just say: _FiFi_:~/urlmon/allmyfilters; It's important to put ';' after the file name! Before pasting the filter file it is parsed such that all blank characters, empty lines and also comments (lines starting with any number of blank characters followed by '#') are removed. Consequently, you can define filters in an external file and use blank characters in the code as opposed to the default situation where the complex filter string should be a single string without blanks. The other rules for writing filters, as previously explained, are still valid. So, separating pre- and post-filters in a filter file is done as usual by putting '&'. But don't forget, at the end, when the complex filter string is returned to the program, it should contain only one separating '&'. B<2)> Base URL (use '-' for auto-find-out, '' or "" for none). For database requests, if the extracted document's path/URL for the request has an overlapping part with the base URL-string, then this part of the extracted string which is in front of the overlapping is deleted. For example, if the base URL is: "http://www.example.org/pub/documents" and the document's path is: "../somedir/somesubdir/documents/fetchme?id=123" then the constructed URL for the database request will be: "http://www.example.org/pub/documents/fetchme?id=123". B<3)> Directory (for working) where the data files will be saved - the current working directory is default. Paths should be absolute or relative to the working directory. One can also use Perl variables $ENV{'HOME'}, $ENV{'PWD'}, and also ~/, ~// etc. B<4)> Basename of the file in which the results for this URL will be saved (use '-' for auto naming). Generally, three files are created: one with extension '.old', next with extension '.new', and the last with extension '.rep' which will be a link to the report file. The name of the actual report file is generated automatically and consists of the basename, date and time tag, and increasing counter number. B<5)> File where to save summary of the results (use '-' for auto, '' or "" for none). "Summary.rep" is the default (see also the previous argument). The magic word "STDOUT" (case insensitive) will cause the summary to be sent to the standard output. In this case mail will not be sent and the output will not have a header. Use absolute paths in the filename if you have different directories for different URL entries and you want to be sure that all reports are written to the same file! In case the 3rd argument is given, the relative paths for this file are then relative to it. B<6)> E-mail address(es) or file with address(es) to send report to. In case you have a summary (see argument 5) the summary is e-mailed. Otherwise the file with the results for this URL is sent. The validity of the address is not checked. Multiple e-mail addresses can be passed to the program by giving a string containing different addresses separated by '&': "user1@host1&user2@host2&user3" etc. When using file with e-mail addresses, write the absolute path, $ENV{'HOME'}//, or at least ./! E-mail addresses in this file can be separated by blanks, new lines, or '&'. If there are no changes between the old and new documents, mail will not be sent. The mail queue is checked for unique delivery orders such that a report will be sent only once to the user. Hence one can give repeatedly his e-mail address in different URL entries (of the urlmon database) for the same report. B<7)> Total number of pages in a collection. This argument is needed if you want to observe many pages (URLs) as a single one collected page. As for example, when you monitor an URL with a list of database items (links), but the number of items in the list is limited by the provider, you have to load in sequence all following pages with items to go through the whole list. The solution is to define explicitly additional URL entries in the urlmon's database file for every following page in the provider's generated list. When monitoring a collection of pages and B is executed in forking mode, the present argument is essential. Nevertheless, it has to be defined in any case when pages are collected. The basename of the file where the results are saved (see argument 4) must be the same for all URLs in the collection! B<8)> Program to use for retrieving the extracted list of items from the monitored URL - like in case B above. It should print out the downloaded HTML documensts to standard output. Such a program could be 'wget' for example. In this case the wget-options '-q -O-' are automatically added on execution. B<9)> Program to use for converting the HTML document to ASCII. 'lynx', 'links', or user-defined program can be used: 'lynx' or //lynx - use 'lynx' (default) 'links' or //links - use 'links' // - user-defined program that converts HTML to ASCII. The program should expects as (only) command-line argument the name of the file to be converted, and should print out the ASCII text to STDOUT. Eventually, a private converter normally serves as an external content filter that presumably does [quite] complicated filtering and executes additional user-defined tasks. B<10)> Debug level: 1-5 are possible values =head1 NOTES The difference between the new and old text content of a document is considered to be the additional lines in the new document compared to the old one. If there are no additional lines but only few lines have changed, or alternatively [almost] the whole text is new, then only the changed parts of the documents are reported in the first case; the whole content is reported to be changed in the latter case, respectively. For details on the method of how the boundary is defined between small and big difference, look at the code :-) =head1 ENVIRONMENT VARIABLES There are following variables which can be defined in the OS environment to further configure the behavior of the program: B - could be used to override the LC_CTYPE setting and to define the character encoding of the resulting output text. B - defines the time (in floating seconds) which the 'expand'-filter waits before fetching the next item on its list of extracted urls. "Don't wait" is the default behavior. =head1 COPYRIGHT Copyright (C) 2001-2008, 2011-2014 Dimitar Ivanov, This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. You can find the latest version of the program at http://gnu.mirendom.net =head1 SEE ALSO Check the documentation in the B distribution. =cut # # End of POD #