0001 #!/opt/perl-5.8.8/bin/perl 0002 0003 # ========= ========= ========= ========= ========= ========= 0004 # spy.pl 0005 # 0006 # PURPOSE - This program spies on websites -- checks them 0007 # regularly for changes -- then saves the most 0008 # recent time saved into a DBM file. It also can 0009 # execute 0010 # 0011 # AUTHOR - Dave Jacoby 0012 # 0013 # Copyright (c) 2006. David Jacoby. All Rights Reserved. 0014 # 0015 # This program is free software; you can redistribute it 0016 # and/or modify it under the same terms as Perl itself. 0017 # 0018 # See http://www.perl.com/perl/misc/Artistic.html 0019 # 0020 # --------- --------- --------- --------- --------- --------- 0021 0022 use AnyDBM_File ; 0023 use Getopt::Long ; 0024 use HTTP::Request ; 0025 use HTTP::Response ; 0026 use LWP::UserAgent ; 0027 use Digest::MD5 qw( md5_hex ) ; 0028 use Digest::Nilsimsa ; 0029 use subs qw( 0030 make_agent_string 0031 time_as_string 0032 0033 read_data_file 0034 check_page 0035 eval_response_code 0036 0037 open_log_file 0038 print_log 0039 close_log_file 0040 0041 send_email_response 0042 ) ; 0043 #grouping by function 0044 0045 # ######## ######### ######### ######### ######### ######### 0046 my %md5 ; 0047 my %nils ; 0048 my %lastmod ; 0049 my %opt ; 0050 my $system ; 0051 my $LOGFILE ; 0052 0053 0054 # ######## ######### ######### ######### ######### ######### 0055 # 0056 # Use more standard 0057 # -A => ALL -> Ignores time and runs all elements 0058 # -d => DEBUG -> Prints all sorts of output to STDOUT 0059 # -f => FORCE -> Force a change 0060 # -r => RESET -> Resets changed data 0061 # -i => INITIALIZE -> Sets initial variables, doesn't run responses 0062 # -n => NO LOG -> Doesn't print to log file 0063 0064 # CUT OPTIONS 0065 # -s => SOURCE -> Names Spy Resource file 0066 # -l => LOGFILE -> Names Logfile 0067 0068 my $logfile = "" ; 0069 my $source = "" ; 0070 my $debug = "" ; 0071 my $reset = "" ; 0072 my $force = "" ; 0073 my $all = "" ; 0074 my $nolog = "" ; 0075 my $initial = "" ; 0076 0077 GetOptions( 0078 'logfile=s' => \$logfile , 0079 'source=s' => \$source , 0080 'debug' => \$debug , 0081 'reset' => \$reset , 0082 'force' => \$force , 0083 'all' => \$all , 0084 'nolog' => \$nolog , 0085 'initial' => \$initial , 0086 ); 0087 0088 if ( $logfile !~ /\w/ ) { $logfile = q(.spylog) ; } 0089 if ( $source !~ /\w/ ) { $source = q(.spyrc) ; } 0090 0091 0092 0093 # ######## ######### ######### ######### ######### ######### 0094 # The Work Starts 0095 0096 chdir $ENV{'HOME'} ; 0097 open_log_file( $logfile ) ; 0098 0099 my @hits = read_data_file() ; 0100 my $agent = make_agent_string() ; 0101 0102 my $spy = new LWP::UserAgent ; # Pull Agentmaking to main 0103 $spy->agent($agent) ; 0104 $spy->timeout(10) ; 0105 0106 dbmopen %md5 , '.Spydata/md5' , 0644 or die "Can't open md5 $!\n"; 0107 dbmopen %lastmod , '.Spydata/lastmod' , 0644 or die "Can't open lastmod $!\n"; 0108 dbmopen %nils , '.Spydata/nils' , 0644 or die "Can't open nilsimsa $!\n"; 0109 0110 print_log q(Starting) ; 0111 0112 for my $hit ( @hits ) { 0113 0114 next if $hit !~ /\w/ ; 0115 $hit =~ s/^\s//o ; 0116 print_log q(------) ; 0117 print_log qq($hit) ; 0118 0119 my ( $url , undef , $top , $bottom , $action , $target ) 0120 = split /\s+/ , $hit ; 0121 print_log qq(\tURL:\t$url) ; 0122 print_log qq(\tAction:\t$action) ; 0123 print_log qq(\tTarget:\t$target) ; 0124 0125 my $response = check_page $url , $action , $target , $top , $bottom ; 0126 if ( $force ) { $response++ } 0127 if ( $response ) { print_log qq(\t$url changed) ; } 0128 else { print_log qq(\t$url unchanged) ; } 0129 0130 if ( ( $response ) && ( !$initial ) ) { 0131 #If there's a change 0132 #and if there's an action 0133 #and we're not initializing 0134 0135 { 0136 my $time = time ; 0137 $lastmod{$url} = $time ; # Store the last modification 0138 print_log q(+=+=+=+=+=+=+=+=+) ; 0139 print_log $url ; 0140 print_log $time ; 0141 print_log $lastmod{$url} ; 0142 print_log ; 0143 } 0144 0145 if ( $action ) { 0146 my $kid = fork ; 0147 if ( !defined $kid ){ print_log qq(fork fail: $!\n) ; } 0148 if ( $kid ) { 0149 print_log qq(THIS IS THE FORK) ; 0150 print_log join " " , qx(pwd) , $action , $target , $url ; 0151 print_log qx( .spy_tools/$action -t "$target" -u "$url" ) ; 0152 print_log qq(THIS IS THE FORK) ; 0153 kill(TERM => $kid ) ; 0154 } 0155 } 0156 } 0157 0158 print_log q(------) ; 0159 print_log q() ; 0160 } 0161 0162 print_log q(Finishing) ; 0163 dbmclose %md5 ; 0164 dbmclose %lastmod ; 0165 dbmclose %nils ; 0166 close_log_file() ; 0167 0168 exit 0 ; 0169 0170 # ######## ######### ######### ######### ######### ######### 0171 0172 0173 0174 0175 0176 0177 0178 # ######## ######### ######### ######### ######### ######### 0179 # The Subroutines 0180 # ######## ######### ######### ######### ######### ######### 0181 0182 0183 0184 0185 # ######## ######### ######### ######### ######### ######### 0186 # 0187 # Read Data File 0188 # 0189 # This program parses the configuration file and pushes 0190 # the output in an array, which should be time-delimited 0191 # (every hour). 0192 # 0193 # ######## ######### ######### ######### ######### ######### 0194 sub read_data_file { 0195 0196 # We use a hash instead of an array for return_vals because 0197 # this way, we can guard against multiple instances of the 0198 # same search during a submit all run. By returning the keys, 0199 # we get a list of non-replicated elements. 0200 0201 if ( $debug ) { print_log q(open read_data_file) ; } 0202 0203 my $hour = ( localtime time )[2] ; 0204 my $home = $ENV{'HOME'} ; 0205 my $resource = $source ; 0206 my %return_vals ; 0207 0208 if ( $debug ) { print_log qq(open $resource) } 0209 0210 open my $RESOURCE , q{<} , "$resource" or print_log qq(Can't open $resource: $! \n) ; 0211 while (<$RESOURCE>) { 0212 chomp ; 0213 my $line = ( split /#/ )[0] ; 0214 $line =~ s/^\s+//o ; 0215 next unless $line =~ /\w/ ; 0216 next unless length $line > 10 ; 0217 my @line = split /\s+/ , $line ; 0218 my $time = $line[1] ; 0219 $line[1] = q(TIME) ; 0220 my $newline = join ' ' , @line ; 0221 #we reattach to make the search time-neutral from here on 0222 0223 next if ( ( $time ne 'A' ) && ( $time != $hour ) && ( !$all ) ) ; 0224 #there's no good way to wildcard it 0225 0226 $return_vals{$line}++ ; 0227 if ( $debug ) { print_log qq(\t $time) ; } 0228 if ( $debug ) { print_log qq(\t $line) ; } 0229 } 0230 close $RESOURCE ; 0231 0232 if ( $debug ) { print_log q(close read_data file) ; } 0233 return ( sort keys %return_vals ) ; 0234 } 0235 # ######## ######### ######### ######### ######### ######### 0236 0237 0238 0239 0240 # ######## ######### ######### ######### ######### ######### 0241 # 0242 # Check Page 0243 # 0244 # This program checks if the page in question has changed 0245 # and that's it. We should work on linking the response 0246 # and the URL, so that we're only checking whether this 0247 # 0248 # 0249 # ######## ######### ######### ######### ######### ######### 0250 sub check_page { 0251 0252 if ( $debug ) { print_log q(check_page) ; } 0253 0254 my $bottom_toggle = 0 ; 0255 my $url = shift ; 0256 my $action = shift ; 0257 my $target = shift ; 0258 my $top = shift ; 0259 my $bot = shift ; 0260 my $check = qq($url\t$action\t$target) ; 0261 0262 # $url is used by LWP, but $action and $target are only 0263 # used to make a unique spying incident, so we can watch 0264 # the same URL for different email addresses, or watch the 0265 # same URL with different responses. 0266 0267 my $request = new HTTP::Request('GET',$url) ; 0268 my $response = $spy->request($request); 0269 my $output = $response->content ; 0270 $output =~ s/\r\n/\n/g ; 0271 $output =~ s/\r/\n/g ; 0272 # We come up with a unified definition of newline so we can do 0273 # a certain number of lines. 0274 0275 my @output = split /\n/ , $output ; 0276 if ( $bot < 0 ) { $bottom_toggle = -1 ; $bot = abs $bot ; } 0277 elsif ( $bot > 0 ) { $bottom_toggle = 1 ; } 0278 0279 for ( 0 .. $top ) { 0280 shift @output ; 0281 } 0282 if ( $bottom_toggle == 1 ) { 0283 $#output = $bot ; 0284 } 0285 elsif ( $bottom_toggle == -1 ) { 0286 for ( 0 .. $bot ) { 0287 pop @output ; 0288 } 0289 } 0290 0291 # We take the output file, of the size we specify, and we take an 0292 # MD5 hash of it, to compare with an existing hash. We convert the 0293 # hash to a big-endian hexidecimal number because we want it to be 0294 # human-readable for debugging purposes and hexidecimal numbers are 0295 # shorter. We then compare the page hash with an earlier page hash. 0296 # MD5 hashes are great for this, because any change in the number 0297 # means the file has been changed. 0298 0299 my $key = $check . "\t" . time ; 0300 $output = join "\n" , @output ; 0301 my $nilsimsa = new Digest::Nilsimsa ; 0302 my $digest = $nilsimsa->text2digest($output) ; 0303 $output = md5_hex $output ; 0304 $nils{$key} = $digest ; #log it each time, every time 0305 0306 print_log qq(New: $output) ; 0307 print_log qq(Old: $md5{$check}) ; 0308 print_log q(-----) ; 0309 0310 if ( $debug ) { print_log q(check_page) ; } 0311 0312 if ( ( $md5{$check} ne $output ) || ( $initial ) ) { 0313 $md5{$check} = $output ; # Store the new Hash 0314 # relog every time there's a change 0315 #$lastmod{$key}++ ; # Store the last modification 0316 #maybe put this outside? 0317 return 1 ; # changed 0318 } 0319 return 0 ; # unchanged 0320 } 0321 # ######## ######### ######### ######### ######### ######### 0322 0323 0324 0325 0326 # ######## ######### ######### ######### ######### ######### 0327 # 0328 # Make Agent String 0329 # 0330 # I need to make this read from a SPY_AGENT variable or 0331 # somethingt. 0332 # 0333 # ######## ######### ######### ######### ######### ######### 0334 sub make_agent_string { 0335 # make this configurable 0336 $system = 'Spy-zilla/0.1' ; 0337 0338 return $system ; 0339 } 0340 # ######## ######### ######### ######### ######### ######### 0341 0342 0343 0344 0345 # ######## ######### ######### ######### ######### ######### 0346 # 0347 # Send Email Response 0348 # 0349 # I need a better, scriptable(?) response text here 0350 # 0351 # ######## ######### ######### ######### ######### ######### 0352 sub send_email_response { 0353 # It would be nice to have some agnosticism here. I'll likely be 0354 # happy with everything else before I go into this, though 0355 0356 if ( $debug ) { print_log q(send_email_resposne) ; } 0357 my $date = time_as_string() ; 0358 my $url = shift ; 0359 my $to = shift ; 0360 0361 print_log qq(\tChanged URL:\t$url) ; 0362 print_log qq(\tRecipient:\t$to) ; 0363 0364 open my $MAIL , q{|} , "$sendmail" or print_log qq(Can't Open Sendmail: $!) ; 0365 print {$MAIL} qq(To: $to\n) ; 0366 print {$MAIL} qq(From: $to\n) ; # Change to a spy address 0367 print {$MAIL} qq(Subject: Page Spy - $url\n) ; 0368 print {$MAIL} qq(\n) ; 0369 print {$MAIL} <<'EOF' ; 0370 The url $url changed before $date. 0371 0372 -- 0373 Your Web Spy, $agent 0374 0375 EOF 0376 0377 close $MAIL or print_log qq(Can't close Sendmail: $!) ; 0378 if ( $debug ) { print_log q(send_email_response) ; } 0379 return 1 ; 0380 } 0381 # ######## ######### ######### ######### ######### ######### 0382 0383 0384 0385 0386 # ######## ######### ######### ######### ######### ######### 0387 # 0388 # Open Log File 0389 # 0390 # Start logging, handling sending errors to console 0391 # 0392 # ######## ######### ######### ######### ######### ######### 0393 sub open_log_file { 0394 0395 return 0 if $nolog ; 0396 my $logfile = shift ; 0397 open $LOGFILE , q{>} , qq{$logfile} ; 0398 0399 print_log q(#Spy Log: ) . time_as_string() ; 0400 if ( $logfile ) { print_log qq(Logfile:\t$logfile) ; } 0401 if ( $source ) { print_log qq(Source File:\t$source) ; } 0402 if ( $reset ) { print_log q(reset) ; } 0403 if ( $force ) { print_log q(force) ; } 0404 if ( $all ) { print_log q(all) ; } 0405 if ( $nolog ) { print_log q(nolog) ; } 0406 if ( $initial ) { print_log q(initial) ; } 0407 if ( $debug ) { print_log q(debug) ; } 0408 print_log q() ; 0409 0410 return 0 ; 0411 } 0412 # ######## ######### ######### ######### ######### ######### 0413 0414 0415 0416 0417 # ######## ######### ######### ######### ######### ######### 0418 # 0419 # Close Log File 0420 # 0421 # Close Log File 0422 # 0423 # ######## ######### ######### ######### ######### ######### 0424 sub close_log_file { 0425 return close $LOGFILE ; 0426 } 0427 # ######## ######### ######### ######### ######### ######### 0428 0429 0430 0431 # ######## ######### ######### ######### ######### ######### 0432 # 0433 # Print Log 0434 # 0435 # Handle debug, logging 0436 # 0437 # ######## ######### ######### ######### ######### ######### 0438 sub print_log { 0439 return 0 if $nolog ; 0440 my @content = @_ ; 0441 for my $line ( @content ) { 0442 print {$LOGFILE} qq($line\n) ; 0443 if ( $debug ) { print qq($line\n) ; } 0444 } 0445 return 0 ; 0446 } 0447 # ######## ######### ######### ######### ######### ######### 0448 0449 0450 0451 # ######## ######### ######### ######### ######### ######### 0452 # 0453 # Time as String 0454 # 0455 # Returns time as a localtime string 0456 # 0457 # ######## ######### ######### ######### ######### ######### 0458 sub time_as_string { 0459 my $time = shift || time ; 0460 my @months = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ) ; 0461 my @days = qw( Sun Mon Tue Wed Thu Fri Sat ) ; 0462 0463 my ( $seconds , 0464 $minute , 0465 $hour , 0466 $mday , 0467 $mon , 0468 $year , 0469 $wday , 0470 $yday , 0471 $isdst ) = localtime $time ; 0472 0473 my $am ; 0474 $seconds = sprintf '%02d' , $seconds ; 0475 $minute = sprintf '%02d' , $minute ; 0476 if ( $hour < 1 or $hour > 12 ) { $am = q(PM) } 0477 else { $am = q(AM) ; } 0478 $hour = $hour % 12 ; 0479 if ( $hour == 0 ) { $hour = 12 ; } 0480 $mon = $months[$mon] ; 0481 $wday = $days[$wday] ; 0482 $year = $year + 1900 ; 0483 0484 return qq($wday, $hour:$minute:$seconds $am, $mon $mday, $year) ; 0485 } 0486 # ######## ######### ######### ######### ######### ######### 0487 0488 0489 0490 # ######## ######### ######### ######### ######### ######### 0491 # 0492 # Eval Response Code 0493 # 0494 # xxxxxxx xxxxxxxxxxx xxx x xxxxxxxxx xxxx 0495 # 0496 # ######## ######### ######### ######### ######### ######### 0497 sub eval_response_code { 0498 print_log q(eval_response_code) ; 0499 my $url = shift ; 0500 my $script = shift ; 0501 my $content ; 0502 0503 open my $FILE , q{<},qq{$script} ; 0504 print_log q(**********) ; 0505 while (<$FILE>) { 0506 my $line = $_ ; 0507 $content .= $line ; 0508 print_log $line ; 0509 } 0510 print_log q(**********) ; 0511 close $FILE ; 0512 0513 print_log q(Starting Eval) ; 0514 eval $content ; 0515 #LET'S SWITCH TO QX{}ing an outside program 0516 print_log q(Ending Eval) ; 0517 if ( $@ ) { print_log qq(\tWARNING:\t$@\n\n) ; } 0518 send_email_response ( 'http://csociety.ecn.purdue.edu/~jacoby/spy.html' , 'jacoby@csociety.org' ) ; 0519 print_log q(eval_response_code) ; 0520 return ; 0521 } 0522 # ######## ######### ######### ######### ######### #########