#! /usr/bin/perl -w

# this script shall be run regularly:
#   it queries the 'entrez' nucleotide-databese for a defined string (like 16S)
#   and appends the number of items to a text file, errors are sent per email to the 'maintener'

# 2004-02-18 Felipe Wettstein


use  LWP::Simple;
use  strict;
use  warnings;
use  Mail::Mailer;

# 'maintainers' email
my  $maintainer  =  'maps@smpa.com';

# adress that is queried
my  $adress  =  'http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Search&db=nucleotide&term=';

# query string
my  $query_string  =  '((16s[Title] AND rRNA[Title]) OR (16s[Title] AND ribosomal[Title] AND RNA[Title]))';

# data are apended to this file, it might be simpler to use full paths - who knows in the end where the script is run
# my $data = '/home/felipe/cgi-bin/entrez_daten.txt';
my  $data  =  'entrez_daten.txt';

# every whitespace in the query must be replaced with '+'
$query_string  =~s/\s/\+/g;

# the whole website is retrieved and copied in $doc, then the number of items is parsed
my  $doc  =  get($adress.$query_string);  
# for offline tests: my $doc ="Items 1-20 of 134530";
# the line of interest contains
# Items 1-20 of 134530

my  $time  =  localtime();

if  ($doc  =~  /Items\s\d+\-\d+\s+of\s(\d+)\s*\<\/div/)  {

     # append to file $data
     open  (TEMP,  ">>$data")  ||  die("cannot open file");
         print  TEMP  $time."\t$1\n";
     close  (TEMP);
}  else  {  # string not found

     # send mail
     my  %headers  =  (
     'To'  =>  $maintainer,
     'Subject'=>'Cannot reach server'
     );
     
     my  $body="It seams as if today at ($time), no correct string could be retriefed from ($adress).\n";
     
     my  $mailer  =  Mail::Mailer->new('sendmail');
     $mailer->open(\%headers);
     print  $mailer  $body;
     $mailer->close;
     
     # append to file $data
     open  (TEMP,  ">>$data")  ||  die("cannot open file");
         print  TEMP  $time."\terror\n";
     close  (TEMP);
}
exit  0;