#! /usr/bin/perl 
# dieses Programm macht einen xml rss feed aus der nzz seite. bei jedem aufruf wird die nzz-site nach neuen artikeln durchsucht
# this script generates xml rss feed of nzz, that currently has none. a feature is the option to chose the subjects of subscription
# 2004-04-21 Felipe Wettstein
# 2005-06-19 Felipe Wettstein: select correct image and title2 and cache with parsed index files stored in xml and used only when up2date

# THERE IS ABSOLUTELY NO ERROR HANDLING!!!


use  LWP::Simple;
use  strict;
use  warnings;
use  HTML::Entities  qw(:DEFAULT encode_entities_numeric);
use  Time::Local;
use  CGI;


# INITIALIZE
############
# new CGI object
my  $cgi_eval  =  new  CGI;

# site that is searched
my  $site  =  "http://www.nzz.ch";

# name of index page
my  $index  ="index.html";

# choose script home
#my $host = "maria";
my  $host  =  "gromski";

my  ($file_cache,  $html_home,  $css);



if  ($host  eq  "maria")  {
     # physical path to cache, might be a htdocs folder
     $file_cache  =  "../Documents/rss_cache";  # maria
     $html_home  =  "http://maria.local/nzzrss.html";  # home of html page to get subscribed
     $css  =  "/gromski.css";
}  else  {
     $file_cache  ="../gromski/rss_cache/";  # gromski.ch
     $html_home  =  "http://gromski.ch/nzzrss.html";  # home of html page to get subscribed
     $css  =  "/gromski.css";
}

# to install make shure to have write acces to files in $file_cache
# it might be neccessary to first make all possible temp files: touch al.txt, touch wi.txt, .. and give them the correct owner and write privilges

# german month names (html) in hash (key = month, value = number (0-11))
my  %month;
my  $counter=0;
foreach  (qw (Januar Februar März April Mai Juni Juli August September Oktober November Dezember))  {
     $month{$_}=$counter  ;
     $counter  ++;
}

# keys for subject: key = directory name, value = meaning
my  %rubrik  =  (
         'kommentar'=>'Kommentar',
         'al'=>'International',
         'wi'=>'Wirtschaft',
         'bm'=>'Börsen - Märkte',
         'il'=>'Inland',
         'zh'=>'Zürich',
         'sp'=>'Sport',
         'fe'=>'Feuilleton',
         'vm'=>'Vermischtes',
         'wetter'=>'Wetter',
         'ft'=>'Forschung - Technik',
         'to'=>'Tourismus',
         'em'=>'Medien - Informatik',
         'li'=>'Literatur - Kunst',
         'zf'=>'Zeitfragen');

# file xml heading
my  $head_xml_file  =  "head_xml.txt";


# ANALYSE REQUEST
#################
if  ($cgi_eval->param("evaluate"))  {
     # called from webbrowser, anwer to browser in html
     answer_browser  ();
}  elsif  ($cgi_eval->param("clear_cache"))  {
     # clear cache, means, put everywhere a very old timestamp
     clear_cache  ();
}  else  {
     # called from rss-newsreader requesting information, answer to reader in xml
     answer_reader  ();
}

exit  0;

# SUBS
######

# html page containing url for newsreader
sub  answer_browser  {

     # get url with query string
     my  $query_string  =  $cgi_eval->url(-query=>1);
     # chop 'submit button' from url
     $query_string  =~  s/evaluate.*$//;


     # start html
     print  "Content-Type: text/html; charset=iso-8859-1

<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'>
<html>
<head>
    <LINK REL='stylesheet' TYPE='text/css' HREF=\"".$css."\">
    <meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>
    <title>** Alles Anmalen * Kopieren * Neuen Feed Anmelden **</title>
</head>

<body>
    <h1>
        <a href=\"".$query_string."\">".$query_string."</a>
    </h1>
</body>
</html>\n";
}  # end answer_browser


# construct xml page containing all the rss information
sub  answer_reader  {

     # get directories that will be searched, if none is defined, get em all
     my  @directories=  $cgi_eval->param("rubrik");
     
     if  (!@directories)  {
         #@directories =  keys %rubrik;
         @directories  =   qw (il vm);
     }
     
     # feed title from selected directories
     my  $feed_title  =  "NZZ: ".join  (", ",@rubrik{(@directories)});  # i am learning:)

     # header 
     write_xml_header($feed_title);

     # get all index files from all selected directories, and parse
     foreach  my  $dir  (@directories)  {

         # get timestamp of index file
         my  $timestamp  =  (head("$site/$dir/$index"))[2];  # perldoc -f head
#       print "\n****\nindex file timestamp: ".localtime($timestamp)."\n";

         # compose path to cache_file
         my  $cache_filename  =  "$file_cache/$dir.txt";

         # check if actual cache exists
         if  (!((stat($cache_filename))[9]==  $timestamp))  {
             # cache_file does not exist or timestamp is different from index file
             # => use cache file to compose xml feed

             # get index file
             my  $doc  =  get("$site/$dir/$index");

             # remove leading crap from $doc
             $doc  =~  s/^.+?\<span\s+class=\"A4\"\>(.*)/$1/is;  # '?' in '.+?' to change greediness: match minum 
             # remove tailing crap from $doc
             $doc  =~  s/^(.+\<span\s+class=\"B4\"\>.+\<\/span\>).+/$1/is;  # the text ends with a  .... tag

             # put xml lines of index file in @cache, $block_ref is reference to string holding one informaton block
             my  (@cache,$block_ref);

             while  ($doc)  {
             # parse $doc $block by $block and put xml in @cache

                 $block_ref  =  get_next_block  (\$doc);

                 # write_xml makes xml out of a bunch of variables, it then writes them to @cache
                 # variables: $rubrik, $date, $time, $publication, $link, $title, $title2, $image
                 push  (@cache,write_xml  ($rubrik{$dir},parse_block  ($block_ref)));
             }  # end while $doc ..

             # save cache
             open  (CACHE,  ">$cache_filename")  or  print  "could not open cache $cache_filename\n";
                 print  CACHE  "<!-- '$rubrik{$dir}' cached at ".localtime(time())." -->\n";
                 print  CACHE  "<!-- with timestamp ".localtime($timestamp)." -->\n";
                 print  CACHE  @cache;
             close  CACHE;

             # put index file timestamp on cache file
             utime  $timestamp,  $timestamp,  $cache_filename;  # set timestamp to index files timestamp

         }  # end if (no actual cache)
         else  {  # xml is from cache
             print  "<!-- from cache, last update ".localtime($timestamp)." -->\n";
         }

         # print xml cache files to stdout
         open  (CACHE,  "$cache_filename");
             print  STDOUT  <CACHE>;
         close  CACHE;

     }  # end foreach $dir (@directories)
     print  "\n\n";
}  # end answer_reader

# extract next block to parse: a block is embraced by the actual "" tag and the equal tag of the next entry
# the leading "" has been removed already
sub  get_next_block  {

     # expexts reference to resting doc
     # returns reference to actual block, $doc has then be chopped already
     
     my  $doc_ref  =  shift;
     my  $block;

     # first entry starts with: <span class="A4"> (the date))
     if  ($$doc_ref  =~  s/^(.+?)\<span\s+class=\"A4\"\>(.*)/$2/is)  {  # '?' in '.+?' to change greediness: match minum 
         $block  =  $1;
     }  else  {
         $block  =  $$doc_ref;
         $$doc_ref  =  "0";
     }  # end while $doc..

     # return reference to actual block
     \$block;

}  # end sub get_next_block

# parse information of one information-block out of $doc
sub  parse_block  {

     # expects a html block ($block_ref) that will be transferred to a bunch of variables, containing information that will be inserted in item
     # returns a bunch of variables: ($date, $time, $publication, $link, $title, $title2, $image)

     my  $block_ref  =  shift  @_;
     my  ($date,  $time,  $publication,  $link,  $title,  $title2,  $image);

     # get date and publication type
     ($date,$publication)  =  get_date  ($block_ref);
#   print "\n\n***************\n\$date: $date\n\$publication: $publication\n";

     # get link, title1 and title2
     ($link,  $title,  $title2)  =  get_link_and_title  ($block_ref);
#   print "\$link: $link\n\$title: $title\n\$title2: $title2\n";

     # get image location
     ($image)  =  get_image  ($block_ref);
#   print "\$image: $image\n";

     return  ($date,  $time,  $publication,  $link,  $title,  $title2,  $image);
}  # end sub parse_block

# extract date from html block
sub  get_date  {

     # expect reference to acutal block, starting with date
     # return date and publication
     
     my  $block_ref  =  shift;
     my  ($date,$publication);
     # DATE              1day       2month        3year     4hour  5min   6publication
     # parse this string: 25. April 2005, 10:54, NZZ Online
if ($$block_ref =~ /^(\d+?)\.\s*(\w+?)\&nbsp\;(\d+)\,\s*(\d+)\:(\d+)\,(.+?)\<br\>/) { # print "\n\n#####\$1: $1, \$2: $2, \$3: $3, \$4: $4, \$5: $5, \$6: $6\n"; $date = make_RFC_822_date (timelocal(0,$5,$4,$1,$month{$2},$3)); $publication = encode_entities_numeric(decode_entities($6)) || "nopublication"; } else { # parse this string: 25. April 2005, Neue Zürcher Zeitung
# DATE 1day 2month 3year 6publication $$block_ref =~ /^(\d+?)\.\s*(\w+?)\&nbsp\;(\d+)\,(.+?)\<br\>/; $date = make_RFC_822_date (timelocal(0,0,0,$1,$month{$2},$3)); $publication = encode_entities_numeric(decode_entities($4)) || "nopublication"; } # return date (RFC 822 compliant) and publicaton-type ($date, $publication); } # end sub get_date # extract link to article and titles from html block sub get_link_and_title { # expect reference to acutal block # return link, title1 and title2 my $block_ref = shift; my ($link,$title,$title2); my $title3; # is then cat to $title2 # LINK and TITLE1 1link 2title $$block_ref =~ /\<a\s+href\s*\=\s*\"(.+?)\"\s*\>(.+?)\<\//i; $link = $1 || "nolink"; $title = $2 || "notitle"; $title =~ s/\&nbsp;/ /ig; # replace non-braking-space with normal $title =~ s/\s*$//; # remove tailing whitespace $title = encode_entities_numeric(decode_entities($title)) || "notitle"; $link =~ s/\.\.(.+)/$site$1/; # title2 if (($$block_ref =~ /\<span\s+class\s*\=\s*\"B2\"\s*>(.+?)<\/span>/i) || ($$block_ref =~ /\<span\s+class\s*\=\s*\"newzz-ressort-headline-maintitle\"\s*>(.+?)<\/span>/i)) { $title2 = $1; $title2 =~ s/\<a\s*.*?\>//i; # remove possible links (i.e. when block == kommentar, second chance in || ) $title2 =~ s/\&nbsp;/ /ig; # replace non-braking-space with normal $title2 =~ s/\s*$//; # remove tailing whitespace $title2 = encode_entities_numeric(decode_entities("<b>$title2.</b> "))|| "notitle2"; # title3 if ($$block_ref =~ /<span\s+class\s*\=\s*\"B3\"\s*>(.+?)<\/span>/is) { $title3 = $1; $title3 =~ s/\&nbsp;/ /ig; # replace non-braking-space with normal $title3 =~ s/\s*$//; # remove tailing whitespace $title3 =~ s/\<a.+?\<\/a\>$//; # remove tailing link $title3 = encode_entities_numeric(decode_entities($title3))|| "notitle3"; } else { $title3 = ""; } $title2 = $title2.$title3; } else { # there are small titles especially old ones and from nzz print if ($$block_ref =~ /\<span\s+class\s*\=\s*\"B4\"\s*>(.+?)<\/span>/i) {; $title2 = $1; $title2 =~ s/\&nbsp;/ /ig; # replace non-braking-space with normal $title2 =~ s/\s*\<a.+?\<\/a\>\s*$//; # remove tailing link and whitespace $title2 = encode_entities_numeric(decode_entities("<b>$title2.</b>"))||"notitle2"; } else { $title2 = ""; } } # return values ($link, $title, $title2); } # extract image location from html block sub get_image { # expect reference to acutal block # return image location my $block_ref = shift; my ($image); if ($$block_ref =~ /<img\s+alt=\"\"\s+height=\"100\"\s+src="(.+?)\"\s+width=\"100"\s*\>/i) { $image = $site.$1; $image = encode_entities("<img src=\"$image\" height=\"100\" width=\"100\" align=\"left\" hspace=\"5\">"); } else { $image =""; } # return values $image; } # end sub get_image # write header of xml file sub write_xml_header { my $titel = shift; # actual local time for xml header my $date = make_RFC_822_date (); print "Content-type: text/xml\n\n"; open (TEMP, "$head_xml_file") || die "Can't open file $head_xml_file\n"; my $temp = join("",<TEMP>); $temp =~ s/--heute--/$date/s; $temp =~ s/--feed_titel--/$titel/s; print $temp; close (TEMP); } # put values in xml sub write_xml { # expects a bunch of variables # returns xml lines of one item my ($rubrik, $date, $time, $publication, $link, $title, $title2, $image) = @_; return "<item> <title>$title</title> <description>$image $title2\n<p>$publication: $rubrik </description> <link>$link</link> <dc:subject>$rubrik</dc:subject> <dc:creator>gromski feed</dc:creator> <pubDate>$date</pubDate> </item>\n\n"; } # end sub write_xml sub make_RFC_822_date { # get unix time, if no argument, calculate with actual time # return RFC_822 compatible Date # lists for RFC-822 compatible Date my @Mon = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my @Wda = qw (Sun Mon Tue Wed Thu Fri Sat ); my ($sec,$min,$hour,$mday,$mon,$year,$wday,,) = localtime(shift||time()); # check argument or use actual time if ($hour>3) { $hour = $hour-2;} # nzz is located in switzerland: two more hours than gmt $year += 1900; $wday = $Wda[$wday]; $mon = $Mon[$mon]; return sprintf("%s, %02d %s %04d %02d:%02d:%02d GMT",$wday, $mday, $mon, $year, $hour, $min,0); } # end sub make_RFC_822_date # clear entire cache == put an old timestamp sub clear_cache { # set the timestamp of the whole cache back to the beginnings of life my $timestamp =timelocal(0,10,0,9,$month{"September"},1969); print "Content-Type: text/html; charset=iso-8859-1\n\n"; print "<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'>\n<html>\n<head>\n\t<LINK REL='stylesheet' TYPE='text/css' HREF='$css'>\n\t<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>\n<title>Cache geleert</title>\n</head>\n<body>\n\n"; print "<h1><a href=\"$html_home\">Timestamp ist jetzt überall: ".localtime($timestamp)."</h1>\n"; foreach (keys %rubrik) { utime $timestamp, $timestamp, "$file_cache/$_.txt"; } }