#! /usr/bin/perl # dieses Programm macht einen xml rss feed aus der nzz seite. bei jedem aufruf wird die nzz-site nach neuen artikeln durchsucht # this script generates xml rss feed of nzz, that currently has none. a feature is the option to chose the subjects of subscription # 2004-04-21 Felipe Wettstein # 2005-06-19 Felipe Wettstein: select correct image and title2 and cache with parsed index files stored in xml and used only when up2date # THERE IS ABSOLUTELY NO ERROR HANDLING!!! use LWP::Simple; use strict; use warnings; use HTML::Entities qw(:DEFAULT encode_entities_numeric); use Time::Local; use CGI; # INITIALIZE ############ # new CGI object my $cgi_eval = new CGI; # site that is searched my $site = "http://www.nzz.ch"; # name of index page my $index ="index.html"; # choose script home #my $host = "maria"; my $host = "gromski"; my ($file_cache, $html_home, $css); if ($host eq "maria") { # physical path to cache, might be a htdocs folder $file_cache = "../Documents/rss_cache"; # maria $html_home = "http://maria.local/nzzrss.html"; # home of html page to get subscribed $css = "/gromski.css"; } else { $file_cache ="../gromski/rss_cache/"; # gromski.ch $html_home = "http://gromski.ch/nzzrss.html"; # home of html page to get subscribed $css = "/gromski.css"; } # to install make shure to have write acces to files in $file_cache # it might be neccessary to first make all possible temp files: touch al.txt, touch wi.txt, .. and give them the correct owner and write privilges # german month names (html) in hash (key = month, value = number (0-11)) my %month; my $counter=0; foreach (qw (Januar Februar März April Mai Juni Juli August September Oktober November Dezember)) { $month{$_}=$counter ; $counter ++; } # keys for subject: key = directory name, value = meaning my %rubrik = ( 'kommentar'=>'Kommentar', 'al'=>'International', 'wi'=>'Wirtschaft', 'bm'=>'Börsen - Märkte', 'il'=>'Inland', 'zh'=>'Zürich', 'sp'=>'Sport', 'fe'=>'Feuilleton', 'vm'=>'Vermischtes', 'wetter'=>'Wetter', 'ft'=>'Forschung - Technik', 'to'=>'Tourismus', 'em'=>'Medien - Informatik', 'li'=>'Literatur - Kunst', 'zf'=>'Zeitfragen'); # file xml heading my $head_xml_file = "head_xml.txt"; # ANALYSE REQUEST ################# if ($cgi_eval->param("evaluate")) { # called from webbrowser, anwer to browser in html answer_browser (); } elsif ($cgi_eval->param("clear_cache")) { # clear cache, means, put everywhere a very old timestamp clear_cache (); } else { # called from rss-newsreader requesting information, answer to reader in xml answer_reader (); } exit 0; # SUBS ###### # html page containing url for newsreader sub answer_browser { # get url with query string my $query_string = $cgi_eval->url(-query=>1); # chop 'submit button' from url $query_string =~ s/evaluate.*$//; # start html print "Content-Type: text/html; charset=iso-8859-1 <!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'> <html> <head> <LINK REL='stylesheet' TYPE='text/css' HREF=\"".$css."\"> <meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'> <title>** Alles Anmalen * Kopieren * Neuen Feed Anmelden **</title> </head> <body> <h1> <a href=\"".$query_string."\">".$query_string."</a> </h1> </body> </html>\n"; } # end answer_browser # construct xml page containing all the rss information sub answer_reader { # get directories that will be searched, if none is defined, get em all my @directories= $cgi_eval->param("rubrik"); if (!@directories) { #@directories = keys %rubrik; @directories = qw (il vm); } # feed title from selected directories my $feed_title = "NZZ: ".join (", ",@rubrik{(@directories)}); # i am learning:) # header write_xml_header($feed_title); # get all index files from all selected directories, and parse foreach my $dir (@directories) { # get timestamp of index file my $timestamp = (head("$site/$dir/$index"))[2]; # perldoc -f head # print "\n****\nindex file timestamp: ".localtime($timestamp)."\n"; # compose path to cache_file my $cache_filename = "$file_cache/$dir.txt"; # check if actual cache exists if (!((stat($cache_filename))[9]== $timestamp)) { # cache_file does not exist or timestamp is different from index file # => use cache file to compose xml feed # get index file my $doc = get("$site/$dir/$index"); # remove leading crap from $doc $doc =~ s/^.+?\<span\s+class=\"A4\"\>(.*)/$1/is; # '?' in '.+?' to change greediness: match minum # remove tailing crap from $doc $doc =~ s/^(.+\<span\s+class=\"B4\"\>.+\<\/span\>).+/$1/is; # the text ends with a .... tag # put xml lines of index file in @cache, $block_ref is reference to string holding one informaton block my (@cache,$block_ref); while ($doc) { # parse $doc $block by $block and put xml in @cache $block_ref = get_next_block (\$doc); # write_xml makes xml out of a bunch of variables, it then writes them to @cache # variables: $rubrik, $date, $time, $publication, $link, $title, $title2, $image push (@cache,write_xml ($rubrik{$dir},parse_block ($block_ref))); } # end while $doc .. # save cache open (CACHE, ">$cache_filename") or print "could not open cache $cache_filename\n"; print CACHE "<!-- '$rubrik{$dir}' cached at ".localtime(time())." -->\n"; print CACHE "<!-- with timestamp ".localtime($timestamp)." -->\n"; print CACHE @cache; close CACHE; # put index file timestamp on cache file utime $timestamp, $timestamp, $cache_filename; # set timestamp to index files timestamp } # end if (no actual cache) else { # xml is from cache print "<!-- from cache, last update ".localtime($timestamp)." -->\n"; } # print xml cache files to stdout open (CACHE, "$cache_filename"); print STDOUT <CACHE>; close CACHE; } # end foreach $dir (@directories) print "\n\n"; } # end answer_reader # extract next block to parse: a block is embraced by the actual "" tag and the equal tag of the next entry # the leading "" has been removed already sub get_next_block { # expexts reference to resting doc # returns reference to actual block, $doc has then be chopped already my $doc_ref = shift; my $block; # first entry starts with: <span class="A4"> (the date)) if ($$doc_ref =~ s/^(.+?)\<span\s+class=\"A4\"\>(.*)/$2/is) { # '?' in '.+?' to change greediness: match minum $block = $1; } else { $block = $$doc_ref; $$doc_ref = "0"; } # end while $doc.. # return reference to actual block \$block; } # end sub get_next_block # parse information of one information-block out of $doc sub parse_block { # expects a html block ($block_ref) that will be transferred to a bunch of variables, containing information that will be inserted in item # returns a bunch of variables: ($date, $time, $publication, $link, $title, $title2, $image) my $block_ref = shift @_; my ($date, $time, $publication, $link, $title, $title2, $image); # get date and publication type ($date,$publication) = get_date ($block_ref); # print "\n\n***************\n\$date: $date\n\$publication: $publication\n"; # get link, title1 and title2 ($link, $title, $title2) = get_link_and_title ($block_ref); # print "\$link: $link\n\$title: $title\n\$title2: $title2\n"; # get image location ($image) = get_image ($block_ref); # print "\$image: $image\n"; return ($date, $time, $publication, $link, $title, $title2, $image); } # end sub parse_block # extract date from html block sub get_date { # expect reference to acutal block, starting with date # return date and publication my $block_ref = shift; my ($date,$publication); # DATE 1day 2month 3year 4hour 5min 6publication # parse this string: 25. April 2005, 10:54, NZZ Online
if ($$block_ref =~ /^(\d+?)\.\s*(\w+?)\ \;(\d+)\,\s*(\d+)\:(\d+)\,(.+?)\<br\>/) { # print "\n\n#####\$1: $1, \$2: $2, \$3: $3, \$4: $4, \$5: $5, \$6: $6\n"; $date = make_RFC_822_date (timelocal(0,$5,$4,$1,$month{$2},$3)); $publication = encode_entities_numeric(decode_entities($6)) || "nopublication"; } else { # parse this string: 25. April 2005, Neue Zürcher Zeitung
# DATE 1day 2month 3year 6publication $$block_ref =~ /^(\d+?)\.\s*(\w+?)\ \;(\d+)\,(.+?)\<br\>/; $date = make_RFC_822_date (timelocal(0,0,0,$1,$month{$2},$3)); $publication = encode_entities_numeric(decode_entities($4)) || "nopublication"; } # return date (RFC 822 compliant) and publicaton-type ($date, $publication); } # end sub get_date # extract link to article and titles from html block sub get_link_and_title { # expect reference to acutal block # return link, title1 and title2 my $block_ref = shift; my ($link,$title,$title2); my $title3; # is then cat to $title2 # LINK and TITLE1 1link 2title $$block_ref =~ /\<a\s+href\s*\=\s*\"(.+?)\"\s*\>(.+?)\<\//i; $link = $1 || "nolink"; $title = $2 || "notitle"; $title =~ s/\ / /ig; # replace non-braking-space with normal $title =~ s/\s*$//; # remove tailing whitespace $title = encode_entities_numeric(decode_entities($title)) || "notitle"; $link =~ s/\.\.(.+)/$site$1/; # title2 if (($$block_ref =~ /\<span\s+class\s*\=\s*\"B2\"\s*>(.+?)<\/span>/i) || ($$block_ref =~ /\<span\s+class\s*\=\s*\"newzz-ressort-headline-maintitle\"\s*>(.+?)<\/span>/i)) { $title2 = $1; $title2 =~ s/\<a\s*.*?\>//i; # remove possible links (i.e. when block == kommentar, second chance in || ) $title2 =~ s/\ / /ig; # replace non-braking-space with normal $title2 =~ s/\s*$//; # remove tailing whitespace $title2 = encode_entities_numeric(decode_entities("<b>$title2.</b> "))|| "notitle2"; # title3 if ($$block_ref =~ /<span\s+class\s*\=\s*\"B3\"\s*>(.+?)<\/span>/is) { $title3 = $1; $title3 =~ s/\ / /ig; # replace non-braking-space with normal $title3 =~ s/\s*$//; # remove tailing whitespace $title3 =~ s/\<a.+?\<\/a\>$//; # remove tailing link $title3 = encode_entities_numeric(decode_entities($title3))|| "notitle3"; } else { $title3 = ""; } $title2 = $title2.$title3; } else { # there are small titles especially old ones and from nzz print if ($$block_ref =~ /\<span\s+class\s*\=\s*\"B4\"\s*>(.+?)<\/span>/i) {; $title2 = $1; $title2 =~ s/\ / /ig; # replace non-braking-space with normal $title2 =~ s/\s*\<a.+?\<\/a\>\s*$//; # remove tailing link and whitespace $title2 = encode_entities_numeric(decode_entities("<b>$title2.</b>"))||"notitle2"; } else { $title2 = ""; } } # return values ($link, $title, $title2); } # extract image location from html block sub get_image { # expect reference to acutal block # return image location my $block_ref = shift; my ($image); if ($$block_ref =~ /<img\s+alt=\"\"\s+height=\"100\"\s+src="(.+?)\"\s+width=\"100"\s*\>/i) { $image = $site.$1; $image = encode_entities("<img src=\"$image\" height=\"100\" width=\"100\" align=\"left\" hspace=\"5\">"); } else { $image =""; } # return values $image; } # end sub get_image # write header of xml file sub write_xml_header { my $titel = shift; # actual local time for xml header my $date = make_RFC_822_date (); print "Content-type: text/xml\n\n"; open (TEMP, "$head_xml_file") || die "Can't open file $head_xml_file\n"; my $temp = join("",<TEMP>); $temp =~ s/--heute--/$date/s; $temp =~ s/--feed_titel--/$titel/s; print $temp; close (TEMP); } # put values in xml- sub write_xml { # expects a bunch of variables # returns xml lines of one item my ($rubrik, $date, $time, $publication, $link, $title, $title2, $image) = @_; return "<item> <title>$title</title> <description>$image $title2\n<p>$publication: $rubrik </description> <link>$link</link> <dc:subject>$rubrik</dc:subject> <dc:creator>gromski feed</dc:creator> <pubDate>$date</pubDate> </item>\n\n"; } # end sub write_xml sub make_RFC_822_date { # get unix time, if no argument, calculate with actual time # return RFC_822 compatible Date # lists for RFC-822 compatible Date my @Mon = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my @Wda = qw (Sun Mon Tue Wed Thu Fri Sat ); my ($sec,$min,$hour,$mday,$mon,$year,$wday,,) = localtime(shift||time()); # check argument or use actual time if ($hour>3) { $hour = $hour-2;} # nzz is located in switzerland: two more hours than gmt $year += 1900; $wday = $Wda[$wday]; $mon = $Mon[$mon]; return sprintf("%s, %02d %s %04d %02d:%02d:%02d GMT",$wday, $mday, $mon, $year, $hour, $min,0); } # end sub make_RFC_822_date # clear entire cache == put an old timestamp sub clear_cache { # set the timestamp of the whole cache back to the beginnings of life my $timestamp =timelocal(0,10,0,9,$month{"September"},1969); print "Content-Type: text/html; charset=iso-8859-1\n\n"; print "<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'>\n<html>\n<head>\n\t<LINK REL='stylesheet' TYPE='text/css' HREF='$css'>\n\t<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>\n<title>Cache geleert</title>\n</head>\n<body>\n\n"; print "<h1><a href=\"$html_home\">Timestamp ist jetzt überall: ".localtime($timestamp)."</h1>\n"; foreach (keys %rubrik) { utime $timestamp, $timestamp, "$file_cache/$_.txt"; } }