#!/usr/bin/perl -w use strict; use XML::RSS; use LWP::Simple; use HTML::Entities; use HTML::TokeParser::Simple; my $rss = new XML::RSS (version => '1.0'); my $url = "http://www.linux.org.uk/~telsa/Diary/diary.html"; my $page = get($url); my $stream = HTML::TokeParser::Simple->new(\$page); my $tag; $rss->channel(title => "The more accurate diary. Really.", link => $url, description => "Telsa's diary of life with a hacker:" . " the current ramblings"); while ($tag = $stream->get_token) { next unless $tag->is_start_tag ('a'); next unless $tag->return_attr("name") ne ""; my $link = $tag->return_attr("name"); $tag = $stream->get_token; next unless $tag->is_start_tag ('strong'); $tag = $stream->get_token; my $title = $tag->as_is; $tag = $stream->get_token; next unless $tag->is_end_tag ('/strong'); $tag = $stream->get_token; next unless $tag->is_end_tag ('/a'); $tag = $stream->get_token; next unless $tag->is_end_tag ('/dt'); $tag = $stream->get_token; #We've got whitespace; on to the next tag $tag = $stream->get_token; next unless $tag->is_start_tag ('dd'); my $content = ""; $tag = $stream->get_token; until ($tag->is_end_tag('/dd')) { $content .= $tag->as_is; $tag = $stream->get_token; next; } $rss->add_item(title => $title, link => "$url#$link", description => encode_entities($content)); } print $rss->as_string;