#!/usr/bin/perl $infile = 'refered_to.dat'; # this was the captured output of the java program $outfile = 'apache_rewrites.conf'; # here's where the rewrites are going $archivedir = 'writeon/archives'; # this is the directory where the copies # of the original individual archive html is # my MT constructs titles as 'AkuAku SF: ' # That part of the <title> data needs to be stripped $blogname = 'AkuAku SF'; open( IN, $infile ) || die "cannot open '$infile'\n"; open( OUT, ">$outfile") || die "cannot open '$outfile'"; while () { ($count, $nada, $post) = split; #print "$count $post\n"; if ($post =~ /([\d]+\.shtml)/) { $entry = $1; #print "$count $entry\n"; if (open( HTML, "$archivedir/$entry")) { ($title, $date) = parseHTML( HTML ); if (defined $title && defined $date) { #print "title is $title\n"; $newurl = getNewUrl($title, $date); #print "newurl is $newurl\n"; print "RewriteRule ^$post\$ http://AkuAku.org/archives/$newurl\n"; } else { #print "date or title undefined\n"; } close HTML; } else { #print "could not open '$archivedir/$entry'\n"; } } } close OUT; close IN; exit; sub parseHTML( HTML ) { my $title, $date; while () { if ($_ =~ /(.*)<\/title>/) { $title = $1; $title =~ s/$blogname: //; } if ($_ =~ /dc:date="(\d{4})-(\d{2})-\d{2}/) { $date = "$1/$2"; } } ($title, $date); } # the following is a lot of educated guesswork sub getNewUrl( $title, $date ) { $newurl = $title; $newurl =~ s/[:\.\@=\-,'";\?\(\)]//g; $newurl =~ s/ {2,}/ /g; $newurl =~ s/ /_/g; $newurl =~ tr/[A-Z]/[a-z]/; $newurl = substr( $newurl, 0, 15); $newurl =~ s/^_*//; $newurl =~ s/_*$//; $newurl = $date . '/' . $newurl . '.shtml'; }