#!/usr/bin/perl
$infile = 'refered_to.dat'; # this was the captured output of the java program
$outfile = 'apache_rewrites.conf'; # here's where the rewrites are going
$archivedir = 'writeon/archives'; # this is the directory where the copies
# of the original individual archive html is
# my MT constructs titles as 'AkuAku SF:
'
# That part of the data needs to be stripped
$blogname = 'AkuAku SF';
open( IN, $infile ) || die "cannot open '$infile'\n";
open( OUT, ">$outfile") || die "cannot open '$outfile'";
while () {
($count, $nada, $post) = split;
#print "$count $post\n";
if ($post =~ /([\d]+\.shtml)/) {
$entry = $1;
#print "$count $entry\n";
if (open( HTML, "$archivedir/$entry")) {
($title, $date) = parseHTML( HTML );
if (defined $title && defined $date) {
#print "title is $title\n";
$newurl = getNewUrl($title, $date);
#print "newurl is $newurl\n";
print "RewriteRule ^$post\$ http://AkuAku.org/archives/$newurl\n";
} else {
#print "date or title undefined\n";
}
close HTML;
} else {
#print "could not open '$archivedir/$entry'\n";
}
}
}
close OUT;
close IN;
exit;
sub parseHTML( HTML ) {
my $title, $date;
while () {
if ($_ =~ /(.*)<\/title>/) {
$title = $1;
$title =~ s/$blogname: //;
}
if ($_ =~ /dc:date="(\d{4})-(\d{2})-\d{2}/) {
$date = "$1/$2";
}
}
($title, $date);
}
# the following is a lot of educated guesswork
sub getNewUrl( $title, $date ) {
$newurl = $title;
$newurl =~ s/[:\.\@=\-,'";\?\(\)]//g;
$newurl =~ s/ {2,}/ /g;
$newurl =~ s/ /_/g;
$newurl =~ tr/[A-Z]/[a-z]/;
$newurl = substr( $newurl, 0, 15);
$newurl =~ s/^_*//;
$newurl =~ s/_*$//;
$newurl = $date . '/' . $newurl . '.shtml';
}