#!/usr/bin/env perl # Author: David Griffith <dave@661.org> # Date: March 4, 2020 # Version: 1.0 # License: Artistic 2.0 # # Recursively convert Index files into gophermap files. # # To make the IF Archive's gopher mirror more pleasant to browse, the # Index files should be converted into proper gophermap files. This # script takes a path to the root of the IF Archive. If a path is not # supplied, then a default of /var/gopher/if-archive is used. If an Index # file is not present, something intelligent will be done with the files. # # This script is intended to be run as a cron(8) job immediately after the # cron(8) job to update the IF Archive mirror has run. By default, it # will check to see if the Index file being processed is less than a day # old. If so, the file will be processed and a new gophermap file # produced. If the Index file is older than a day, then we assume that it # hasn't changed since the last update. To force all gophermap files to # be rewritten, use the -f flag. # # For testing purposes, the -s flag can be used to print a single # gophermap file to standard output. The idea is to point the script at a # particular directory in the IF Archive and get a gophermap for it. # # Option flags: # # -? --usage Print simple usage message. # -h --help Verbose help message. # -d --dryrun Dry run. Don't write anything. # -f --force Force. Rewrite gophermap even if Index hasn't changed. # -s --single Single. Print one gophermap to stdout and exit. # -v --verbose Verbosity. Say which Index we're working on now. # # Examples: # # Really just executing "index2gophermap.pl" is enough if the IF Archive # mirror lives in /var/gopher/if-archive. This default can be changed by # editing the values of $gopherroot and $ifarchive. Otherwise the Archive # can be specified on the command line like this: # # index2gophermap.pl /export/disk3/gopher/if-archive # use strict; use warnings; use utf8; use File::Basename; use File::Temp qw(tempfile); use Cwd qw(getcwd); use Getopt::Long qw(:config no_ignore_case); use Net::Domain qw(hostname hostfqdn hostdomain); use Pod::Usage; use File::LibMagic; # libfile-libmagic-perl use DateTime; # libdatetime-perl my $progname = basename($0); my $version = "0.1"; my $dir = $ARGV[0]; my $hostname = hostname(); my $port = 70; my $index = "Index"; my $gophermap = "gophermap"; my $gopherroot = "/var/gopher"; my $ifarchive = "if-archive"; my $ifroot = "$gopherroot/$ifarchive"; my $ifdb = "https://ifdb.tads.org/viewgame?id="; my %options; my $dt; my $footer = "\n\n". "The IF Archive is a public service of the\n". "Interactive Fiction Technology Foundation.\n". "hhttp://iftechfoundation.org/\tURL:http://iftechfoundation.org/\n\n". "This mirror is a public service of 661.org.\n". "hhttp://661.org/\tURL:http://661.org/\n"; # These extensions are binary, but are detected as text. # There's got to be a less nasty way to do this. my @binary = ("pdf", "ps", "lha"); # These should be marked as images in some fashion. my @image = ("gif", "png", "jpg", "jpeg", "tif", "tiff", "pcx", "bmp", "ico"); # These are for binhex encoded files (primarily for Macs). # In the IF Archive, there are assorted *.bin files that could be # MacBinary or TTComp Macintosh archives, a strange ersatz shell archive # for Unix, or Atari 2600 cartridge images. my @macbin = ("hqx", "sit", "bin"); # These should be checked if they're for MSDOS. my @dosexec = ("exe", "com"); # Audio file extensions. my @audio = ("wav", "au", "aiff", "mp2", "mp3", "ogg"); # HTML file extensions. my @html = ("html", "htm"); my %audio = map { $_ => 1 } @audio; my %dosexec = map { $_ => 1 } @dosexec; my %macbin = map { $_ => 1 } @macbin; my %image = map { $_ => 1 } @image; my %binary = map { $_ => 1 } @binary; my %html = map { $_ => 1 } @html; # Avoid putting gophermaps in these directories: my @avoid = ("$ifroot", "$ifroot/unprocessed"); my %avoid = map { $_ => 1 } @avoid; GetOptions( 'usage|?' => \$options{usage}, 'h|help' => \$options{help}, 'd|dryrun' => \$options{dryrun}, 'f|force' => \$options{force}, 's|single' => \$options{single}, 'v|verbose' => \$options{verbose}, 'V|version' => \$options{version} ); if ($options{version}) { print "$progname version $version\n"; exit; } pod2usage(1) if $options{usage}; pod2usage(-verbose => 2) if $options{help}; $dt = DateTime->now(); my $starttime = $dt->epoch(); print "$progname: Starting at "; print $dt->day() . "-" . $dt->month_abbr() . "-" . $dt->year() . " " . $dt->hms(":")."\n"; if ($options{dryrun}) { print "$progname: Starting dry run.\n"; } if ($options{force}) { print "$progname: Forcing rewrite of all gophermaps.\n"; } if ($ARGV[0]) { traverse($ARGV[0]); } else { traverse($ifroot); } my $dt_done = DateTime->now(); print "$progname: Complete at "; print $dt_done->day() . "-" . $dt_done->month_abbr() . "-" . $dt_done->year() . " " . $dt_done->hms(":")."\n"; print "$progname: Runtime " . (($dt_done->epoch()) - ($starttime)) . " seconds\n"; ###################################################################### # Make a gophermap for this directory, then recurse through any # subdirectories. # sub traverse { my ($dir) = @_; my $pwd; chdir $dir; # On the first call, $dir is a complete path. Subsequently, # it's just the directory name. So, we need to use getcwd() # here to make sure to keep things straight. $pwd = getcwd(); # Make a gophermap for this directory unless it's one of those # that's not supposed to get one. if (!exists $avoid{$pwd} and ref($avoid{$pwd}) ne 'DateTime') { make_gophermap(); } elsif ($options{verbose}) { print "$progname: Avoiding $dir\n"; } # Check this directory for non-symlink subdirectories. foreach my $file (glob("*")) { next if -l $file or $file eq '.' or $file eq '..'; if (-d $file) { traverse($file); } } chdir ".."; return; } # This is where most of the magic happens. # # 1. If we have an Index file, proceed only if this Index file is newer # than X minutes ago. # # 2. Get the page header from Index and store it in an array, # translating any HTML URLs into a gopher URI. # # 3. Get the next filename, IFDB hashes (if any), and description (if # any). Store these line by line in a hash of arrays.. # # 4. Get lists of all normal files, subdirectories, and symlinks. # # 5. Write out the header to a gophermap file. # # 6. Write out the list of subdirectories to the gophermap file. # # 7. Check a list of files found in Index against the list of all # normal files. If there's a file missing from Index, aside from # certain ones we don't want to list, add it to the hash of arrays # and a list of all files in the hash of array. # # 8. Use the list of all files to access the hash of arrays in # alphabetical order and write out the file entries to the gophermap # file. # sub make_gophermap { my $pwd = getcwd(); my $index_fh; my $out_fh; my $outfile; my @chunk; my @header; my @symlinks; my @subdirs; my @files; my $filename; my @indexed_files; my %body; if ($options{verbose}) { print "$progname: processing $pwd\n"; } # If there's an Index file, check it. if (-f $index) { my $modtime = -M "$index"; $modtime = $modtime *60*24; # If the Index file is newer than X minutes ago, # rewrite the gophermap. if ($modtime > 30 && !$options{force}) { if ($options{verbose}) { print "$progname: skipping $pwd/$index\n"; return; } } if ($options{verbose}) { print "$progname: rewriting $pwd/$gophermap\n"; } open ($index_fh, "<", $index) or die "$progname: Unable to read $index. $!\n"; # Get the page header. If there's nothing before the # first file description, we get nothing. @header = getchunk($index_fh); # Parse the list of files in Index and store in a hash # of arrays. while ($filename = getfilename($index_fh)) { push @indexed_files, ($filename); # If present, the IFDB key is always right after # the filename. @chunk = getifdb($index_fh); if (@chunk) { my @fixed; foreach my $line (@chunk) { push @fixed, ("hIFDB entry\tURL:$ifdb$line"); } push (@{$body{$filename}}, @fixed); } # If there's an HTML URL in the body, we're not # catching it yet. @chunk = getchunk($index_fh); if (@chunk) { push (@{$body{$filename}}, @chunk); } } close($index_fh); } # Regardless of if we have an Index file or not, save lists of # subdirectories, symlinks, and regular files. opendir(SUBDIR, $pwd); while (my $file_test = readdir(SUBDIR)) { next if $file_test eq '.' or $file_test eq '..'; next if $file_test eq $index or $file_test eq $gophermap; next if $file_test =~ /^\./; if (-l $file_test) { push @symlinks, ($file_test); } elsif (-d $file_test) { push @subdirs, ($file_test); } else { push @files, ($file_test); } } closedir(SUBDIR); if ($options{dryrun}) { $outfile = "/dev/null"; } else { $outfile = $gophermap; } # Now put it all together. if ($options{single}) { $out_fh = *STDOUT; } else { open ($out_fh, ">", $outfile) or die "$progname: Unable to write to $pwd/$outfile. $!\n"; } # Print the header. if (@header) { foreach my $line (@header) { print $out_fh "$line\n"; } } my $path = $pwd; $path =~ s/$gopherroot//g; if (@subdirs) { my $subdir_count = @subdirs; print $out_fh "\n\n$subdir_count Subdirector"; if ($subdir_count > 1) { print $out_fh "ies:\n"; } else { print $out_fh "y:\n"; } foreach my $line (sort @subdirs) { print $out_fh "1" . $line . "\t$path/$line\n"; } } if (@indexed_files) { # Merge unindexed files in with the indexed files and # their descriptions. my %indexed_files_hash = map{$_ => 1} @indexed_files; my @newfiles; my $count = 0; foreach my $file (@files) { if (!exists $indexed_files_hash{$file}) { push @newfiles, ($file); } } push @indexed_files, @newfiles; @indexed_files = sort @indexed_files; # Write out the files and descriptions. # Timestamp format is defined here. my $items = @indexed_files; print $out_fh "\n\n$items item"; if ($items > 1) { print $out_fh "s"; } print $out_fh ":\n"; foreach my $file (@indexed_files) { $dt = DateTime->from_epoch(epoch => (stat($file))[9]); print $out_fh filetype($file)."$file\t$path/$file\n"; print $out_fh "[" . $dt->day() . "-" . $dt->month_abbr() . "-" . $dt->year() . "]\n"; foreach my $line (@{$body{$file}}) { # Bullet points if ($line =~ /^-/) { $line =~ s/^-\s*//; print $out_fh "i$line\tfoo\t$port\n"; } else { print $out_fh "$line\n"; } } $count++; if ($count != @indexed_files) { print $out_fh "\n"; } } } elsif (@files) { # We don't have an Index, so just print file with gopher # descriptors. # Timestamp format is defined here. my $items = @files; my $count = 0; print $out_fh "\n\n$items item"; if ($items > 1) { print $out_fh "s"; } print $out_fh ":\n"; foreach my $file (@files) { $dt = DateTime->from_epoch(epoch => (stat($file))[9]); print $out_fh filetype($file)."$file\t$path/$file\n"; print $out_fh "[" . $dt->day() . "-" . $dt->month_abbr() . "-" . $dt->year() . "]\n\n"; } } print $out_fh $footer; if ($options{single}) { exit; } close($out_fh); return; } ################################################################### # These functions below are all called only from make_gophermap() # or subordinate functions. ################################################################### # Return the standard gopher filetype for the supplied filename. # See https://gopher.zone/posts/how-to-gophermap/ # # According to # https://dev.to/dotcomboom/the-gopher-protocol-in-brief-1d88, only .wav # files should be given an item type of s. All other audio formats # should have an item type of 9. Not sure how correct this is. # sub filetype { my ($file, @junk) = @_; my $ext; ($ext) = $file =~ /(\.[^.]+)$/; if (!$ext) { $ext = "NeVeRhApPeNs"; } else { $ext =~ s/\.//; $ext = lc($ext); } if (exists $html{$ext}) { return "h"; } elsif (exists $audio{$ext}) { my $magic = File::LibMagic->new(); my $info = $magic->info_from_filename($file); my $description = substr($info->{description}, 0, 22); if ($description =~ /^RIFF.*/ or $description =~ /Sun\/NeXT\saudio\sdata/) { return "s"; } elsif ($description =~ /IFF\sdata,\sAIFF\saudio/) { return "s"; } elsif ($description =~ /^Ogg.*/ or $description =~ /MPEG\sADTS,\slayer II.*/) { return "s"; } return 9; } elsif (exists $dosexec{$ext}) { my $magic = File::LibMagic->new(); my $info = $magic->info_from_filename($file); my $description = substr($info->{description}, 0, 20); if ($description =~ /^DOS.*/ or $description =~ /^MS-DOS.*/) { return 5; } elsif ($description =~ /^ASCII.*/) { return 0; } return 9; } elsif (exists $macbin{$ext}) { # In the IF Archive, there are assorted *.bin files that # could be MacBinary or TTComp Macintosh archives, a # strange ersatz shell archive for Unix, or Atari 2600 # cartridge images. if ($ext eq "bin") { my $magic = File::LibMagic->new(); my $info = $magic->info_from_filename($file); my $description = substr($info->{description}, 0, 20); if ($description =~ /^MacBinary.*/ or $description =~ /^TTComp.*/) { return 4; # Macintosh archives. } elsif ($description =~ /^POSIX.*/) { return 0; # Text for a shell archive } return 9; # Generic binary. } return 4; # Other Macintosh archives. } elsif (exists $image{$ext}) { if ($ext eq "gif") { return "g"; } return "I"; } elsif (exists $binary{$ext}) { return 9; } elsif (-d $file) { return 1; } elsif (!-T $file) { return 9; } return 0; } # Get a key to an IFDB entry. If present, it will always be on the next # line after a filename. If there's something on the next line after an # IFDB key, it'll be another IFDB key. # sub getifdb { my $fh = shift; my @tuids; my $got_tuid = 0; my $pos; my $line; $line = <$fh>; if ($line =~ /^tuid:/) { $line =~ s/^tuid:\s*//; chomp($line); push @tuids, ($line); while (my $nextline = <$fh>) { if ($nextline =~ /^\s*$/) { return @tuids; } else { $nextline =~ s/^\s*//; chomp($nextline); push @tuids, ($nextline); } } } return; } # Return the next filename or nothing if not found (EOF). # Filename is marked like this: /^#\sfilename/ # sub getfilename { my $fh = shift; my $line; while (my $line = <$fh>) { if ($line =~ /^#/) { chomp($line); $line =~ s/^#\s*//; return $line; } } return; } # Return a chunk of text in the form of an array, one line per element, # describing a file. # sub getchunk { my $fh = shift; my $line; my $topdone; my $filename; my @chunk; my $pos; # Get text until the next hashmark or EOF. $pos = tell $fh; while ($line = <$fh>) { # Get rid of whitespace at top of file. if (($line =~ /^\s*$/) && !$topdone) { next; } # If EOF, simply return what we got. if (!$line) { return @chunk; } # A hashmark indicates the next file, so if that's what we # have, go back to the previous line and terminate the loop. if ($line =~ /^#/) { seek $fh, $pos, 0; last; } # Angle brackets mark URLs if ($line =~ /\<.*\>/) { my @result; chomp($line); $line =~ s/^[^\<]*\<//; $line =~ s/\>.*//; $line = "h" . $line . "\tURL:$line"; } $topdone = 1; # We're past any leading blank lines. chomp($line); push @chunk, $line; $pos = tell $fh; } # Get rid of trailing blank lines. foreach my $foo (@chunk) { if ($chunk[-1] =~ /^\s*$/) { pop @chunk; } } return @chunk; } __END__ =head1 NAME index2gophermap.pl - Recursively convert Index files into gophermap files =head1 SYNOPSIS index2gophermap.pl [options] [<directory>] =head1 DESCRIPTION To make the IF Archive's gopher mirror more pleasant to browse, the Index files should be converted into proper gophermap files. This script takes a path to the root of the IF Archive. If a path is not supplied, then a default of /var/gopher/if-archive is used. If an Index file is not present, something intelligent will be done with the files. This script is intended to be run as a cron(8) job immediately after the cron(8) job to update the IF Archive mirror has run. By default, it will check to see if the Index file being processed is less than a day old. If so, the file will be processed and a new gophermap file produced. If the Index file is older than a day, then we assume that it hasn't changed since the last update. To force all gophermap files to be rewritten, use the -f flag. For testing purposes, the -s flag can be used to print a single gophermap file to standard output. The idea is to point the script at a particular directory in the IF Archive and get a gophermap for it. =head2 Option flags -? --usage Print simple usage message. -h --help Verbose help message. -d --dryrun Dry run. Don't write anything. -f --force Force. Rewrite gophermap even if Index hasn't changed. -s --single Single. Print one gophermap to stdout and exit. -v --verbose Verbosity. Say which Index we're working on now. =head2 Examples Really just executing "index2gophermap.pl" is enough if the IF Archive mirror lives in /var/gopher/if-archive. This default can be changed by editing the values of $gopherroot and $ifarchive. Otherwise the Archive can be specified on the command line like this: index2gophermap.pl /export/disk3/gopher/if-archive =head1 LICENSE Artistic License 2.0 =head1 AUTHOR David Griffith <dave@661.org>