#!/usr/bin/env perl

# Author:	David Griffith <dave@661.org>
# Date:		March 4, 2020
# Version:	1.0
# License:	Artistic 2.0
#
# Recursively convert Index files into gophermap files.
#
# To make the IF Archive's gopher mirror more pleasant to browse, the 
# Index files should be converted into proper gophermap files.  This 
# script takes a path to the root of the IF Archive.  If a path is not 
# supplied, then a default of /var/gopher/if-archive is used.  If an Index 
# file is not present, something intelligent will be done with the files.
#
# This script is intended to be run as a cron(8) job immediately after the
# cron(8) job to update the IF Archive mirror has run.  By default, it
# will check to see if the Index file being processed is less than a day
# old.  If so, the file will be processed and a new gophermap file
# produced.  If the Index file is older than a day, then we assume that it
# hasn't changed since the last update.  To force all gophermap files to
# be rewritten, use the -f flag.
#
# For testing purposes, the -s flag can be used to print a single 
# gophermap file to standard output.  The idea is to point the script at a 
# particular directory in the IF Archive and get a gophermap for it.
#
# Option flags:
#
#  -?  --usage           Print simple usage message.
#  -h  --help            Verbose help message.
#  -d  --dryrun          Dry run.  Don't write anything.
#  -f  --force           Force.  Rewrite gophermap even if Index hasn't changed.
#  -s  --single          Single.  Print one gophermap to stdout and exit.
#  -v  --verbose         Verbosity.  Say which Index we're working on now.
#
# Examples:
#
# Really just executing "index2gophermap.pl" is enough if the IF Archive 
# mirror lives in /var/gopher/if-archive.  This default can be changed by 
# editing the values of $gopherroot and $ifarchive.  Otherwise the Archive 
# can be specified on the command line like this:
# 
#   index2gophermap.pl /export/disk3/gopher/if-archive
#

use strict;
use warnings;
use utf8;
use File::Basename;
use File::Temp qw(tempfile);
use Cwd qw(getcwd);
use Getopt::Long qw(:config no_ignore_case);
use Net::Domain qw(hostname hostfqdn hostdomain);
use Pod::Usage;

use File::LibMagic;	# libfile-libmagic-perl
use DateTime;		# libdatetime-perl

my $progname = basename($0);
my $version = "0.1";
my $dir = $ARGV[0];
my $hostname = hostname();
my $port = 70;
my $index = "Index";
my $gophermap = "gophermap";
my $gopherroot = "/var/gopher";
my $ifarchive = "if-archive";
my $ifroot = "$gopherroot/$ifarchive";
my $ifdb = "https://ifdb.tads.org/viewgame?id=";
my %options;
my $dt;

my $footer =	"\n\n".
		"The IF Archive is a public service of the\n".
		"Interactive Fiction Technology Foundation.\n".
		"hhttp://iftechfoundation.org/\tURL:http://iftechfoundation.org/\n\n".
		"This mirror is a public service of 661.org.\n".
		"hhttp://661.org/\tURL:http://661.org/\n";

# These extensions are binary, but are detected as text.
# There's got to be a less nasty way to do this.
my @binary = ("pdf", "ps", "lha"); 

# These should be marked as images in some fashion.
my @image = ("gif", "png", "jpg", "jpeg", "tif", "tiff", "pcx", "bmp", "ico");

# These are for binhex encoded files (primarily for Macs).
# In the IF Archive, there are assorted *.bin files that could be 
# MacBinary or TTComp Macintosh archives, a strange ersatz shell archive 
# for Unix, or Atari 2600 cartridge images.
my @macbin = ("hqx", "sit", "bin");

# These should be checked if they're for MSDOS.
my @dosexec = ("exe", "com");

# Audio file extensions.
my @audio = ("wav", "au", "aiff", "mp2", "mp3", "ogg");

# HTML file extensions.
my @html = ("html", "htm");

my %audio	= map { $_ => 1 } @audio;
my %dosexec	= map { $_ => 1 } @dosexec;
my %macbin	= map { $_ => 1 } @macbin;
my %image	= map { $_ => 1 } @image;
my %binary	= map { $_ => 1 } @binary;
my %html	= map { $_ => 1 } @html;


# Avoid putting gophermaps in these directories:
my @avoid = ("$ifroot", "$ifroot/unprocessed");
my %avoid = map { $_ => 1 } @avoid;	


GetOptions(
	'usage|?' => \$options{usage},
	'h|help' => \$options{help},
	'd|dryrun' => \$options{dryrun},
	'f|force' => \$options{force},
	's|single' => \$options{single},
	'v|verbose' => \$options{verbose},
	'V|version' => \$options{version}
	);

if ($options{version}) {
	print "$progname version $version\n";
	exit;
}

pod2usage(1) if $options{usage};
pod2usage(-verbose => 2) if $options{help};

$dt = DateTime->now();
my $starttime = $dt->epoch();
print "$progname: Starting at ";
print	$dt->day() . "-" .
	$dt->month_abbr() . "-" .
	$dt->year() . " " .
	$dt->hms(":")."\n";


if ($options{dryrun}) {
	print "$progname: Starting dry run.\n";
}
if ($options{force}) {
	print "$progname: Forcing rewrite of all gophermaps.\n";
}

if ($ARGV[0]) {
	traverse($ARGV[0]);
} else {
	traverse($ifroot);
}

my $dt_done = DateTime->now();
print "$progname: Complete at ";
print	$dt_done->day() . "-" .
	$dt_done->month_abbr() . "-" .
	$dt_done->year() . " " .
	$dt_done->hms(":")."\n";
print "$progname: Runtime " . (($dt_done->epoch()) - ($starttime)) . " seconds\n";


######################################################################

# Make a gophermap for this directory, then recurse through any 
# subdirectories.
#
sub traverse {
	my ($dir) = @_;
	my $pwd;

	chdir $dir;

	# On the first call, $dir is a complete path.  Subsequently, 
	# it's just the directory name.  So, we need to use getcwd() 
	# here to make sure to keep things straight.
	$pwd = getcwd();

	# Make a gophermap for this directory unless it's one of those
	# that's not supposed to get one.
	if (!exists $avoid{$pwd} and ref($avoid{$pwd}) ne 'DateTime') {
		make_gophermap();
	} elsif ($options{verbose}) {
		 print "$progname: Avoiding $dir\n";
	}

	# Check this directory for non-symlink subdirectories.
	foreach my $file (glob("*")) {
		next if -l $file or $file eq '.' or $file eq '..';
		if (-d $file) {
			traverse($file);
		}
	}
	chdir "..";
	return;
}


# This is where most of the magic happens.
#
# 1.  If we have an Index file, proceed only if this Index file is newer 
#     than X minutes ago.
#
# 2.  Get the page header from Index and store it in an array, 
#     translating any HTML URLs into a gopher URI.
#
# 3.  Get the next filename, IFDB hashes (if any), and description (if 
#     any).  Store these line by line in a hash of arrays..
#
# 4.  Get lists of all normal files, subdirectories, and symlinks.
#
# 5.  Write out the header to a gophermap file.
#
# 6.  Write out the list of subdirectories to the gophermap file.
#
# 7.  Check a list of files found in Index against the list of all 
#     normal files.  If there's a file missing from Index, aside from 
#     certain ones we don't want to list, add it to the hash of arrays 
#     and a list of all files in the hash of array.
#
# 8.  Use the list of all files to access the hash of arrays in 
#     alphabetical order and write out the file entries to the gophermap 
#     file.
#
sub make_gophermap {
	my $pwd = getcwd();
	my $index_fh;
	my $out_fh;
	my $outfile;

	my @chunk;
	my @header;

	my @symlinks;
	my @subdirs;
	my @files;
	my $filename;
	my @indexed_files;
	my %body;

	if ($options{verbose}) {
		print "$progname: processing $pwd\n";
	}

	# If there's an Index file, check it.
	if (-f $index) {
		my $modtime = -M "$index";
		$modtime = $modtime *60*24;

		# If the Index file is newer than X minutes ago,
		# rewrite the gophermap.
		if ($modtime > 30 && !$options{force}) {
			if ($options{verbose}) {
				print "$progname: skipping $pwd/$index\n";
				return;
			}
		}

		if ($options{verbose}) {
			print "$progname: rewriting $pwd/$gophermap\n";
		}
		open ($index_fh, "<", $index) or die "$progname: Unable to read $index. $!\n";

		# Get the page header. If there's nothing before the 
		# first file description, we get nothing.
		@header = getchunk($index_fh);

		# Parse the list of files in Index and store in a hash
		# of arrays.
		while ($filename = getfilename($index_fh)) {
			push @indexed_files, ($filename);

			# If present, the IFDB key is always right after
			# the filename.
			@chunk = getifdb($index_fh);
			if (@chunk) {
				my @fixed;
				foreach my $line (@chunk) {
					push @fixed, ("hIFDB entry\tURL:$ifdb$line");
				}
				push (@{$body{$filename}}, @fixed);
			}

			# If there's an HTML URL in the body, we're not 
			# catching it yet.
			@chunk = getchunk($index_fh);
			if (@chunk) {
				push (@{$body{$filename}}, @chunk);
			}
		}
		close($index_fh);
	}

	# Regardless of if we have an Index file or not, save lists of 
	# subdirectories, symlinks, and regular files.
	opendir(SUBDIR, $pwd);
	while (my $file_test = readdir(SUBDIR)) {
		next if $file_test eq '.' or $file_test eq '..';
		next if $file_test eq $index or $file_test eq $gophermap;
		next if $file_test =~ /^\./;

		if (-l $file_test) {
			push @symlinks, ($file_test);
		} elsif (-d $file_test) {
			push @subdirs, ($file_test);
		} else {
			push @files, ($file_test);
		}
	}	
	closedir(SUBDIR);
	
	if ($options{dryrun}) {
		$outfile = "/dev/null";
	} else {
		$outfile = $gophermap;
	}

	# Now put it all together.
	if ($options{single}) {
		$out_fh = *STDOUT;
	} else {
		open ($out_fh, ">", $outfile) or die "$progname: Unable to write to $pwd/$outfile. $!\n";
	}

	# Print the header.
	if (@header) {
		foreach my $line (@header) {
			print $out_fh "$line\n";
		}
	}

	my $path = $pwd;
	$path =~ s/$gopherroot//g;

	if (@subdirs) {
		my $subdir_count = @subdirs;
		print $out_fh "\n\n$subdir_count Subdirector";
		if ($subdir_count > 1) {
			print $out_fh "ies:\n";
		} else {
			print $out_fh "y:\n";
		}
 
		foreach my $line (sort @subdirs) {
			print $out_fh "1" . $line . "\t$path/$line\n";
		}
	}

	if (@indexed_files) {
		# Merge unindexed files in with the indexed files and 
		# their descriptions.
		my %indexed_files_hash = map{$_ => 1} @indexed_files;
		my @newfiles;
		my $count = 0;
		foreach my $file (@files) {
			if (!exists $indexed_files_hash{$file}) {
				push @newfiles, ($file);
			}
		}
		push @indexed_files, @newfiles;
		@indexed_files = sort @indexed_files;

		# Write out the files and descriptions.
		# Timestamp format is defined here.
		my $items = @indexed_files;
		print $out_fh "\n\n$items item";
		if ($items > 1) {
			print $out_fh "s";
		}
		print $out_fh ":\n";

		foreach my $file (@indexed_files) {
			$dt = DateTime->from_epoch(epoch => (stat($file))[9]);
			print $out_fh filetype($file)."$file\t$path/$file\n";
			print $out_fh "[" .	$dt->day() . "-" .
						$dt->month_abbr() . "-" .
						$dt->year() . "]\n";
			foreach my $line (@{$body{$file}}) {
				# Bullet points
				if ($line =~ /^-/) {
					$line =~ s/^-\s*//;
					print $out_fh "i$line\tfoo\t$port\n";
				} else {
					print $out_fh "$line\n";
				}
			}
			$count++;
			if ($count != @indexed_files) {
				print $out_fh "\n";
			}
		}
	} elsif (@files) {
		# We don't have an Index, so just print file with gopher 
		# descriptors.
		# Timestamp format is defined here.
		my $items = @files;
		my $count = 0;
		print $out_fh "\n\n$items item";
		if ($items > 1) {
			print $out_fh "s";
		}
		print $out_fh ":\n";
		foreach my $file (@files) {
			$dt = DateTime->from_epoch(epoch => (stat($file))[9]);			
			print $out_fh filetype($file)."$file\t$path/$file\n";
			print $out_fh "[" .	$dt->day() . "-" .
						$dt->month_abbr() . "-" .
						$dt->year() . "]\n\n";
		}
	}		

	print $out_fh $footer;

	if ($options{single}) {
		exit;
	}

	close($out_fh);
	return;
}


###################################################################
# These functions below are all called only from make_gophermap()
# or subordinate functions.
###################################################################


# Return the standard gopher filetype for the supplied filename.
# See https://gopher.zone/posts/how-to-gophermap/
#
# According to 
# https://dev.to/dotcomboom/the-gopher-protocol-in-brief-1d88, only .wav 
# files should be given an item type of s.  All other audio formats 
# should have an item type of 9.  Not sure how correct this is.
#
sub filetype {
	my ($file, @junk) = @_;
	my $ext;

	($ext) = $file =~ /(\.[^.]+)$/;
	if (!$ext) {
		$ext = "NeVeRhApPeNs";
	} else {
		$ext =~ s/\.//;
		$ext = lc($ext);
	}

	if (exists $html{$ext}) {
		return "h";
	} elsif (exists $audio{$ext}) {
		my $magic = File::LibMagic->new();
		my $info = $magic->info_from_filename($file);
		my $description = substr($info->{description}, 0, 22);
		if ($description =~ /^RIFF.*/ or $description =~ /Sun\/NeXT\saudio\sdata/) {
			return "s";
		}
		elsif ($description =~ /IFF\sdata,\sAIFF\saudio/) {
			return "s";
		}
		elsif ($description =~ /^Ogg.*/ or $description =~ /MPEG\sADTS,\slayer II.*/) {
			return "s";
		}
		return 9;
	} elsif (exists $dosexec{$ext}) {
		my $magic = File::LibMagic->new();
		my $info = $magic->info_from_filename($file);
		my $description = substr($info->{description}, 0, 20);
		if ($description =~ /^DOS.*/ or $description =~ /^MS-DOS.*/) {
			return 5;
		} elsif ($description =~ /^ASCII.*/) {
			return 0;
		}
		return 9;
	} elsif (exists $macbin{$ext}) {
		# In the IF Archive, there are assorted *.bin files that 
		# could be MacBinary or TTComp Macintosh archives, a 
		# strange ersatz shell archive for Unix, or Atari 2600 
		# cartridge images.
		if ($ext eq "bin") {
			my $magic = File::LibMagic->new();
			my $info = $magic->info_from_filename($file);
			my $description = substr($info->{description}, 0, 20);
			if ($description =~ /^MacBinary.*/ or
			    $description =~ /^TTComp.*/) {
				return 4;	# Macintosh archives.
			} elsif ($description =~ /^POSIX.*/) {
				return 0;	# Text for a shell archive
			}
			return 9;		# Generic binary.
		}
		return 4;	# Other Macintosh archives.
	} elsif (exists $image{$ext}) {
		if ($ext eq "gif") {
			return "g";
		}
		return "I";
	} elsif (exists $binary{$ext}) {
		return 9;
	} elsif (-d $file) {
		return 1;
	} elsif (!-T $file) {
		return 9;
	}
	return 0;
}


# Get a key to an IFDB entry.  If present, it will always be on the next 
# line after a filename.  If there's something on the next line after an 
# IFDB key, it'll be another IFDB key.
#
sub getifdb {
	my $fh = shift;
	my @tuids;
	my $got_tuid = 0;
	my $pos;
	my $line;

	$line = <$fh>;
	if ($line =~ /^tuid:/) {
		$line =~ s/^tuid:\s*//;
		chomp($line);
		push @tuids, ($line);
		while (my $nextline = <$fh>) {
			if ($nextline =~ /^\s*$/) {
				return @tuids;
			} else {
				$nextline =~ s/^\s*//;
				chomp($nextline);
				push @tuids, ($nextline);
			}
		}
	}
	return;
}


# Return the next filename or nothing if not found (EOF).
# Filename is marked like this: /^#\sfilename/
#
sub getfilename {
	my $fh = shift;
	my $line;

	while (my $line = <$fh>) {
		if ($line =~ /^#/) {
			chomp($line);
			$line =~ s/^#\s*//;
			return $line;
		}
	}
	return;
}


# Return a chunk of text in the form of an array, one line per element, 
# describing a file.
#
sub getchunk {
	my $fh = shift;
	my $line;
	my $topdone;
	my $filename;
	my @chunk;
	my $pos;

	# Get text until the next hashmark or EOF.
	$pos = tell $fh;
	while ($line = <$fh>) {
		# Get rid of whitespace at top of file.
		if (($line =~ /^\s*$/) && !$topdone) { next; }
		# If EOF, simply return what we got.
		if (!$line) {
			return @chunk;
		}
		# A hashmark indicates the next file, so if that's what we
		# have, go back to the previous line and terminate the loop. 
		if ($line =~ /^#/) {
			seek $fh, $pos, 0;
			last;
		}
		# Angle brackets mark URLs
		if ($line =~ /\<.*\>/) {
			my @result;
			chomp($line);
			$line =~ s/^[^\<]*\<//;
			$line =~ s/\>.*//;
			$line = "h" . $line . "\tURL:$line";
		}
		$topdone = 1;	# We're past any leading blank lines.
		chomp($line);
		push @chunk, $line;
		$pos = tell $fh;
	}
	# Get rid of trailing blank lines.
	foreach my $foo (@chunk) {
		if ($chunk[-1] =~ /^\s*$/) {
			pop @chunk;
		}
	}
	return @chunk;
}


__END__


=head1 NAME

index2gophermap.pl - Recursively convert Index files into gophermap files

=head1 SYNOPSIS

index2gophermap.pl [options] [<directory>]

=head1 DESCRIPTION

To make the IF Archive's gopher mirror more pleasant to browse, the 
Index files should be converted into proper gophermap files.  This 
script takes a path to the root of the IF Archive.  If a path is not 
supplied, then a default of /var/gopher/if-archive is used.  If an Index 
file is not present, something intelligent will be done with the files.

This script is intended to be run as a cron(8) job immediately after the 
cron(8) job to update the IF Archive mirror has run.  By default, it 
will check to see if the Index file being processed is less than a day 
old.  If so, the file will be processed and a new gophermap file 
produced.  If the Index file is older than a day, then we assume that it 
hasn't changed since the last update.  To force all gophermap files to 
be rewritten, use the -f flag.

For testing purposes, the -s flag can be used to print a single 
gophermap file to standard output.  The idea is to point the script at a 
particular directory in the IF Archive and get a gophermap for it.

=head2 Option flags

  -?  --usage		Print simple usage message.
  -h  --help		Verbose help message.
  -d  --dryrun		Dry run.  Don't write anything.
  -f  --force		Force. Rewrite gophermap even if Index hasn't changed.
  -s  --single          Single. Print one gophermap to stdout and exit.
  -v  --verbose		Verbosity. Say which Index we're working on now.
    
=head2 Examples

Really just executing "index2gophermap.pl" is enough if the IF Archive 
mirror lives in /var/gopher/if-archive.  This default can be changed by 
editing the values of $gopherroot and $ifarchive.  Otherwise the Archive 
can be specified on the command line like this:
 
  index2gophermap.pl /export/disk3/gopher/if-archive


=head1 LICENSE

Artistic License 2.0

=head1 AUTHOR

David Griffith <dave@661.org>