Getting PTV GTFS Into Your MySQL Database
PTV has finally released their trip information in GTFS format, but that’s only half of the battle; to make use of this data, you’ll need it in a format other than ‘zip file of text files’. Because I (decided to, this week) love Bash, I though I’d use everyone’s favourite shell to get up close with GTFS to MySQL translations.
You’ll need a schema:
CREATE DATABASE IF NOT EXISTS `gtfs` DEFAULT CHARACTER SET latin1;
USE `gtfs`;
-- --------------------------------------------------------
--
-- Table structure for table `agency`
--
CREATE TABLE IF NOT EXISTS `agency` (
`agency_id` varchar(11) NOT NULL,
`agency_name` varchar(255) DEFAULT NULL,
`agency_url` varchar(255) DEFAULT NULL,
`agency_timezone` varchar(50) DEFAULT NULL,
`agency_lang` varchar(2) NOT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `calendar`
--
CREATE TABLE IF NOT EXISTS `calendar` (
`service_id` varchar(15) NOT NULL DEFAULT '',
`monday` tinyint(1) DEFAULT NULL,
`tuesday` tinyint(1) DEFAULT NULL,
`wednesday` tinyint(1) DEFAULT NULL,
`thursday` tinyint(1) DEFAULT NULL,
`friday` tinyint(1) DEFAULT NULL,
`saturday` tinyint(1) DEFAULT NULL,
`sunday` tinyint(1) DEFAULT NULL,
`start_date` varchar(8) DEFAULT NULL,
`end_date` varchar(8) DEFAULT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `calendar_dates`
--
CREATE TABLE IF NOT EXISTS `calendar_dates` (
`service_id` varchar(15) NOT NULL DEFAULT '',
`date` varchar(8) NOT NULL DEFAULT '',
`date_timestamp` int(11) DEFAULT NULL,
`exception_type` int(2) DEFAULT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `routes`
--
CREATE TABLE IF NOT EXISTS `routes` (
`route_id` varchar(15) NOT NULL,
`agency_id` varchar(11) DEFAULT NULL,
`route_short_name` varchar(50) DEFAULT NULL,
`route_long_name` varchar(255) DEFAULT NULL,
`route_type` int(2) DEFAULT NULL,
`route_text_color` varchar(255) DEFAULT NULL,
`route_color` varchar(255) DEFAULT NULL,
`route_url` varchar(255) DEFAULT NULL,
`route_desc` varchar(255) DEFAULT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `shapes`
--
CREATE TABLE IF NOT EXISTS `shapes` (
`shape_id` varchar(30) NOT NULL,
`shape_pt_lat` decimal(18,15) NOT NULL,
`shape_pt_lon` decimal(18,15) NOT NULL,
`shape_pt_sequence` int(11) NOT NULL,
`shape_dist_traveled` double NOT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `stops`
--
CREATE TABLE IF NOT EXISTS `stops` (
`stop_id` varchar(20) NOT NULL,
`stop_code` varchar(50) DEFAULT NULL,
`stop_name` varchar(255) DEFAULT NULL,
`stop_desc` varchar(255) DEFAULT NULL,
`stop_lat` decimal(18,15) DEFAULT NULL,
`stop_lon` decimal(18,15) DEFAULT NULL,
`zone_id` int(11) DEFAULT NULL,
`stop_url` varchar(255) DEFAULT NULL,
`location_type` int(2) DEFAULT NULL,
`parent_station` int(11) DEFAULT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `stop_times`
--
CREATE TABLE IF NOT EXISTS `stop_times` (
`trip_id` varchar(35) NOT NULL,
`arrival_time` time DEFAULT NULL,
`arrival_time_seconds` int(11) DEFAULT NULL,
`departure_time` time DEFAULT NULL,
`departure_time_seconds` int(11) DEFAULT NULL,
`stop_id` int(11) DEFAULT NULL,
`stop_sequence` int(11) DEFAULT NULL,
`stop_headsign` varchar(50) DEFAULT NULL,
`pickup_type` int(2) DEFAULT NULL,
`drop_off_type` int(2) DEFAULT NULL,
`shape_dist_traveled` varchar(50) DEFAULT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `trips`
--
CREATE TABLE IF NOT EXISTS `trips` (
`route_id` varchar(15) DEFAULT NULL,
`service_id` varchar(15) DEFAULT NULL,
`trip_id` varchar(30) NOT NULL,
`trip_headsign` varchar(255) DEFAULT NULL,
`trip_short_name` varchar(255) DEFAULT NULL,
`direction_id` tinyint(1) DEFAULT NULL,
`block_id` int(11) DEFAULT NULL,
`shape_id` varchar(20) DEFAULT NULL,
`mode` int(11) NOT NULL DEFAULT '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
--
-- Indexes for dumped tables
--
--
-- Indexes for table `agency`
--
ALTER TABLE `agency`
ADD PRIMARY KEY (`agency_id`);
--
-- Indexes for table `calendar`
--
ALTER TABLE `calendar`
ADD PRIMARY KEY (`service_id`);
--
-- Indexes for table `calendar_dates`
--
ALTER TABLE `calendar_dates`
ADD PRIMARY KEY (`service_id`,`date`), ADD KEY `service_id` (`service_id`), ADD KEY `date_timestamp` (`date_timestamp`), ADD KEY `exception_type` (`exception_type`);
--
-- Indexes for table `routes`
--
ALTER TABLE `routes`
ADD PRIMARY KEY (`route_id`), ADD KEY `agency_id` (`agency_id`), ADD KEY `route_type` (`route_type`);
--
-- Indexes for table `stops`
--
ALTER TABLE `stops`
ADD PRIMARY KEY (`stop_id`), ADD KEY `zone_id` (`zone_id`), ADD KEY `stop_lat` (`stop_lat`), ADD KEY `stop_lon` (`stop_lon`), ADD KEY `location_type` (`location_type`), ADD KEY `parent_station` (`parent_station`);
--
-- Indexes for table `stop_times`
--
ALTER TABLE `stop_times`
ADD KEY `trip_id` (`trip_id`), ADD KEY `arrival_time_seconds` (`arrival_time_seconds`), ADD KEY `departure_time_seconds` (`departure_time_seconds`), ADD KEY `stop_id` (`stop_id`), ADD KEY `stop_sequence` (`stop_sequence`), ADD KEY `pickup_type` (`pickup_type`), ADD KEY `drop_off_type` (`drop_off_type`);
--
-- Indexes for table `trips`
--
ALTER TABLE `trips`
ADD PRIMARY KEY (`trip_id`), ADD KEY `route_id` (`route_id`), ADD KEY `service_id` (`service_id`), ADD KEY `direction_id` (`direction_id`), ADD KEY `block_id` (`block_id`), ADD KEY `shape_id` (`shape_id`);
I went a little bit key-happy; feel free to omit these in the name of space, albeit at the cost of query speed.
Code (which I assume you’ll install in /home/joel/gtfs/ – adjust accordingly if not)
#!/bin/bash
FOLDER=/tmp/gtfs
if [ ! -d "$FOLDER" ]; then
mkdir /tmp/gtfs
fi
rm -rf "$FOLDER"/*
curl http://data.ptv.vic.gov.au/downloads/gtfs.zip > "$FOLDER"/gtfs.zip
unzip -d "$FOLDER"/ "$FOLDER"/gtfs.zip
rm "$FOLDER"/gtfs.zip
python /home/joel/gtfs/gtfs-sql/truncate.py
for mode in "$FOLDER"/2 "$FOLDER"/3 "$FOLDER"/4
do
echo "Processing $file"
unzip -d "$mode"/ "$mode"/google_transit.zip
for file2 in $mode/*.txt
do
python /home/joel/gtfs/gtfs-sql/bom.py "$file2"
done
for file in "$mode"/*.txt
do
COLS=$(head -n 1 "$file"|tr -d '\r\n')
mysqlimport --verbose --local --columns="$COLS" --fields-terminated-by=',' --fields-optionally-enclosed-by='\"' --lines-terminated-by='\r\n' --ignore-lines=1 --use-threads=5 gtfs "$file"
t=${file%.txt}
mysql gtfs --batch --silent -e "update `basename ${t}` set mode = `basename ${mode}` where mode = 0"
done
done
echo 'Done'
You will also need /home/joel/gtfs/gtfs-sql/bom.py:
import os, sys, codecs
BUFSIZE = 4096
BOMLEN = len(codecs.BOM_UTF8)
path = sys.argv[1]
with open(path, "r+b") as fp:
chunk = fp.read(BUFSIZE)
if chunk.startswith(codecs.BOM_UTF8):
i = 0
chunk = chunk[BOMLEN:]
while chunk:
fp.seek(i)
fp.write(chunk)
i += len(chunk)
fp.seek(BOMLEN, os.SEEK_CUR)
chunk = fp.read(BUFSIZE)
fp.seek(-BOMLEN, os.SEEK_CUR)
fp.truncate()
This is required because PTV specifically breaks the recommendation of the W3C and a bunch of other organisations and puts a Byte Order Mark (BOM) right where Unix utilities will choke on it.
Anyway, this will get you the Melbourne Metro PTV data, at the cost of around 1.5GB.