#!/usr/bin/perl -w
#
# redzip - A script for parsing (loading) or serialising an archive of RDF
#          graphs to/from a persistent Redland model with contexts. 
#
# Usage: ./redzip ( parse [ --keep ] [ --test ] [ --reset ] [ --list ]
#                 | serialise [ --graph <uri> ]
#                   [ --output (rdfxml|turtle|ntriples) ] )
#                 [ --verbose ] <redland-uri> <archive>
#
# Example: ./redzip parse mysql://user:password@mysql/db/model file.zip
#
# URL: http://www.wasab.dk/morten/blog/archives/2008/01/08/named-graph-exchange
#
# Revision: $ bzr-revision-id $
#
# Changelog
# ---------
# Version 1.4 - 2007.01.14 (morten@mfd-consult.dk):
# - Added generatorAgent, added --list.
# Version 1.3 - 2007.01.09 (morten@mfd-consult.dk):
# - Added --reset option.
# Version 1.2 - 2007.01.09 (morten@mfd-consult.dk):
# - Documented --verbose option.
# Version 1.1 - 2007.01.08 (morten@mfd-consult.dk):
# - Added --test option, fixed graph labels.
# Version 1.0 - 2007.01.06 (morten@mfd-consult.dk):
# - First release based on redzip-parse and redzip-serialise.
#
# Copyright 2005-2007, Morten Frederiksen, mfd-consult.dk
# Licensed under the Eiffel Forum License, version 2.
#

use strict;
use Getopt::Long;
use Pod::Usage;
use URI;
use RDF::Redland;
use Archive::Zip qw( :ERROR_CODES :CONSTANTS );

# Command line variables.
my $operation='';
my $redland='';
my $archive='';
my $keep=-1;
my $test=-1;
my $reset=-1;
my $list=-1;
my $graph='';
my $output='ntriples';
my $verbose=-1;
my $help=0;
my %formats=('ttl'=>'turtle', 'rdf'=>'rdfxml', 'nt'=>'ntriples');
my %extensions=('turtle'=>'ttl', 'rdfxml'=>'rdf', 'ntriples'=>'nt');

# Parse command line.
GetOptions(
		'keep:i'=>\$keep,
		'test:i'=>\$test,
		'reset:i'=>\$reset,
		'list:i'=>\$list,
		'graph=s'=>\$graph,
		'output=s'=>\$output,
		'verbose:i'=>\$verbose,
		'help|h'=>\$help)
		|| pod2usage(2);
pod2usage(-exitval=>1,-verbose=>2) if ($help || @ARGV<3);
$operation=shift(@ARGV);
$redland=shift(@ARGV);
$archive=shift(@ARGV);

# Validate/fix parameters.
pod2usage(-exitval=>1,-verbose=>2) if ('parse' ne $operation && 'serialise' ne $operation || '' eq $archive);
$keep++ if ($keep<=0);
$verbose++ if ($verbose<=0);
$test++ if ($test<=0);
$verbose=$test if ($test);
$list++ if ($list<=0);
$test=$list if ($list);
$reset++ if ($reset<=0);
$keep=$reset if ($reset);
print STDERR "$0: initialising\n" if ($verbose);

# Parse Redland URI: <type>://[<user>[:<password>]@]<host>/<database>/<model>
my %redland=('user'=>'default','password'=>'default','database'=>'redland','model'=>'redland');
$redland=URI->new($redland);
$redland{'type'}=$redland->scheme;
$redland->scheme('http');
$redland{'hash-type'}=$redland{'host'}=$redland->host;
($redland{'user'},$redland{'password'})=split(':',$redland->userinfo) if ($redland->userinfo);
($redland{'dummy'},$redland{'database'},$redland{'model'})=$redland->path_segments if ($redland->path_segments);
$redland{'dir'}=$redland{'database'};

# Create Redland objects...
my $storage;
my $model;
if (!$list) {
	$storage=new RDF::Redland::Storage($redland{'type'},
			$redland{'model'},"host='".$redland{'host'}."',database='".
			$redland{'database'}."',user='".$redland{'user'}."',password='".
			$redland{'password'}."',contexts='yes',hash-type='".
			$redland{'hash-type'}."',dir='".$redland{'dir'}."'".
			($reset?",new='yes'":""))
			or die ($0.': unable to create RDF::Redland::Storage');
	$model=new RDF::Redland::Model($storage,'')
			or die ($0.': unable to create RDF::Redland::Model');
}
my $ms=new RDF::Redland::Storage('hashes', 'dummy',
		"new='yes',hash-type='memory'")
		or die $0.': unable to create temporary RDF::Redland::Storage';
my $mm=new RDF::Redland::Model($ms,'')
		or die $0.': unable to create temporary RDF::Redland::Model';
my @tempfiles=();

redzip_parse() if ('parse' eq $operation);
redzip_serialise() if ('serialise' eq $operation);

# Clean up...
print STDERR "$0: cleaning up\n" if ($verbose);
foreach my $tfn (@tempfiles) { unlink($tfn); }
$mm=undef;
$ms=undef;
$model=undef;
$storage=undef;
print STDERR $0.': done.'."\n" if ($verbose);

# Parse...
sub redzip_parse {

# Open archive.
(-r $archive)
		or die ($0.': archive does not exist ('.$archive.')');
my $zip=Archive::Zip->new();
$zip->read($archive) == AZ_OK
		or die ($0.': unable to read archive ('.$archive.')');

# Locate and extract manifest.
print STDERR "$0: locating manifest\n" if ($verbose);
my @manifests=$zip->membersMatching('^META-INF/rdf-manifest\.('.join('|',keys %formats).')$');
(@manifests==1)
		or die ($0.': rdf-manifest missing from archive');
my $manifest=shift(@manifests);
my $syntax=$manifest->fileName;
$syntax=~s/^[^\.]+\.(.+)$/$1/;
$formats{$syntax}
		or die ($0.': unknown extension for rdf-manifest, '.$syntax.' (expecting one of '.join('/',keys %formats).')');
my ($tfh, $tfn)=Archive::Zip::tempFile();
$zip->extractMemberWithoutPaths($manifest, $tfn) == AZ_OK
		or die ($0.': unable to extract rdf-manifest from archive');
push(@tempfiles, $tfn);

# Create parser.
my $parser=new RDF::Redland::Parser($formats{$syntax})
		or die ($0.': unable to create RDF::Redland::Parser');

# Parse map of names into temporary model.
$manifest=new RDF::Redland::Node->new_from_uri('file:/'.$manifest->fileName);
print STDERR "$0: parsing manifest ".$manifest->as_string."\n" if ($verbose);
$parser->parse_into_model(new RDF::Redland::URI('file:'.$tfn), new RDF::Redland::URI('file:/'), $mm);
print STDERR "$0: manifest: ".$mm->size." statements\n" if ($verbose);

# List?
if ($list) {
	my $cs=$mm->find_statements(new RDF::Redland::Statement(
		new RDF::Redland::Node($manifest),
		new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#comment'),
		undef));
	print STDERR "$0: content: ".$cs->current->object->as_string."\n" if ($cs && !$cs->end);
	my $gs=$mm->find_statements(new RDF::Redland::Statement(
		new RDF::Redland::Node($manifest),
		new RDF::Redland::URI('http://webns.net/mvcb/generatorAgent'),
		undef));
	print STDERR "$0: generator: ".$gs->current->object->as_string."\n" if ($gs && !$gs->end);
}

# Iterate over graphs referenced from manifest doc with rdfs:seeAlso.
my $stream=$mm->find_statements(new RDF::Redland::Statement(
		new RDF::Redland::Node($manifest),
		new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#seeAlso'),
		undef));
unless ($stream && !$stream->end) {
		die $0.': unable to locate graphs in manifest graph'; };
while (!$stream->end) {
	my $graph=$stream->current->object;
	$stream->next;
	# Find label for graph.
	my $ls=$mm->find_statements(new RDF::Redland::Statement(
		new RDF::Redland::Node($graph),
		new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#label'),
		undef));
	if (!$ls || $ls->end) {
		print STDERR "$0: label not found for ".$graph->as_string.", skipping\n" if ($verbose);
		next;
	}
	my $label=$ls->current->object;
	if ($list) {
		my $cs=$mm->find_statements(new RDF::Redland::Statement(
			new RDF::Redland::Node($graph),
			new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#comment'),
			undef));
		print STDERR "$0: ".$label->as_string.": ".$cs->current->object->as_string."\n" if ($cs && !$cs->end);
		print STDERR "$0: ".$label->as_string."\n" if (!$cs || $cs->end);
		next;
	}
	print STDERR "$0: parsing and loading graph ".$label->as_string."\n" if ($verbose);
	# Extract graph to temporary file.
	my ($tfh, $tfn)=Archive::Zip::tempFile();
	my $member=$graph->uri->as_string;
	$member=~s/file:\/*//;
	$zip->extractMemberWithoutPaths($member, $tfn) == AZ_OK
			or die ($0.': unable to extract graph ('.$graph->as_string.') from archive');
	push(@tempfiles, $tfn);
	# Truncate context?
	$model->remove_context_statements($label) if (!$keep && !$test);
	# Create parser.
	my $parser=new RDF::Redland::Parser($formats{$syntax})
			or die ($0.': unable to create RDF::Redland::Parser');
	# Parse graph into context.
	if (!$test) {
	  my $cs=$parser->parse_as_stream(new RDF::Redland::URI('file:'.$tfn));
	  !$model->add_statements($cs, $label)
			  or die ($0.': unable to add graph to persistent storage');
	}
}
$stream=undef;

$parser=undef;
}

# Serialise...
sub redzip_serialise {

# Create serializer.	
my $serialiser=new RDF::Redland::Serializer($output)
		or die ($0.': unable to create RDF::Redland::Serializer');

# Build list of graphs to serialise.
my @graphs;
@graphs=new RDF::Redland::Node->new_from_uri($graph) if ($graph);
@graphs=$model->contexts if (!$graph);

# Create archive.
my $zip=Archive::Zip->new();

# Create temporary model to hold map of names.
my $ms=new RDF::Redland::Storage('hashes', 'dummy',
		"new='yes',hash-type='memory'")
		or die $0.': unable to create temporary RDF::Redland::Storage';
my $mm=new RDF::Redland::Model($ms,'')
		or die $0.': unable to create temporary RDF::Redland::Model';

# Iterate through graphs, building map in model on the way.
my $triplecount=0;
my $filenum=0;
foreach my $graph (sort { $a->as_string cmp $b ->as_string } @graphs) {
	print STDERR "$0: serialising graph ".$graph->as_string."\n" if ($verbose);
	# Create temporary storage to hold statements before serialisation.
	my $cs=new RDF::Redland::Storage('hashes', 'dummy',
			"new='yes',hash-type='memory'")
			or die $0.': unable to create temporary RDF::Redland::Storage';
	my $cm=new RDF::Redland::Model($cs,'')
			or die $0.': unable to create temporary RDF::Redland::Model';
	# Add statements from graph to temporary model.
	$cm->add_statements($model->as_stream($graph));
	$triplecount+=$cm->size;
	# Create temporary file and serialise temporary model.
	my ($tfh, $tfn)=Archive::Zip::tempFile();
	!$serialiser->serialize_model_to_file($tfn, new RDF::Redland::URI(''), $cm)
			or die $0.': unable to serialise temporary model';
	# Add contents of temporary file to archive.
	my $graphfile='graph'.++$filenum.'.'.$extensions{$output};
	$zip->addFile($tfn, $graphfile);
	# Add graph name and statistics to map.
	$mm->add(new RDF::Redland::URI($graphfile),
			new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#label'),
			$graph);
	$mm->add(new RDF::Redland::URI($graphfile),
			new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#comment'),
			new RDF::Redland::Node($cm->size.' statements'));
	$mm->add(new RDF::Redland::URI('META-INF/rdf-manifest.'.$extensions{$output}),
			new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#seeAlso'),
			new RDF::Redland::URI($graphfile));
	# Clean up after graph.
	push(@tempfiles, $tfn);
	$cm=undef;
	$cs=undef;
}
$filenum or die $0.': no graph(s) found to serialise?';
$mm->add(new RDF::Redland::URI('META-INF/rdf-manifest.'.$extensions{$output}),
		new RDF::Redland::URI('http://www.w3.org/2000/01/rdf-schema#comment'),
		new RDF::Redland::Node($filenum.' graphs, '.$triplecount.' statements'));
$mm->add(new RDF::Redland::URI('META-INF/rdf-manifest.'.$extensions{$output}),
		new RDF::Redland::URI('http://webns.net/mvcb/generatorAgent'),
		new RDF::Redland::URI('http://bzr.mfd-consult.dk/named-graph-exchange/redzip.pl'));

# Add map to archive.
print STDERR "$0: creating map\n" if ($verbose);
my ($tfh, $tfn)=Archive::Zip::tempFile();
!$serialiser->serialize_model_to_file($tfn, new RDF::Redland::URI(''), $mm)
		or die $0.': unable to serialise map model';
$zip->addFile($tfn, 'META-INF/rdf-manifest.'.$extensions{$output});
push(@tempfiles, $tfn);

# "Serialise" archive.
$zip->writeToFileNamed($archive) == AZ_OK
		or die $0.': unable to create archive';

$serialiser=undef;
}


__END__

=head1 NAME

redzip - A script for parsing (loading) or serialising an archive of RDF
graphs to/from a persistent Redland model with contexts. 

=head1 SYNOPSIS

B<redzip>
[ B<--keep> ]
[ B<--reset> ]
[ B<--graph> I<uri> ]
[ B<--output> I<format> ]
[ B<--verbose> [ I<level> ] ]
[ B<--help> ]
( B<parse> | B<serialise> )
I<redland-uri>
I<archive>

=head1 DESCRIPTION

I<redzip> creates or reads a .zip-archive containing a manifest describing
documents with RDF graphs.

When parsing, the persistent Redland storage is updated with the contents of
the archive.

When serialising, the archive is created based on persistent Redland storage
and optionally named graph.

=head1 OPTIONS AND ARGUMENTS

=over 4

=item B<--keep>

Keep statements already existing in each graph. By default, each incoming
graph is truncated before the statements are added.

=item B<--reset>

Reset (truncate) model prior to insertion. Also creates model if not existing
(storage option "new").

Implies B<--keep> for performance reasons.

=item B<--graph> I<uri>

URI of single context to serialise.

=item B<--output> I<format>

Serialization format, I<ntriples>, I<turtle> or I<rdfxml>. Default is
I<ntriples>.

=item B<--verbose> I<level>

Output progress on STDERR.

=item B<--help> | B<--?>

Output manual page with usage instructions on stdout and exit.

=item I<redland-uri>

URI for Redland model:
  <type>://[<user>[:<password>]@]<host/hash-type>/<database/dir>/<model>

Examples:
  mysql://user:password@mysql/db/model
  hashes://bdb/./test

  Note that the model/graph must exist in the database when parsing.

=item I<archive>

Path to archive file.

=back

=head1 BUGS

Temporary files are currently left as is when die'ing during execution.

=head1 AUTHOR

MFD Consult, Morten Frederiksen E<lt>morten@mfd-consult.dkE<gt>

URL: http://www.wasab.dk/morten/blog/archives/2005/01/04/ exchange-of-named-rdf-graphs

=cut
