#!/usr/bin/perl

use strict;
use warnings;

use WWW::Mechanize;
use Getopt::Long;
use Pod::Usage;
use OpenGuides::RDF::Reader;
use XML::RSS;
use Data::Dumper;
use OpenGuides;
use OpenGuides::Config;

our $VERSION = 0.03;
my $help = 0;
my $list_version = 0;
my $days=0;
my $scrape=0;
my ($site,$config_file);

GetOptions(
	'site=s' => \$site,
	'config=s' => \$config_file,
	'days=i' => \$days,
	'scrape!' => \$scrape,
	'help+' => \$help,
	'version!' => \$list_version,
	) || pod2usage( -verbose => 0 );

(print STDERR "$0 version $VERSION\n\n"), exit 0
	if $list_version;

pod2usage( -verbose => $help) if $help || !($site && $config_file);

=head1 NAME

og_mirror - Replicate an OpenGuides site

=head1 SYNOPSIS

  og_mirror --site http://from.site.url/ --config /path/to/wiki.conf [--days 1] [--scrape]

=head1 DESCRIPTION

This is a script to mirror the contents from another OpenGuides website.
It can be run from a cron job to update periodically.

To initially load the wiki, run the script without the --days option. Then,
the script can be run periodically with the --days option, to keep the site
in line.

=head1 OPTIONS

=over 4

=item C<--site>

Specify the guide website to mirror from.

=item C<--config | -c>

Path to the config file for the wiki on the localhost.

=item C<--days | -d>

Number of days back to look at in the RSS feed. Omit this option to work
in "hoover" mode. 

=item C<--scrape>

If the OpenGuides site is prior to 0.51, it will not support format=raw.
Specify the option --scrape to use HTML scraping of the edit form of
action=edit instead.

=item C<--help | -h>

Show this list of options.

=item C<--help --help | -h -h>

Display man page.

=item C<--version>

Show the mirror script's version number

=back

=head1 HISTORY

   0.01 18-Oct-2005 Initial version

   0.02 19-Oct-2005 Exclude updates for pages that haven't changed

   0.03 29-Oct-2005 Add HTML scraping option for guides that don't have format=raw
                    Check source URL against database for mirroring multiple guides 

=head1 BUGS

Please report any bugs in this package using http://rt.cpan.org/ or posting to
bugs-openguides-rdf-reader (at) rt.cpan.org.

=head1 SUPPORT

For discussion of all matters relating to OpenGuides, there is a mailing list
http://openguides.org/mm/listinfo/openguides-dev.

=head1 AUTHOR

	Ivor Williams
	CPAN ID: IVORW
	 
	ivorw-openguides (at) xemaps.com
	http://openguides.org/

=head1 COPYRIGHT

This program is free software licensed under the...

	The General Public License (GPL)
	Version 2, June 1991

The full text of the license can be found in the
LICENSE file included with this module.

=cut

my $agent = WWW::Mechanize->new();

my $config = OpenGuides::Config->new( file => $config_file );
my $guide = OpenGuides->new( config => $config );
my $wiki = $guide->wiki;

my @pagelist = $days ? get_recent_changes($agent, $site, $days) :
	get_all_pages($agent, $site);

$|=1;

for (@pagelist) {
    chomp;
    print $_,":";

    my %meta = eval { get_page_metadata($agent, $site, $_) };
    (print "Failed to parse metadata\n$@\n"),next if $@;
    
#    print Dumper \%meta;
    my $text = $scrape ? scrape_page_content($agent, $site, $_) :
    			get_page_content($agent, $site, $_);

#    print $text;

    populate_local_wiki($wiki, $_, $text, \%meta);
}

sub get_all_pages {
    my ($ua, $url) = @_;

    $ua->get("$url?action=index;format=plain");

    split /\n/,$ua->content;
}

sub get_recent_changes {
    my ($ua, $url, $days) = @_;

    $ua->get("$url?action=rss;days=$days");
    my $rss = XML::RSS->new;
    $rss->parse($ua->content);
    reverse map {$_->{title}} @{$rss->{items}};
}

sub get_page_metadata {
    my ($ua, $url, $page) = @_;

    $ua->get("$url?id=$page;format=rdf");

    my $rdf = $ua->content;

    parse_rdf($rdf);
}

sub get_page_content {
    my ($ua, $url, $page) = @_;

    $ua->get("$url?id=$page;format=raw");

    $ua->content;
}

sub scrape_page_content {
    my ($ua, $url, $page) = @_;
    
    $ua->get("$url?id=$page;action=edit");

    for ($ua->forms) {
        my $in = $_->find_input('content');
	return $in->value if $in;
    }

    undef;
}

sub populate_local_wiki {
    my ($wiki, $page, $content, $metadata) = @_;

    my $node = $wiki->formatter->node_param_to_node_name( $page );
    my %old_data = $wiki->retrieve_node($node);
    
    if ($old_data{version}) {
        my $from = $old_data{metadata}{source}[0];
        (print "Skipping as source URL is $from\n"), return
	    if $from ne $metadata->{source};
        (print "Unchanged\n"), return
	    if $old_data{metadata}{version}[0] == $metadata->{version};
	print "Updating... ";
    }
    else {
    	print "Creating... ";
    }

    my $written = $wiki->write_node( $node, $content, $old_data{checksum}, $metadata);

    print $written ? "Done\n" : "Failed\n";

#    print Dumper \%old_data;
}
