#!/usr/bin/perl -w

# settings are located in $HOME/.corpus

use strict;
use POSIX qw(nice);
use constant MONTH => 60*60*24*30;

nice(15);

my $configuration = "$ENV{HOME}/.corpus";
my %opt;
my $revision = "unknown";
my %revision;
my @files;
my $skip = '';
my $time_start = time;

&configure;
&version;
&init;
&update;
&locate;
&rename;
&current;
&clean_up;

sub version {
    my $line;
    if (open(TESTING, "$opt{tree}/rules/70_testing.cf")) {
	chomp($line = <TESTING>);
	if ($line =~ m/^#.*Rev(?:ision)?:\s*(\S+).*/) {
	    $revision = $1;
	}
	close(TESTING);
    }
}

sub configure {
    # does rough equivalent of source
    open(C, $configuration) || die "open failed: $configuration: $!\n";
    while(<C>) {
	chomp;
	s/#.*//;
	if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
	    $opt{$1} = $2;
	}
    }
    close(C);
}

sub clean_up {
    system "rm -f $opt{tmp}/*.$$";
}

sub init {
    $SIG{INT} = \&clean_up;
    $SIG{TERM} = \&clean_up;

    $ENV{RSYNC_PASSWORD} = $opt{password};
    $ENV{TIME} = '%e,%U,%S';
    $ENV{TZ} = 'UTC';
}

sub update {
    chdir $opt{corpus};

    # allow non-running of rsync under some circumstances
    if ($opt{rsync_command}) {
      system $opt{rsync_command};
    } else {
      system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
    }

    if (-f "rsync.last") {
	open(FIND, "find . -type f -newer rsync.last |");
	my $files = "";
	while(<FIND>) {
	    $files .= $_;
	}
	close(FIND);
	if (! $files) {
	    print STDERR "no new corpus files\n";
	    if (!$opt{always_update_html} && rand(24) > 1) {
		exit 0;
	    }
	    else {
		print STDERR "updating anyway\n";
	    }
	}
    }
    open(RSYNC, "> rsync.last");
    close(RSYNC);
    system "chmod +r *.log";
}

sub locate {
    chdir "$opt{tree}/masses";
    opendir(CORPUS, $opt{corpus});
    @files = sort readdir(CORPUS);
    closedir(CORPUS);

    @files = grep { /^(?:spam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
    @files = grep {
	my $time = 0;
	my $tag = 0;
	$revision{$_} = "unknown";
	open(FILE, "$opt{corpus}/$_");
	while (my $line = <FILE>) {
	    last if $line !~ /^#/;
	    $time++ if $line =~ /\b(?!08)\d\d:\d\d:\d\d\b/;
	    $revision{$_} = $1 if $line =~ m/(?:CVS|SVN) revision:\s*(\S+)/;
	}
	close(FILE);
	if (!$time) {
	    $skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n";
	}
	$time;
    } @files;
}

sub rename {
    use File::Copy qw(move);

    my $hour = (gmtime($time_start))[2];
    if ($hour == 9) {
	chdir $opt{html};
	opendir(HTML, $opt{html});
	my @html = readdir(HTML);
	closedir(HTML);
	@html = grep { -f } @html;
	for (@html) {
	    move($_, "last/$_");
	}
    }
}

sub sort_all {
    my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
    my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);

    my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
    if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
    elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
    elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
    if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
    elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
    elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
    return $n;
}

sub time_filter {
    my ($after, $before) = @_;
    if (/time=(\d+)/) {
	return (($time_start - $1 >= MONTH * $after) &&
		($time_start - $1 < MONTH * $before));
    }
    return 0;
}

sub current {
    my $classes = $opt{output_classes};
    $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age";

    foreach my $entry (split(' ', $classes)) {
        $entry =~ /^(\S+)\.(\S+)$/;
        my $class = $1;
        my $age = $2;
        if (!$age) { warn "no age in $entry"; next; }
        gen_class ($class, $age);
    }
}

sub gen_class {
    my ($class, $age) = @_;

    print STDERR "generating $class.$age\n";

    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);

    my @ham = grep { /^ham/ } @files;
    my @spam = grep { /^spam/ } @files;

    print STDERR "ham: " . join(' ', @ham) . "\n";
    print STDERR "spam: " . join(' ', @spam) . "\n";

    chdir $opt{corpus};

    # net vs. local
    if ($class eq "NET") {
        @ham = grep { /-net-/ } @ham;
        @spam = grep { /-net-/ } @spam;
        print STDERR "ham: " . join(' ', @ham) . "\n";
        print STDERR "spam: " . join(' ', @spam) . "\n";
    }
    else {
        # if both net and local exist, use newer
        my %spam;
        my %ham;
        
        for my $file (@spam) {
            $spam{$1}++ if ($file =~ m/-(\w+)\.log$/);
        }
        for my $file (@ham) {
            $ham{$1}++ if ($file =~ m/-(\w+)\.log$/);
        }
        while (my ($user, $count) = each %ham) {
            if ($count > 1) {
                my $nightly = "ham-$user.log";
                my $weekly = "ham-net-$user.log";
                if ($revision{$nightly} >= $revision{$weekly}) {
                    @ham = grep { $_ ne $weekly } @ham;
                }
                else {
                    @ham = grep { $_ ne $nightly } @ham;
                }
            }
        }
        while (my ($user, $count) = each %spam) {
            if ($count > 1) {
                my $nightly = "spam-$user.log";
                my $weekly = "spam-net-$user.log";
                if ($revision{$nightly} >= $revision{$weekly}) {
                    @spam = grep { $_ ne $weekly } @spam;
                }
                else {
                    @spam = grep { $_ ne $nightly } @spam;
                }
            }
        }
        print STDERR "ham: " . join(' ', @ham) . "\n";
        print STDERR "spam: " . join(' ', @spam) . "\n";
    }
    
    # age
    if ($class eq "NET" && $age ne "7day") {
        @ham = grep { -M "$_" < 10 } @ham;
        @spam = grep { -M "$_" < 10 } @spam;
        # find most recent CVS revision
        my $wanted = 0.0;
        for (@spam, @ham) {
            $wanted = $revision{$_} if ($revision{$_} > $wanted);
        }
        @spam = grep { $revision{$_} eq $wanted } @spam;
        @ham = grep { $revision{$_} eq $wanted } @ham;
        print STDERR "ham: " . join(' ', @ham) . "\n";
        print STDERR "spam: " . join(' ', @spam) . "\n";
    }
    elsif ($age =~ /^(?:new|all|age)$/) {
        @ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
        @spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
        if (!$opt{ignore_revisions}) {
            @ham = grep { $revision{$_} eq $revision } @ham;
            @spam = grep { $revision{$_} eq $revision } @spam;
        }
        print STDERR "ham: " . join(' ', @ham) . "\n";
        print STDERR "spam: " . join(' ', @spam) . "\n";
    }
    elsif ($age =~ /(\d+)day/) {
        my $mtime = $1;
        @ham = grep { -M "$_" < $mtime } @ham;
        @spam = grep { -M "$_" < $mtime } @spam;
        print STDERR "ham: " . join(' ', @ham) . "\n";
        print STDERR "spam: " . join(' ', @spam) . "\n";
    }
    
    open(OUT, "> $opt{html}/$class.$age");
    print OUT "# ham results used: " . join(" ", @ham) . "\n";
    print OUT "# spam results used: " . join(" ", @spam) . "\n";
    for (@ham) {
        print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
    }
    for (@spam) {
        print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
    }

    my $flags = "";
    $flags = "-t net -s 1" if $class eq "NET";
    $flags = "-M HTML_MESSAGE" if $class eq "HTML";
    $flags = "-o" if $class eq "OVERLAP";

    if ($age eq "all") {
        my %spam;
        my %ham;
        my @output;
        
        for my $file (@spam) {
            $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/);
        }
        for my $file (@ham) {
            $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/);
        }
        unlink "$opt{tmp}/ham.log.$$";
        unlink "$opt{tmp}/spam.log.$$";
        next unless (scalar keys %spam && scalar keys %ham);
        for my $user (sort keys %spam) {
            next unless defined $ham{$user};
            chdir "$opt{tree}/masses";
            system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$");
            system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$");
            open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |");
            while(<IN>) {
                chomp;
                push @output, "$_:$user\n";
            }
            close(IN);
        }
        open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
        while(<IN>) {
            push @output, $_;
        }
        close(IN);
        for (sort sort_all @output) {
            print OUT $_;
        }
    }
    elsif ($age eq "age") {
        my @output;

        for my $which (("0-1", "1-3", "3-6")) {
            my ($after, $before) = split(/-/, $which);
            # get and filter logs
            chdir $opt{corpus};
            for my $type (("ham", "spam")) {
                open(TMP, "> $opt{tmp}/$type.log.$$");
                my @array = ($type eq "ham") ? @ham : @spam;
                for my $file (@array) {
                    open(IN, $file);
                    while (<IN>) {
                        print TMP $_ if time_filter($after, $before);
                    }
                    close(IN);
                }
                close (TMP);
            }
            # print out by age
            chdir "$opt{tree}/masses";
            open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
            while(<IN>) {
                chomp;
                push @output, "$_:$which\n";
            }
            close(IN);
        }
        for (sort sort_all @output) {
            print OUT $_;
        }
    }
    elsif (@ham && @spam) {
        # get logs
        system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
        system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");

        chdir "$opt{tree}/masses";
        open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
        while(<IN>) {
            print(OUT);
        }
        close(IN);
    }
    close(OUT);
}
