#!/usr/bin/env perl
#-*-perl-*-
#
#  attempt to combine -u and -p
#  --> implementation not complete
#  --> problem: sentences in TU's are found in different partitions


=head1 NAME

tmx2opus - convert TMX into OPUS XML

=head1 USAGE

  tmx2opus [OPTIONS] [-o outfile] tmxfile

=head1 OPTIONS

  -o outfile ........ name of the output file (default = tmxfile)
  -p size ........... new files after <size> translation units
  -P ................ do NOT change some property names from ParaCrawl TMX
  -r ................ always remove regional codes
  -u ................ store unique sentences only
  -d ................ use DB_File to store sentences when using -u
  -v ................ verbose output

=head1 DESCRIPTION

C<tmx2opus> converts TMX files into OPUS format. It handles translation units with several languages and it also does sentence-splitting based on Lingua::Sentence. Regional codes can be removed from the language attribute. If the C<outfile> has the extension C<.gz> then it will write to compressed files (corpus files and XCES link files)

=cut


use utf8;
use strict;

use IO::File;
use File::Basename qw/dirname basename/;
use XML::Parser;
use XML::Writer;
use Lingua::Sentence;

use File::Temp qw/ tempfile /;
use DB_File;
# use DBM_Filter;

use vars qw($opt_d $opt_o $opt_p $opt_P $opt_r $opt_u $opt_v);
use Getopt::Std;
getopts('do:p:Pruv');


my $outfile = $opt_o || @ARGV[0] || 'text.xml.gz';

if ($opt_u && $opt_p){
    warn "Options -u and -p are not compatible! Value of -p is ignored!\n";
    $opt_p = undef;
}

my $maxsize = $opt_p;          # split into parts of size <p>
my $part    = $maxsize ? 1 : 0;

## partial files
if ($part){
    my $ext  = sprintf('%04d',$part);
    $outfile =~s/(\.xml)(\.gz)?$/.$ext$1$2/
}


my %LangSeg = ();
my %LangDB = ();
my $CurrentLang = 'en';

my %SentID = ();
my %LinkID = ();
my %LinkProp = ();

my %CorpusWriter = ();
my %BitextWriter = ();
my %TextSplitter = ();


my $TMXParser = new XML::Parser(Handlers => {Start => \&XmlTagStart,
					     End => \&XmlTagEnd,
					     Char => \&XmlChar});
my $TMXHandler = $TMXParser->parse_start;


my $count = 0;
binmode(STDIN);
while (<>){

    ## fix un-escaped XML entities
    s/&(?!(#\d+|\w+);)/&amp;/g;

    ## make it possible to pipe more than one TMX file
    ## delete everything before the XML declaration
    if (/<\?xml/){
	s/^.*(<\?xml)/$1/s;
	print STDERR "\nparse TMX file\n" if ($opt_v);
	$TMXParser = new XML::Parser(Handlers => {Start => \&XmlTagStart,
						  End => \&XmlTagEnd,
						  Char => \&XmlChar});
	$TMXHandler = $TMXParser->parse_start;
    }

    eval { $TMXHandler->parse_more($_); };
    if ($@){
	print STDERR $_;
	die $@;
    }
}

&close_corpora();
&close_bitexts();



sub close_corpora{
    foreach my $c (values %CorpusWriter){
	close_corpus($c);
    }
    %CorpusWriter = ();
}

sub close_bitexts{
    foreach my $c (values %BitextWriter){
	close_bitext($c);
    }
    %BitextWriter = ();
}






sub XmlTagStart{
    my ($p,$e,%a)=@_;

    if ($e eq 'tuv'){
	$CurrentLang = $a{'xml:lang'} if (exists $a{'xml:lang'});
	$CurrentLang = $a{'lang'} if (exists $a{'lang'});

	## replace hyphens with underscores
	## lowercase the language IDs
	## remove regional identifier if it is a copy of the lang ID
	$CurrentLang=~s/\-/\_/g;
	$CurrentLang=lc($CurrentLang);   # lower-case
	$CurrentLang=~s/^(.*)\_\1$/$1/g; # remove copies of lang ID
	# $CurrentLang=~s/\_[^a-z]$//g;    # remove non-alphabetic extension

	## always remove regional extension if -r
	$CurrentLang=~s/^(.*)\_.*$/$1/g if ($opt_r);

	$LangSeg{$CurrentLang} = [];
    }
    elsif ($e eq 'seg'){
	$p->{OPEN_SEG} = 1;
	push(@{$LangSeg{$CurrentLang}},'');
    }
    elsif ($e eq 'prop'){
	## skip source document properties from ParaCrawl!
	if ($opt_P && $a{type}=~/^[a-zA-Z\-\_]*$/){
	    $p->{OPEN_PROP} = $a{type};
	}
	# elsif ($a{type}=~/^(score[a-zA-Z\-\_]*|type|info)$/){
	elsif ($a{type}=~/^(score[a-zA-Z\-\_]*|info)$/){
	    $p->{OPEN_PROP} = $a{type};
	}
    }
}

sub XmlTagEnd{
    my ($p,$e)=@_;

    if ($e eq 'tu'){
	## print the translations collected in segments
	print_alignments(\%LangSeg);
	%LangSeg = ();
	%LinkProp = ();
    }
    elsif ($e eq 'seg'){
	$p->{OPEN_SEG} = 0;
    }
    elsif ($e eq 'prop'){
	if ($p->{OPEN_PROP}){
	    if (exists $LinkProp{$p->{OPEN_PROP}}){
		if ($LinkProp{$p->{OPEN_PROP}}!~/\S/){
		    delete $LinkProp{$p->{OPEN_PROP}};
		}
		## fix type to standard in xces align
		elsif ( (! $opt_P) && $p->{OPEN_PROP} eq 'type' ){
		    $LinkProp{$p->{OPEN_PROP}}=~s/\:/\-/;
		}
	    }
	    $p->{OPEN_PROP} = undef;
	}
    }
}

sub XmlChar{
    my ($p,$c)=@_;
    if ($p->{OPEN_SEG}){
	$LangSeg{$CurrentLang}[-1] .= $c;
    }
    elsif ($p->{OPEN_PROP}){
	$LinkProp{$p->{OPEN_PROP}} .= ';' 
	    if ($LinkProp{$p->{OPEN_PROP}});
	$LinkProp{$p->{OPEN_PROP}} .= $c;
    }
}


sub print_alignments{
    my $seg = shift;

    return unless (ref($seg) eq 'HASH');
    my @langs = sort keys %{$seg};
    return unless ($#langs);

    ## in case we want to split the data
    ## close and reopen data files
    if ($maxsize && $count>=$maxsize){
	$count = 0;
	$part++;
	my $ext  = sprintf('%04d',$part);
	$outfile =~s/\.[0-9]{4}(\.xml)(\.gz)?$/.$ext$1$2/
	&close_corpora();
	&close_bitexts();
	%SentID = ();
    }

    my %IDs = {};
    foreach my $s (0..$#langs){
	@{$IDs{$langs[$s]}} = print_sentences($seg,$langs[$s]);
    }

    foreach my $s (0..$#langs-1){
	foreach my $t ($s+1..$#langs){
	    print_links($IDs{$langs[$s]},$IDs{$langs[$t]},$langs[$s],$langs[$t]);
	}
    }

    $count++;
    print STDERR '.' if (! ($count % 1000));
    print STDERR " $count\n" if (! ($count % 50000));


}

sub print_links{
    my ($SrcIDs,$TrgIDs,$SrcLang,$TrgLang) = @_;

    my $LangPair = "$SrcLang-$TrgLang";
    $BitextWriter{$LangPair} = open_bitext($outfile,$SrcLang,$TrgLang) 
	unless ($BitextWriter{$LangPair});
    $LinkID{$LangPair}++;
    my $link = join(' ',@{$SrcIDs}).';'.join(' ',@{$TrgIDs});

    ## make some minor adjustments to link properties coming from ParaCrawl data
    unless ($opt_P){
	if (exists $LinkProp{'score-aligner'}){
	    $LinkProp{'hunalign'} = $LinkProp{'score-aligner'};
	    delete $LinkProp{'score-aligner'};
	}
	if (exists $LinkProp{'score-bicleaner'}){
	    $LinkProp{'score'} = $LinkProp{'score-bicleaner'};
	    delete $LinkProp{'score-bicleaner'};
	}
	delete $LinkProp{'lengthRatio'};
    }

    $BitextWriter{$LangPair}->emptyTag('link',
				       'id' => 'L'.$LinkID{$LangPair},
				       'xtargets' => $link, 
				       %LinkProp );
}

sub print_sentences{
    my ($segments,$lang) = @_;
    $CorpusWriter{$lang} = open_corpus($outfile,$lang) 
	unless ($CorpusWriter{$lang});

    my @ids = ();
    my @sentences = get_sentences($segments->{$lang},$lang);
    my $key;
    
    $CorpusWriter{$lang}->startTag('p') if ( $#sentences && (!$opt_u) );
    foreach my $sent (@sentences){
	my $sid = undef;
	if ($opt_u){
	    $key = $sent;
	    utf8::encode($key) if ($opt_d);  # to make DB_FILE happy
	    $sid = &find_sentence($key,$lang);
	}
	# if ($opt_v && $sid){
	#     print STDERR "repeated sentence found (id = $sid)\n";
	# }
	unless ($sid){
	    $SentID{$lang}++;
	    $sid=$SentID{$lang};

	    $CorpusWriter{$lang}->startTag('s', 'id' => $sid);
	    $CorpusWriter{$lang}->characters($sent);
	    $CorpusWriter{$lang}->endTag('s');

	    $LangDB{$lang}{$key} = $sid if ($opt_u);
	}
	push(@ids,$sid);
    }
    $CorpusWriter{$lang}->endTag('p') if ($#sentences && (!$opt_u));
    return @ids;
}


## check whether the sentence is stored in our cache
sub find_sentence{
    my ($sent,$lang) = @_;

    unless (exists $LangDB{$lang}){
	%{$LangDB{$lang}} = ();
	## create a new DB file if necessary
	if ($opt_d){
	    my ($fh, $filename) = tempfile();
	    close $fh;
	    tie %{$LangDB{$lang}}, "DB_File", $filename ;
	    ## this does not seem to work
	    ## from https://www.perl.com/pub/2012/06/perlunicook-unicode-text-in-dbm-files-the-easy-way.html/
	    # my $dbobj = tie %{$LangDB{$lang}}, "DB_File", $filename ;
	    # $dbobj->Filter_Value_Push("utf8");
	    print STDERR "Sentence in language $lang stored in $filename\n" if ($opt_v);
	}
    }
    if (exists $LangDB{$lang}{$sent}){
	return $LangDB{$lang}{$sent};
    }
    return undef;
}

sub get_sentences{
    my $segments = shift;
    my $lang = shift;

    my @sentences = ();
    foreach my $seg (@{$segments}){
	if ($seg=~/\S/){
	    $seg =~s/\n/ /gs;
	    $seg =~s/\s{2,}/ /g;
	    push(@sentences,$TextSplitter{$lang}->split_array($seg));
	}
    }
    return @sentences;
}



sub open_file{
    my $file = shift;

    ## make sub dir if necessary
    my $dir = dirname($file);
    system("mkdir -p ".$dir) unless (-d $dir);

    ## open pipe to gzip if necessary
    return $file=~/\.gz$/ ?
	IO::File->new("| gzip -c > $file") :
	IO::File->new(">$file");
}

sub open_corpus{
    my $file = shift;
    my $lang = shift;

    my $fh = open_file($lang.'/'.$file);
    binmode($fh,":encoding(utf-8)");
    my $XmlWriter = XML::Writer->new( OUTPUT => $fh,
				      DATA_MODE => 1,
				      DATA_INDENT => 1 );
    $XmlWriter->xmlDecl("UTF-8");
    $XmlWriter->startTag("text");

    ## Lingua::Sentences requires 2-letter language codes
    my $isolang = $lang;
    $isolang =~s/[-_].+$//;
    $isolang = 'en' unless ($isolang =~ /^[a-z][a-z]$/i);

    $TextSplitter{$lang} = Lingua::Sentence->new($isolang) unless $TextSplitter{$lang};

    return $XmlWriter;
}

sub open_bitext{
    my $file = shift;
    my $srclang = shift;
    my $trglang = shift;

    my $fh = open_file($srclang.'-'.$trglang.'/'.$file);
    my $XmlWriter = XML::Writer->new( OUTPUT => $fh,
				      DATA_MODE => 1,
				      DATA_INDENT => 1 );
    $XmlWriter->xmlDecl("UTF-8");
    $XmlWriter->doctype('cesAlign', "-//CES//DTD XML cesAlign//EN", "");
    $XmlWriter->startTag('cesAlign','version' => '1.0');
    $XmlWriter->startTag('linkGrp',
			 'targType' => 's',
			 'fromDoc'  => $srclang.'/'.$file,
			 'toDoc'    => $trglang.'/'.$file);
    return $XmlWriter;
}


sub close_corpus{
    my $XmlWriter = shift;
    $XmlWriter->endTag('text');
}

sub close_bitext{
    my $XmlWriter = shift;
    $XmlWriter->endTag('linkGrp');
    $XmlWriter->endTag('cesAlign');
}


