#!/usr/bin/env perl
# C&C NLP tools
# Copyright (c) Universities of Edinburgh, Oxford and Sydney
# Copyright (c) James R. Curran
#
# This software is covered by a non-commercial use licence.
# See LICENCE.txt for the full text of the licence.
#
# If LICENCE.txt is not included in this distribution
# please email candc@it.usyd.edu.au to obtain a copy.

scalar(@ARGV) == 5 || die "usage: distribute <nsentences> <nsplits> <sep> <directory> <file>\n";

$total = shift;
$nbuckets = shift;
$sep = shift;
$dir = shift;

$sbucket = int($total/$nbuckets) + 1;

$file = 0;
$sents = 0;

$command_line = "# this file was generated by the following command(s):\n";
$command_line .= "# $0 $total $nbuckets \"$sep\" $dir @ARGV\n";
$command_line .= "#    number of sentences per file is $sbucket\n";
$command_line .= "#    number of files is $nbuckets\n";

$sep .= "\n";

while(<>){
    last if(/^$/);

    if(/^\# /){
        next if(/^\# this file .*generated by the following command/);
        $command_line .= $_;
    }else{
        chomp;
        die "unrecognised preface comment line '%s'\n" % $_;
    }
}

$command_line .= "\n";

open(OUT, ">$dir/$file") || die("can't open OUT $dir/$file\n");
print OUT $command_line;
while(<>){
  if($sents == $sbucket){
    close(OUT);
    $sents = 0;
    $file++;
    open(OUT, ">$dir/$file") || die("can't open OUT file $dir/$file\n");
    print OUT $command_line;
  }

  $sents++ if(/$sep/o);
  print OUT;
}
close(OUT);
