#!/usr/bin/env perl

use strict;
use warnings;
use autodie;
use Data::Dumper;
use Getopt::Long;
use List::Util 'shuffle';

use constant {
	SEED     => 666,
	RTC_NUM  => 100,
	LENGTH   => 1000
};

if (@ARGV < 1) {
	usage();
	exit 0;
}

my $seed = SEED;
my $num = RTC_NUM;
my $length = LENGTH;

GetOptions(
	"seed=i"   => \$seed,
	"rtc_num=i" => \$num,
	"length=i" => \$length
) or die "Error in command line arguments\n";

my $fasta_file = shift; 
unless ($fasta_file) {
	usage();
	exit 1;
}

srand $seed;

#print Dumper(index_fasta(shift));
my $idx_h = index_fasta($fasta_file);
my %idx_large;

for my $gene (keys %$idx_h) {
	my $seq = (
		sort { length($b) <=> length($a) }
		@{ $idx_h->{$gene} }
	)[0];

	if (length($seq) >= $length) {
		$idx_large{$gene} = $seq;
	}
}

my @genes = shuffle sort keys %idx_large;

for my $i (1..$num) {
	my $seq = substr $idx_large{$genes[$i]}, - $length;
	print "$genes[$i]\t$seq\n";
}

sub usage {
	print "Usage: $0 [--seed=INT] [--cohort=INT] <FASTA>\n";
	print "FASTA: transcripts of protein-coding genes\n";
}

sub index_fasta {
	my $genome = shift;
	open my $fh, "-|" => "zcat $genome";

	my %idx;
	my ($id, $i);

	while (<$fh>) {
		chomp;
		next if /^;/;

		if (/^>/) {
			$i = 0;
			my @f = split /\|/;

			$id = $f[5];
			$id =~ s/^\s+|\s+$//g;

			if (exists $idx{$id}) {
				$i = scalar @{ $idx{$id} };
			}
		} else {
			die "Error reading fasta file '$genome': Not defined id"
				unless defined $id;
			$idx{$id}[$i] .= $_;
		}
	}

	close $fh;
	return \%idx;
}
