#!/usr/local/bin/perl


#-------------------------------------PolyAna-----------------------------------#
#--------Authors: Smruthy Sivakumar, Dr. Jerry Fowler, Dr. Paul Scheet----------#
#-----------------------------------version: v1---------------------------------#
#---    This script takes as input a SNP text file and detects potential     ---#
#---    homopolymer sites based on their locaton in the reference genome     ---#

use strict;

#Dependencies
#use lib "/path/to/lib/perl5";
use Bio::Perl;
use Bio::SeqIO;
use Getopt::Long;

#Variables
my $help;
my $min_length;
my %n1;
my %n2;
my $hom_base;
my $window;

#Defaults
my $ref_path="absent";
my $variant_file="absent";
my $out="absent";
my $homopolymer_min_length_d=6;
my $window_d=10;
my $stretch_length1=4;
my $stretch_length2=5;

#Prints usage if no command line parameters are passed or there is an unknown parameter or help option is passed
usage() if ( @ARGV < 5 or
          ! GetOptions('help|?' => \$help, 'ref:s' => \$ref_path, 'variant_file:s' => \$variant_file, 'homopolymer_min_length:i' => \$min_length, 'window:i' => \$window, 'output:s' => \$out)
          or defined $help);
#Usage 
sub usage
{
  print "Unknown option: @_\n" if ( @_ );
  print "Usage: perl PolyAna.pl --ref <reference_fasta> --variant_file <variant_file> --homopolymer_min_length <min_length> --window <window> --output <output_file_name>[--help|-?]\n";
  print "--ref <reference_fasta> : Provide path to the reference fasta file\n";
  print "--variant_file <variant_file> : Provide path to input variant file. The variant file must be tab separated and ordered in the following columns: Chr, Pos, Ref, Alt. You can specify a header file starting with #.\n";
  print "--homopolymer_min_length <min_length> : Provide minimum length to chose as cut off for homopolymer. (Default = 6)\n"; 
  print "--window <window> : Provide the length of the window to check for homopolymers in the vicinity.(Default = 10)\n";
  print "--output <out> : Provide full path and output file name\n";
  print "--help|-? : Provides help for executing this script.\n";
  exit;
}

#Checking reference file parameter
if ($ref_path eq "absent" || $ref_path eq "")
{
	print "Reference fasta is required\n\n";
	usage();
	exit;
}
#Checking if reference file exists
if (-e $ref_path)
{
    print "Reference file: $ref_path exists\n";
    open (REF, $ref_path);
}
else
{
	print "Reference file: $ref_path not found\n\n";
	usage();
	exit;
}

#Checking variant file parameter
if ($variant_file eq "absent" || $variant_file eq "")
{
	print "Input variant file is required\n\n";
	usage();
	exit;
}
#Checking if variant file exists
if (-e $variant_file)
{
    print "Variant file: $variant_file exists\n";
    open (VARIANT, $variant_file);
}
else
{
	print "Variant file: $variant_file not found\n\n";
	usage();
	exit;
}

#checking min_length
if ($min_length)
{
	print "Using homopolymer_min_length cut off : $min_length\n";
}
elsif ($min_length == "")
{
	print "You must specify an integer for homopolymer_min_length if you want to use a custom cut off.\n";
	print "Using default homopolymer_min_length cut off: $homopolymer_min_length_d\n";
	$min_length=$homopolymer_min_length_d;
}
else
{
	print "Using default homopolymer_min_length cut off: $homopolymer_min_length_d\n";
	$min_length=$homopolymer_min_length_d;
}

#checking window specification
if ($window)
{
	print "Using window cut off : $window\n";
}
elsif ($window == "")
{
	print "You must specify an integer for window if you want to use a custom cut off.\n";
	print "Using default window cut off: $window_d\n";
	$window=$window_d;
}
else
{
	print "Using default window cut off: $window_d\n";
	$window=$window_d;
}


#checking output file
if ($out eq "absent" || $out eq "")
{
	print "Output file is required\n\n";
	usage();
	exit;
}
open( OUT, '>', $out) or die "Cannot open output file: $out\n";

#initializations
my %reference;
my %polyx;
my %count_left = 0;
my %count_right = 0;
my %left_boundary;
my %right_boundary;
my %ref;
my %alt;
my %hom_base;
my @keys_ar;
my %flag =0;
my %n1_base;
my %n2_base;

#Reading reference fasta and storing in a hash
my $reffa = Bio::SeqIO->new(-file => $ref_path, '-format' => 'Fasta');
while(my $seqobj = $reffa->next_seq) {
	my $chr  = $seqobj->display_id;
    my $seq = $seqobj->seq;
    chomp $seq;
    $reference{$chr} = $seq;
}


#Reading variant file
while (<VARIANT>)
{
	next if /^#/;
	chomp $_;
	
	my @line=split("\t",$_);
	my $chr=$line[0];
	my $pos=$line[1];

	my $refal=$line[2];
	my $altal=$line[3];

	my $key=join(":",$chr,$pos,$refal,$altal);

	$ref{$key}=$line[2];
	$alt{$key}=$line[3];
	
	#chromosome - sanity check
	if (! exists $reference{$chr})
	{
		print "Sequence for $chr not found in Reference\n";
		exit;
	}

	my $pos_char= uc (substr $reference{$chr}, $pos-1, 1);
	
	#chromosome - position reference allele check
	if (uc($ref{$key}) ne $pos_char)
	{
		print "Reference base doesn't match at $chr:$pos; uc($ref{$key}) in reference and $pos_char in the variant file\nExiting..\n";
		exit;
	}

	push @keys_ar,$key;

}

#Header in output
print OUT "CHR\tPOS\tREF\tALT\tPOLYX\tHP_LEN\tHP_BASE\tN_LEFT\tN_RIGHT\tOFFSET\tCALL\n";

#Processing each variant..
foreach my $key (@keys_ar)
{
	my @ar=split(":",$key);
	my $chr = $ar[0];
	my $pos = $ar[1];

	#Part1: calculating repeat length

	#going left
	$left_boundary{$key} = $pos;
	for (my $i=$pos-1; $i>0; $i--)
	{
		if (uc (substr $reference{$chr}, $i-1,1) eq uc ($ref{$key}))
		{
			$count_left{$key}++;
			$left_boundary{$key}=$i;
		}
		else
		{
			last;
		}
	}

	#going right
	$right_boundary{$key} = $pos;
	for (my $i=$pos+1; $i<length($reference{$chr}); $i++)
	{
		if (uc (substr $reference{$chr}, $i-1,1) eq uc ($ref{$key}))
		{
			$count_right{$key}++;
			$right_boundary{$key}=$i;
		}
		else
		{
			last;
		}
	}
	#repeat length
	$polyx{$key}=$count_left{$key}+$count_right{$key}+1;

	$flag{$key}=0;

	#Part1: If repeat length is between 4 and min_length
	if ($polyx{$key}>=4 && $polyx{$key}<$min_length)
	{

		if ($left_boundary{$key} == $pos) #left_offset=0
		{
			$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-2, 1);
			$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}, 1);
			if ($n1_base{$key} eq $alt{$key})
			{
				$flag{$key} = 1;
				print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$polyx{$key}\t$ref{$key}\t$n1_base{$key}\t$n2_base{$key}\t0\tHomopolymer(based_on_alt)\n";
			}
		}
		elsif ($right_boundary{$key} == $pos) #right_offset=0
		{
			$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-2, 1);
			$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}, 1);
			if ($n2_base{$key} eq $alt{$key})
			{
				$flag{$key} = 1;
				print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$polyx{$key}\t$ref{$key}\t$n1_base{$key}\t$n2_base{$key}\t0\tHomopolymer(based_on_alt)\n";
			}
		}
	}


	#Part2: Looking for long homopolymer in the window (if repeat length < minimum length to call homopolymer)
	if ($polyx{$key} < $min_length)
	{
		my $left_window_seq;
		my $right_window_seq;
		my $left_margin=$left_boundary{$key}-$window;
		if ( $left_margin >0)
		{
			$left_window_seq=uc (substr $reference{$chr}, $left_margin-1, $window);
		}
		else
		{
			$left_window_seq=uc (substr $reference{$chr}, 0 , $left_boundary{$key}-1);	
		}
		
		my $right_margin=$right_boundary{$key}+$window;
		if (length($reference{$chr}) - $right_margin > $window)
		{
			$right_window_seq=uc (substr $reference{$chr}, $right_boundary{$key}, $window);
		}
		else
		{
			$right_window_seq=uc (substr $reference{$chr},-(length($reference{$chr}) - $right_margin));	
		}

		#Part A: Immediately adjacent HP
		my $pos2 = length($left_window_seq);
		my $count_left_adj=1; my $left_adj=0;
		if ($polyx{$key}>=$stretch_length1)
		{
			if ($flag{$key} != 1)
			{
				#going left to check for adjacent HP
				for (my $i=$pos2-1; $i>0; $i--)
				{
					if (uc (substr $left_window_seq, $i-1,1) eq uc (substr $left_window_seq, $pos2-1,1))
					{
						$count_left_adj++;
						$left_adj=$i;
					}
					else
					{
						last;
					}

				}

				$hom_base{$key}=uc (substr $left_window_seq, $pos2-1,1);

				if ($left_boundary{$key} == $pos && ($alt{$key} eq $hom_base{$key}))
				{
					$flag{$key}=1;
					$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-$left_adj-1, 1);
					$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}, 1);
					my $right_coordinate=$right_boundary{$key}-$pos;
					my $left_coordinate=-$left_adj-1;
					print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$count_left_adj\t$hom_base{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tHomopolymer_stretch(adjacent_left)\n"; 
				}
				elsif (((($count_left_adj>=$stretch_length1 && $polyx{$key} >=$stretch_length2) || ($count_left_adj>=$stretch_length2 && $polyx{$key} >=$stretch_length1)) && $count_left_adj+$polyx{$key}>=$min_length))
				{
					$flag{$key}=1;
					$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-$left_adj-1, 1);
					$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}, 1);
					my $right_coordinate=$right_boundary{$key}-$pos;
					my $left_coordinate=-$left_adj-1;
					print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$count_left_adj\t$hom_base{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tHomopolymer_stretch(adjacent_left)\n"; 
				}	
			}

			if ($flag{$key} != 1)
			{
				#going right to check for adjacent HP
				my $count_right_adj=1; my $right_adj=0;
				for (my $i=1; $i<length($right_window_seq); $i++)
				{
					if (uc (substr $right_window_seq, $i,1) eq uc (substr $right_window_seq, 0,1))
					{
						$count_right_adj++;
						$right_adj=$i;
					}
					else
					{
						last;
					}
				}
				$hom_base{$key} = uc (substr $right_window_seq, 0,1);
				if ($right_boundary{$key} == $pos && ($alt{$key} eq $hom_base{$key}))
				{
					$flag{$key}=1;
					$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-2, 1);
					$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}+$right_adj+1, 1);
					my $left_coordinate = $left_boundary{$key}-$pos;
					my $right_coordinate = $right_adj+1;
					print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$count_right_adj\t$hom_base{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tHomopolymer_stretch(adjacent_right)\n"; 				
				}
				elsif (((($count_right_adj>=$stretch_length2 && $polyx{$key}>=$stretch_length1) || ($count_right_adj>=$stretch_length1 && $polyx{$key}>=$stretch_length2)) && $count_right_adj+$polyx{$key}>=$min_length))
				{
					$flag{$key}=1;
					$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-2, 1);
					$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}+$right_adj+1, 1);
					my $left_coordinate = $left_boundary{$key}-$pos;
					my $right_coordinate = $right_adj+1;
					print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$count_right_adj\t$hom_base{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tHomopolymer_stretch(adjacent_right)\n"; 
				}
			}
		}

		#Part B: long HP in vicinity 
		
		if ($flag{$key} != 1)
		{

			my ($left_longest_hp,$left_hp_l, $hom_base_left) = longest_homopolymer ($left_window_seq);
			my ($right_longest_hp,$right_hp_l, $hom_base_right) = longest_homopolymer ($right_window_seq);

			#Index of longest HP
			my $hompos_left = index (reverse ($left_window_seq), reverse ($left_longest_hp))+1;
			my $hompos_right = index ($right_window_seq, $right_longest_hp)+1;

			if ($right_hp_l >= $min_length || $left_hp_l >= $min_length)
			{
				$flag{$key}=1;
				if ($right_hp_l >= $min_length && $left_hp_l >= $min_length)
				{
					#Getting n1 & n2
					$n1{$key}=join (",", uc (substr $reference{$chr}, $right_boundary{$key}+$hompos_right-2,1), uc (substr $reference{$chr}, $left_boundary{$key}-$hompos_left-$left_hp_l-1, 1));
					$n2{$key}=join (",", uc (substr $reference{$chr}, $right_boundary{$key}+$hompos_right+$right_hp_l-1,1), uc (substr $reference{$chr}, $left_boundary{$key}-$hompos_left, 1));
					print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$right_hp_l,$left_hp_l\t$hom_base_right,$hom_base_left\t$n1{$key}\t$n2{$key}\t$hompos_right,$hompos_left\tHomopolymer(both)\n";
				}
				elsif ($right_hp_l>$left_hp_l)
				{
					#Getting n1 & n2
					$n1{$key}=uc (substr $reference{$chr}, $right_boundary{$key}+$hompos_right-2,1);
					$n2{$key}=uc (substr $reference{$chr}, $right_boundary{$key}+$hompos_right+$right_hp_l-1,1);
					print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$right_hp_l\t$hom_base_right\t$n1{$key}\t$n2{$key}\t$hompos_right\tHomopolymer(right)\n";
				}
				elsif ($right_hp_l<$left_hp_l)
				{
					#Getting n1 & n2
					$n1{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-$hompos_left-$left_hp_l-1, 1);
					$n2{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-$hompos_left, 1);
					print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$left_hp_l\t$hom_base_left\t$n1{$key}\t$n2{$key}\t$hompos_left\tHomopolymer(left)\n";
				}
			}
		}
		
		#Part C: Consecutive short streches of Homopolymers in vicinity
		if ($flag{$key} != 1)
		{	
	 		my ($right_longest_hp,$right_hp_l2,$hom_base_right) = longest_consecutive_hp ($right_window_seq);
	 		my ($left_longest_hp,$left_hp_l2,$hom_base_left) = longest_consecutive_hp ($left_window_seq);

	 		#Index of longest HP
			my $hompos_left = index (reverse ($left_window_seq), reverse ($left_longest_hp))+1;
			my $hompos_right = index ($right_window_seq, $right_longest_hp)+1;

	 		if ($left_hp_l2 >= $min_length)
	 		{
	 			$flag{$key}=1;
	 			$n1{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-$hompos_left-$left_hp_l2-1, 1);
				$n2{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-$hompos_left, 1);
	 			print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$left_hp_l2\t$hom_base_left\t$n1{$key}\t$n2{$key}\t$hompos_left\tHomopolymer_Stretch(left)\n";
	 		}
	 		elsif ($right_hp_l2 >= $min_length)
	 		{
	 			$flag{$key}=1;
	 			$n1{$key}=uc (substr $reference{$chr}, $right_boundary{$key}+$hompos_right-2,1);
				$n2{$key}=uc (substr $reference{$chr}, $right_boundary{$key}+$hompos_right+$right_hp_l2-1,1);
	 			print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$right_hp_l2\t$hom_base_right\t$n1{$key}\t$n2{$key}\t$hompos_right\tHomopolymer_Stretch(right)\n";
	 		}
		}

		if ($flag{$key}==0)
		{
			$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-2, 1);
			$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}, 1);
			print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\tNA\tNA\t$n1_base{$key}\t$n2_base{$key}\tNA\tPotentially_Valid\n";
		}
	}
	else
	{
		$n1_base{$key}=uc (substr $reference{$chr}, $left_boundary{$key}-2, 1);
		$n2_base{$key}=uc (substr $reference{$chr}, $right_boundary{$key}, 1);
		my $left_coordinate=$left_boundary{$key}-$pos;
		my $right_coordinate=$right_boundary{$key}-$pos;
		if ($left_boundary{$key} == $pos) #left_offset=0
		{
			if ($n1_base{$key} eq $alt{$key})
			{
				$flag{$key} = 1;
				print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$polyx{$key}\t$ref{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tHomopolymer\n";
			}
			else
			{
				print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$polyx{$key}\t$ref{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tPotentially_Valid\n";
			}
		}
		elsif ($right_boundary{$key} == $pos) #right_offset=0
		{
			if ($n2_base{$key} eq $alt{$key})
			{
				$flag{$key} = 1;
				print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$polyx{$key}\t$ref{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tHomopolymer\n";
			}
			else
			{
				print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$polyx{$key}\t$ref{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tPotentially_Valid\n";
			}
		}
		else #somewhere in the middle
		{
			print OUT "$chr\t$pos\t$ref{$key}\t$alt{$key}\t$polyx{$key}\t$polyx{$key}\t$ref{$key}\t$n1_base{$key}\t$n2_base{$key}\t[$left_coordinate,$right_coordinate]\tHomopolymer\n";
		}
	}
}

#Finding the longest homopolymer in the window
sub longest_homopolymer {

    my ( $sequence ) = @_;

    my @base=("A", "T", "G", "C");

    my $longest;
    my $hombase;
    foreach my $base (@base)
    {
    	my @matches = $sequence =~ /((?:$base)+)/g ;  

	    foreach my $match(@matches)
	    {  
	    	if( length( $longest ) < length( $match ))
			{
				$longest = $match;
				$hom_base=$base;
			}
	    }
    }
    return $longest, length($longest), $hom_base;
}

#Finding the longest consecutive short streches of homopolymers in the window
sub longest_consecutive_hp {

    my ( $sequence ) = @_;

    my @base1=("A", "T", "G", "C");
    my @base2=("A", "T", "G", "C");

    my $longest;
    my $hom_base1;
    my $hom_base2;
    foreach my $b1 (@base1)
    {
   		foreach my $b2 (@base2)
   		{
   			if ($b1 ne $b2)
   			{
   				my @matches = $sequence =~ /(?:$b1){$stretch_length1,}(?:$b2){$stretch_length2,}/g ;  

				foreach my $match ( @matches ) 
			 	{  
				   	$longest = $match if length( $longest ) < length( $match );
				   	$hom_base1=$b1;
				   	$hom_base2=$b2;
				}
   			}
   		}
    }
    my $hom_base=join("-",$hom_base1,$hom_base2);
    return $longest, length($longest),$hom_base;
}
