#!/usr/bin/perl 
#
# dbSNPtoROD.pl                          
# v 1.0                       
# CRB GADIE/GABI/INRA/FRANCE  
# http://crb-gadie.inra.fr    
# author: Sylvain MARTHEY     
# 23/03/2010               
#
# 
#   
#  This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
# 
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
# 
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#


=head1 NAME

 dbSNPtoROD.pl

=head1 SYNOPSIS

 dbSNPtoROD.pl --chr_name <chars> --fasta_file <file_path> --SNP_file <file_path> --contig_info_file <file_path> --contig_loc_file <file_path> --ucsc_out_file <file_path> [--contig_locus_id_file]
 
=head1 OPTIONS

	--chr_name	: chromosome name

	--fasta_file	: path of fasta dbSNP file, this file should correspond to the selected chromosome

	--SNP_file	: path of SNP file
	
	--contig_info_file	: SNPContigInfo file

	--contig_loc_file	: SNPContigLoc file

	--map_info_file	: SNPContigLoc file

	--ucsc_out_file	: path to the output file

	--contig_locus_id_file	:  SNPContigLocusID file (not implement)
	
=over

=back

=head1 B<DESCRIPTION>

	This script can be used to convert NCBI dbSNP dump files to generate the UCSC SNP track (https://cgwb.nci.nih.gov/cgi-bin/hgTables?db=hg18&hgta_group=varRep&hgta_track=snp129&hgta_table=snp129&hgta_doSchema=describe+table+schema)
        The generated format can be used to create a .rod file (http://www.broadinstitute.org/gsa/wiki/index.php/The_DBSNP_rod)
	

=cut

use strict;
use Getopt::Long;
use Pod::Usage;

my $help ;
my $chr_name ;
my $fasta_file ;
my $SNP_file ;
my $contig_info_file ;
my $contig_loc_file ;
my $map_info_file ;
my $contig_locus_id_file ;
my $ucsc_out_file ;


# we retrieve options
GetOptions("help|?" => \$help,
           "chr_name:s" => \$chr_name,
           "fasta_file:s" => \$fasta_file,
           "SNP_file:s" => \$SNP_file,
           "contig_info_file:s" => \$contig_info_file,
           "contig_loc_file:s" => \$contig_loc_file,
           "contig_locus_id_file:s" => \$contig_locus_id_file,
           "map_info_file:s" => \$map_info_file,
           "ucsc_out_file:s" => \$ucsc_out_file
           )
or
pod2usage(-message=>"Try `$0' for more information.", -exitval => 2,-verbose=> 0);


pod2usage(-exitval =>1, -verbose => 2) if ($help);
# on test si les parametres ont bien ete rentres
if ($chr_name eq "")
  {
     pod2usage(-message=>"--chr_name parameter needed.\nTry `$0 --help' for more information.",
              -exitval => 2,-verbose=> 0);
  }
if ($fasta_file eq "" || !-f($fasta_file))
  {
     pod2usage(-message=>"--fasta_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
              -exitval => 2,-verbose=> 0);
  }
if ($SNP_file eq "" || !-f($SNP_file))
  {
     pod2usage(-message=>"--SNP_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
              -exitval => 2,-verbose=> 0);
  }
if ($contig_info_file eq "" || !-f($contig_info_file))
  {
     pod2usage(-message=>"--contig_info_fil parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
              -exitval => 2,-verbose=> 0);
  }
if ($contig_loc_file eq "" || !-f($contig_loc_file))
  {
     pod2usage(-message=>"--contig_loc_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
              -exitval => 2,-verbose=> 0);
  }
if ($map_info_file eq "" || !-f($map_info_file))
  {
     pod2usage(-message=>"--map_info_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
              -exitval => 2,-verbose=> 0);
  }
# if ($contig_locus_id_file eq "" || !-f($contig_locus_id_file))
#   {
#      pod2usage(-message=>"--contig_locus_id_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
#               -exitval => 2,-verbose=> 0);
#   }

my %NCBIdbSNPInfos;
print "extract informations from fasta file\n";
# read the fasta file and store all SNP informations
extractSNPInfosFormFasta(	fasta_file => $fasta_file,
				results => \%NCBIdbSNPInfos);

print "map informations from SNP file\n";
# read the SNP and store the folowing values: avg_heterozygosity, het_se, validation_status
mapInfosFromTabulateFile(	file => $SNP_file,
				id => 1,
				avg_heterozygosity => 2,
				prefix => 'rs',
				het_se => 3,
				validation_status => 8,
				results => \%NCBIdbSNPInfos);

print "map informations from contig_loc_file file\n";
# read the SNPContigLoc and store the folowing values: loc_type, orientation, phys_pos_from, allele
mapInfosFromTabulateFile(	file => $contig_loc_file,
				id => 2,
				prefix => 'rs',
				ctg_id => 3,
				loc_type => 10,
				phys_pos_from => 11,
				orientation => 15,
				allele => 16,
				results => \%NCBIdbSNPInfos);

print "map informations from map_info_file file\n";
# read the SNPContigLoc and store the folowing values: weight
mapInfosFromTabulateFile(	file => $map_info_file,
				id => 2,
				prefix => 'rs',
				weight => 6,
				results => \%NCBIdbSNPInfos);

print "map informations from contig_locus_id_file file\n";
# # read the SNPContigLocusID and store the folowing values: fxn_class
# mapInfosFromTabulateFile(	file => $contig_locus_id_file,
# 				id => 1,
#				prefix => 'rs',
# 				fxn_class => 12,
# 				results => \%NCBIdbSNPInfos);


print "map informations from SNPContiInfoFile file\n";
# read the SNPContigInfoFile and store the folowing values: contig_chr, orient
mapContigInfosFromTabulateFile(		file => $contig_info_file,
					results => \%NCBIdbSNPInfos);

print "convert NCBI dbSNP Infos to UCSC\n";
my %UCSCdbNSP = %{convertNCBIdbToUCSC(	NCBI => \%NCBIdbSNPInfos,
					bin => 666)};

print "print the ucsc_out_file file\n";
my @rod_columns = ('bin','chrom','chromStart','chromEnd','name','score','strand','refNCBI','refUCSC','observed','molType','class','valid','avHet','avHetSE','func','locType','weight');
open (OUT, ">".$ucsc_out_file) or die "unable to open ucsc_out_file.\n";

foreach my $SNP_name (keys(%UCSCdbNSP)){
	my $line;
	foreach my $k (@rod_columns){
		$line =  $line.$UCSCdbNSP{$SNP_name}{$k}."\t";
	}
	$line =~ s/\t$/\n/;
	print OUT $line;
}

close OUT;


###########################################################################################################################################


=begin function

  Function    : convertNCBIdbToUCSC
  Description : convert the NCBI SNP attributs to UCSC SNP track attributs
  Usage       : convertNCBIdbToUCSC( NCBI => \%hash);
  Parameters  : NCBI -> hash table which contains the NCBI SNP attributs/values. The key of the hash is the snp Name.
  Returns     : hash ref. This Hash table contains the keys/values for all the UCSC SNP track attributs.
  Author      : Sylvain Marthey INRA/FRANCE
  Version     : v1.0

=end function

=cut
sub convertNCBIdbToUCSC {
	my %args = @_;
	my %NCBI = %{$args{NCBI}};
	my %UCSC;
	my @class = ('SNP','DIP','HETEROZYGOUS','STR','NAMED','NO VARIATON','MIXED','MNP');
	# foreach variation 
	foreach my $SNP_name (keys(%NCBI)){
		my %SNP;
		# convert keys/values
		$SNP{bin} = $args{bin};
		# chrom fragment
		$SNP{chrom} = 'chr'.$NCBI{$SNP_name}{contig_chr};
		# start variation
		$SNP{chromStart} = $NCBI{$SNP_name}{phys_pos_from};
		# stop variation
		# calcul depend of the variation type
		if($NCBI{$SNP_name}{allele} eq '-'){
			$SNP{chromEnd} = $SNP{chromStart};
		}else{
			$SNP{chromEnd} = $SNP{chromStart}+length($NCBI{$SNP_name}{allele});
		}
		# name 
		$SNP{name} = $SNP_name;
		# score
		$SNP{score} = 0;
		# strand 
		if($NCBI{$SNP_name}{orientation} == 0){
			$SNP{strand} = '+';
		}else{
			$SNP{strand} = '-';
		}
		
		my $allele = $NCBI{$SNP_name}{allele};
		$allele =~ s/\t/ /g;
		# refNCBI
		$SNP{refNCBI} = $allele;
		# refUCSC (the same as NCBI)
		$SNP{refUCSC} = $allele;
		# observed alleles
		my $alleles = $NCBI{$SNP_name}{alleles};
		$alleles =~ s/"//g;
		$SNP{observed} = $alleles;
		$SNP{observed} =~ s/"//g;
		# moltype
		$SNP{molType} = $NCBI{$SNP_name}{mol};
		$SNP{molType} =~ s/"//g;
		# SNP class 
		# only detect insertions/delections/single/indel/microsats
		if(!($alleles =~ m/[^A|T|G|C|\/|-]/) && !($allele =~ m/[^A|T|G|C|\/|-]/)){
			my $ins;
			my $del;
			my @obs = split(/\//,$alleles);
			if($allele eq '-'){
				# insertion 1
				foreach my $ob (@obs){
					if(length($ob)>1){
						$ins=1;
					}
				}
			}else{
				foreach my $ob (@obs){
					# deletion 1
					if($ob eq '-' || length($ob) < length($allele) || length($ob) < length($allele)){
						$del=1;
					# insertion 2
					}elsif(length($ob) > length($allele) || length($ob) > length($allele)){
						$ins=1;
					}
				}
			}
			if($ins && !$del){
				$SNP{class} = 'insertion';
			}elsif(!$ins && $del){
				$SNP{class} = 'deletion';
			}elsif($ins && $del){
				$SNP{class} = 'in-del';
			}elsif($allele =~ m/([A|T|G|C][A|T|G|C]){5}/){
				$SNP{class} = 'microsatellite';
			}else{
				$SNP{class} = 'single';
			}
		}else{
			# microsats
                        if($allele =~ m/.*([[A|T|G|C][A|T|G|C]]){5}.*/ || $alleles =~ m/^\([A|T|G|C][A|T|G|C]\)\d+/){
				print "micosat: ".$1."\n";
				$SNP{class} = 'microsatellite';
			}elsif(uc($alleles) =~ m/INSERTION/ || uc($alleles) =~ m/DELETION/ || uc($allele) =~ m/INSERTION/ || uc($allele) =~ m/DELETION/){
				$SNP{class} = 'named';
			}else{
				$SNP{class} = $class[$NCBI{$SNP_name}{class}];
			}
		}
		# single
                # insertion 
                # delection
		# validation statuts
		$SNP{valid} = 'unknown';
		# avHet
		if($NCBI{$SNP_name}{avg_heterozygosity} eq ''){
			  $SNP{avHet} = 0;
		}else{
			  $SNP{avHet} = $NCBI{$SNP_name}{avg_heterozygosity};
		}
		# avHetSE
		if($NCBI{$SNP_name}{het_se} eq ''){
			  $SNP{avHetSE} = 0;
		}else{
			  $SNP{avHetSE} = $NCBI{$SNP_name}{het_se};
		}
		# func
		$SNP{func} = 'unknown';
		# locType
		$SNP{locType} = 'exact';
		# weight
		$SNP{weight} = $NCBI{$SNP_name}{weight};
		
		$UCSC{$SNP_name} = \%SNP;
	}
	return \%UCSC;
}

=begin function

  Function    : extractSNPInfosFormFasta
  Description : extract all SNP infos wich are contain in the fasta Header
  Usage       : extractSNPInfosFormFasta( fasta_file => $file_path,
                                          results => \%hash);
  Parameters  : file -> path to the fasta file.
                results -> hash table which contains the results. The key of the hash is the snpID, the value is a hash table which contain all the attributes of the header
  Returns     : nothing.
  Author      : Sylvain Marthey INRA/FRANCE
  Version     : v1.0

=end function

=cut
sub extractSNPInfosFormFasta {
	my %args = @_;
	my $file = $args{fasta_file};
	my $results = $args{results};
	
	open (IN,$file) or die "unable to open the file: $file\n";
	while(<IN>){
		# next if the line is not a fasta heaser
		next if($_ =~ m/^[^\>]/);
		chomp;
		my $id;
		my %infos;
		my @line = split(/\|/,$_);
		foreach  my $el (@line){
			# next if the element contains not information
			next if(!($el =~ m/\=/));
			# detect the informations in element
			$el =~ m/^(([^\s]+)\s){0,1}([^\=]+)\=([^\=]+)$/;
			# if we detect the snpID
			if($1){
				if($id){
					die "problem with header $_. Ambigous detection of snpID\n";
				}else{
					$id = $1;
					$id =~ s/\s//g;
				}
			}
			next if(!$3);
			$infos{$3} = $4;
		}
		$$results{$id} = \%infos;
	}
	close (IN)
}

=begin function

  Function    : mapInfosFromTabulateFile
  Description : extract informations from a tabulate file and add them to a has table.
  Usage       : mapInfosFromTabulateFile( file => file_path (string),
					  id => column (int),
					  prefix => (tab),
					  attribut_1 => column_1 (int),
					  ...
					  attribut_n => column_n (int),
                                          results => hash (reference to hashtable);
  Parameters  : file -> path to the tabulate file wich contain foreach line a set of value corresponding to one key.
		id -> column which contains the id 
# 		prefix -> add this prefix to the ID from the tabulate file to the results keys
                results -> hash table which contains the results of mapping. In the form of hastable:
		results {
			key1 => { att1 => val1,
			          ...
			          attN => valN,
				}
			...
			keyN => { att1 => val1,
			          ...
			          attN => valN,
				}
		}
		XXX => YYY -> XXX is the atribut YYY is the value
  Returns     : nothing.
  Author      : Sylvain Marthey INRA/FRANCE
  Version     : v1.0

=end function

=cut
sub mapInfosFromTabulateFile {
	my %args = @_;
	my $file = $args{file};
	my $id = $args{id};
	my $results = $args{results};
	my $prefix = $args{prefix};
	my %vals;
	foreach my $arg (keys(%args)){
		next if($arg eq "file" || $arg eq "id" || $arg eq "results" || $arg eq "prefix");
		$vals{$arg} = $args{$arg};
	}
	open (IN,$file) or die "unable to open the file: $file\n";
	while(<IN>){
		chomp;
		next if(!$_);
		my @line = split(/\t/,$_);
		my $key;
		if(!$prefix){
			next if(!$$results{$line[$id-1]});
			$key = $line[$id-1];
		}else{
			next if(!$$results{$prefix.$line[$id-1]});
			$key =  $prefix.$line[$id-1];
		}
		foreach my $k (keys(%vals)){
			$$results{$key}{$k} = $line[$vals{$k}-1];
		}
	}
	close (IN);
}

=begin function

  Function    : mapContigInfosFromTabulateFile
  Description : extract contig infos from the file Contiginfo file and map this information to the SNP hash 
  Usage       : mapInfosFromTabulateFile( file => file_path (string),
                                          results => hash (reference to hashtable);
  Parameters  : file -> contig info file
                results -> results hash
  Returns     : nothing.
  Author      : Sylvain Marthey INRA/FRANCE
  Version     : v1.0

=end function
=cut

sub mapContigInfosFromTabulateFile {
	my %args = @_;
	my $file = $args{file};
	my $results = $args{results};
	my %contigs_info = %{createHashContigInfo(	file => $contig_info_file,
							id => 1)};
	foreach my $k (keys(%{$results})){
		my $ctg = $$results{$k}{ctg_id};
		$$results{$k}{contig_chr} = @{$contigs_info{$ctg}}[5];
		$$results{$k}{orient} = @{$contigs_info{$ctg}}[8];
	}
}

sub createHashContigInfo {
	my %args = @_;
	my $file = $args{file};
	my $id = $args{id};
	
	my %results;
	open (IN,$file) or die "unable to open the file: $file\n";
	while(<IN>){
		chomp;
		my @line = split(/\t/,$_);
		$results{$line[$id-1]} = \@line;
	}
	close IN;
	return \%results;
}

1;