#!/usr/bin/perl
#
# dbSNPtoROD.pl
# v 1.0
# CRB GADIE/GABI/INRA/FRANCE
# http://crb-gadie.inra.fr
# author: Sylvain MARTHEY
# 23/03/2010
#
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
#
=head1 NAME
dbSNPtoROD.pl
=head1 SYNOPSIS
dbSNPtoROD.pl --chr_name --fasta_file --SNP_file --contig_info_file --contig_loc_file --ucsc_out_file [--contig_locus_id_file]
=head1 OPTIONS
--chr_name : chromosome name
--fasta_file : path of fasta dbSNP file, this file should correspond to the selected chromosome
--SNP_file : path of SNP file
--contig_info_file : SNPContigInfo file
--contig_loc_file : SNPContigLoc file
--map_info_file : SNPContigLoc file
--ucsc_out_file : path to the output file
--contig_locus_id_file : SNPContigLocusID file (not implement)
=over
=back
=head1 B
This script can be used to convert NCBI dbSNP dump files to generate the UCSC SNP track (https://cgwb.nci.nih.gov/cgi-bin/hgTables?db=hg18&hgta_group=varRep&hgta_track=snp129&hgta_table=snp129&hgta_doSchema=describe+table+schema)
The generated format can be used to create a .rod file (http://www.broadinstitute.org/gsa/wiki/index.php/The_DBSNP_rod)
=cut
use strict;
use Getopt::Long;
use Pod::Usage;
my $help ;
my $chr_name ;
my $fasta_file ;
my $SNP_file ;
my $contig_info_file ;
my $contig_loc_file ;
my $map_info_file ;
my $contig_locus_id_file ;
my $ucsc_out_file ;
# we retrieve options
GetOptions("help|?" => \$help,
"chr_name:s" => \$chr_name,
"fasta_file:s" => \$fasta_file,
"SNP_file:s" => \$SNP_file,
"contig_info_file:s" => \$contig_info_file,
"contig_loc_file:s" => \$contig_loc_file,
"contig_locus_id_file:s" => \$contig_locus_id_file,
"map_info_file:s" => \$map_info_file,
"ucsc_out_file:s" => \$ucsc_out_file
)
or
pod2usage(-message=>"Try `$0' for more information.", -exitval => 2,-verbose=> 0);
pod2usage(-exitval =>1, -verbose => 2) if ($help);
# on test si les parametres ont bien ete rentres
if ($chr_name eq "")
{
pod2usage(-message=>"--chr_name parameter needed.\nTry `$0 --help' for more information.",
-exitval => 2,-verbose=> 0);
}
if ($fasta_file eq "" || !-f($fasta_file))
{
pod2usage(-message=>"--fasta_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
-exitval => 2,-verbose=> 0);
}
if ($SNP_file eq "" || !-f($SNP_file))
{
pod2usage(-message=>"--SNP_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
-exitval => 2,-verbose=> 0);
}
if ($contig_info_file eq "" || !-f($contig_info_file))
{
pod2usage(-message=>"--contig_info_fil parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
-exitval => 2,-verbose=> 0);
}
if ($contig_loc_file eq "" || !-f($contig_loc_file))
{
pod2usage(-message=>"--contig_loc_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
-exitval => 2,-verbose=> 0);
}
if ($map_info_file eq "" || !-f($map_info_file))
{
pod2usage(-message=>"--map_info_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
-exitval => 2,-verbose=> 0);
}
# if ($contig_locus_id_file eq "" || !-f($contig_locus_id_file))
# {
# pod2usage(-message=>"--contig_locus_id_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.",
# -exitval => 2,-verbose=> 0);
# }
my %NCBIdbSNPInfos;
print "extract informations from fasta file\n";
# read the fasta file and store all SNP informations
extractSNPInfosFormFasta( fasta_file => $fasta_file,
results => \%NCBIdbSNPInfos);
print "map informations from SNP file\n";
# read the SNP and store the folowing values: avg_heterozygosity, het_se, validation_status
mapInfosFromTabulateFile( file => $SNP_file,
id => 1,
avg_heterozygosity => 2,
prefix => 'rs',
het_se => 3,
validation_status => 8,
results => \%NCBIdbSNPInfos);
print "map informations from contig_loc_file file\n";
# read the SNPContigLoc and store the folowing values: loc_type, orientation, phys_pos_from, allele
mapInfosFromTabulateFile( file => $contig_loc_file,
id => 2,
prefix => 'rs',
ctg_id => 3,
loc_type => 10,
phys_pos_from => 11,
orientation => 15,
allele => 16,
results => \%NCBIdbSNPInfos);
print "map informations from map_info_file file\n";
# read the SNPContigLoc and store the folowing values: weight
mapInfosFromTabulateFile( file => $map_info_file,
id => 2,
prefix => 'rs',
weight => 6,
results => \%NCBIdbSNPInfos);
print "map informations from contig_locus_id_file file\n";
# # read the SNPContigLocusID and store the folowing values: fxn_class
# mapInfosFromTabulateFile( file => $contig_locus_id_file,
# id => 1,
# prefix => 'rs',
# fxn_class => 12,
# results => \%NCBIdbSNPInfos);
print "map informations from SNPContiInfoFile file\n";
# read the SNPContigInfoFile and store the folowing values: contig_chr, orient
mapContigInfosFromTabulateFile( file => $contig_info_file,
results => \%NCBIdbSNPInfos);
print "convert NCBI dbSNP Infos to UCSC\n";
my %UCSCdbNSP = %{convertNCBIdbToUCSC( NCBI => \%NCBIdbSNPInfos,
bin => 666)};
print "print the ucsc_out_file file\n";
my @rod_columns = ('bin','chrom','chromStart','chromEnd','name','score','strand','refNCBI','refUCSC','observed','molType','class','valid','avHet','avHetSE','func','locType','weight');
open (OUT, ">".$ucsc_out_file) or die "unable to open ucsc_out_file.\n";
foreach my $SNP_name (keys(%UCSCdbNSP)){
my $line;
foreach my $k (@rod_columns){
$line = $line.$UCSCdbNSP{$SNP_name}{$k}."\t";
}
$line =~ s/\t$/\n/;
print OUT $line;
}
close OUT;
###########################################################################################################################################
=begin function
Function : convertNCBIdbToUCSC
Description : convert the NCBI SNP attributs to UCSC SNP track attributs
Usage : convertNCBIdbToUCSC( NCBI => \%hash);
Parameters : NCBI -> hash table which contains the NCBI SNP attributs/values. The key of the hash is the snp Name.
Returns : hash ref. This Hash table contains the keys/values for all the UCSC SNP track attributs.
Author : Sylvain Marthey INRA/FRANCE
Version : v1.0
=end function
=cut
sub convertNCBIdbToUCSC {
my %args = @_;
my %NCBI = %{$args{NCBI}};
my %UCSC;
my @class = ('SNP','DIP','HETEROZYGOUS','STR','NAMED','NO VARIATON','MIXED','MNP');
# foreach variation
foreach my $SNP_name (keys(%NCBI)){
my %SNP;
# convert keys/values
$SNP{bin} = $args{bin};
# chrom fragment
$SNP{chrom} = 'chr'.$NCBI{$SNP_name}{contig_chr};
# start variation
$SNP{chromStart} = $NCBI{$SNP_name}{phys_pos_from};
# stop variation
# calcul depend of the variation type
if($NCBI{$SNP_name}{allele} eq '-'){
$SNP{chromEnd} = $SNP{chromStart};
}else{
$SNP{chromEnd} = $SNP{chromStart}+length($NCBI{$SNP_name}{allele});
}
# name
$SNP{name} = $SNP_name;
# score
$SNP{score} = 0;
# strand
if($NCBI{$SNP_name}{orientation} == 0){
$SNP{strand} = '+';
}else{
$SNP{strand} = '-';
}
my $allele = $NCBI{$SNP_name}{allele};
$allele =~ s/\t/ /g;
# refNCBI
$SNP{refNCBI} = $allele;
# refUCSC (the same as NCBI)
$SNP{refUCSC} = $allele;
# observed alleles
my $alleles = $NCBI{$SNP_name}{alleles};
$alleles =~ s/"//g;
$SNP{observed} = $alleles;
$SNP{observed} =~ s/"//g;
# moltype
$SNP{molType} = $NCBI{$SNP_name}{mol};
$SNP{molType} =~ s/"//g;
# SNP class
# only detect insertions/delections/single/indel/microsats
if(!($alleles =~ m/[^A|T|G|C|\/|-]/) && !($allele =~ m/[^A|T|G|C|\/|-]/)){
my $ins;
my $del;
my @obs = split(/\//,$alleles);
if($allele eq '-'){
# insertion 1
foreach my $ob (@obs){
if(length($ob)>1){
$ins=1;
}
}
}else{
foreach my $ob (@obs){
# deletion 1
if($ob eq '-' || length($ob) < length($allele) || length($ob) < length($allele)){
$del=1;
# insertion 2
}elsif(length($ob) > length($allele) || length($ob) > length($allele)){
$ins=1;
}
}
}
if($ins && !$del){
$SNP{class} = 'insertion';
}elsif(!$ins && $del){
$SNP{class} = 'deletion';
}elsif($ins && $del){
$SNP{class} = 'in-del';
}elsif($allele =~ m/([A|T|G|C][A|T|G|C]){5}/){
$SNP{class} = 'microsatellite';
}else{
$SNP{class} = 'single';
}
}else{
# microsats
if($allele =~ m/.*([[A|T|G|C][A|T|G|C]]){5}.*/ || $alleles =~ m/^\([A|T|G|C][A|T|G|C]\)\d+/){
print "micosat: ".$1."\n";
$SNP{class} = 'microsatellite';
}elsif(uc($alleles) =~ m/INSERTION/ || uc($alleles) =~ m/DELETION/ || uc($allele) =~ m/INSERTION/ || uc($allele) =~ m/DELETION/){
$SNP{class} = 'named';
}else{
$SNP{class} = $class[$NCBI{$SNP_name}{class}];
}
}
# single
# insertion
# delection
# validation statuts
$SNP{valid} = 'unknown';
# avHet
if($NCBI{$SNP_name}{avg_heterozygosity} eq ''){
$SNP{avHet} = 0;
}else{
$SNP{avHet} = $NCBI{$SNP_name}{avg_heterozygosity};
}
# avHetSE
if($NCBI{$SNP_name}{het_se} eq ''){
$SNP{avHetSE} = 0;
}else{
$SNP{avHetSE} = $NCBI{$SNP_name}{het_se};
}
# func
$SNP{func} = 'unknown';
# locType
$SNP{locType} = 'exact';
# weight
$SNP{weight} = $NCBI{$SNP_name}{weight};
$UCSC{$SNP_name} = \%SNP;
}
return \%UCSC;
}
=begin function
Function : extractSNPInfosFormFasta
Description : extract all SNP infos wich are contain in the fasta Header
Usage : extractSNPInfosFormFasta( fasta_file => $file_path,
results => \%hash);
Parameters : file -> path to the fasta file.
results -> hash table which contains the results. The key of the hash is the snpID, the value is a hash table which contain all the attributes of the header
Returns : nothing.
Author : Sylvain Marthey INRA/FRANCE
Version : v1.0
=end function
=cut
sub extractSNPInfosFormFasta {
my %args = @_;
my $file = $args{fasta_file};
my $results = $args{results};
open (IN,$file) or die "unable to open the file: $file\n";
while(){
# next if the line is not a fasta heaser
next if($_ =~ m/^[^\>]/);
chomp;
my $id;
my %infos;
my @line = split(/\|/,$_);
foreach my $el (@line){
# next if the element contains not information
next if(!($el =~ m/\=/));
# detect the informations in element
$el =~ m/^(([^\s]+)\s){0,1}([^\=]+)\=([^\=]+)$/;
# if we detect the snpID
if($1){
if($id){
die "problem with header $_. Ambigous detection of snpID\n";
}else{
$id = $1;
$id =~ s/\s//g;
}
}
next if(!$3);
$infos{$3} = $4;
}
$$results{$id} = \%infos;
}
close (IN)
}
=begin function
Function : mapInfosFromTabulateFile
Description : extract informations from a tabulate file and add them to a has table.
Usage : mapInfosFromTabulateFile( file => file_path (string),
id => column (int),
prefix => (tab),
attribut_1 => column_1 (int),
...
attribut_n => column_n (int),
results => hash (reference to hashtable);
Parameters : file -> path to the tabulate file wich contain foreach line a set of value corresponding to one key.
id -> column which contains the id
# prefix -> add this prefix to the ID from the tabulate file to the results keys
results -> hash table which contains the results of mapping. In the form of hastable:
results {
key1 => { att1 => val1,
...
attN => valN,
}
...
keyN => { att1 => val1,
...
attN => valN,
}
}
XXX => YYY -> XXX is the atribut YYY is the value
Returns : nothing.
Author : Sylvain Marthey INRA/FRANCE
Version : v1.0
=end function
=cut
sub mapInfosFromTabulateFile {
my %args = @_;
my $file = $args{file};
my $id = $args{id};
my $results = $args{results};
my $prefix = $args{prefix};
my %vals;
foreach my $arg (keys(%args)){
next if($arg eq "file" || $arg eq "id" || $arg eq "results" || $arg eq "prefix");
$vals{$arg} = $args{$arg};
}
open (IN,$file) or die "unable to open the file: $file\n";
while(){
chomp;
next if(!$_);
my @line = split(/\t/,$_);
my $key;
if(!$prefix){
next if(!$$results{$line[$id-1]});
$key = $line[$id-1];
}else{
next if(!$$results{$prefix.$line[$id-1]});
$key = $prefix.$line[$id-1];
}
foreach my $k (keys(%vals)){
$$results{$key}{$k} = $line[$vals{$k}-1];
}
}
close (IN);
}
=begin function
Function : mapContigInfosFromTabulateFile
Description : extract contig infos from the file Contiginfo file and map this information to the SNP hash
Usage : mapInfosFromTabulateFile( file => file_path (string),
results => hash (reference to hashtable);
Parameters : file -> contig info file
results -> results hash
Returns : nothing.
Author : Sylvain Marthey INRA/FRANCE
Version : v1.0
=end function
=cut
sub mapContigInfosFromTabulateFile {
my %args = @_;
my $file = $args{file};
my $results = $args{results};
my %contigs_info = %{createHashContigInfo( file => $contig_info_file,
id => 1)};
foreach my $k (keys(%{$results})){
my $ctg = $$results{$k}{ctg_id};
$$results{$k}{contig_chr} = @{$contigs_info{$ctg}}[5];
$$results{$k}{orient} = @{$contigs_info{$ctg}}[8];
}
}
sub createHashContigInfo {
my %args = @_;
my $file = $args{file};
my $id = $args{id};
my %results;
open (IN,$file) or die "unable to open the file: $file\n";
while(){
chomp;
my @line = split(/\t/,$_);
$results{$line[$id-1]} = \@line;
}
close IN;
return \%results;
}
1;