#!/usr/bin/perl # # dbSNPtoROD.pl # v 1.0 # CRB GADIE/GABI/INRA/FRANCE # http://crb-gadie.inra.fr # author: Sylvain MARTHEY # 23/03/2010 # # # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # =head1 NAME dbSNPtoROD.pl =head1 SYNOPSIS dbSNPtoROD.pl --chr_name --fasta_file --SNP_file --contig_info_file --contig_loc_file --ucsc_out_file [--contig_locus_id_file] =head1 OPTIONS --chr_name : chromosome name --fasta_file : path of fasta dbSNP file, this file should correspond to the selected chromosome --SNP_file : path of SNP file --contig_info_file : SNPContigInfo file --contig_loc_file : SNPContigLoc file --map_info_file : SNPContigLoc file --ucsc_out_file : path to the output file --contig_locus_id_file : SNPContigLocusID file (not implement) =over =back =head1 B This script can be used to convert NCBI dbSNP dump files to generate the UCSC SNP track (https://cgwb.nci.nih.gov/cgi-bin/hgTables?db=hg18&hgta_group=varRep&hgta_track=snp129&hgta_table=snp129&hgta_doSchema=describe+table+schema) The generated format can be used to create a .rod file (http://www.broadinstitute.org/gsa/wiki/index.php/The_DBSNP_rod) =cut use strict; use Getopt::Long; use Pod::Usage; my $help ; my $chr_name ; my $fasta_file ; my $SNP_file ; my $contig_info_file ; my $contig_loc_file ; my $map_info_file ; my $contig_locus_id_file ; my $ucsc_out_file ; # we retrieve options GetOptions("help|?" => \$help, "chr_name:s" => \$chr_name, "fasta_file:s" => \$fasta_file, "SNP_file:s" => \$SNP_file, "contig_info_file:s" => \$contig_info_file, "contig_loc_file:s" => \$contig_loc_file, "contig_locus_id_file:s" => \$contig_locus_id_file, "map_info_file:s" => \$map_info_file, "ucsc_out_file:s" => \$ucsc_out_file ) or pod2usage(-message=>"Try `$0' for more information.", -exitval => 2,-verbose=> 0); pod2usage(-exitval =>1, -verbose => 2) if ($help); # on test si les parametres ont bien ete rentres if ($chr_name eq "") { pod2usage(-message=>"--chr_name parameter needed.\nTry `$0 --help' for more information.", -exitval => 2,-verbose=> 0); } if ($fasta_file eq "" || !-f($fasta_file)) { pod2usage(-message=>"--fasta_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.", -exitval => 2,-verbose=> 0); } if ($SNP_file eq "" || !-f($SNP_file)) { pod2usage(-message=>"--SNP_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.", -exitval => 2,-verbose=> 0); } if ($contig_info_file eq "" || !-f($contig_info_file)) { pod2usage(-message=>"--contig_info_fil parameter missing or is not a valid file path.\nTry `$0 --help' for more information.", -exitval => 2,-verbose=> 0); } if ($contig_loc_file eq "" || !-f($contig_loc_file)) { pod2usage(-message=>"--contig_loc_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.", -exitval => 2,-verbose=> 0); } if ($map_info_file eq "" || !-f($map_info_file)) { pod2usage(-message=>"--map_info_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.", -exitval => 2,-verbose=> 0); } # if ($contig_locus_id_file eq "" || !-f($contig_locus_id_file)) # { # pod2usage(-message=>"--contig_locus_id_file parameter missing or is not a valid file path.\nTry `$0 --help' for more information.", # -exitval => 2,-verbose=> 0); # } my %NCBIdbSNPInfos; print "extract informations from fasta file\n"; # read the fasta file and store all SNP informations extractSNPInfosFormFasta( fasta_file => $fasta_file, results => \%NCBIdbSNPInfos); print "map informations from SNP file\n"; # read the SNP and store the folowing values: avg_heterozygosity, het_se, validation_status mapInfosFromTabulateFile( file => $SNP_file, id => 1, avg_heterozygosity => 2, prefix => 'rs', het_se => 3, validation_status => 8, results => \%NCBIdbSNPInfos); print "map informations from contig_loc_file file\n"; # read the SNPContigLoc and store the folowing values: loc_type, orientation, phys_pos_from, allele mapInfosFromTabulateFile( file => $contig_loc_file, id => 2, prefix => 'rs', ctg_id => 3, loc_type => 10, phys_pos_from => 11, orientation => 15, allele => 16, results => \%NCBIdbSNPInfos); print "map informations from map_info_file file\n"; # read the SNPContigLoc and store the folowing values: weight mapInfosFromTabulateFile( file => $map_info_file, id => 2, prefix => 'rs', weight => 6, results => \%NCBIdbSNPInfos); print "map informations from contig_locus_id_file file\n"; # # read the SNPContigLocusID and store the folowing values: fxn_class # mapInfosFromTabulateFile( file => $contig_locus_id_file, # id => 1, # prefix => 'rs', # fxn_class => 12, # results => \%NCBIdbSNPInfos); print "map informations from SNPContiInfoFile file\n"; # read the SNPContigInfoFile and store the folowing values: contig_chr, orient mapContigInfosFromTabulateFile( file => $contig_info_file, results => \%NCBIdbSNPInfos); print "convert NCBI dbSNP Infos to UCSC\n"; my %UCSCdbNSP = %{convertNCBIdbToUCSC( NCBI => \%NCBIdbSNPInfos, bin => 666)}; print "print the ucsc_out_file file\n"; my @rod_columns = ('bin','chrom','chromStart','chromEnd','name','score','strand','refNCBI','refUCSC','observed','molType','class','valid','avHet','avHetSE','func','locType','weight'); open (OUT, ">".$ucsc_out_file) or die "unable to open ucsc_out_file.\n"; foreach my $SNP_name (keys(%UCSCdbNSP)){ my $line; foreach my $k (@rod_columns){ $line = $line.$UCSCdbNSP{$SNP_name}{$k}."\t"; } $line =~ s/\t$/\n/; print OUT $line; } close OUT; ########################################################################################################################################### =begin function Function : convertNCBIdbToUCSC Description : convert the NCBI SNP attributs to UCSC SNP track attributs Usage : convertNCBIdbToUCSC( NCBI => \%hash); Parameters : NCBI -> hash table which contains the NCBI SNP attributs/values. The key of the hash is the snp Name. Returns : hash ref. This Hash table contains the keys/values for all the UCSC SNP track attributs. Author : Sylvain Marthey INRA/FRANCE Version : v1.0 =end function =cut sub convertNCBIdbToUCSC { my %args = @_; my %NCBI = %{$args{NCBI}}; my %UCSC; my @class = ('SNP','DIP','HETEROZYGOUS','STR','NAMED','NO VARIATON','MIXED','MNP'); # foreach variation foreach my $SNP_name (keys(%NCBI)){ my %SNP; # convert keys/values $SNP{bin} = $args{bin}; # chrom fragment $SNP{chrom} = 'chr'.$NCBI{$SNP_name}{contig_chr}; # start variation $SNP{chromStart} = $NCBI{$SNP_name}{phys_pos_from}; # stop variation # calcul depend of the variation type if($NCBI{$SNP_name}{allele} eq '-'){ $SNP{chromEnd} = $SNP{chromStart}; }else{ $SNP{chromEnd} = $SNP{chromStart}+length($NCBI{$SNP_name}{allele}); } # name $SNP{name} = $SNP_name; # score $SNP{score} = 0; # strand if($NCBI{$SNP_name}{orientation} == 0){ $SNP{strand} = '+'; }else{ $SNP{strand} = '-'; } my $allele = $NCBI{$SNP_name}{allele}; $allele =~ s/\t/ /g; # refNCBI $SNP{refNCBI} = $allele; # refUCSC (the same as NCBI) $SNP{refUCSC} = $allele; # observed alleles my $alleles = $NCBI{$SNP_name}{alleles}; $alleles =~ s/"//g; $SNP{observed} = $alleles; $SNP{observed} =~ s/"//g; # moltype $SNP{molType} = $NCBI{$SNP_name}{mol}; $SNP{molType} =~ s/"//g; # SNP class # only detect insertions/delections/single/indel/microsats if(!($alleles =~ m/[^A|T|G|C|\/|-]/) && !($allele =~ m/[^A|T|G|C|\/|-]/)){ my $ins; my $del; my @obs = split(/\//,$alleles); if($allele eq '-'){ # insertion 1 foreach my $ob (@obs){ if(length($ob)>1){ $ins=1; } } }else{ foreach my $ob (@obs){ # deletion 1 if($ob eq '-' || length($ob) < length($allele) || length($ob) < length($allele)){ $del=1; # insertion 2 }elsif(length($ob) > length($allele) || length($ob) > length($allele)){ $ins=1; } } } if($ins && !$del){ $SNP{class} = 'insertion'; }elsif(!$ins && $del){ $SNP{class} = 'deletion'; }elsif($ins && $del){ $SNP{class} = 'in-del'; }elsif($allele =~ m/([A|T|G|C][A|T|G|C]){5}/){ $SNP{class} = 'microsatellite'; }else{ $SNP{class} = 'single'; } }else{ # microsats if($allele =~ m/.*([[A|T|G|C][A|T|G|C]]){5}.*/ || $alleles =~ m/^\([A|T|G|C][A|T|G|C]\)\d+/){ print "micosat: ".$1."\n"; $SNP{class} = 'microsatellite'; }elsif(uc($alleles) =~ m/INSERTION/ || uc($alleles) =~ m/DELETION/ || uc($allele) =~ m/INSERTION/ || uc($allele) =~ m/DELETION/){ $SNP{class} = 'named'; }else{ $SNP{class} = $class[$NCBI{$SNP_name}{class}]; } } # single # insertion # delection # validation statuts $SNP{valid} = 'unknown'; # avHet if($NCBI{$SNP_name}{avg_heterozygosity} eq ''){ $SNP{avHet} = 0; }else{ $SNP{avHet} = $NCBI{$SNP_name}{avg_heterozygosity}; } # avHetSE if($NCBI{$SNP_name}{het_se} eq ''){ $SNP{avHetSE} = 0; }else{ $SNP{avHetSE} = $NCBI{$SNP_name}{het_se}; } # func $SNP{func} = 'unknown'; # locType $SNP{locType} = 'exact'; # weight $SNP{weight} = $NCBI{$SNP_name}{weight}; $UCSC{$SNP_name} = \%SNP; } return \%UCSC; } =begin function Function : extractSNPInfosFormFasta Description : extract all SNP infos wich are contain in the fasta Header Usage : extractSNPInfosFormFasta( fasta_file => $file_path, results => \%hash); Parameters : file -> path to the fasta file. results -> hash table which contains the results. The key of the hash is the snpID, the value is a hash table which contain all the attributes of the header Returns : nothing. Author : Sylvain Marthey INRA/FRANCE Version : v1.0 =end function =cut sub extractSNPInfosFormFasta { my %args = @_; my $file = $args{fasta_file}; my $results = $args{results}; open (IN,$file) or die "unable to open the file: $file\n"; while(){ # next if the line is not a fasta heaser next if($_ =~ m/^[^\>]/); chomp; my $id; my %infos; my @line = split(/\|/,$_); foreach my $el (@line){ # next if the element contains not information next if(!($el =~ m/\=/)); # detect the informations in element $el =~ m/^(([^\s]+)\s){0,1}([^\=]+)\=([^\=]+)$/; # if we detect the snpID if($1){ if($id){ die "problem with header $_. Ambigous detection of snpID\n"; }else{ $id = $1; $id =~ s/\s//g; } } next if(!$3); $infos{$3} = $4; } $$results{$id} = \%infos; } close (IN) } =begin function Function : mapInfosFromTabulateFile Description : extract informations from a tabulate file and add them to a has table. Usage : mapInfosFromTabulateFile( file => file_path (string), id => column (int), prefix => (tab), attribut_1 => column_1 (int), ... attribut_n => column_n (int), results => hash (reference to hashtable); Parameters : file -> path to the tabulate file wich contain foreach line a set of value corresponding to one key. id -> column which contains the id # prefix -> add this prefix to the ID from the tabulate file to the results keys results -> hash table which contains the results of mapping. In the form of hastable: results { key1 => { att1 => val1, ... attN => valN, } ... keyN => { att1 => val1, ... attN => valN, } } XXX => YYY -> XXX is the atribut YYY is the value Returns : nothing. Author : Sylvain Marthey INRA/FRANCE Version : v1.0 =end function =cut sub mapInfosFromTabulateFile { my %args = @_; my $file = $args{file}; my $id = $args{id}; my $results = $args{results}; my $prefix = $args{prefix}; my %vals; foreach my $arg (keys(%args)){ next if($arg eq "file" || $arg eq "id" || $arg eq "results" || $arg eq "prefix"); $vals{$arg} = $args{$arg}; } open (IN,$file) or die "unable to open the file: $file\n"; while(){ chomp; next if(!$_); my @line = split(/\t/,$_); my $key; if(!$prefix){ next if(!$$results{$line[$id-1]}); $key = $line[$id-1]; }else{ next if(!$$results{$prefix.$line[$id-1]}); $key = $prefix.$line[$id-1]; } foreach my $k (keys(%vals)){ $$results{$key}{$k} = $line[$vals{$k}-1]; } } close (IN); } =begin function Function : mapContigInfosFromTabulateFile Description : extract contig infos from the file Contiginfo file and map this information to the SNP hash Usage : mapInfosFromTabulateFile( file => file_path (string), results => hash (reference to hashtable); Parameters : file -> contig info file results -> results hash Returns : nothing. Author : Sylvain Marthey INRA/FRANCE Version : v1.0 =end function =cut sub mapContigInfosFromTabulateFile { my %args = @_; my $file = $args{file}; my $results = $args{results}; my %contigs_info = %{createHashContigInfo( file => $contig_info_file, id => 1)}; foreach my $k (keys(%{$results})){ my $ctg = $$results{$k}{ctg_id}; $$results{$k}{contig_chr} = @{$contigs_info{$ctg}}[5]; $$results{$k}{orient} = @{$contigs_info{$ctg}}[8]; } } sub createHashContigInfo { my %args = @_; my $file = $args{file}; my $id = $args{id}; my %results; open (IN,$file) or die "unable to open the file: $file\n"; while(){ chomp; my @line = split(/\t/,$_); $results{$line[$id-1]} = \@line; } close IN; return \%results; } 1;