#! /usr/local/bin/perl # ################################################################################ # Requirements # ################################################################################ # require 5.002; #require "GUSI.ph"; #MacOS # # External References # use Carp; use FileHandle; use File::Path; use File::Basename; use Getopt::Std; #UNIX use vars qw($opt_d $opt_n $opt_h); # ################################################################################ # Main program default variables # ################################################################################ # my $max_width = 300; #number of bases scanned up- and downstream of each 5mC # ################################################################################ # Main program global variables # ################################################################################ # my $seq ; my $filelist ; my $file ; my @files ; my $seqname ; my $umgeb_C ; my @umgeb_block ; my @r_umgeb_block; my $umgeb_feld ; my $z = 0; my $pos = 0; my $counter = 0; my $next = 0; my $frame = 0; my $laenge = 0; my $position = 0; my $summe = 0; my $Hsumme = 0; my $max_H = 0; my $C_anzahl = 0; my $c_anzahl = 0; my $a_anzahl = 0; my $t_anzahl = 0; my $g_anzahl = 0; my $n_anzahl = 0; my $f_a = 0; my $f_t = 0; my $f_c = 0; my $f_C = 0; my $f_g = 0; my $e = 0; $| = 1; #print immediately # # MAN page # my $MAN = <; #write all .seq1-files into a list if ($#files <0) {die "Sorry! No files available. Please try option -h.\n"}; print "Scanning information content upstream of 5mC\n"; foreach $filelist(@files) { $seq = "-"; #consequtive transfer of sequences open (SEQUENCE, "<$filelist") || die ("Cannot open file '$filelist'."); print ("."); #show program is running while () { # #remove Newline, make sequences from lines, ignore FASTA-name # $file = $_; chomp $file; unless ($file =~ /^>/){$seq = $seq . $file;} $seq =~ s/-//g; #replace "-" } close SEQUENCE || die ("Cannot close open file '$filelist'."); $laenge = length($seq); $next = $laenge; # # determine global base number # while ($seq =~ /C/g) {$C_anzahl++;} while ($seq =~ /c/g) {$c_anzahl++;} while ($seq =~ /a/g) {$a_anzahl++;} while ($seq =~ /t/g) {$t_anzahl++;} while ($seq =~ /g/g) {$g_anzahl++;} while ($seq =~ /n/g) {$n_anzahl++;} # # find 5mC positions # while ($seq) { $position = rindex ($seq,"C",$next); $next = $position - 1; if ($position == -1) {last;} $umgeb_C = substr($seq,0,$position+1); # #transform surrounding from string to list # @r_umgeb_block = split (//, $umgeb_C); @umgeb_block = reverse(@r_umgeb_block); #reverse list $i = 0; for ( $i=0; $i<($position); #from current 5mC until end $i++ ) { if ($umgeb_block[$i] eq "a") {$umgeb_feld[$i][1]++;} elsif ($umgeb_block[$i] eq "t") {$umgeb_feld[$i][2]++;} elsif ($umgeb_block[$i] eq "g") {$umgeb_feld[$i][3]++;} elsif ($umgeb_block[$i] eq "c") {$umgeb_feld[$i][4]++;} elsif ($umgeb_block[$i] eq "C") {$umgeb_feld[$i][5]++;} elsif ($umgeb_block[$i] eq "n") {$umgeb_feld[$i][6]++;} } } } print "\n"; # # Write results to table file. # $path = `pwd`; chomp $path; if (!defined $opt_n) { ($out_file = $path) =~ s#.*[/:]##; #from path name end to last / or : } else { $opt_n =~ tr/a-zA-Z0-9_\.\///cd; #UNIX $out_file = $opt_n; } if ($out_file !~ /\//) {$out_file = substr ($out_file,0,28);} #UNIX unless ($out_file =~ /\.inf$/i) {$out_file = $out_file.".inf";} open (INFO, ">$out_file") || die ("Cannot open '$out_file'."); # determine globale frequencies of C (represents 5mC),c,t,g,a $f_a = $a_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl); $f_t = $t_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl); $f_c = $c_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl); $f_C = $C_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl); $f_g = $g_anzahl/($a_anzahl+$t_anzahl+$c_anzahl+$C_anzahl+$g_anzahl); printf INFO "%6s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n", "Pos.","f(a)","f(t)","f(g)","f(c)","f(C)","f(n)","R","H'","Cov."; $z = 0; for ( $z=$max_width; $z>0; $z=$z-1 ) { $summe = $umgeb_feld[$z][1] + $umgeb_feld[$z][2] + $umgeb_feld[$z][3] + $umgeb_feld[$z][4] + $umgeb_feld[$z][5] + $umgeb_feld[$z][6]; if ($summe == 0) {last}; $f[$z][1] = $umgeb_feld[$z][1]/$summe; $f[$z][2] = $umgeb_feld[$z][2]/$summe; $f[$z][3] = $umgeb_feld[$z][3]/$summe; $f[$z][4] = $umgeb_feld[$z][4]/$summe; $f[$z][5] = $umgeb_feld[$z][5]/$summe; $f[$z][6] = $umgeb_feld[$z][6]/$summe; if ($umgeb_feld[$z][1] > 0 && $f_a > 0) {$H[$z][1] = $umgeb_feld[$z][1]/$summe*(log($umgeb_feld[$z][1]/$summe)/log(2)); $Hr[$z][1] = $umgeb_feld[$z][1]/$summe*(log(($umgeb_feld[$z][1]/$summe)/$f_a)/log(2))}; if ($umgeb_feld[$z][2] > 0 && $f_t > 0) {$H[$z][2] = $umgeb_feld[$z][2]/$summe*(log($umgeb_feld[$z][2]/$summe)/log(2)); $Hr[$z][2] = $umgeb_feld[$z][2]/$summe*(log(($umgeb_feld[$z][2]/$summe)/$f_t)/log(2))}; if ($umgeb_feld[$z][3] > 0 && $f_g > 0 ) {$H[$z][3] = $umgeb_feld[$z][3]/$summe*(log($umgeb_feld[$z][3]/$summe)/log(2)); $Hr[$z][3] = $umgeb_feld[$z][3]/$summe*(log(($umgeb_feld[$z][3]/$summe)/$f_g)/log(2))}; if ($umgeb_feld[$z][4] > 0 && $f_c > 0) {$H[$z][4] = $umgeb_feld[$z][4]/$summe*(log($umgeb_feld[$z][4]/$summe)/log(2)); $Hr[$z][4] = $umgeb_feld[$z][4]/$summe*(log(($umgeb_feld[$z][4]/$summe)/$f_c)/log(2))}; if ($umgeb_feld[$z][5] > 0 && $f_C > 0) {$H[$z][5] = $umgeb_feld[$z][5]/$summe*(log($umgeb_feld[$z][5]/$summe)/log(2)); $Hr[$z][5] = $umgeb_feld[$z][5]/$summe*(log(($umgeb_feld[$z][5]/$summe)/$f_C)/log(2))}; if ($umgeb_feld[$z][6] > 0 && $f_n > 0) {$H[$z][6] = $umgeb_feld[$z][6]/$summe*(log($umgeb_feld[$z][6]/$summe)/log(2)); $Hr[$z][6] = $umgeb_feld[$z][6]/$summe*(log(($umgeb_feld[$z][6]/$summe)/$f_n)/log(2))}; $Hsumme = $H[$z][1] + $H[$z][2] + $H[$z][3] + $H[$z][4] + $H[$z][5]; $rel_H = $Hr[$z][1] + $Hr[$z][2] + $Hr[$z][3] + $Hr[$z][4] + $Hr[$z][5]; # # "n" for max. entropy not considered:-> binaery logarythm of 1/5 # $max_H = -(log(1/5))/(log(2)); # # since it's difficult to determine exact error, estimation is used: # $e = ((5-1)/2*log(2)/$summe); printf INFO "%6d %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6d\n", "-".$z, $f[$z][1], $f[$z][2], $f[$z][3], $f[$z][4], $f[$z][5], $f[$z][6], $max_H + $Hsumme - $e, $rel_H, $summe; } #---------------------------------------------------------------- #INFORMATION CONTENT DOWNSTREAM OF 5MC-SITES: for ($z=0; $z<=$max_width+1; $z++) { $umgeb_feld[$z][1] = 0; #set variables for a,t,g,c,C and n back to zero $umgeb_feld[$z][2] = 0; $umgeb_feld[$z][3] = 0; $umgeb_feld[$z][4] = 0; $umgeb_feld[$z][5] = 0; $umgeb_feld[$z][6] = 0; $H[$z][1] = 0; $H[$z][2] = 0; $H[$z][3] = 0; $H[$z][4] = 0; $H[$z][5] = 0; $H[$z][6] = 0; $Hr[$z][1] = 0; $Hr[$z][2] = 0; $Hr[$z][3] = 0; $Hr[$z][4] = 0; $Hr[$z][5] = 0; $Hr[$z][6] = 0; $f[$z][1] = 0; $f[$z][2] = 0; $f[$z][3] = 0; $f[$z][4] = 0; $f[$z][5] = 0; $f[$z][6] = 0; } @files = <*.seq1>; #write all .seq1-files into a list print "Scanning information content downstream of 5mC\n"; foreach $filelist(@files) { $seq = "-"; open (SEQUENCE, "<$filelist") || die ("Cannot open file '$filelist'."); print ("."); #show program is running while () { # # read line by line, cut Newline, make sequences from lines # ignore FASTA-names, remove all "-" $file = $_; chomp $file; unless ($file =~ /^>/){$seq = $seq . $file} $seq =~ s/-//g; } close SEQUENCE || die ("Cannot close open file '$filelist'."); $laenge = length($seq); while ($seq) { # # determine stepwise positions of 5mCs $next = $position + 1; $position = index ($seq,"C",$next); if ($position == -1) {last}; $umgeb_C = substr($seq,$position,$laenge); # # transform surrounding from string to list # @umgeb_block = split (//, $umgeb_C); $i = 0; for ( $i=0; $i<($laenge-$position); #from 5mC position to the end $i++ ) { if ($umgeb_block[$i] eq "a") {$umgeb_feld[$i][1]++;} elsif ($umgeb_block[$i] eq "t") {$umgeb_feld[$i][2]++;} elsif ($umgeb_block[$i] eq "g") {$umgeb_feld[$i][3]++;} elsif ($umgeb_block[$i] eq "c") {$umgeb_feld[$i][4]++;} elsif ($umgeb_block[$i] eq "C") {$umgeb_feld[$i][5]++;} elsif ($umgeb_block[$i] eq "n") {$umgeb_feld[$i][6]++;} } } } print "\n"; $z = 0; for ( $z=0; $z<$max_width; $z++ ) { $summe = $umgeb_feld[$z][1] + $umgeb_feld[$z][2] + $umgeb_feld[$z][3] + $umgeb_feld[$z][4] + $umgeb_feld[$z][5] + $umgeb_feld[$z][6]; if ($summe == 0) {last}; $f[$z][1] = $umgeb_feld[$z][1]/$summe; $f[$z][2] = $umgeb_feld[$z][2]/$summe; $f[$z][3] = $umgeb_feld[$z][3]/$summe; $f[$z][4] = $umgeb_feld[$z][4]/$summe; $f[$z][5] = $umgeb_feld[$z][5]/$summe; $f[$z][6] = $umgeb_feld[$z][6]/$summe; if ($umgeb_feld[$z][1] > 0 && $f_a > 0) {$H[$z][1] = $umgeb_feld[$z][1]/$summe*(log($umgeb_feld[$z][1]/$summe)/log(2)); $Hr[$z][1] = $umgeb_feld[$z][1]/$summe*(log(($umgeb_feld[$z][1]/$summe)/$f_a)/log(2))}; if ($umgeb_feld[$z][2] > 0 && $f_t > 0) {$H[$z][2] = $umgeb_feld[$z][2]/$summe*(log($umgeb_feld[$z][2]/$summe)/log(2)); $Hr[$z][2] = $umgeb_feld[$z][2]/$summe*(log(($umgeb_feld[$z][2]/$summe)/$f_t)/log(2))}; if ($umgeb_feld[$z][3] > 0 && $f_g > 0) {$H[$z][3] = $umgeb_feld[$z][3]/$summe*(log($umgeb_feld[$z][3]/$summe)/log(2)); $Hr[$z][3] = $umgeb_feld[$z][3]/$summe*(log(($umgeb_feld[$z][3]/$summe)/$f_g)/log(2))}; if ($umgeb_feld[$z][4] > 0 && $f_c > 0) {$H[$z][4] = $umgeb_feld[$z][4]/$summe*(log($umgeb_feld[$z][4]/$summe)/log(2)); $Hr[$z][4] = $umgeb_feld[$z][4]/$summe*(log(($umgeb_feld[$z][4]/$summe)/$f_c)/log(2))}; if ($umgeb_feld[$z][5] > 0 && $f_C > 0) {$H[$z][5] = $umgeb_feld[$z][5]/$summe*(log($umgeb_feld[$z][5]/$summe)/log(2)); $Hr[$z][5] = $umgeb_feld[$z][5]/$summe*(log(($umgeb_feld[$z][5]/$summe)/$f_C)/log(2))}; if ($umgeb_feld[$z][6] > 0 && $f_n > 0) {$H[$z][6] = $umgeb_feld[$z][6]/$summe*(log($umgeb_feld[$z][6]/$summe)/log(2)); $Hr[$z][6] = $umgeb_feld[$z][6]/$summe*(log(($umgeb_feld[$z][6]/$summe)/$f_n)/log(2))}; $Hsumme = $H[$z][1] + $H[$z][2] + $H[$z][3] + $H[$z][4] + $H[$z][5]; $rel_H = $Hr[$z][1] + $Hr[$z][2] + $Hr[$z][3] + $Hr[$z][4] + $Hr[$z][5]; # # "n" for max. entropy not considered:-> binaery logarythm of 1/5 # $max_H = -(log(1/5))/(log(2)); # # since it's difficult to determine exact error estimation is used: # $e = ((5-1)/2*log(2)/$summe); # #write results into table: # printf INFO "%6d %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6.2f %6d\n", $z, $f[$z][1], $f[$z][2], $f[$z][3], $f[$z][4], $f[$z][5], $f[$z][6], $max_H + $Hsumme - $e, $rel_H, $summe; } print INFO "\n"; print INFO "Sequenzes analyzed in \n",`pwd`,"\n"; #path of analyzed sequences print INFO "F(a) = ",$f_a,"\n"; # global nucleotide frequencies print INFO "F(t) = ",$f_t,"\n"; print INFO "F(g) = ",$f_g,"\n"; print INFO "F(c) = ",$f_c,"\n"; print INFO "F(C) = ",$f_C,"\n"; print INFO "\n"; print INFO <<'Ende'; ------------------- F(x) = global frequency of a base f(x) = frequency of a base at position Pos. R = decrease in uncertainty at position Pos. in bits (1) - correction factor approximated according to (2) Maximum uncertainty for 5 bases = -(log(1/5))/(log(2)) = aprx. -2.32 H' = relative entropy (3) Takes global base frequency into account. Will be 0 when bases occure at a given position exactly with their global frequency. Cov. = number of bases analysed per position ("Coverage"). This number should be >50 Ref.: (1) Schneider TD, Stephens RM: Nucl.Acid.Res.,16: 6097-6100 (1990) (2) Schneider TD, Stormo, GD, Gold L, Ehrenfeucht A: J.Mol.Biol., 188:415-431 (1986) (3) Durbin R, et.al: Biological sequence analysis: 308ff. (1998) Ende close (INFO) || die ("Cannot close open file '$out_file'"); print "\nResults written into file '$out_file'.\n"; ############################################################################## # SUBROUTINES # # part of the Standard File Package Utility for MacPerl 4.1.1 # # # # 1994.01.05 v4.1.1 Matthias Neeracher # # Minor changes to reflect future plans for standard file support. # # # # 1993.10.27 v1.2 wm # # Change the calling syntax to adopt the 4.1.0 release. # # # # 1993.10.19 v1.1 wm # # convert for 4.1b6 # # # # 1993.8.10 V1.0 # # Watanabe Maki (Watanabe.Maki@tko.dec.com) # # # ############################################################################## # Name # PutFile/GetNewFile # Syntax # $filename = &PutFile($prompt [, $default]); # $filename = &GetNewFile($prompt [, $default]); # Description # Query a new file name to user by Standard File Dialog Box. # $prompt is a prompt sting on the dialog box. # $default is a default file name. # # sub PutFile { # local($prompt, $default) = @_; # # &MacPerl'Choose( # &GUSI'AF_FILE, # domain # 0, # type # $prompt, # prompt # "", # constraint # &GUSI'CHOOSE_NEW + ($default ? # &GUSI'CHOOSE_DEFAULT : 0), # # flag # $default # default filename # ); # } ###### # Name # GetFolder # Syntax # $foldername = &GetFolder($prompt [, $default]); # Description # Query a folder name to user by Standard File Dialog Box. # $default is the default dialog # # sub GetFolder { # local($prompt, $default) = @_; # # &MacPerl'Choose( # &GUSI'AF_FILE, # domain # 0, # type # $prompt, # prompt # "", # constraint # &GUSI'CHOOSE_DIR + ($default ? # &GUSI'CHOOSE_DEFAULT : 0), # # flag # $default # ); # } #