#!/usr/local/bin/perl -w # # Bernd Senf - bsenf@fli-leibniz.de - FLI Leibniz e.V. Jena # # Version 0.1 with some bugs ("sometimes" uncorrect alignment < 10% from reverse sequences) # Please change lines below => my @fnafn = glob("EIK149*.fna"); # Please change lines below => my @qualfn = glob("EIK149*.qual"); # # Version 1.0 - 2007-05-04 # exec external align_to_scf -f 454Contigs.qry -o 454Contigs.aln # nearly perfect conversion # all bases at exact position # all quality values correct # all hidden cuttoff-bases switched on/off # # Version 1.01 - 2007-05-10 # - option [-c cutoff ] added; # Reads short then cutoff are discarded # - option -help added; # - option -i 0/1 added # - option -f file.ace added # - align_to_scf moved from ./align_to_scf to ../bin/align_to_scf # # Version 1.02 - 2007-05-22 # - sometime double rev sequence in case of missing query file # # Version 1.03 - 2007-06-22 # - sometime 1 or 2 empty lines between contig fasta and contig quality values # causes from running runPhoenix or runMapping # # Version 1.04 - 2007-06-25 # - some styling bugs - fixed # - endless loop (i++ missing) in case of no alignment found - fixed # # Version 1.05 - 2007-06-26 # - bug in case of missing sff and missing traces - fixed # - new option [-q qual_file ] added: # quality values from traces readen so from this file # accept different styles of quality files: # # #>name LEN=... (from align_to_scf) # #1 2 3 4 5 # #>name length=....(from consed export) # #1 2 3 4 5 # # - allow negative AF positions # # # Version 1.06 - # - allow double reads with same SFF trace # (same also for read like 'ABCDEFGHIJ12345.1-99' # or like 'ABCDEFGHIJ12345.100-122.fm123' # # Hint: IT IS DIFFERENT to "_to" and "_from" # # Version 1.06.1 - 2008-03-26 # - fixed some bugs with option -i and -c # - add option -x for test/development purposes # Version 1.07 - 2008-03-30 # - keep in caf output same order of contigs and traces as in ace input # # Version 1.08 - 2008-04-14 # - since version 1.1.03 from runAssembly/runMapper QA values contains real # clipping informations # - runs parallel variant of align_to_scf if present # # Version 1.08a - 2008-05-20 # - due to significant decreasing of the run-time from align_to_scf (hints by jb) # parallel variant was removed again # # Version 1.09 - 2008-10-21 # - new feature: add sequence contigXXX as single read # (gap4 calculates his own CONSENSUS sequence and his own CONSENSUS quality values) # # - bug fixed: negative starting position ( value < 1 ) calculated # # Version 1.10 - 2010-12-08 # - new feature add sequence contigXXX as single read --- is now an option '-a' # (because gap4 calculates his own CONSENSUS sequence and his own CONSENSUS quality values) # my $VERSION="1.10"; use Getopt::Long; use strict; #use diagnostics; my %seq; my %contig; my $as_contigs=0; # Number of Contigs from ACE File AS-Line my $as_reads=0; # Number of Reads from ACE File AS-Line my $co_contigs=0; # Number of Contigs from ACE File CO-Lines my $rd_reads=0; # Number or Reads from ACE File RD-Line my $cutoff=1; my $help=0; my $invtrn=1; my $opt_q=''; my $opt_x=0; my $wContigAsRead=0; my $fatal=0; my $StadenID=1; # make caf2gap happy $|=1; # autoflush # # Important files, change this if necessary # my @fnafn = glob("*.fna"); my @qualfn = glob("*.qual"); #my $ALIGNPROG ="$ENV{'HOME'}/ace/bin/align_to_scf"; # external program, chmod 755 ? my $ALIGNPROG ="$ENV{'RPATH'}/align_to_scf"; # external program, chmod 755 ? #my $ALIGNPROG ='/gen/fly/biosw/roche454ace2gap/align_to_scf'; my $CONTIGACE ='454Contigs.ace'; my @contig_order=(); my @traces_order=(); sub usage () { print STDERR "\n"; print STDERR "usage:\n"; print STDERR " roche454ace2caf [-i 0|1 ] [-c cutoff] [-f 454Contigs.ace] [-q 454.qual ] > 454.caf 2>454.err\n"; print STDERR " VERSION: $VERSION\n"; print STDERR "\n"; print STDERR " -f ACE file, Default = '454Contigs.ace'\n"; print STDERR " -c Reads shorter than cutoff are discarded; Default = '1'\n"; print STDERR " -i 0 or 1 Allow partial trace names '_to' '_fm' ; Default = '1'\n"; print STDERR " -g Read quality values from this file\n"; print STDERR " -a Add/duplicate contig as special read\n"; print STDERR " -x Don't read fasta/qual files (only for debugging purposes)\n"; print STDERR " -h|-? This help\n"; print STDERR "\n"; exit; } # usage GetOptions ( "f=s" => \$CONTIGACE, "c=i" => \$cutoff, "i=i" => \$invtrn, "q=s" => \$opt_q, "x=i" => \$opt_x, "a" => \$wContigAsRead, "v|h|help|?" => \$help, ) or usage; usage() if ( $help ); #------------------------------------------------ sub pretty_print_string { my ($seq)=@_; my $len; my $i; $len=length($seq); for ($i = 0; $i < $len; $i+=50) { print substr($seq, $i, 50),"\n"; } } sub pretty_print_hash { my (@qual)=@_; my ($len,$fr,$to); my $max=$#qual+1; $fr=0; $to=50; while ($fr < $max ) { my $line=''; $to=( $to < $max ? $to : $max ); for my $j ( $fr .. $to - 1 ) { if ( $j != $to ) { $line.=$qual[$j] . ' '; } else { $line.=$qual[$j]; } } print "$line\n"; $fr+=50; $to+=50; } } sub rev_comp_seq { my ($seq) = @_; $seq = reverse $seq; $seq =~ tr/acgtACGT/tgcaTGCA/; return $seq; } sub print_quality_4 { my ($len) = @_; my $i=0; my $n=0; for ($i = 0; $i< $len; $i++, $n++ ) { print "4 "; if ( $n == 50 ) { print "\n"; $n=0; } } if ($n != 50 || $n != 0 ) { print "\n" }; print "\n"; } sub print_trace { my $s; my $seqlen; my $trlen; my $scf; my $rseq; my $ql; my $qr; my $forward=1; $s=shift; if ( ! defined $seq{$s}{seq} ) { print STDERR "MISS: SEQ undefined $s\n"; } if ( ! defined $seq{$s}{len} ) { print STDERR "MISS: LEN undefined $s\n"; } if ( ! defined $seq{$s}{strand} ) { print STDERR "MISS: STRAND undefined $s\n"; } $rseq=$seq{$s}{seq}; $seqlen=length($rseq); $scf=$seq{$s}{scf}; print "Sequence : $s\n"; print "Is_read\n"; print "Padded\n"; print "Staden_id $StadenID\n"; $StadenID++; print "SCF_File $scf\n"; print "Template $scf\n"; if ( $seq{$s}{strand} eq 'U' ) { $forward=1; print "Strand Forward\n"; } if ( $seq{$s}{strand} eq 'C' ) { $forward=0; print "Strand Reverse\n"; } if (defined $seq{$s}{qr} and defined $seq{$s}{ql} ) { $qr=$seq{$s}{qr}; $ql=$seq{$s}{ql}; print "Clipping QUAL $ql $qr\n"; } #if ( defined $seq{$s}{trlen} ) { # $trlen=$seq{$s}{trlen}; # print "DNA $s $seqlen\n"; # print "FNA $s $trlen\n"; #} if ( defined $seq{$s}{align} && $#{$seq{$s}{align}} >=0 ) { foreach my $line ( @{$seq{$s}{align}} ) { print "Align_to_SCF $line\n"; } } else { if ( $seqlen > 0 ) { print "Align_to_SCF 1 $seqlen 1 $seqlen\n"; #print "Tag COMM 1 $seqlen UnCorrectMatch\n"; } else { print STDERR "MISS: $s without ALIGN\n"; } } print "\n"; print "DNA : $s\n"; pretty_print_string($rseq); print "\n"; print "BaseQuality : $s\n"; if ( defined($seq{$s}{qual}) ) { my @newqual; my $line; my @ind; my $nind; my @cqual; @cqual=split(/\s+/,$seq{$s}{qual}); if ( defined $seq{$s}{align} ) { my ($acefr,$aceto,$qualfr,$qualto); my $i; my $j; my $nind; my $nfnd; $i=0; $j=0; $nind=0; foreach $line ( @{$seq{$s}{align}} ) { ($acefr,$aceto,$qualfr,$qualto) = (split(/\s+/,$line)); $ind[$nind][0]=$acefr; $ind[$nind][1]=$aceto; $ind[$nind][2]=$qualfr; $ind[$nind][3]=$qualto; $nind++; } for ( $i = 1; $i <= $seqlen ; $i++) { my $err; $nfnd=1; for ($j=0; $nfnd && $j < $nind ; $j++ ) { if ( $nfnd && ( $i >= $ind[$j][0] ) && ( $i <= $ind[$j][1] )) { my $k; my $v; $k = $i - $ind[$j][0] + $ind[$j][2] - 1; if (defined $cqual[$k] ) { $v = $cqual[$k]; } else { $v = 2; }; #print STDERR "MISS : $s : pushqual : qual($i) <= [$k] ",$v,"\n" if ($s eq "EIK149E02BDNCU" ); push @newqual,$v; $nfnd=0; } } if ( $nfnd ) { push @newqual,'4'; } } pretty_print_hash(@newqual); print "\n"; } else { if ( $#cqual == $seqlen ) { pretty_print_hash(@cqual); } else # Seq not aligned { $fatal++; if ( $fatal < 100 ) { print STDERR "MISS $s: LEN trace ($seqlen) != LEN quality ($#cqual) := Value 4\n"; } if ( $fatal == 100) { print STDERR " too much errors here; dont report any more ...\n"; } print_quality_4($seqlen); } print "\n"; } } else { print_quality_4($seqlen); } } #----------------------------------------------------- print STDERR "-" x 30, "\n"; print STDERR "\n"; print STDERR "Options:\n\n"; print STDERR "Cutoff ........................... = $cutoff\n"; print STDERR "Allow partial trace names _to _fm = ", $invtrn ? "Yes\n" : "No\n"; print STDERR "ACE file ......................... = $CONTIGACE\n"; print STDERR "\n"; print STDERR "Check all neccessary 454-Files:\n\n"; # # check all files readable # if ( $opt_q ne '' && -f "$opt_q" ) { undef @qualfn; push (@qualfn,$opt_q); } foreach my $f ( @qualfn,@fnafn,"$CONTIGACE","$ALIGNPROG") { open(FH,"<$f") || die "Can't read \"$f\": $!"; close (FH); print STDERR "? $f\n"; } print STDERR "? cutoff = $cutoff\n"; print STDERR "? allow partial trace names = ", $invtrn ? "YES\n" : "NO\n"; print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Reading ACE ...\n"; open (ACE,"<$CONTIGACE") || die "Can't read $CONTIGACE: $!\n"; while( defined($_=) ) { chomp(); # Syntax: # AS # AS 1 102939 # AS 1035 19330 # if (/^AS\s+(\d+)\s+(\d+)/ ) { ($as_contigs,$as_reads) = ($1,$2); print STDERR "AS uncorrect entry: number contigs ($as_contigs,$as_reads)\n" unless (defined($as_contigs) and $as_contigs > 0 ); print STDERR "AS uncorrect entry: number reads ($as_contigs,$as_reads)\n" unless (defined($as_reads) and $as_reads > 0 ); } # Syntax: # RD <# of padded bases> <# of whole read info items> <# of read tags> # # RD D8QSF4J01CU6TZ 96 0 0 # AGGGAGGG*AAG*AAATACACCTATG*CTTCCTCAGTAGGTAAAG***** # ***AAGA*C*TGGG***AACC*ACCACG*CCAGAGTTA*GAAAATA # if (/^RD\s+(\S+)\s+(\d+)/) { # # naming variations prior 1.1.02 # # name = D8QSF4J01CU6TZ.1-20 # name = D8QSF4J01CU6TZ_fm123 # name = D8QSF4J01CU6TZ_to123 # name = D8QSF4J01CU6TZ.1-20.fm123 # +naming variations since 1.1.03 # name = D8QSF4J01CU6TZ.1-20 # name = D8QSF4J01CU6TZ.1-20.to123 my ($name, $len) = ($1, $2); my ($scf,$show); $rd_reads++; # statistics $show=1; $scf=substr($name,0,14); if ( $len < $cutoff ) { print STDERR "Read $rd_reads: too short ($name LEN=$len)\n"; $show=0; } if ( $name =~ /_to/ && ! $invtrn ) { print STDERR "Read $rd_reads: partial trace name ($name LEN=$len)\n"; $show=0; } if ( $name =~ /_fm/ && ! $invtrn ) { print STDERR "Read $rd_reads: partial trace name ($name LEN=$len)\n"; $show=0; } if ( defined $seq{$name} && defined $seq{$name}{len} ) { my $x=$seq{$name}{len}; print STDERR "MISS: Read twice defined: $name (len: $x <=> $len )\n"; } $seq{$name}{len} = $len; $seq{$name}{useit} = $show; $seq{$name}{scf} = $scf; $seq{$name}{ql} = 1; $seq{$name}{qr} = $len; my $s = ""; while () { chomp(); $s .= $_; last if (/^$/); } $s =~ tr/xXnN\*/-/; $seq{$name}{seq} = $s; # Syntax: # # QA # while () { my ($qa_clip_start,$qa_clip_end); chomp(); last if (/^$/); if (/^QA\s+(\d+)\s+(\d+)/) { ($qa_clip_start,$qa_clip_end)=($1,$2); print STDERR "MISS: $name: QA uncorrect entry: qa_clip_start < 1 (QR=$qa_clip_start)\n" unless ( $qa_clip_start > 0 ); print STDERR "MISS: $name: QA uncorrect entry: qa_clip_end < 1 (QL=$qa_clip_end)\n" unless ( $qa_clip_end > 0 ); print STDERR "MISS: $name: QA uncorrect entry: qa_clip_end > len (L=$len,QR=$qa_clip_end)\n" if ( $qa_clip_end > $len ); $seq{$name}{ql}=$qa_clip_start; $seq{$name}{qr}=$qa_clip_end; } } # progress report if ( $as_reads> 1e5 && $rd_reads % 10000 == 0 ) { print STDERR "\rRead $rd_reads / $as_reads"; }; } # QA 1 96 1 96 # DS CHROMAT_FILE: D8QSF4J01CU6TZ PHD_FILE: D8QSF4J01CU6TZ.phd.1 TIME: Thu Jul 27 12:33:48 2000 #... skip # CO <# of bases> <# of reads in contig> <# of base segments in contig> [C=Complement U=Uncomplement] # # CO contig00001 1163 523 89 U # AAA*GAGAGAAGG*AGGgAGGGaGGAAGgAAGGAAGGAGAG*GAAaGAAA # *GAAA*GAGAGAGaGAAAGAAA*GAAA*GG*AAA*GAAGG*AAA*GAAGG # ... # AF D8QSF4J01CZSRB C 209 # AF D8QSF4J01CU6TZ U 205 # AF D8QSF4J01E6204 U 163 # AF D8QSF4J01A09VN U 214 # AF D8QSF4J01D2IAU C 176 # ... if (defined ($_) && /^CO\s+(\S+)\s+(\d+)/) { my ($cname, $len) = ($1, $2); print STDERR "CO uncorrect entry: len ($cname LEN=$len)\n" unless (defined($len) and $len > 0 ) ; $co_contigs++; push (@contig_order,$cname); $contig{$cname}{len} = $2; # Read consensus my $seq = ""; while () { last if (/^$/); chomp; $seq .= $_; } $seq =~ s/\*/-/g; $contig{$cname}{seq} = $seq; # # sometimes 1x NewLine 1xNewLine 1xBQ # sometimes 1x NewLine 1xBQ # my $qual = ""; my $aflines = 0; ; while () { last if ( /^$/ ); next if ( /^BQ/ ); $qual .= $_; } $contig{$cname}{qual} = $qual; # Read AF $aflines=0; while () { chomp(); last if (/^$/ && $aflines ); if (/^AF/) { $aflines++; push(@{$contig{$cname}{AF}}, $_); if ( $_ =~ /^AF\s+(\S+)\s+(\S)\s+(\-?\d+)/ ) { my ($name,$dir,$pos)=($1,$2,$3); print STDERR "AF uncorrect entry: dir ($name,$dir,$pos)\n" unless (defined($dir) and ( $dir =~ m/[U|C]/ )) ; $seq{$name}{strand}=$dir; } else { print STDERR "AF unreadable entry: $_\n"; } } } } } print STDERR "\n" if ($as_reads > 1e5 ); #----------------------------------------------------- print STDERR "Number of Contigs: $co_contigs\n"; print STDERR "Number of Reads : $rd_reads\n"; print STDERR "-" x 30, "\n"; #----------------------------------------------------- my $valid; # maybe skip unexpected entries #----------------------------------------------------- print STDERR "Load QUAL Values of Reads\n"; # # again # foreach my $f ( @qualfn ) { my ($len,$name,%quality,$rdhit,$entry); next if ( $opt_x ); print STDERR "Load $f\n"; open(FH,"<$f") || die "Can't read \"$f\": $!"; $entry=0; $valid=0; while () { chomp; if (/^>(\S+) LEN=(\d+)/ || # >EMDW8LR05C1HB5 LEN=117 QL=5 QR=117 /^>(\S+) length=(\d+) uaccno=/ || # >000086_0088_0326 length=111 uaccno=EIK149E01AH0NS /^>(\S+) length=(\d+) xy=/ || # >EEP4CQW01AK78Q length=115 xy=0124_2664 region=1 run=R_2006_12_05_16_06_16_ /^>(contig\d+).*length=(\d+)\s+numreads=/ # >contig00001 NC_000964, 1..1367 length=1367 numreads=89 ) { ($name,$len)=($1,$2); $quality{$name}{len}=$len; $quality{$name}{qual}=''; $entry++; $valid=1; } elsif ( /^>/ ) { print STDERR "MISS Entry: $_\n"; $valid=0; ; } elsif ( $valid) { $quality{$name}{qual}.=$_; } } close (FH); $rdhit=0; $fatal=0; foreach my $s (keys %seq) { next if ! defined $seq{$s}{seq}; next if ! defined $seq{$s}{len}; if (defined($quality{$s}) && defined($quality{$s}{qual}) ) { if ( defined $seq{$s}{qual} ) # already defined in case of more the one data files ...? { if ($fatal < 10 ) { print STDERR " ? DUP QUAL entry: $s\n"; } if ($fatal == 10) { print STDERR " ... and more\n" } $fatal++; } else { $seq{$s}{qual}=$quality{$s}{qual}; $seq{$s}{qlen}=$quality{$s}{len}; $rdhit++; } } else { my $x=$seq{$s}{scf}; if (defined( $quality{$x}) && defined($quality{$x}{qual}) ) { $rdhit++; $seq{$s}{qual}=$quality{$x}{qual}; $seq{$s}{qlen}=$quality{$x}{len}; } } } if ( $fatal ) { print STDERR " $f used $rdhit/$entry ($fatal/$entry dup)\n";} else { print STDERR " $f used $rdhit/$entry\n"; } } print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Load Fasta Values of Reads\n"; # # again # foreach my $f ( @fnafn ) { next if ( $opt_x ); my ($name,$len,$ql,$qr,%trace,$rdhit,$entry); print STDERR "Load $f\n"; open(FH,"<$f") || die "Can't read \"$f\": $!"; $entry=0; $valid=0; while () { chomp; #>EIK149E01ASB1O LEN=118 QL=5 QR=116 #TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG if ( /^>(\S+) LEN=(\d+) QL=(\d+) QR=(\d+)/) { ($name,$len,$ql,$qr)=($1,$2,$3,$4); $trace{$name}{len}=$len; $trace{$name}{seq}=''; $trace{$name}{ql} =$ql; $trace{$name}{qr} =$qr; $entry++; $valid=1; } elsif ( /^>(\S+) length=(\d+) xy=/ || /^>(contig\d+).*length=(\d+)\s+numreads=/ ) { # >EEP4CQW01AK78Q length=115 xy=0124_2664 region=1 run=R_2006_12_05_16_06_16_ # GATATTGGCCTTGGCTTTATCTCAATATTATATGGATCATAGCTGGCAACTAATTCAGTC # CAGTAAATATCCTCAATAGGGAATAATATATGCTT ($name,$len)=($1,$2); $trace{$name}{len}=$len; $trace{$name}{seq}=''; $trace{$name}{ql} =1; $trace{$name}{qr} =$len; $entry++; $valid=1; } elsif ( /^>/ ) { print STDERR "MISS $name = $_\n"; ; # next line $valid=0; } elsif ( $valid ) { $trace{$name}{seq} .= $_; } } close (FH); $rdhit=0; $fatal=0; foreach my $s (keys %seq) { next if ! defined $seq{$s}{seq}; next if ! defined $seq{$s}{len}; if (defined($trace{$s}) && defined($trace{$s}{seq}) ) { if ( defined $seq{$s}{trace} ) { if ($fatal < 10 ) { print STDERR " ? DUP FASTA entry: $s\n"; } if ($fatal == 10) { print STDERR " ... and more\n" } $fatal++; } else { $seq{$s}{trace}=$trace{$s}{seq}; $seq{$s}{trlen}=$trace{$s}{len}; $rdhit++; } } else { my $x=$seq{$s}{scf}; if (defined($trace{$x}) && defined($trace{$x}{seq}) ) { $rdhit++; $seq{$s}{trace}=$trace{$x}{seq}; $seq{$s}{trlen}=$trace{$x}{len}; } } } if ( $fatal ) { print STDERR " $f used $rdhit/$entry ($fatal/$entry dup)\n";} else { print STDERR " $f used $rdhit/$entry\n"; } } $fatal=0; print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Start Contig Quality Correction\n"; foreach my $c (keys %contig) { if (defined($contig{$c}{qual})) { my @cqual; my $ba_len; my $co_len; $co_len=$contig{$c}{len}; @cqual=split(/\s+/,$contig{$c}{qual}); $ba_len = $#cqual + 1; if ($co_len != $ba_len ) { my @newqual; my $cqual; my @seq; @seq=split(//,$contig{$c}{seq}); my $i=0; my $q=0; for ($i=0, $q=0; $i < $co_len; $i++ ) { if ( $seq[$i] ne '-' ) { push(@newqual,$cqual[$q]); $q++; } else { push (@newqual,"4"); } } $contig{$c}{qual}=join(" ",@newqual); } else { #print STDERR "Match ContigLength Basequality: ",$c," : $co_len == $ba_len\n"; } } } print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Calculate Re-Alignments of ACE to TRACE\n"; my $cnt=0; if ( ! -f "454Contigs.aln" ) { print STDERR "Create 454Contigs.qry\n"; open(FH,">454Contigs.qry") || die "Can't write to 454Contigs.qry:$!"; my $hit = 0; foreach my $s (keys %seq) { if ( defined $seq{$s}{trace} && defined $seq{$s}{seq} && $seq{$s}{len} > 5 && $seq{$s}{len} < 1024 ) { my @align=(); my $sseq; my $tseq; my $ssseq; # second try of $sseq $sseq=$seq{$s}{seq}; $tseq=$seq{$s}{trace}; if ( $seq{$s}{strand} eq 'C' ) { $sseq=rev_comp_seq($sseq); } $hit++; print FH ">$s\n"; print FH "$sseq\n"; print FH "$tseq\n"; } else { if ( $cnt++ < 100 ) { print STDERR "MISS: $s without trace or too short\n"; } $cnt++; } } close (FH); print STDERR " $hit entries - please be patient for align_to_scf ...\n"; my @args=("$ALIGNPROG","-f","454Contigs.qry","-o","454Contigs.aln"); print STDERR "EXEC @args\n"; system(@args) == 0 or die "system @args failed: $?" } else { print STDERR " -> Found 454Contigs.aln\n"; } # # Statistik .... # print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "\n"; $cnt=0; open(FH,"<454Contigs.qry") || die "Can't open 454Contigs.qry:$!"; while () { $cnt++ if /^>/ }; close (FH); print STDERR " ALIGN_TO_SCF: 454Contigs.qry -> $cnt queries \n"; $cnt=0; open(FH,"<454Contigs.aln") || die "Can't open 454Contigs.aln:$!"; while () { $cnt++ if /^>/ }; close (FH); print STDERR " ALIGN_TO_SCF: 454Contigs.aln -> $cnt results \n"; print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "\n"; print STDERR "\n"; print STDERR "Parse Alignments 454Contigs.aln\n"; open(ALN,"<454Contigs.aln") || die "Can't open 454Contigs.aln:$!"; while () { $cnt++; chomp(); if (/^>(\S+)/ ) { my $s=$1; if ( not defined $seq{$s} ) { print STDERR "MISS: 454Contigs.aln: Seq $s not defined\n"; next; } while () { $cnt++; chomp(); last if (/^$/); if ( /^Align_to_SCF\s+(\d+ \d+ \d+ \d+)/ ) { push(@{$seq{$s}{align}}, $1); } else { print STDERR "MISS: Unparseable ($s:$cnt): $_"; } } } } close(ALN); print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Start Reads Quality/Clipping/FNA Correction\n"; my $hit = 0; foreach my $s (keys %seq) { my $sseq; my ( $ql,$qr,$len); $hit++; $sseq=$seq{$s}{seq}; $ql =$seq{$s}{ql}; $qr =$seq{$s}{qr}; $len =$seq{$s}{len}; if ( $ql > 1 || $qr < $len ) { my ($s1,$s2,$s3); $s1=substr($sseq,0,$ql-1 ); $s2=substr($sseq,$ql-1,$qr-$ql+1 ); $s3=substr($sseq,$qr,$len-$qr); $s1=~tr/ACGT/acgt/; $s3=~tr/ACGT/acgt/; $seq{$s}{seq} = $sseq = $s1 . $s2 . $s3; } if ( ! defined $seq{$s}{strand} ) { print STDERR "MISS: Strand undefined $s\n"; } if ( $seq{$s}{strand} eq 'C' ) { $sseq=$seq{$s}{seq}=rev_comp_seq($sseq); } if ( defined $seq{$s}{trace} ) { my @align=(); my $tseq; my $ssseq; # second try of $sseq $hit++; $tseq=$seq{$s}{trace}; if (defined $seq{$s}{align} ) { @align=@{$seq{$s}{align}}; if ( $#align >= 0 ) { my ($ace1,$ace2,$from,$to); my $news; my $afpos; my $loffs; ($ace1,$ace2,$from,$to)=split(/\s+/,$align[0]); # # !!!! # $seq{$s}{afpos}=$afpos=$from; ## $left-offset mostly 4 # 1 => +0 # 5 => +1 and so on $loffs=$from - 1; if ( $loffs > 0 ) { $news = substr($tseq,0, $loffs); $news =~ tr/ATGC/atgc/; } else { $news =''; } $news.= $sseq; if ( $seq{$s}{strand} ne 'X' ) { my $trlen=$seq{$s}{trlen}; # # @align new setup with offset correction # my @newalign; my $hit=0; foreach my $algline (@align) { ($ace1,$ace2,$from,$to)=split(/\s+/,$algline); if ( $hit == 0 ) { $ace2+=$loffs; $from=1; $hit=1; } else { $ace1+=$loffs; $ace2+=$loffs; } push (@newalign,"$ace1 $ace2 $from $to"); } my $roffs; $roffs=$trlen-$to; if ( $roffs > 0 ) { $ace1=$ace2+1; $ace2=$ace1+$roffs-1; $from=$to+1; $to=$from+$roffs-1; push (@newalign,"$ace1 $ace2 $from $to"); my $ends=substr($seq{$s}{trace},$from-1,$roffs); $ends =~ tr/ATGC/atgc/; $news.=$ends; } @align=@newalign; $seq{$s}{seq}=$news; } else { print STDERR "MISS: $s with none alignment\n"; if ( $afpos > 1 ) { my @newalign; $afpos=$afpos-1; push( @newalign,"1 $from 1 $from"); foreach my $algline (@align) { ($ace1,$ace2,$from,$to)=split(/\s+/,$algline); $ace1+=$afpos; $ace2+=$afpos; push (@newalign,"$ace1 $ace2 $from $to"); } @align=@newalign; } $seq{$s}{seq}=$news; } @{$seq{$s}{align}}=@align; } else { print STDERR "MISS: $s with negative alignment [$#align]\n"; my @string = split(//, $sseq); my %count; $count{'-'}=0; foreach(@string) { $count{$_} ++; } #foreach (keys(%count)) { # print "$_ is $count{$_}\n"; #} print STDERR "MISS: $s : matching failed : $seq{$s}{strand}: $count{'-'}\n"; print STDERR "MISS: $s : ACE1 : $sseq\n"; print STDERR "MISS: $s : FNA : $tseq\n\n"; } } } elsif (defined $seq{$s}{qual} ) { my $ba_len; my $co_len; my @cqual; $co_len=$seq{$s}{len}; @cqual=split(/\s+/,$seq{$s}{qual}); $ba_len = $#cqual + 1; if ($co_len != $ba_len ) { my @newqual; my $cqual; my @seq; @seq=split(//,$seq{$s}{seq}); my $i=0; my $q=0; for ($i=0, $q=0; $i < $co_len; $i++ ) { if ( $seq[$i] ne '-' ) { push(@newqual,$cqual[$q]); $q++; } else { push (@newqual,"4"); } } $seq{$s}{qual}=join(" ",@newqual); } else { #print STDERR "Match [$hit/$rd_reads] ReadLength Basequality: ",$s," : $co_len == $ba_len\n"; } # progress report if ( $rd_reads> 1e4 && $hit % 1000 != 0 ) { print STDERR "\rRead $hit / $as_reads"; }; } } print STDERR "-" x 30, "\n"; #----------------------------------------------------- my $check=1; print STDERR "Selfcheck ($check) - Number of Contigs uniq ?\n"; $check++; if ( $as_contigs != $co_contigs ) { print STDERR "MISS: AS Entry <> Number Contigs found: $as_contigs <> $co_contigs\n"; } print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Selfcheck ($check) - Number Bases == Number Quality values (C)?\n"; $check++; foreach my $c (keys %contig) { if (defined($contig{$c}{qual})) { my @cqual; my $co_len; my $ba_len; $co_len=$contig{$c}{len}; @cqual=split(/\s+/,$contig{$c}{qual}); $ba_len = $#cqual + 1; if ($co_len != $ba_len ) { print STDERR "Missmatch: ContigLength <> Basequality: ",$c," : $co_len <> $ba_len\n"; } else { #print STDERR "Match ContigLength Basequality: ",$c," : $co_len == $ba_len\n"; } } } print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Selfcheck ($check) - Minimal Informations for Reads ?\n"; $check++; my $maxerr=0; foreach my $s (keys %seq) { if ( ! defined $seq{$s}{qual} ) { print STDERR "MISS $s - no quality values given\n" if ( $maxerr++ < 100 ); } if ( ! defined $seq{$s}{strand} ) { print STDERR "MISS $s - no strand given\n" if ( $maxerr++ < 100 ); } if ( ! defined $seq{$s}{len} ) { print STDERR "MISS $s - no len given\n" if ( $maxerr++ < 100 ); } if ( ! defined $seq{$s}{seq} ) { print STDERR "MISS $s - no seq given\n" if ( $maxerr++ < 100 ); } } if ( $maxerr > 1 ) { print STDERR "MISS ... total $maxerr of $rd_reads reads without important values\n" ; } print STDERR "-" x 30, "\n"; #----------------------------------------------------- print STDERR "Writing CAF\n"; my $cnt_contig=0; my $cnt_traces=0; $fatal=0; foreach my $c (@contig_order) { $cnt_contig++; print "Sequence : $c#C\n"; print "Is_contig\n"; print "Padded\n"; print "Staden_id $StadenID\n"; $StadenID++; @traces_order=(); foreach my $af (@{$contig{$c}{AF}}) { my ($len,$name,$dir,$pos); my $afpos; my ($ql,$qr,$qlen); my ($s1,$s2,$r1,$r2); # Assembled_from NAME S1 S2 R1 R2 ($name,$dir,$pos) = $af =~ /^AF\s+(\S+)\s+(\S)\s+(\-?\d+)/; print STDERR "WARN: Unparseable entry $c: AF: $af\n" unless (defined($name) && defined($dir) && defined($pos) ); # # Skip unwanted sequences # next if ( $seq{$name}{useit} eq 0 ); push (@traces_order,$name); if ( ! defined $seq{$name}{seq} ) { print STDERR "MISS: Contig $c can't found AF $name ($name)\n"; next; } if ( ! defined $seq{$name}{len} ) { print STDERR "MISS: Contig $c can't found AF $name ($name), len==0 ??? \n"; next; } if (defined $seq{$name} ) { $len = $seq{$name}{len}; } else { print STDERR "MISS: undefined seq{name} : $af: $name\n"; }; if ( defined $seq{$name}{afpos} ) { $afpos=$seq{$name}{afpos}; } else { $afpos=1; } $ql =$seq{$name}{ql}; $qr =$seq{$name}{qr}; $qlen=$qr-$ql; # QL == 1 # QR == LEN # if ( $dir eq 'U' ) { printf "Assembled_from %s %d %d %d %d\n", $name, $pos, $pos+$len-1, $afpos, $afpos+$len-1; } # if ( $dir eq 'C' ) { printf "Assembled_from %s %d %d %d %d\n", $name, $pos+$len-1, $pos, $afpos, $afpos+$len-1; } if ( $dir eq 'U' ) { $r1 = $afpos + ( $ql - 1 ); $r2 = $r1 + $qlen; $s1 = $pos + ( $ql - 1 ); $s2 = $s1 + $qlen; } elsif ( $dir eq 'C' ) { $r1=$afpos + ( $len - $qr ) ; $r2=$r1 + $qlen; $s2 = $pos + ( $ql - 1 ); $s1 = $s2 + $qlen; } else { print STDERR "MISS: $name : unkown direction\n"; } printf "Assembled_from %s %d %d %d %d\n", $name, $s1, $s2, $r1, $r2; $seq{$name}{ql}=$r1; $seq{$name}{qr}=$r2; } if ( $wContigAsRead ) { print "Assembled_from $c.ace 1 ",$contig{$c}{len}," 1 " ,$contig{$c}{len},"\n"; print "\nDNA : $c\n"; pretty_print_string( $contig{$c}{seq}); } print "\n"; print "BaseQuality : $c#C\n"; if (defined($contig{$c}{qual})) { my @cqual; @cqual=split(/\s+/,$contig{$c}{qual}); pretty_print_hash(@cqual); print "\n"; } else { print_quality_4($contig{$c}{len}); } foreach my $s (@traces_order) { $cnt_traces++; print_trace($s); } if ( $wContigAsRead ) { # # add contig sequence as reference # my $clen=$contig{$c}{len}; print "Sequence : $c.ace\n"; print "Is_read\n"; print "Padded\n"; print "Strand Forward\n"; print "Align_to_SCF 1 $clen 1 $clen\n"; print "Template $c\n"; print "Clipping QUAL 1 $clen\n"; print "\nDNA : $c.ace\n"; pretty_print_string( $contig{$c}{seq} ); if (defined($contig{$c}{qual})) { my @cqual; print "\n"; print "BaseQuality : $c.ace\n"; @cqual=split(/\s+/,$contig{$c}{qual}); pretty_print_hash(@cqual); print "\n"; } } } printf STDERR " %6d contigs (of %d contigs)\n",$cnt_contig,$co_contigs ; printf STDERR " %6d reads (of %d reads)\n",$cnt_traces,$rd_reads ;