#!/usr/bin/env perl # # encodename.pl -- see below or run w/o arguments for switch list # if (!@ARGV) { print <) { if (!$macfile && /\r/){ # if you find mac cr in line print "Macfile detected.\n" unless $textout; $/="\r"; # set indicator to mac CR $macfile=1; seek FILE, 0,0; next; } # next if /^\s*$/; # next if /^\s*#/; # comment line skip if (/^>/) { die "First line super long -- try running with Mac option on/off (-m) ?" if (length($_)>250); # $name = $_; if (/^>DL;/ || /^>P1;\ */){ print "NBRF format detected.\n" unless ($nbrf || $textout); $nbrf=1; ($name, $nothing) = ($_ =~ /^>\w\w;\ *(.*)(\r|\n)/ ) ; # print "$name\n"; $noname=; #grab line after next } else { #normal fasta ($name) = ($_ =~ /^\>(.*)$/); } # print "N$name\n"; $name =~ s/\t/_/g ; $name =~ s/(\r|\n)//g ; push (@names,$name); # print "name: $name"; $numotu++; } #need a provision for blank lines before first name elsif ($name ne "") { chomp; $seq = $_; $seq =~ s/\s//g; if ($nbrf){ $seq =~ s/\*//g; } $seqs{$name} .= $seq; } } # while file w/in fasta $numsite = length($seqs{$names[0]}); # print "numsite: $numsite\n"; close FILE; } else { # not fasta, so must be phylip **interleaved** # can write seq or interleaved, but only reads interleaved.. $notblank=1; while ($notblank) { # first line $_=; $notblank = 0 unless /^\s*$/; # white line skip $notblank = 0 unless /^\s*#/; # comment line skip } chomp; die "First line too long -- try running with Mac option on/off (-m) ?" if (length($_)>2500); ($numotu,$numsite) = ( $_ =~ /(\d+)\s+(\d+)/ ); $i=0; # print "$i $name\n"; # print "$seq\n"; die "Format error: Phylip files must begin with numbers of taxa and sites: $_\n" unless ($numotu>0 and $numsite>1) ; while ( ($i < $numotu) && ($ticker++ < 10000) ){ $_ = ; next if /^\s*$/; # white line skip next if /^\s*#/; # comment line skip chomp; ($name,$seq) = ( $_ =~ /\s*(\S*)\s*(.*)$/); # print "$i $name\n"; # print "$seq\n"; #temp cheat changing name >16 die "name abnormal or too long (>10): $_\n" unless ($name and (length($name)<18)); die "Identifier \"$name\" is double defined\n" if grep(/^$name$/,@names); push(@names,$name); # print "hash: $names[$i]\n"; $seq =~ s/\s//g; $seqs{$name} .= $seq; # $infos{$name} = $info; $i++; } # while die "\ngot stuck in a loop near 68\n" if ($ticker>9999); $ticker=0; $i=0; $leng = length($seqs{$names[$i]}); # print "\nLENGTH: $leng \n"; while(($leng < $numsite) && ($ticker++<10000)){ $i=0; while (($i < $numotu) && ($ticker++ < 10000)) { $_ = ; next if /^\s*$/; # white line skip chomp; #next if /^\s*#/; # comment line skip s/\s//g; $seq = $_; $seqs{$names[$i]} .= $seq; $leng = length($seqs{$names[$i]}); # print "$names[$i] $leng $seq\n"; die "$names[$i]: abnormal sequence size, $leng sites\n" if ($leng > $numsite); $i++; } } print "$leng\n"; die "\ngot stuck in a loop near 210\n" if ($ticker>9999); close FILE; } # end not fasta $/="\n"; # set indicator back to unix CR # generate ctxnames here $ctxi=1; foreach $name (@names){ $ctxname{$name}= sprintf("$root%03d",$ctxi++); if ((!$fastaout) && (!$pad) && (length($seqs{$name}) != $numsite)){ die "\n** Sorry, can't output phylip format with variable length sequences\nTry pad option (-x) or fasta output format (-f)\n"; } # for debugging... # $tl=length($seqs{$name}) ; # print"name: $ctxname{$name} len: $tl - maxlen: $maxlen\n"; # if ($tl > $maxlen ){ # print "resetting maxlen\n"; # $maxlen=length($seqs{$name}) ; # # } } $fileroot=$file; $fileroot =~ s/\.\w+$//; $outroot="phy"; $outroot="fta" if ($fastaout); $outfile = $fileroot . ".$root.$outroot"; if ((-e $outfile) && (!$textout)) { print "\n*** File $outfile exists.\n Overwrite? ([y]/n/new_filename): "; $answer = ; chomp $answer; if (length($answer) <4) { #anything more than 3 chars is interpreted as a file name if ( $answer =~ /^(N|n)/) { die "Gracefully exiting without overwrite."; } else { print "Overwriting... \n"; } } else { $outfile = $answer; $lutfile=$outfile; $lutfile =~ s/\.\w+$/\.lut/; } } # print "\nSaving to $outfile \n"; # OUTPUT OPTIONS if (!$textout){ open (OUTFILE, ">$outfile"); select (OUTFILE); } if ($fastaout) { foreach $name (@names) { if ($convert){ printf (">%-10s\n",$name); }else{ printf (">%-10s\n",$ctxname{$name}); } $seq = $seqs{$name}; $totalsite=length($seq); if (($pad) && ($totalsite < $maxlen)) { for ($di=$totalsite; $di<$maxlen; $di += 1){ $seq .= "-"; } } $totalsite=length($seq); $line = substr($seq, 0, 60); print "$line\n"; for ($offset = 60; $offset<$totalsite +1; $offset += 60) { $line = substr($seq, $offset, 60); if ($line ne "" ){ print "$line\n"; } } } } else { # not fasta, phylip of one kind or another $numsite2 = length($seqs{$names[0]}); if (not($interleave)) { print " $numotu $numsite2\n"; foreach $name (@names) { if ($convert){ printf ("%-14s",$name); }else{ printf ("%-14s",$ctxname{$name}); } $seq = $seqs{$name}; $line = substr($seq, 0, 50); print "$line\n"; for ($offset = 50; $offset<($numsite2+1); $offset += 50) { if ($offset < length($seq)){ $line = substr($seq, $offset, 50); print " "; print "$line\n"; } } } } # end not interleave else{ # interlaced phylip # print $datainfo ? "$numotu2 $numsite2 $datainfo\n" : "$numotu2 $numsite2\n"; print " $numotu $numsite2\n"; foreach $name (@names) { if ($convert){ printf ("%-14s",$name); }else{ printf ("%-14s",$ctxname{$name}); } $seq = $seqs{$name}; $line = substr($seq, 0, 50); print "$line\n"; } #end foreach print "\n"; for ($offset = 50; $offset<($numsite2 +1); $offset += 50) { foreach $name (@names) { #printf("%-3d %-10s %s\n", $num, $name, $info); $seq = $seqs{$name}; if ($offset < length($seq)){ $line = substr($seq, $offset, 50); print " "; print "$line\n"; } } print "\n"; } } } # end phylip formats (else fastaout) close(OUTFILE) unless $textout; select (STDOUT); if ($numotu>0){ print "Saved $numotu sequences to: $outfile...\n" unless ($textout); }else{ print "No sequences found in data file. Check format...\n"; } if ($makelut){ if ($lutfile eq "") { $lutfile = $fileroot . ".$root.lut"; } open(LUTFILE, ">$lutfile") || die "can't open output file"; foreach $name (@names){ $nt = sprintf ("%-10s",$name); print LUTFILE "$ctxname{$name}\t$nt\n"; } close (LUTFILE); print "Created lookup file: $lutfile\n" unless ($textout); } # makelut