Primitive documentation.

Privates (from "my" definitions)

%CodonTable = ( 'gac', 'D', 'caa', 'Q', 'gca', 'A', 'ctg', 'L', 'gat', 'D', 'cag', 'Q', 'gcc', 'A', 'ctt', 'L', 'gaa', 'E', 'agc', 'S', 'gcg', 'A', 'ata', 'I', 'gag', 'E', 'agt', 'S', 'gct', 'A', 'atc', 'I', 'aga', 'R', 'tca', 'S', 'gga', 'G', 'att', 'I', 'agg', 'R', 'tcc', 'S', 'ggc', 'G', 'cca', 'P', 'cga', 'R', 'tcg', 'S', 'ggg', 'G', 'ccc', 'P', 'cgc', 'R', 'tct', 'S', 'ggt', 'G', 'ccg', 'P', 'cgg', 'R', 'aca', 'T', 'gta', 'V', 'cct', 'P', 'cgt', 'R', 'acc', 'T', 'gtc', 'V', 'atg', 'M', 'aaa', 'K', 'acg', 'T', 'gtg', 'V', 'tgg', 'W', 'aag', 'K', 'act', 'T', 'gtt', 'V', 'tgc', 'C', 'cac', 'H', 'tac', 'Y', 'tta', 'L', 'tgt', 'C', 'cat', 'H', 'tat', 'Y', 'ttg', 'L', 'taa', '/', 'aac', 'N', 'ttc', 'F', 'cta', 'L', 'tag', '/', 'aat', 'N', 'ttt', 'F', 'ctc', 'L', 'tga', '/' )

Name: DoubleHelix - we all love the DNA molecule! Description: This method prints the given sequence as an ASCII text-based graphic depicting a DNA molecule. Enjoy :-) Usage: DoubleHelix($seq); Options: -speed controls the time to display one base-pair. default is 0.05 (second) Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20060511-01 initial posting

Name: shuffleseq - create randomized sequence with conserved k-mer composition Description: Shuffle and randomize the given sequence, conserving the nucleotide/peptide k-mer content of the original sequence. For k=1, i.e. shuffling sequencing preserving single nucleotide composition, Fisher-Yates Algorithm is employed. For k>1, shuffling preserves all k-mers (all k where k=1~k). For example, k=3 preserves all triplet, doublet, and single nucleotide composition. Algorithm for k-mer preserved shuffling is non-trivial, which is solved by graph theoretical approach with Eulerian random walks in the graph of k-1-mers. See references 3-5 for details of this algorithm. Usage: $shuffled_seq = shuffleseq($gb); Options: -k k-mer to preserve the composition (default: 1) k=1 means only the base composition is preserved and the order is shuffled. k=1 preserves dinucleotide composition as well as the single nucleotide composotion. Similarly, k=n preserves all of 1~n -mers. Note that k < 5 with nucleotide sequence will take several minutes to finish. References: 1. Fisher R.A. and Yates F. (1938) "Example 12", Statistical Tables, London 2. Durstenfeld R. (1964) "Algorithm 235: Random permutation", CACM 7(7):420 3. Jiang M., Anderson J., Gillespie J., and Mayne M. (2008) "uShuffle: a useful ool for shuffling biological sequences while preserving the k-let counts", BMC Bioinformatics 9:192 4. Kandel D., Matias Y., Unver R., and Winker P. (1996) "Shuffling biological sequences", Discrete Applied Mathematics 71(1-3):171-185 5. Propp J.G. and Wilson D.B. (1998) "How to get a perfectly random sample "from a generic Markov chain and generate a random spanning tree of a directed graph", Journal of Algorithms 27(2):170-217 Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20080421-01 added k-mer count preservation option 20070612-01 initial posting

Name: splitprintseq - format sequence data for printing Description: This method splits the given sequence string in segments of certain length and inserts newline code ("\n") to print the sequence in a formatted way. By default, this function splits the string in segments of 60 letters. This length can be changed by supplying the second argument. Usage : print splitprintseq($seq); # print sequence in a formatted way. or $formatted_seq = splitprintseq($seq, 100); # split into segments of 100 characters and add "\n" to each line. Options: Length of segments can be specified as the second argument. Default is 60 characters to match GenBank. Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20050116-01 initial posting

Name: to_fasta - output given sequence to a FASTA file Description: This method outputs the given sequence as a FASTA file. Usage : to_fasta($seq, -name=>"My sequence"); # output the sequence to out.fasta or $fasta_string = to_fasta($gb, -output=>"return"); Options: -name string for FASTA header (default: sequence) if the first argument is an instance of new G(), $gb->{LOCUS}->{id} is used as default -length number of characters per line (default: 60) -filename output filename (default: "out.fasta") -output "f" to output to file, and return the filename. "return" to return the fasta as a string Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20070612-01 added -output option 20050116-01 initial posting

sub DoubleHelix {

    SubOpt::opt_default(speed=>0.05);
    my @args = SubOpt::opt_get(@_);  
    my $gb = SubOpt::opt_as_gb(shift @args);
    my $speed = SubOpt::opt_val("speed");

    $| = 1;

    my $i;
    my (@offset) = qw/1 0 0 0 1 2 3 4 5 5 4 3 2 1 0 0 0 1/;
    my (@dist)   = qw/0 2 3 4 4 4 3 2 0 0 2 3 4 4 4 3 2 0/;

    foreach my $base (split(//, $gb->{SEQ})){
        print "         ", q// /x$offset[($i%scalar(@offset))];
	print uc($base);
	$i ++;
	print q//-/x$dist[($i%scalar(@dist))];
	print uc(complement($base)), "\n";
	select(undef,undef,undef,$speed);
    }

}

sub shuffleseq {

    opt_default("k"=>1);
    my @args = SubOpt::opt_get(@_);
    my $gb = SubOpt::opt_as_gb(shift @args);
    my $k = opt_val("k");

    if($k == 1){
	return join('', shuffle(split(//, $gb->{SEQ})));
    }

    my $graph = {};
    my $revgraph = {};
    my $arborescence = {};

    my $length = length($gb->{SEQ});
    my %oligo = oligomer_counter($gb, -length=>$k);
    
    foreach my $nuc (keys %oligo){
	$graph->{substr($nuc, 0, $k - 1)}->{substr($nuc, -1, 1)} = $oligo{$nuc};
	$revgraph->{substr($nuc, 1, $k - 1)}->{substr($nuc, 0, 1)} = $oligo{$nuc};
    }
    my $numoligos = keys(%$graph);
    undef %oligo;


    msg_error("Compiling arborescene...\n");
    
    my %edged;
    my $count = 1;
    $edged{substr($gb->{SEQ}, - ($k - 1))} ++;
    my %visited = %edged;

    while($numoligos > $count){
	my @keys = keys %edged;
	my $node = $keys[int(rand(@keys))];
	my @nucs = keys %{$revgraph->{$node}};
	my $rnuc = $nucs[int(rand(@nucs))];

	my $newnode = $rnuc . substr($node, 0, $k - 2);
	unless($visited{$newnode}){
	    $count ++;
	    my $first = substr($node, -1, 1);
	    $visited{$newnode} ++;
	    $edged{$newnode} ++;
	    $arborescence->{$newnode} = $first;
	    $graph->{$newnode}->{$first} --;
	}
	delete($revgraph->{$node}->{$rnuc});
	delete($edged{$node}) if(scalar(@nucs) == 1);
    }
    undef $revgraph;

    msg_error("Compilation of arborescence finished.\nNow generating shuffled sequence.\n");

    my $seed = substr($gb->{SEQ}, 0, $k - 1);
    my $seq = $seed;

    while(length($seq) < $length){
	my $num = 0;
	foreach my $val (values %{$graph->{$seed}}){
	    $num += $val;
	}
	if ($num < 1){
	    if(defined $arborescence->{$seed}){
		my $oldseed = $seed;

		$seq .= $arborescence->{$seed};
		$seed = substr($seed, 1) . $arborescence->{$seed};

		delete($arborescence->{$oldseed});

		next;
	    }else{
		die("Dead end: failed to trace the Eulerian. Try shuffling again.");
		#Mathematically, this should not happen.
	    }
	}

	my $rand = int(rand($num));
	foreach my $key (keys %{$graph->{$seed}}){
	    if($rand <= $graph->{$seed}->{$key}){
		$graph->{$seed}->{$key} --;
		delete($graph->{$seed}->{$key}) if($graph->{$seed}->{$key} < 1);
		
		$seed = substr($seed, 1) . $key;
		$seq .= $key;
		    
		last;
	    }else{
		$rand -= $graph->{$seed}->{$key};
	    }
	}
    }

    msg_error("Done.\n");
    return $seq;

}

sub to_fasta {

    SubOpt::opt_default(length=>60, filename=>"out.fasta", name=>"sequence", output=>"f");
    my @args = SubOpt::opt_get(@_);
    my $gb = SubOpt::opt_as_gb(shift @args);
    my $filename = SubOpt::opt_val("filename");
    my $name = SubOpt::opt_val("name");
    my $length = SubOpt::opt_val("length");
    my $output = SubOpt::opt_val("output");

    $name = $gb->{LOCUS}->{id} if($name eq 'sequence' && length $gb->{LOCUS}->{id});

    if($output eq 'f'){
	open(OUT, ">$filename") || die($!);
	printf OUT ">%s\n%s", $name, splitprintseq($gb->{SEQ}, $length);
	close(OUT);
	return $filename;
    }else{
	return sprintf ">%s\n%s", $name, splitprintseq($gb->{SEQ}, $length);
    }

}

my $seq = lc(shift); my $amino = ''; while(3 <= length($seq)){ my $codon = substr($seq, 0, 3); substr($seq, 0, 3) = ''; if ($codon =~ /[^atgc]/){ $amino .= 'X'; }else{ $amino .= $CodonTable{$codon}; } } msg_error("Translation: illegal length.\n") if(length($seq)); return $amino;

DoubleHelix	Description	Code
complement	No description	Code
shuffleseq	Description	Code
splitprintseq	Description	Code
to_fasta	Description	Code
translate	No description	Code