sub signature
{ &opt_default(output=>'stdout',filename=>'signature.csv',wordlength=>2,start=>1,end=>0);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $wordlength = opt_val("wordlength");
my $start = opt_val("start");
my $end = opt_val("end");
my %mono;
my %obs;
my %exp;
my %oe;
$end = length($gb->{SEQ}) unless($end);
my $seq = $gb->getseq($start, $end);
my $seq2 = reverse($seq);
$seq2 =~ tr [acgturymkdhbvwsn] [tgcaayrkmhdvbwsn];
for(my $i; $i < length($seq) - $wordlength; $i ++){
$mono{substr($seq, $i, 1)} ++;
$mono{substr($seq2, $i, 1)} ++;
$obs{substr($seq, $i, $wordlength)} ++;
$obs{substr($seq2, $i, $wordlength)} ++;
}
my $total;
foreach (keys %mono){ $total += $mono{$_}; }
foreach (keys %mono){ $mono{$_} = $mono{$_} / $total; } $total = 0; foreach (keys %obs){ $total += $obs{$_}; }
foreach (keys %obs){ $obs{$_} = $obs{$_} / $total; } foreach (keys %obs){ delete $obs{$_} if($_ =~ /[^acgt]/); } foreach (keys %obs){ $exp{$_} = 1; for(my $i = 0; $i < $wordlength; $i ++){ $exp{$_} *= $mono{substr($_,$i,1)}; }
}
foreach (keys %obs){ $oe{$_} = sprintf("%.3f", $obs{$_} / $exp{$_}); }
$gb->{"$start..$end"}->{signature} = \%oe; # input in the G instance
my @allkey = qw(a c g t); for(1..($wordlength-1)){
my @tmp = ();
for my $base (qw(a c g t)){
foreach my $key (@allkey){ push(@tmp, $key.$base); }
}
@allkey = @tmp;
}
@allkey = sort @allkey;
if($output =~ /stdout/){
msg_send("\n\nstart\tend\t"); foreach(@allkey){ msg_send("$_\t"); }
msg_send("\n$start\t$end\t"); foreach(@allkey){ msg_send("$oe{$_}\t"); }
}
if($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
unless($gb->{"header_signature"}){
print FILE "start,end,";
foreach(@allkey){ print FILE "$_,"; }
$gb->{"header_signature"} = 1;
}
print FILE "\n$start,$end,";
foreach(@allkey){ print FILE (exists $oe{$_}) ? "$oe{$_}," : "0,"; }
close(FILE);
}
}
=head2 palindrome
Name: palindrome - searches palindrome sequences
Description:
Searches palindrome sequences
Usage:
palindrome(sequence);
Options:
-shortest shortest palindrome to search (default:4)
-loop longest stem loop to allow (default: 0)
-gtmatch if 1, allows g-t match (default: 0)
-output "f" for file output
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20010829-01 initial posting
=cut
sub palindrome {
&opt_default(gtmatch=>0, loop=>0, shortest=>4, -output=>"stdout", -filename=>"palindrome.csv");
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $length = int(opt_val("shortest") / 2); my $output = opt_val("output");
my $filename = opt_val("filename");
my %palindrome;
my $i = $length - 1;
my ($len, $j, $k, $stem);
if (opt_val("output") eq "f"){
open(OUT, '>' . $filename) || &msg_error("G::Seq::PatSearch::palindrome() $! $filename");
print OUT "Length, start, end, sequence\n";
}
while($i <= length($gb->{SEQ}) - 1 - $length - opt_val("loop")){
$stem = opt_val("loop");
while($stem >= 0){
$j = $i;
$k = $stem + 1 + $i;
$len = 0;
last if ($k > length($gb->{SEQ}) - 1);
while(&baseParingTest(substr($gb->{SEQ}, $j, 1),
substr($gb->{SEQ}, $k, 1),
&opt_val("gtmatch"))
)
{
$j --;
$k ++;
last if ($j < 0 || $k > length($gb->{SEQ}) - 1);
$len += 2;
}
if ($len >= opt_val("shortest")){
&msg_send(sprintf("Length: %2d Position: %7d %7d Sequence: %s %s %s\n",
$len, $j + 1, $k - 2,
substr($gb->{SEQ}, $j + 1, $len/2), substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2))) if ($output eq 'stdout');
if ($output eq "f"){
printf OUT "%d,%d,%d,%s %s %s\n",
$len, $j + 1, $k - 2,
substr($gb->{SEQ}, $j + 1, $len/2), substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2);
}
$palindrome{$j + 1} = sprintf("%s %s %s",
substr($gb->{SEQ}, $j + 1, $len/2), substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2)
);
}
$stem --;
}
$i ++;
}
close(OUT) if ($output eq "f");
return\% palindrome;
}
=head2 find_dif
Name: find_dif - finds dif sequence (chromosome partitioning site recognized by XerCD)
Description:
Finds dif sequence (chromosome partitioning site recognized by XerCD) in both strands.
dif is a 28bp sequence element recognized by XerCD located near the replication
terminus used for chromosome dimer resolution by recombination.
For E. coli, 5'-GGTGCGCATAATGTATATTATGTTAAAT-3', (Blakely and Sherratt, 1994)
for Proteobacteria, 5'-RNTKCGCATAATGTATATTATGTTAAAT-3', (Hendrickson and Lawrence, 2007)
for B. subtilis, 5'-ACTTCCTAGAATATATATTATGTAAACT-3', (Sciochetti et al., 2001)
for Firmicute, 5'-ACTKYSTAKAATRTATATTATGTWAACT-3', (Hendrickson and Lawrence, 2007)
for Actinobacteria, 5'-TTSRCCGATAATVNACATTATGTCAAGT-3'. (Hendrickson and Lawrence, 2007)
Usage:
@position = find_dif($sequence)
Options:
-type ecoli for E.coli dif (default)
proteobacteria, bsub, firmicute, actinobacteria, for corresponding dif sequences.
References:
1. Hendrickson H, Lawrence JG (2007) "Mutational bias suggests that replication
termination occurs near the dif site, not at Ter sites", Mol Microbiol. 64(1):42-56.
2. Blakely G, Sherratt D (1994) "Determinants of selectivity in Xer site-specific
recombination" Genes Dev. 10:762-773.
3. Sciochetti SA, Piggot PJ, Blakely GW (2001) "Identification and characterization
of the dif site from Bacillus subtilis." J Bacteriol. 183:1058-1068.
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20071028-01 fixed a bug that did not properly search the complement
20071023-01 updated to include dif consensus of other organisms
20060711-01 initial posting
=cut
sub find_dif {
opt_default(-type=>'ecoli');
my @argv = opt_get(@_);
my $type = lc opt_val('ecoli');
my $dif = 'ggtgcgcataatgtatattatgttaaat';
$dif = 'rntkcgcataatgtatattatgttaaat' if ($type eq 'proteobacteria');
$dif = 'acttcctagaatatatattatgtaaact' if ($type eq 'bsub');
$dif = 'actkystakaatrtatattatgtwaact' if ($type eq 'firmicute');
$dif = 'ttsrccgataatvnacattatgtccagt' if ($type eq 'actinobacteria');
return find_pattern(@_, $dif);
}
=head2 find_ter
Name: find_ter - finds ter sequence (replication termination site)
Description:
Finds ter sequence (replication termination site, recognized by Ter protein) in both strands.
For E. coli, 5'-AGNATGTTGTAAYKAA-3', (Coskun-Ari and Hill, 1997)
for B. subtilis, 5'-KMACTAANWNNWCTATGTACYAAATNTTC-3', (Wake, 1997)
Note that E.coli Ter consensus allows substitutions at bases 1, 4, and 16,
that are NOT considered in this method.
Usage:
@position = find_ter($sequence)
Options:
-type ecoli for E.coli ter (default)
bsub, for corresponding ter sequence.
References:
1. Hendrickson H, Lawrence JG (2007) "Mutational bias suggests that replication
termination occurs near the dif site, not at Ter sites", Mol Microbiol. 64(1):42-56
2. Coskun-Ari, FF, Hill TM (1997) "Sequence-specific interactions in the Tus-Ter
complex and the effect of base pair substitutions on arrest of DNA replication
in Escherichia coli", J Biol Chem. 272:26448-26456.
3. Wake RG (1997) "Replication fork arrest and termination of chromosome replication
in Bacillus subtilis", FEMS Microbiol Lett. 153:24-56.
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20071028-01 fixed a bug that did not properly search the complement
20071022-01 initial posting
=cut
sub find_ter {
opt_default(-type=>'ecoli');
my @argv = opt_get(@_);
my $type = lc opt_val('ecoli');
my $ter = 'agnatgttgtaaykaa';
$ter = 'kmactaanwnnwctatgtacyaaatnttc' if ($type eq 'bsub');
return find_pattern(@_, $ter);
}
=head2 find_dnaAbox
Name: find_dnaAbox - finds dnaA box in both strands
Description:
Finds dnaA box(TT A/T TNCACA) in both strands. Usage: @positions = find_dnaAbox($genome) Options: -output stdout to print data (default: stdout) References: 1. Schaper S, Messer W (1995) "Interaction of the initiator protein DnaA of Escherichia coli with its DNA target", J Biol Chem, 270(29):17622-17626
Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20071028-01 fixed a bug that did not properly search the complement 20071022-01 updated the code to use oligomer_search() 20021125-01 initial posting
=cut
sub find_dnaAbox { return find_pattern(@_, "ttwtncaca"); }
=head2 find_iteron
Name: find_iteron - finds iteron in both strands
Description:
Finds iteron (TGAGGG G/A C/T) in both strands.
Usage:
@positions = find_iteron($genome)
Options:
-output stdout to print data (default: stdout)
References:
1. Haines AS, Akhtar P, Stephens ER, Jones K, Thomas CM, Perkins CD, Williams JR,
Day MJ, Fry JC (2006) "Plasmids from freshwater environments capable of IncQ
retrotransfer are diverse and include pQKH54, a new IncP-1 subgroup archetype.",
Microbiology, 152(Pt 9):2689-2701
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20071107-01 initial posting
=cut
sub find_iteron {
return find_pattern(@_, "tgaggry");
}
=head2 find_pattern
Name: find_pattern - finds oligomer pattern in both strands
Description:
Finds given oligomer pattern specified in degenerate nucleotide code in both strands.
This method serves as the basic function for other find_* methods.
Usage:
@positions = find_patter($genome, "pattern")
Options:
-output stdout to print data (default: stdout)
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20071107-01 initial posting
=cut
sub find_pattern {
opt_default('output'=>'stdout');
my @argv = opt_get(@_);
my $gb = opt_as_gb(shift @argv);
my $pattern = shift @argv;
my $output = opt_val("output");
my %data = (oligomer_search($gb, $pattern, -return=>"both"), oligomer_search($gb, complement($pattern), -return=>"both"));
if($output eq 'stdout'){
foreach my $pos (sort {$a<=>$b}keys %data){
msg_send(sprintf "%d %s\n", $pos, $data{$pos});
}
}
return sort keys %data;
}
=head2 oligomer_counter
Name: oligomer_counter - counts the number of given oligomers in a sequence
Description:
Counts the number of oligomers in a sequence (by windows optionally).
Oligomer can be specified using degenerate nucleotide alphabet, or by
regular expressions.
Usage:
$count = oligomer_counter($genome, $oligomer);
or
%octamers = oligomer_counter($genome, -length=>8);
Options:
-window int window size.
If specified, seeks oligomer in specified windows
Method returns an array of numbers at each windows
If not specified, seeks oligomer in the genome
Method returns the number of oligomers
-output "f" for file output, "g" for graph output, "show"
to display the graph.
Only available when -window option is specified
-length If specified, returns a hash containing number counts
for all n-mers.
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
-based on atg7.wind + gcwind [rsaito]
History:
20071107-01 added -length option
20071022-01 oligomer can be now degenerate nucleotide code or regular expressions
20010829-01 initial posting
=cut
sub oligomer_counter {
opt_default("window"=>0);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $seq = shift @args;
my $window = opt_val("window");
my $output = opt_val("output");
my $length = opt_val("length");
$window = length($gb->{SEQ}) if($window <= 0);
if (opt_val("window")){
die("Error in oligomer_counter: oligomer not specified.") unless(length($seq));
open(OUT, '>oligo_count.csv') || &msg_error($!) if ($output eq "f");
my (@wincount, @winnum);
for (my $i = 0; $i <= length($gb->{SEQ}) - $window; $i += $window){
my $partial = substr($gb->{SEQ}, $i, $window);
my $count = 0;
if (length($seq) == 1 && $seq =~ /[atgc]/){
$count = $partial =~ tr/a/a/ if ($seq eq 'a');
$count = $partial =~ tr/t/t/ if ($seq eq 't');
$count = $partial =~ tr/g/g/ if ($seq eq 'g');
$count = $partial =~ tr/c/c/ if ($seq eq 'c');
}else{
$count = scalar oligomer_search($partial, $seq);
}
push (@wincount, $count);
push (@winnum, $i);
printf OUT "%d,%d\n", $i, $count if ($output eq "f");
}
close(OUT) if ($output eq 'f');
if ($output eq 'g' || $output eq 'show'){
grapher(\@winnum,\@ wincount, -x=>'window(bp)',
-y=>'number of oligomer',
-title=>'oligomer by window',
-outfile=>'oligo_count.png', -output=>$output
);
}
return (@wincount);
}elsif($length){
my %oligo;
for(my $i = 0; $i <= length($gb->{SEQ}) - $length; $i ++){
$oligo{substr($gb->{SEQ}, $i, $length)} ++;
}
return %oligo;
}else{
die("Error in oligomer_counter: oligomer not specified.") unless(length($seq));
return scalar oligomer_search($gb, $seq);
}
}
=head2 oligomer_search
Name: oligomer_search - searches oligomers in given sequence
Description:
Searches for the given oligomer in given sequence. Oligomer can be
specified using degenerate nucleotide alphabet, or by regular expressions.
Performance is optimized for fast searching.
This method changes the returning value according to the given options.
Usage:
@positions = oligomer_search($genome, $oligomer);
@oligomers = oligomer_search($genome, $oligomer, -return=>"oligo");
%positions_to_oligomers = oligomer_search($genome, $oligomer, -return=>"both");
($number_direct, $number_complement, $number_total, $ratio_direct) =
oligomer_search($genome, $oligomer, -return=>"distribution");
$oligomer can be degenerate nucleotide alphabet or regular expressions.
ex: "grtggngg" (degenerate code), or "g[ag]tgg[a-z]gg" (regular expression)
Options:
-return "position" to return list of positions where oligomers are found (default),
"oligo" to return list of oligomers found ordered by positions,
"both" to return a hash with positions as keys and oligomers as values,
"distribution" to return four values (see above) about the distribution
of given oligomer.
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20071022-01 initial posting
=cut
sub oligomer_search{
opt_default(return=>"position");
my @argv = opt_get(@_);
my $gb = opt_as_gb(shift @argv);
my $oligo = lc(shift @argv);
my $return = opt_val("return");
if($oligo !~ /[^atgc]/ && $return eq 'position'){
my $start = 0;
my @result;
while(0 <= ($start = index($gb->{SEQ}, $oligo, $start + 1))){
push(@result, $start);
}
return @result;
}elsif($return eq 'distribution'){
my $direct = scalar oligomer_search($gb, $oligo);
my $comp = scalar oligomer_search($gb, complement $oligo);
return ($direct, $comp, $direct + $comp, $direct/($direct + $comp)); }
unless($oligo =~ /[^a-z]/){
$oligo =~ s/r/[ag]/g;
$oligo =~ s/k/[gt]/g;
$oligo =~ s/s/[gc]/g;
$oligo =~ s/y/[ct]/g;
$oligo =~ s/m/[ac]/g;
$oligo =~ s/w/[at]/g;
$oligo =~ s/b/[gct]/g;
$oligo =~ s/h/[act]/g;
$oligo =~ s/n/[a-z]/g;
$oligo =~ s/d/[agt]/g;
$oligo =~ s/v/[acg]/g;
}
my @result;
{
no strict 'refs';
while($gb->{SEQ} =~ m/($oligo)/g){
if($return eq 'oligo'){
push(@result, ${1});
}elsif($return eq 'both'){
push(@result, $-[1], ${1});
}else{
push(@result, $-[1]);
}
}
}
return @result;
}
=head2 baseParingTest
Name: baseParingTest - checks if the two bases forms a pair
Description:
Base pairing check. 1 if the two bases pair, and 0 if they do not pair.
G-T match is also considered when third argument is given.
Usage:
boolean $match = match_test(char $first, char $second, boolean $gtmatch);
Options:
none
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20010829-01 initial posting
=cut
sub baseParingTest {
my $first = lc(shift);
my $second = lc(shift);
my $gtmatch = shift;
die("First two arguments must be single base (i.e. a, t, g, or c).\n")
unless(length($first) == 1 && length($second) == 1);
if ($first eq 'a' && $second eq 't' ||
$first eq 't' && $second eq 'a' ||
$first eq 'g' && $second eq 'c' ||
$first eq 'c' && $second eq 'g' ||
$first eq 't' && $second eq 'g' && $gtmatch ||
$first eq 'g' && $second eq 't' && $gtmatch
)
{
return 1;
}else{
return 0;
}
}
=head2 nucleotide_periodicity
Name: nucleotide_periodicity - checks the periodicity of certain oligonucleotides
Description:
Checks the periodicity of certain nucleotide (best known with AA dinucleotide)
Usage:
array data = nucleotide_periodicity(sequence);
Options:
-nucleotide nucleotide to search (default:aa)
-window window size to seek periodicity (default:50)
-filename output filename (default:aa_frequency.png)
-output "g" for graph file output only,
"show" for graph file output and display.
(default: show)
ToDo:
data output
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20070206-01 initial posting
=cut
sub nucleotide_periodicity {
opt_default("nucleotide"=>"aa", "window"=>50, "filename"=>"aa_frequency.png", "output"=>"show");
my @argv = opt_get(@_);
my $gb = opt_as_gb(shift @argv);
my $nuc = opt_val("nucleotide");
my $window = opt_val("window");
my $filename = opt_val("filename");
my $output = opt_val("output");
my @data = ();
$data[$_] = 0 for (0..($window - 1));
my $start = -1;
while(0 <= ($start = index($gb->{SEQ}, $nuc, $start + 1))){
my $innerPos = -1;
my $localSeq = substr($gb->{SEQ}, $start + length($nuc), $window);
while(0 <= ($innerPos = index($localSeq, $nuc, $innerPos + 1))){
$data[$innerPos]++;
}
}
_Unimultigrapher([0..($window - 1)],\@ data, -filename=>$filename);
msg_gimv("graph/$filename") if ($output eq 'show');
return @data;
}
1;} |