G::IO
Handler
Summary
G::IO::Handler - Internal class with basic sequence manipulation methods
Package variables
No package variables defined.
Included modules
Carp
File::ShareDir ' :ALL '
Synopsis
require G::IO::Handler;
use base qw(G::IO::Handler);
Description
Intended for internal use only. Super class for the core. Provides
the native methods.
Methods
_interpret_format | No description | Code |
after_startcodon | No description | Code |
after_stopcodon | No description | Code |
annotate | No description | Code |
around_startcodon | No description | Code |
around_stopcodon | No description | Code |
before_startcodon | No description | Code |
before_stopcodon | No description | Code |
cds | No description | Code |
clone | No description | Code |
del_key | No description | Code |
disable_pseudogenes | No description | Code |
disclose | No description | Code |
feature | No description | Code |
feature2id | No description | Code |
filepath | No description | Code |
find | No description | Code |
gene | No description | Code |
gene2id | No description | Code |
get_cdsseq | No description | Code |
get_exon | No description | Code |
get_exons | No description | Code |
get_gbkseq | No description | Code |
get_geneseq | No description | Code |
get_intron | No description | Code |
getseq | No description | Code |
hairpin_cut | No description | Code |
id | No description | Code |
intergenic | No description | Code |
next_cds | No description | Code |
next_feature | No description | Code |
output | No description | Code |
pos2feature | No description | Code |
pos2gene | No description | Code |
previous_cds | No description | Code |
previous_feature | No description | Code |
rRNA | No description | Code |
relocate_origin | No description | Code |
reverse_strand | No description | Code |
seq | No description | Code |
seq_info | No description | Code |
set_essentiality | No description | Code |
set_gene_aliases | No description | Code |
set_generationtime | No description | Code |
set_intergenic | No description | Code |
startcodon | No description | Code |
stopcodon | No description | Code |
tRNA | No description | Code |
Methods description
None available.
Methods code
_interpret_format | description | prev | next | Top |
sub _interpret_format
{ my $filename = shift;
my $ref = ref $filename;
if ($filename =~ /^(.*?):(.*)/){
unless(lc($1) =~ /(swall|swiss|sw|uniprot|genbank|genpept|embl|refseq)/){
croak("Unsupported database name for USA. Supported databases are\n" .
"swiss, sw, uniprot, genbank, genpept, embl, or refseq\n");
}
return "usa";
}elsif (lc($filename) =~ /\.(gb|gbk|gbank|genbank|nuccore)$/i){
return 'genbank';
}elsif (lc($filename) =~ /\.(fasta|fast|seq|fst|fa|fsa|nt|aa|ffn|fna)/i){
return 'fasta';
}elsif (lc($filename) =~ /\.(fastq|fq)/i){
return 'fastq';
}elsif (lc($filename) =~ /\.(gff3)/i){
return 'gff3';
}elsif (lc($filename) =~ /\.(gff2)/i){
return 'gff2';
}elsif (lc($filename) =~ /\.(gff)/i){
return 'gff';
}elsif ($filename =~ /^NC_\d+$/i ||
$filename =~ /^NP_\d+$/i ||
$filename =~ /^(?:[A-Z]|[A-Z][A-Z]).\d+$/i ||
$filename =~ /^NC_\d+\.\d+$/i ||
$filename =~ /^NP_\d+\.\d+$/i ||
$filename =~ /^(?:[A-Z]|[A-Z][A-Z]).\d+\.\d+$/i
){
return 'RefSeq';
}elsif(-e $filename){
my @tmp = split(/\./, $filename);
my $format = $filename =~ /\./ ? pop(@tmp) : '';
if (length $format){
return $format;
}else{
open(FILE, $filename);
while(<FILE>){
if(/^>/){
return 'fasta';
}elsif(/^LOCUS/){
return 'genbank';
}elsif(/^ID/){
return 'embl';
}else{
carp("Unknown file format. Interpreting $filename as GenBank...\n");
return 'genbank';
}
last;
}
close(FILE);
}
}else{
my $id = readFile("http://rest.g-language.org/gbp/orgname.cgi?id=$filename", 1);
return ("named:$id") if(length($id));
} } |
sub after_startcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $object = shift;
my $length = shift || 100;
if ($this->{$object}->{direction} eq 'complement'){
my $start = $this->{$object}->{end} - 1 - 3 - $length + 1;
if($start < 0){
$start = 0;
$length = $this->{$object}->{end} - 3;
}
return complement(substr($this->{SEQ}, $start, $length));
}else{
return substr($this->{SEQ}, $this->{$object}->{start} + 3 - 1, $length);
} } |
sub after_stopcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $object = shift;
my $length = shift || 100;
if ($this->{$object}->{direction} eq 'complement'){
my $start = $this->{$object}->{start} - 1 - $length;
if($start < 0){
$start = 0;
$length = $this->{$object}->{start};
}
return complement(substr($this->{SEQ}, $start, $length));
}else{
return substr($this->{SEQ}, $this->{$object}->{end} +1 - 1, $length);
} } |
sub annotate
{ my $this = shift;
my($fh, $outfile) = tempfile(undef, SUFFIX=>'.gbk');
$this->make_gb($outfile);
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
my $id = $ua->post('http://rest.g-language.org/upload/upl.pl', 'Content_Type'=>'form-data', 'Content'=>['file'=>[$outfile]])->content;
mirror("http://rest.g-language.org/$id/restauro", $outfile);
my $new = new G::IO($outfile, "no cache", "no msg");
return $new; } |
sub around_startcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $gb = shift;
my $cds = shift;
my $before = shift || 0;
my $after = shift || 0;
my $option = shift;
my $seq = $gb->before_startcodon($cds, $before);
$seq .= $gb->startcodon($cds) unless($option =~ /without/);
$seq .= $gb->after_startcodon($cds, $after);
return $seq; } |
sub around_stopcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $gb = shift;
my $cds = shift;
my $before = shift || 0;
my $after = shift || 0;
my $option = shift;
my $seq = $gb->before_stopcodon($cds, $before);
$seq .= $gb->stopcodon($cds) unless($option =~ /without/);
$seq .= $gb->after_stopcodon($cds, $after);
return $seq; } |
sub before_startcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $object = shift;
my $length = shift || 100;
if ($this->{$object}->{direction} eq 'complement'){
return complement(substr($this->{SEQ}, $this->{$object}->{end}, $length));
}else{
my $start = $this->{$object}->{start} - 1 - $length;
if ($start < 0){
$start = 0;
$length = $this->{$object}->{start} - 1;
}
return substr($this->{SEQ}, $start, $length);
} } |
sub before_stopcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $object = shift;
my $length = shift || 100;
if ($this->{$object}->{direction} eq 'complement'){
return complement(substr($this->{SEQ}, $this->{$object}->{start} + 3 - 1, $length));
}else{
my $start = $this->{$object}->{end} - 3 - 1 - $length + 1;
if($start < 0){
$start = 0;
$length = $this->{$object}->{end} - 3;
}
return substr($this->{SEQ}, $start, $length);
} } |
sub cds
{ return feature($_[0], 'CDS', $_[1]); } |
sub clone
{ my $this = shift;
my $sdbPath = _sdb_path();
_set_sdb_path('/tmp');
my $tmpfile = "GINTERNAL-" . time() . rand();
sdb_save($this,$tmpfile);
my $new = sdb_load($tmpfile);
_set_sdb_path($sdbPath);
unlink($tmpfile);
return $new; } |
sub del_key
{ $_[0]->{$_[1]}->{on} = 0;
return 1; } |
sub disable_pseudogenes
{ my $this = shift;
foreach my $feature ($this->feature()){
$this->{$feature}->{on} = 0 if ($this->{$feature}->{pseudo});
} } |
sub disclose
{ my $gb = shift;
my $return = '';
foreach my $level1 (keys %{$gb}){
next if ($level1 eq 'SEQ');
next unless($level1 =~ /LOCUS|FEATURE|HEADER/);
if(ref($gb->{$level1}) =~ /HASH/){
foreach my $level2 (keys %{$gb->{$level1}}){
$return .= join("\t", $level1, $level2, $gb->{$level1}->{$level2}) . "\n";
}
}else{
$return .= join("\t", $level1, $gb->{$level1}) . "\n";
}
}
return $return; } |
sub feature
{ my $this = shift;
my $type = shift;
my $opt = shift;
if($type eq 'all'){
$opt = 'all';
$type = '';
}
my $i = -1;
my @feature;
while(defined($this->{"FEATURE" . ($i + 1)})){
$i ++;
if(length($type)){
next unless ($this->{"FEATURE$i"}->{type} eq $type);
}
if ($opt ne 'all' && defined $this->{"FEATURE$i"}->{on}){
next if ($this->{"FEATURE$i"}->{on} == 0);
}
push (@feature, "FEATURE$i");
}
return @feature; } |
sub feature2id
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
return 'FEATURE' . $_[0]->{$_[1]}->{feature}; } |
sub filepath
{ return $_[0]->{FILENAME}; } |
sub find
{ my $this = shift;
my @args = @_;
my (@keywords, %keyhash, @results);
my $i = 0;
my $print = 0;
while(defined $args[$i]){
if (substr($args[$i], 0, 1) eq '-' && substr($args[$i], 1, 1) !~ /[0-9]/){
if (substr($args[$i],1) eq 'print'){
$print = 1;
$i +=2;
}else{
$keyhash{substr($args[$i],1)} = $args[$i + 1];
$i += 2;
}
}else{
push(@keywords, $args[$i]);
$i ++;
}
}
foreach my $feat ($this->feature()){
my $flag = 0;
foreach my $key (keys %keyhash){
my $val = $keyhash{$key};
unless($this->{$feat}->{$key} =~ /$val/i){
$flag = 1;
last;
}
}
next if ($flag);
foreach my $key (@keywords){
unless(join('%%%___%%%', values(%{$this->{$feat}})) =~ /$key/i){
$flag = 1;
last;
}
}
push(@results, $feat) unless($flag);
}
if(msg_ask_interface() eq 'Shell' || $print){
foreach my $feat (@results){
my $gene = $this->{$feat}->{gene} || $this->{$feat}->{locus_tag} || $feat;
my $ec = $this->{$feat}->{EC_number};
$ec =~ s/\s+/,/g;
$ec = '(' . $ec . ')' if (length $ec);
msg_send(
sprintf(" %s\t%s\t%s\t%s..%s\t%s\t%s %s\n", $feat, $gene, $this->{$feat}->{type},
$this->{$feat}->{start}, $this->{$feat}->{end}, $this->{$feat}->{direction}, $this->{$feat}->{product}, $ec)
);
}
}
return @results; } |
sub gene
{ return feature($_[0], 'gene', $_[1]); } |
sub gene2id
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
return length($_[0]->{$_[1]}->{feature}) ? 'FEATURE' . $_[0]->{$_[1]}->{feature} : ''; } |
sub get_cdsseq
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $object = shift;
my $cdsseq = '';
if($this->{$object}->{start} > $this->{$object}->{end}){
$cdsseq = substr($this->{SEQ}, $this->{$object}->{start} - 1) .
$this->get_gbkseq(1, $this->{$object}->{end});
}else{
$cdsseq = $this->get_gbkseq($this->{$object}->{start},
$this->{$object}->{end});
}
$cdsseq = &complement($cdsseq)
if ($this->{$object}->{direction} eq 'complement');
return $cdsseq; } |
sub get_exon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $cds = shift;
return unless (length $this->{$cds}->{join});
my $seq = '';
foreach my $line (split(/,/, $this->{$cds}->{join})){
my $complement = $line =~ tr/c//d;
my ($start, $end) = split(/\.\./, $line, 2);
my $tmp = $this->get_gbkseq($start, $end);
$tmp = complement($tmp) if ($complement);
$seq .= $tmp;
}
$seq = complement($seq) if ($this->{$cds}->{direction} eq 'complement');
return $seq; } |
sub get_exons
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $cds = shift;
return unless (length $this->{$cds}->{join});
my @seq;
foreach my $line (split(/,/, $this->{$cds}->{join})){
my $complement = $line =~ tr/c//d;
my ($start, $end) = split(/\.\./, $line, 2);
my $tmp = $this->get_gbkseq($start, $end);
$tmp = complement($tmp) if ($complement);
if($this->{$cds}->{direction} eq 'complement'){
unshift(@seq, complement($tmp));
}else{
push(@seq, $tmp);
}
}
return @seq; } |
sub get_gbkseq
{ if (scalar(@_) < 3){
carp("Not enough arguments.");
return;
}
return getseq($_[0], $_[1] - 1, $_[2] - 1, $_[3]); } |
sub get_geneseq
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $object = shift;
my $geneseq = $this->get_gbkseq($this->{$object}->{start},
$this->{$object}->{end});
if ($this->{$object}->{join}){
$geneseq = $this->get_exon($object);
}elsif ($this->{$object}->{direction} eq 'complement'){
$geneseq = &complement($geneseq);
}
my $codon_start = $this->{$object}->{codon_start} || 1;
return substr($geneseq, $codon_start - 1); } |
sub get_intron
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $cds = shift;
return unless (length $this->{$cds}->{join});
my @join = split(/\.\./, $this->{$cds}->{join});
shift @join;
pop @join;
my @seq;
foreach my $line (@join){
$line =~ s/c//g;
my ($start, $end) = split(/,/, $line, 2);
my $tmp = $this->get_gbkseq($start + 1, $end - 1);
$tmp = '' if($end - 2 - $start < 0);
push (@seq, $tmp);
}
return @seq; } |
sub getseq
{ if (scalar(@_) < 3){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $start = shift;
my $end = shift;
my $option = shift;
if($start < $end){
return substr($this->{SEQ}, $start, $end-$start+1);
}else{
if($option =~ /circ/){
return substr($this->{SEQ}, $start) . substr($this->{SEQ}, 0, $end + 1);
}else{
my ($start2, $end2) = sort {$a <=> $b} ($start, $end);
return substr($this->{SEQ}, $start2, $end2-$start2+1);
}
} } |
sub hairpin_cut
{ system('firefox http://www.toychan.net/afro/');
return "\n==============\n!!!Afro Man!!!===============\n\n"; } |
sub id
{ return $_[0]->{LOCUS}->{id}; } |
sub intergenic
{ my $this = shift;
my $opt = shift || '';
my $i = 0;
my @cds;
set_intergenic($this) unless(defined($this->{INTER1}));
while($this->{"INTER" . ($i + 1)}){
$i ++;
if ($opt ne 'all'){
next if ($this->{"INTER$i"}->{on} == 0);
}
push (@cds, "INTER$i");
}
return @cds; } |
sub next_cds
{ return next_feature($_[0], $_[1], 'CDS'); } |
sub next_feature
{ my $this = shift;
my $feature = shift || 'FEATURE0';
my $opt = shift;
my @features = $this->feature($opt);
return undef if ($feature eq $features[-1]);
$feature = $this->{$feature}->{left} if ($feature =~ /^INTER/);
my $i = $this->{$feature}->{feature};
$i ++;
while(%{$this->{"FEATURE$i"}}){
my $feat = "FEATURE$i";
$i ++;
if(length($opt)){
next unless($this->{$feat}->{type} eq $opt);
}
return $feat;
} } |
sub output
{ my $gb = shift;
my $file = shift;
my $option = shift;
$option = _interpret_format($file) unless(length $option);
if (lc($option) eq 'genbank'){
$gb->make_gb($file);
}elsif(length $option){
my $in;
my ($fh, $outfile) = tempfile(undef, SUFFIX=>'.gbk');
$gb->make_gb($outfile);
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
my $response = $ua->post("http://rest.g-language.org/emboss/",
Content_Type=>'multipart/form-data',
Content=>[file1 => [$outfile], 'arg'=>"seqret/osformat2=$option/-feature"]
);
if ($response->is_success) {
writeFile($response->decoded_content, $file);
} else {
die("Error in output:\n" . $response->status_line . "\n File format\" $option\"of\" $file\" is not supported.");
}
unlink($outfile);
}else{
&msg_error("G::output - Unknown format to output.");
} } |
sub pos2feature
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $pos = shift;
my $pos2 = shift;
if(length($pos2) > 1){
my @genes = ();
foreach my $feat ($this->feature()){
next if ($feat eq 'FEATURE0');
if ($this->{$feat}->{end} >= $pos && $pos2 >= $this->{$feat}->{start}){
push(@genes, $feat);
}elsif ($pos2 < $this->{$feat}->{start}){
return @genes;
}
}
}else{
foreach my $feat ($this->feature()){
next if ($feat eq 'FEATURE0');
if ($pos >= $this->{$feat}->{start} && $pos <= $this->{$feat}->{end}){
return $feat;
}elsif ($pos < $this->{$feat}->{start}){
return '';
}
}
} } |
sub pos2gene
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
my $this = shift;
my $pos = shift;
my $pos2 = shift;
if(length($pos2) > 1){
my @genes = ();
foreach my $feat ($this->cds()){
if ($this->{$feat}->{end} >= $pos && $pos2 >= $this->{$feat}->{start}){
push(@genes, $feat);
}elsif ($pos2 < $this->{$feat}->{start}){
return @genes;
}
}
}else{
foreach my $feat ($this->cds()){
if ($pos >= $this->{$feat}->{start} && $pos <= $this->{$feat}->{end}){
return $feat;
}elsif ($pos < $this->{$feat}->{start}){
return '';
}
}
} } |
sub previous_cds
{ return previous_feature($_[0], $_[1], 'CDS'); } |
sub previous_feature
{ my $this = shift;
my $feature = shift || 'FEATURE0';
my $opt = shift;
return undef if ($feature eq 'FEATURE0');
$feature = $this->{$feature}->{right} if ($feature =~ /^INTER/);
my $i = $this->{$feature}->{feature};
$i --;
while(%{$this->{"FEATURE$i"}}){
my $feat = "FEATURE$i";
$i --;
if(length($opt)){
next unless($this->{$feat}->{type} eq $opt);
}
return $feat;
} } |
sub rRNA
{ my $this = shift;
my $option = shift;
if($option =~ /\d+S|SSU|LSU/){
my %rRNA;
foreach ($this->feature('rRNA')){
if($this->{$_}->{product} =~ /([0-9\.]+S)/i){
push(@{$rRNA{uc($1)}}, $_);
}elsif($this->{$_}->{gene} =~ /rrf/){
push(@{$rRNA{'5S'}}, $_);
}elsif($this->{$_}->{gene} =~ /rrl/){
push(@{$rRNA{'23S'}}, $_);
}elsif($this->{$_}->{gene} =~ /rrs/){
push(@{$rRNA{'16S'}}, $_);
}elsif($this->{$_}->{gene} =~ /([0-9\.]+S)/i){
push(@{$rRNA{uc($1)}}, $_);
}elsif($this->{$_}->{locus_tag} =~ /([0-9]+S)/i){
push(@{$rRNA{uc($1)}}, $_);
}elsif($this->{$_}->{note} =~ /([0-9\.]+S)+?/){
push(@{$rRNA{uc($1)}}, $_);
}elsif($this->{$_}->{product} =~ /(SSU|small subunit) ribosomal RNA/i){
push(@{$rRNA{'SSU'}}, $_);
}elsif($this->{$_}->{product} =~ /(LSU|large subunit) ribosomal RNA/i){
push(@{$rRNA{'LSU'}}, $_);
}
}
if(defined $rRNA{$option}){
my @result = sort {length($this->get_geneseq($b))<=>length($this->get_geneseq($a))} @{$rRNA{$option}};
return wantarray ? @result : $result[0];
}
}else{
return feature($this, 'rRNA', $option);
} } |
sub relocate_origin
{ require G::IO;
my $this = new G::IO("blessed");
my $tmp = shift;
my $gb = $tmp->clone();
my $pos = shift;
croak("New start position\( in Perl coordinate\) must be given.\n") unless($pos =~ /\d/);
$this->{LOCUS} = $gb->{LOCUS};
$this->{HEADER} = $gb->{HEADER};
$this->{COMMENT} = $gb->{COMMENT};
$this->{SEQ} = substr($gb->{SEQ}, $pos) . substr($gb->{SEQ}, 0, $pos);
$this->{FEATURE0} = $gb->{FEATURE0};
my (@before, @after);
my @features = $gb->feature();
shift @features;
foreach my $feature (@features){
if($gb->{$feature}->{start} >= $pos + 1){
push(@after, $feature);
}else{
push(@before, $feature);
}
}
my $f = 0;
my $c = 0;
foreach my $feature (@after, @before){
$f ++;
$this->{"FEATURE$f"} = $gb->{$feature};
$this->{"FEATURE$f"}->{feature} = $f;
if($gb->{$feature}->{type} eq 'CDS'){
$c ++;
$this->{"FEATURE$f"}->{cds} = $c;
}
if($gb->{$feature}->{end} >= $pos + 1 && $gb->{$feature}->{start} < $pos + 1){
$this->{"FEATURE$f"}->{start} += length($gb->{SEQ}) - $pos;
$this->{"FEATURE$f"}->{end} -= $pos;
if(length $this->{"FEATURE$f"}->{join}){
msg_error("Warning: overriding join definition for FEATURE$f.\n");
msg_error(" this is likely to destroy positional features of this gene entry.\n");
}
$this->{"FEATURE$f"}->{join} = sprintf("%d\.\.%d,%d\.\.%d", $this->{"FEATURE$f"}->{start},
length($gb->{SEQ}), 1, $this->{"FEATURE$f"}->{end});
next;
}
my $lng = length($gb->{SEQ}) - $pos;
if($gb->{$feature}->{start} >= $pos + 1){
$lng = -$pos;
}
$this->{"FEATURE$f"}->{start} += $lng;
$this->{"FEATURE$f"}->{end} += $lng;
if(defined $this->{"FEATURE$f"}->{"join"}){
my @join = split(/\,/,$this->{"FEATURE$f"}->{"join"});
my @num = ();
my @new_join = ();
foreach(@join){
if(tr/c/c/){
@num = split(/\.\./,$_);
push (@new_join, sprintf ("c%d\.\.%d", $num[0] + $lng, $num[1] + $lng));
} else {
@num = split(/\.\./,$_);
push (@new_join, sprintf ("%d\.\.%d", $num[0] + $lng, $num[1] + $lng));
}
}
$this->{"FEATURE$f"}->{join} = join(',', @new_join);
}
}
$this->set_gene_aliases();
return $this; } |
sub reverse_strand
{ require G::IO;
my $this = new G::IO("blessed");
my $tmpG = shift;
my $gb = $tmpG->clone();
$this->{LOCUS} = $gb->{LOCUS};
$this->{HEADER} = $gb->{HEADER};
$this->{COMMENT} = $gb->{COMMENT};
$this->{SEQ} = complement($gb->{SEQ});
$this->{FEATURE0} = $gb->{FEATURE0};
my @feat = $gb->feature();
shift @feat;
my (@features, @tmp);
foreach my $feature (@feat){
if($gb->{$feature}->{type} eq 'gene'){
unshift(@features, @tmp);
@tmp = ($feature);
}else{
push(@tmp, $feature);
}
}
unshift(@features, @tmp);
my $f = 0;
my $c = 0;
my $lng = length($gb->{SEQ}) + 1;
foreach my $feature (@features){
$f ++;
$this->{"FEATURE$f"} = $gb->{$feature};
$this->{"FEATURE$f"}->{feature} = $f;
if($gb->{$feature}->{type} eq 'CDS'){
$c ++;
$this->{"FEATURE$f"}->{cds} = $c;
}
my ($start, $end) = ($lng - $gb->{$feature}->{end}, $lng - $gb->{$feature}->{start});
$this->{"FEATURE$f"}->{start} = $start;
$this->{"FEATURE$f"}->{end} = $end;
$this->{"FEATURE$f"}->{direction} = $gb->{$feature}->{direction} eq 'direct' ? 'complement' : 'direct';
if(defined $this->{"FEATURE$f"}->{"join"}){
my @join = split(/\,/,$this->{"FEATURE$f"}->{"join"});
my @num = ();
my @new_join = ();
foreach(@join){
if(tr/c/c/){
@num = split(/\.\./,$_);
push (@new_join, sprintf ("c%d\.\.%d", $lng - $num[1], $lng - $num[0]));
} else {
@num = split(/\.\./,$_);
push (@new_join, sprintf ("%d\.\.%d", $lng - $num[1], $lng - $num[0]));
}
}
$this->{"FEATURE$f"}->{join} = join(',', reverse(@new_join));
}
}
$this->set_gene_aliases();
return $this; } |
sub seq
{ return $_[0]->{SEQ}; } |
sub seq_info
{ my $this = shift;
my $nomsg = shift;
my $type = shift;
my $length = length($this->{SEQ});
my $a = $this->{SEQ} =~ tr/a/a/;
my $t = $this->{SEQ} =~ tr/t/t/;
my $g = $this->{SEQ} =~ tr/g/g/;
my $c = $this->{SEQ} =~ tr/c/c/;
my $symbol = $this->{SEQ} =~ tr/0123456789 _-/0123456789 _-/;
my $others = $length - $a - $t - $g - $c;
my $msg;
unless($nomsg){
$msg .= sprintf "\n\nAccession Number: %s\n", $this->{LOCUS}->{id} if(length($this->{LOCUS}->{id}));
$msg .= sprintf "\nDefinition: %s\n", $this->{DEFINITION} if(length($this->{DEFINITION}));
$msg .= sprintf "\n Merged LOCUS Count : %9d\n" , scalar(grep{/LOCUS/} keys(%$this)) - 2 if(exists($this->{LOCUS2}));
$msg .= sprintf "\n Length of Sequence : %9d\n" , $length;
if($length == 0){
$msg .= sprintf "\n No Sequence Found.\n\n";
}elsif($symbol > 5){
$msg .= sprintf "\n Sequence seems to be invalid (neither nucleotide nor amino acid).\n\n";
}elsif($others > $a + $t + $g + $c && $type != 1){
amino_info($this);
return;
}else{
$msg .= sprintf " A Content : %9d (%.2f\%)\n" , $a , $a / $length * 100; $msg .= sprintf " T Content : %9d (%.2f\%)\n" , $t , $t / $length * 100; $msg .= sprintf " G Content : %9d (%.2f\%)\n" , $g , $g / $length * 100; $msg .= sprintf " C Content : %9d (%.2f\%)\n" , $c , $c / $length * 100; $msg .= sprintf " Others : %9d (%.2f\%)\n" , $others, $others / $length * 100; $msg .= sprintf " AT Content : %.2f\%\n" , ($a + $t) / $length * 100; $msg .= sprintf " GC Content : %.2f\%\n\n" , ($g + $c) / $length * 100;
my ($genes, $cds, $trna, $rrna) = (scalar($this->gene()), scalar($this->cds()), scalar($this->tRNA()), scalar($this->rRNA()));
if($cds > 0){
$msg .= sprintf " Number of genes : %d (CDSs: %s, tRNAs: %s, rRNAs: %s)\n " , $genes, $cds, $trna, $rrna;
}
}
&msg_send($msg);
}
return ($a, $t, $g, $c); } |
sub set_essentiality
{ my $gb = shift;
if ($gb->{LOCUS}->{id} eq 'U00096' || $gb->{LOCUS}->{id} eq 'NC_000913'){
my $filename = dist_file('g-language', 'data/PECData.dat');
my $i = 1;
open(FILE, $filename) || die($!);
while(<FILE>){
chomp;
my @hoge = split(/\t/);
my @list = ($hoge[0], split(/,/, $hoge[1]));
my $id;
foreach my $gene (@list){
$id = $gb->gene2id($gene);
last if(length $id);
}
if($hoge[7] == 1){
$gb->{$id}->{essentiality} = 1;
}elsif($hoge[7] == 2){
$gb->{$id}->{essentiality} = -1;
}
}
close(FILE);
}else{
msg_error("No essentiality data for this species.\n\n");
}
return 1; } |
sub set_gene_aliases
{ my $this = shift;
foreach my $feat ($this->feature()){
next unless ($this->{$feat}->{type} =~ /CDS|RNA/);
if(length $this->{$feat}->{gene}){
$this->{$this->{$feat}->{gene}} = $this->{$feat};
}
if(length $this->{$feat}->{locus_tag}){
$this->{$this->{$feat}->{locus_tag}} = $this->{$feat};
}
if($this->{$feat}->{type} eq 'CDS'){
$this->{'CDS' . $this->{$feat}->{cds}} = $this->{$feat};
}
} } |
sub set_generationtime
{ my $gb = shift;
my $filename = dist_file('g-language', 'data/generationtime.dat');
for my $line (readFile($filename, 1)){
next unless($line =~ $gb->{LOCUS}->{id});
my @F = split(/\t/, $line);
$gb->{FEATURE0}->{'generationTime'} = $F[3];
$gb->{FEATURE0}->{'optimumGrowthTemperature'} = $F[2];
}
unless(length($gb->{FEATURE0}->{'generationTime'})){
msg_error("No generation time data for this species.\n\n");
}
return 1; } |
sub set_intergenic
{ my $gb = shift;
return if(defined($gb->{INTER1}));
my $num = 1;
my $i = 0;
my $interface = msg_ask_interface();
msg_interface("NULL");
my @cds = $gb->find(-type=>"CDS|RNA");
foreach my $feature (@cds){
if($i == 0 && $gb->{$feature}->{start} < $gb->{$feature}->{end}){
$gb->{"INTER$num"}->{start} = 1;
$gb->{"INTER$num"}->{end} = $gb->{$feature}->{start} - 1;
$gb->{"INTER$num"}->{direction} = "direct";
$gb->{"INTER$num"}->{left} = undef;
$gb->{"INTER$num"}->{right} = $feature;
$gb->{"INTER$num"}->{on} = 1;
$num++;
}elsif($i == $#cds && $gb->{$cds[-1]}->{end} + 1 < $gb->{SEQ}){
$gb->{"INTER$num"}->{start} = $gb->{$cds[-1]}->{end} + 1;
$gb->{"INTER$num"}->{end} = length $gb->{SEQ};
$gb->{"INTER$num"}->{direction} = "direct";
$gb->{"INTER$num"}->{left} = $feature;
$gb->{"INTER$num"}->{right} = undef;
$gb->{"INTER$num"}->{on} = 1;
$num++;
}elsif($i > 0){
if($gb->{$cds[$i - 1]}->{end} < $gb->{$feature}->{start}){
$gb->{"INTER$num"}->{start} = $gb->{$cds[$i - 1]}->{end} + 1;
$gb->{"INTER$num"}->{end} = $gb->{$feature}->{start} - 1;
$gb->{"INTER$num"}->{direction} = "direct";
$gb->{"INTER$num"}->{left} = $cds[$i - 1];
$gb->{"INTER$num"}->{right} = $feature;
$gb->{"INTER$num"}->{on} = 1;
$num++;
}
}
$i++;
}
msg_interface($interface); } |
sub startcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
return substr($_[0]->get_geneseq($_[1]), 0, 3); } |
sub stopcodon
{ if (scalar(@_) < 2){
carp("Not enough arguments.");
return;
}
return substr($_[0]->get_geneseq($_[1]), -3, 3); } |
sub tRNA
{ return feature($_[0], 'tRNA', $_[1]); } |
General documentation
Name: $gb->annotate() - reannotate the genome using Restauro-G
Description:
This method reannotates the entire genome, if coding sequences are already available.
Added features includes:
rs_des: description of protein
rs_xr: database cross references, including EMBL, RefSeq, KEGG, BioCyc, BRENDA, GO, InterPro, Pfam
rs_com: comments
Usage:
$new_gb = $gb->annotate();
REST:
http://rest.g-language.org/NC_000913/annotate
References:
1. Tamaki S, Arakawa K, Kono N, Tomita M (2007) "Restauro-G: A Rapid Genome Re-Annotation
System for Comparative Genomics", Genomics Proteomics Bioinformatics, 5(1): 53-58.
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History:
20120830-01 initial posting
Name: $gb->find() - search through the genome data object with keywords
Description:
This method provides powerful means to search within the G-language genome
data object with keywords. Given a set of keywords, this method returns
the list of feature IDs corresponding to the search query. In G-language Shell,
search results are also directly printed out (you can specify -print=>1 option
in API mode to print within your program).
eg. @features = $gb->find('RNA', 'tyrosine'); # multiple keywords are allowed.
Keywords can be specific to each of the feature attributes:
eg. $gb->find(-type=>'CDS', -product=>'metabolism', 'subunit');
Regular expressions are allowed for keywords:
eg. $gb->find(-type=>'CDS', -EC_number=>'^2.7.');
REST:
http://rest.g-language.org/NC_000913/product=metabolism/
Name: $gb->getseq() - get nucleotide sequence of the given positions (Perl coordinates)
Description:
Given the start and end positions (starting from 0 as in Perl),
returns the sequence specified.
eg. $gb->getseq(2,4); # returns the 3rd, 4th, and 5th nucleotides.
Options:
-circular when the first position is larger than the second position,
retrieves the sequece spanning across the end of the circular
chromosome. (ex: $gb->getseq(4639670, 5, -circular))
REST:
http://rest.g-language.org/NC_000913/getseq/1/3
Name: $gb->get_gbkseq() - get nucleotide sequence of the given positions (GenBank coordinates)
Description:
Given the start and end positions (starting from 1 as in
Genbank), returns the sequence specified.
eg. $gb->get_gbkseq(1,3); # returns the 1st, 2nd, and 3rd nucleotides.
Options:
-circular when the first position is larger than the second position,
retrieves the sequece spanning across the end of the circular
chromosome. (ex: $gb->getseq(4639670, 5, -circular))
REST:
http://rest.g-language.org/NC_000913/get_gbkseq/1/3
Name: $gb->get_cdsseq() - get nucleotide sequence of the given CDS
Description:
Given a CDS ID, returns the CDS sequence.
'complement' is properly parsed.
eg. $gb->get_cdsseq('recA'); # returns the 'recA' gene sequence.
REST:
http://rest.g-language.org/NC_000913/get_cdsseq/recA
$gb->around_startcodon() | Top |
Name: $gb->around_startcodon() - get the sequence around the startcodon of the given CDS
Description:
Given a CDS ID, lengths before startcodon and after start codon,
returns the sequence around of start codon.
eg. $gb->around_startcodon('FEATURE5239', 100, 100);
# returns 100 bp sequence before and after the start codon of 'CDS1',
# including the start codon itself
Options:
Optional Fourth argument containing a string "without" returns only the sequence
before and after the start codon, and without the stat codon itself.
eg. $gb->around_startcodon('FEATURE5239', 100, 100, "without");
REST:
http://rest.g-language.org/NC_000913/around_startcodon/recA/200/200
$gb->around_stopcodon() | Top |
Name: $gb->around_stopcodon() - get the sequence around the stopcodon of the given CDS
Description:
Given a CDS ID, lengths before stopcodon and after stop codon,
returns the sequence around of stop codon.
eg. $gb->around_stopcodon('FEATURE5239', 100, 100);
# returns 100 bp sequence before and after the stop codon of 'CDS1',
# including the stop codon itself
Options:
Optional Fourth argument containing a string "without" returns only the sequence
before and after the stop codon, and without the stat codon itself.
eg. $gb->around_stopcodon('FEATURE5239', 100, 100, "without");
REST:
http://rest.g-language.org/NC_000913/around_startcodon/recA/200/200
$gb->before_startcodon() | Top |
Name: $gb->before_startcodon() - get the upstream sequence of the given CDS
Description:
Given a CDS ID and length, returns the sequence upstream of
start codon.
eg. $gb->before_startcodon('CDS1', 100);
# returns 100 bp sequence upstream of the start codon of 'CDS1'.
Options:
Second argument specifying the length of sequence to retrieve is
optional. (default: 100).
REST:
http://rest.g-language.org/NC_000913/before_startcodon/recA/200
$gb->after_startcodon() | Top |
Name: $gb->after_startcodon() - get the sequence downstream of start codon of the given CDS
Description:
Given a CDS ID and length, returns the sequence downstream of
start codon.
eg. $gb->after_startcodon('CDS1', 100);
# returns 100 bp sequence downstream of the start codon of 'CDS1'.
Options:
Second argument specifying the length of sequence to retrieve is
optional. (default: 100).
REST:
http://rest.g-language.org/NC_000913/after_startcodon/recA/200
$gb->before_stopcodon() | Top |
Name: $gb->before_stopcodon() - get the sequence upstream of stop codon of the given CDS
Description:
Given a CDS ID and length, returns the sequence upstream of
stop codon.
eg. $gb->before_stopcodon('CDS1', 100);
# returns 100 bp sequence upstream of the stop codon of 'CDS1'.
Options:
Second argument specifying the length of sequence to retrieve is
optional. (default: 100).
REST:
http://rest.g-language.org/NC_000913/before_stopcodon/recA/200
$gb->after_stopcodon() | Top |
Name: $gb->after_stopcodon() - get the downstream sequence of the given CDS
Description:
Given a CDS ID and length, returns the sequence downstream of
stop codon.
eg. $gb->after_stopcodon('CDS1', 100);
# returns 100 bp sequence downstream of the stop codon of 'CDS1'.
Options:
Second argument specifying the length of sequence to retrieve is
optional. (default: 100).
REST:
http://rest.g-language.org/NC_000913/after_stopcodon/recA/200
Name: $gb->pos2feature() - get a feature ID from position
Description:
Given a GenBank position (sequence starting from position 1)
returns the feature ID (ex. FEATURE123) of the feature at
the given position. If multiple features exist for the given
position, the first feature to appear is returned. Returns
NULL if no feature exists.
When two positions are specified, all features within given
range is returned as an array of feature IDs.
REST:
http://rest.g-language.org/NC_000913/pos2feature/1024
Name: $gb->pos2gene() - get a feature ID of CDS from position
Description:
Given a GenBank position (sequence starting from position 1)
returns the feature ID (ex. FEATURE123) of the gene at
the given position. If multiple genes exists for the given
position, the first gene to appear is returned. Returns
NULL if no gene exists.
When two positions are specified, all genes within given
range is returned as an array of feature IDs.
REST:
http://rest.g-language.org/NC_000913/pos2gene/1024
Name: $gb->get_geneseq() - get nucleotide sequence of the given gene
Description:
Given a CDS ID, returns the CDS sequence, or the exon sequence
If introns are present.
'complement' is properly parsed, and introns are spliced out.
Codon start position is used when codon_start key is available.
eg. $gb->get_geneseq('recA'); # returns the 'recA' gene sequence or exon.
REST:
http://rest.g-language.org/NC_000913/get_geneseq/recA
Name: $gb->intron() - get a list of intron sequences of the given CDS
Description:
Given a CDS ID, returns the intron sequences as array of
sequences.
eg. $gb->get_intron('CDS1');
# returns ($1st_intron, $2nd_intron,..)
REST:
http://rest.g-language.org/NC_000913/get_intron/recA
Name: $gb->get_exon() - get a concatenated exon sequence of the given CDS
Description:
Given a CDS ID, returns the exon sequence.
'complement' is properly parsed, and introns are spliced out.
eg. $gb->get_exon('CDS1'); returns the 'CDS1' exon.
REST:
http://rest.g-language.org/NC_000913/get_exon/recA
Name: $gb->get_exons() - get a list of exon sequences of the given CDS
Description:
Given a CDS ID, returns the sequences of exon sequence.
'complement' is properly parsed, and introns are spliced out.
eg. $gb->get_exons('CDS1'); returns the array of exon sequences.
REST:
http://rest.g-language.org/NG_016275/get_exons/IL10RA
$gb->disable_pseudogenes() | Top |
Name: $gb->feature() - get a list of feature IDs
Description:
Returns the array of all feature IDs.
Features are ignored when $gb->{$feature}->{on} is 0.
eg.
foreach ($gb->feature()){
$gb->get_cdsseq($_);
}
#prints all feature sequences.
Optionally, feature type can be supplied to return only the
specifies features.
eg. $gb->feature("tRNA"); # returns feature IDs only for tRNAs
Option of "all" always returns all features regardless of the
value of $gb->{$feature}->{on}.
REST:
http://rest.g-language.org/NC_000913/feature
Name: $gb->rRNA() - get a list of feature IDs of rRNAs
Description:
Returns the array of all feature IDs of rRNAs.
Options:
Second optional argument can specify the rRNA type
(5S, 16S, 23S, 5.8S, 18S, 28S). When used in scalar context,
only the longest match is returned.
ex:
@SSU = $gb->rRNA('16S');
$longestSSU = $gb->rRNA('16S');
REST:
http://rest.g-language.org/NC_000913/rRNA/16S
Name: $gb->cds() - get a list of CDS IDs
Description:
Returns the array of all feature IDs of CDS.
Features are ignored when $gb->{FEATURE$i}->{on} OR
$gb->{CDS$i}->{on} is 0.
!CAUTION! the object name is actually the FEATURE ID,
to enable access to all feature values. However, most of the
time you do not need to be aware of this difference.
eg.
foreach ($gb->cds()){
$gb->get_geneseq($_);
}
#prints all gene sequences.
Option of "all" always returns all features regardless of the
value of $gb->{$feature}->{on}.
REST:
http://rest.g-language.org/NC_000913/cds
Name: $gb->next_feature() - get the next feature ID
Description:
Given a feature ID, returns the ID of the next feature.
Second argument can be used to specify the type of the
next feature.
eg. $gb->next_feature(FEATURE1234); # returns 'FEATURE1235'
$gb->next_feature(FEATURE1234, 'tRNA');
# returns next feature ID whose type is 'tRNA'
REST:
http://rest.g-language.org/NC_000913/next_feature/recA
$gb->previous_feature() | Top |
Name: $gb->previous_feature() - get the previous feature ID
Description:
Given a feature ID, returns the ID of the previous feature.
Second argument can be used to specify the type of the
next feature.
eg. $gb->previous_feature(FEATURE1234); # returns 'FEATURE1233'
$gb->previous_feature(FEATURE1234, 'tRNA');
# returns previous feature ID whose type is 'tRNA'
REST:
http://rest.g-language.org/NC_000913/previous_feature/recA
Name: $gb->intergenic() - get a list of IDs of intergenic regions
Description:
Returns the array of all IDs of intergenic regions.
Here "intergenic region" is defined as the region in a genome
between coding and stable RNA genes.
REST:
http://rest.g-language.org/NC_000913/intergenic
$gb->relocate_origin() | Top |
Name: $gb->relocate_origin() - create a G instance starting at given position
Description:
Returns a G instance starting at given position, assuming circular
chromosome. All information, including the sequence and feature
annotations are moved. Note that the given position is Perl position
and NOT GenBank position. GenBank position -1 equals Perl position.
Usage:
$new = $gb->relocate_origin($position);
This method would probably be most useful in conjunction with
find_ori_ter(), to create a G instance starting from the
origin of replication, as follows:
($ori, $ter) = find_ori_ter($gb);
$new = $gb->relocate_origin($ori);
Several of related methods can be concatenated. For example,
to create a GenBank file of complementary DNA strand starting
from the origin of replication, do the following:
$gb->reverse_strand()->relocate_origin($ori)->output("out.gbk");
REST:
http://rest.g-language.org/NC_000913/relocate_origin
Name: $gb->reverse_strand() - create a G instance on complementary DNA strand
Description:
Returns a G instance for the complementary DNA strand.
All information, including the sequence and feature annotations
is switched to reflect that of the complementary DNA strand.
In other words, gene order, direction of genes (either direct or
complement), and positions are reversed.
Usage:
$new = $gb->reverse_strand();
REST:
http://rest.g-language.org/NC_000913/reverse_strand
Name: $gb->output() - output the G instance data to file
Description:
Given a filename and an option, outputs the G-language data object
to the specified file in a flat-file database of a given format.
The options are the same as those of new(). Default format is 'GenBank'.
eg. $gb->output("my_genome.embl", "EMBL");
$gb->output("my_genome.gbk"); # with GenBank you can ommit the option.
$gb->output("my_genome.gff3");
REST:
http://rest.g-language.org/NC_000913/output/embl (convert input genome to EMBL format)
$gb->set_generationtime() | Top |
Name: $gb->set_generationtime() - add generation time data for bacterial genomes
Description:
This method adds generation time data for bacterial genomes,
based on the data in Reference 1.
$gb->{FEATURE0}->{generationTime} is the generation time (hour), and
$gb->{feature0}->{optimumGrowthTemperature} is the optimum growth temperature in Celsius.
References:
1. Vieira-Silva S and Rocha RPC (2010) "The systemic imprint of growth and its uses in
ecological (meta)genomics", PLoS Genetics, 6(1):e1000808
REST:
http://rest.g-language.org/NC_000913/set_generationtime
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
HISTORY:
20110224-01 initial posting
$gb->set_essentiality() | Top |
Name: $gb->set_essentiality() - add gene essentiality data for E.coli genome
Description:
This method adds gene essentiality data for E.coli genome,
based on the data in PEC database (http://www.shigen.nig.ac.jp/ecoli/pec/index.jsp)
Only works for U00096 or NC_000913 E.coli K-12 MG1655 flatfile.
$gb->{$cds}->{essentiality} is 1 for essential, -1 for non-essential genes.
References:
1. Hashimoto M et al. (2005) "Cell size and nucleoid organization of engineered
Escherichia coli cells with a reduced genome", Mol Microbiol, 55(1):137-149
REST:
http://rest.g-language.org/NC_000913/set_essentiality
Author:
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
HISTORY:
20100123-01 bundled the data in distribution, moved the function to G::IO::Handler
20070219-01 initial posting