加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
MaxEntScan.pm 26.79 KB
一键复制 编辑 原始数据 按行查看 历史
Helen Schuilenburg 提交于 2020-02-25 15:02 . Yearly copyright update
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2020] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=head1 CONTACT
Ensembl <http://www.ensembl.org/info/about/contact/index.html>
=cut
=head1 NAME
MaxEntScan
=head1 SYNOPSIS
mv MaxEntScan.pm ~/.vep/Plugins
./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload
./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS
=head1 DESCRIPTION
This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html)
to get splice site predictions.
The plugin copies most of the code verbatim from the score5.pl and score3.pl
scripts provided in the MaxEntScan download. To run the plugin you must get and
unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path
to this unpacked directory is then the param you pass to the --plugin flag.
The plugin executes the logic from one of the scripts depending on which
splice region the variant overlaps:
score5.pl : last 3 bases of exon --> first 6 bases of intron
score3.pl : last 20 bases of intron --> first 3 bases of exon
The plugin reports the reference, alternate and difference (REF - ALT) maximum
entropy scores.
If 'SWA' is specified as a command-line argument, a sliding window algorithm
is applied to subsequences containing the reference and alternate alleles to
identify k-mers with the highest donor and acceptor splice site scores. To assess
the impact of variants, reference comparison scores are also provided. For SNVs,
the comparison scores are derived from sequence in the same frame as the highest
scoring k-mers containing the alternate allele. For all other variants, the
comparison scores are derived from the highest scoring k-mers containing the
reference allele. The difference between the reference comparison and alternate
scores (SWA_REF_COMP - SWA_ALT) are also provided.
If 'NCSS' is specified as a command-line argument, scores for the nearest
upstream and downstream canonical splice sites are also included.
By default, only scores are reported. Add 'verbose' to the list of command-
line arguments to include the sequence output associated with those scores.
=cut
package MaxEntScan;
use strict;
use warnings;
use Digest::MD5 qw(md5_hex);
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
# how many seq/score pairs to cache in memory
our $CACHE_SIZE = 50;
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
# we need sequence, so no offline mode unless we have FASTA
die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
my $params = $self->params;
my $dir = shift @$params;
die("ERROR: MaxEntScan directory not specified\n") unless $dir;
die("ERROR: MaxEntScan directory not found\n") unless -d $dir;
$self->{_dir} = $dir;
## setup from score5.pl
$self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5');
$self->{'score5_seq'} = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences');
## setup from score3.pl
$self->{'score3_metables'} = $self->score3_makemaxentscores;
my %opts = map { $_ => undef } @$params;
$self->{'run_SWA'} = 1 if exists $opts{'SWA'};
$self->{'run_NCSS'} = 1 if exists $opts{'NCSS'};
$self->{'verbose'} = 1 if exists $opts{'verbose'};
return $self;
}
sub feature_types {
return ['Transcript'];
}
sub get_header_info {
my $self = shift;
my $v = $self->{'verbose'};
my $headers = $self->get_MES_header_info($v);
if ($self->{'run_SWA'}) {
my $swa_headers = $self->get_SWA_header_info($v);
$headers = {%$headers, %$swa_headers};
}
if ($self->{'run_NCSS'}) {
my $ncss_headers = $self->get_NCSS_header_info($v);
$headers = {%$headers, %$ncss_headers};
}
return $headers;
}
sub get_MES_header_info {
my ($self, $verbose) = @_;
my $headers = {
MaxEntScan_ref => "MaxEntScan reference sequence score",
MaxEntScan_alt => "MaxEntScan alternate sequence score",
MaxEntScan_diff => "MaxEntScan score difference",
};
if ($verbose) {
$headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence";
$headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence";
}
return $headers;
}
sub get_SWA_header_info {
my ($self, $verbose) = @_;
my $headers = {
"MES-SWA_donor_ref" => "Highest splice donor reference sequence score",
"MES-SWA_donor_alt" => "Highest splice donor alternate sequence score",
"MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score",
"MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores",
"MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score",
"MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score",
"MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score",
"MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores",
};
if ($verbose) {
$headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence";
$headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence";
$headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele";
$headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence";
$headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence";
$headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele";
$headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence";
$headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence";
$headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence";
$headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele";
$headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence";
$headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence";
$headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele";
$headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence";
}
return $headers;
}
sub get_NCSS_header_info {
my ($self, $verbose) = @_;
my $headers = {
"MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score",
"MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score",
"MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score",
"MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score",
};
if ($verbose) {
$headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence";
$headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence";
$headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence";
$headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence";
}
return $headers;
}
sub run {
my ($self, $tva) = @_;
my $seq_headers = $self->get_MES_header_info();
my $results = $self->run_MES($tva);
if ($self->{'run_SWA'}) {
my $swa_seq_headers = $self->get_SWA_header_info();
$seq_headers = {%$seq_headers, %$swa_seq_headers};
my $swa_results = $self->run_SWA($tva);
$results = {%$results, %$swa_results};
}
if ($self->{'run_NCSS'}) {
my $ncss_seq_headers = $self->get_NCSS_header_info();
$seq_headers = {%$seq_headers, %$ncss_seq_headers};
my $ncss_results = $self->run_NCSS($tva);
$results = {%$results, %$ncss_results};
}
my %data;
# add the scores
my @scores = grep { exists $results->{$_} } keys %$seq_headers;
@data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores};
if ($self->{'verbose'}) {
# add any remaining results
my @non_scores = grep { ! exists $data{$_} } keys %$results;
@data{@non_scores} = @{$results}{@non_scores};
}
return \%data;
}
sub run_MES {
my ($self, $tva) = @_;
my $vf = $tva->variation_feature;
return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/;
my $tv = $tva->transcript_variation;
my $tr = $tva->transcript;
my $tr_strand = $tr->strand;
my ($vf_start, $vf_end) = ($vf->start, $vf->end);
# use _overlapped_introns() method from BaseTranscriptVariation
# this will use an interval tree if available for superfast lookup of overlapping introns
# we have to expand the search space around $vf because we're looking for the splice region not the intron per se
foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) {
# get coords depending on strand
# MaxEntScan does different predictions for 5 and 3 prime
# and we need to feed it different bits of sequence for each
#
# 5prime, 3 bases of exon, 6 bases of intron:
# ===------
#
# 3prime, 20 bases of intron, 3 bases of exon
# --------------------===
my ($five_start, $five_end, $three_start, $three_end);
if($tr_strand > 0) {
($five_start, $five_end) = ($intron->start - 3, $intron->start + 5);
($three_start, $three_end) = ($intron->end - 19, $intron->end + 3);
}
else {
($five_start, $five_end) = ($intron->end - 5, $intron->end + 3);
($three_start, $three_end) = ($intron->start - 3, $intron->start + 19);
}
if(overlap($vf->start, $vf->end, $five_start, $five_end)) {
my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)};
return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
my $ref_score = $self->score5($ref_seq);
my $alt_score = $self->score5($alt_seq);
return {
MaxEntScan_ref => $ref_score,
MaxEntScan_ref_seq => $ref_seq,
MaxEntScan_alt => $alt_score,
MaxEntScan_alt_seq => $alt_seq,
MaxEntScan_diff => $ref_score - $alt_score,
}
}
if(overlap($vf->start, $vf->end, $three_start, $three_end)) {
my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)};
return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
my $ref_score = $self->score3($ref_seq);
my $alt_score = $self->score3($alt_seq);
return {
MaxEntScan_ref => $ref_score,
MaxEntScan_ref_seq => $ref_seq,
MaxEntScan_alt => $alt_score,
MaxEntScan_alt_seq => $alt_seq,
MaxEntScan_diff => $ref_score - $alt_score,
}
}
}
return {};
}
sub run_SWA {
my ($self, $tva) = @_;
my $vf = $tva->variation_feature;
my %results;
# get the donor reference and alternate sequence contexts
my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)};
if (defined($donor_ref_context)) {
$results{'MES-SWA_donor_ref_context'} = $donor_ref_context;
if ($donor_ref_context =~ /^[ACGT]+$/) {
my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)};
$results{'MES-SWA_donor_ref_seq'} = $seq;
$results{'MES-SWA_donor_ref_frame'} = $frame;
$results{'MES-SWA_donor_ref'} = $score;
}
}
if (defined($donor_alt_context)) {
$results{'MES-SWA_donor_alt_context'} = $donor_alt_context;
if ($donor_alt_context =~ /^[ACGT]+$/) {
my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)};
$results{'MES-SWA_donor_alt_seq'} = $seq;
$results{'MES-SWA_donor_alt_frame'} = $frame;
$results{'MES-SWA_donor_alt'} = $score;
if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) {
if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
# for SNVs, compare to the same frame as the highest scoring ALT k-mer
$ref_comp_seq = substr($donor_ref_context, $frame - 1, 9);
}
$results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq;
$results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq);
$results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score;
}
}
}
# get the acceptor reference and alternate sequence contexts
my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)};
if (defined($acceptor_ref_context)) {
$results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context;
if ($acceptor_ref_context =~ /^[ACGT]+$/) {
my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)};
$results{'MES-SWA_acceptor_ref_seq'} = $seq;
$results{'MES-SWA_acceptor_ref_frame'} = $frame;
$results{'MES-SWA_acceptor_ref'} = $score;
}
}
if (defined($acceptor_alt_context)) {
$results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context;
if ($acceptor_alt_context =~ /^[ACGT]+$/) {
my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)};
$results{'MES-SWA_acceptor_alt_seq'} = $seq;
$results{'MES-SWA_acceptor_alt_frame'} = $frame;
$results{'MES-SWA_acceptor_alt'} = $score;
if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) {
if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
# for SNVs, compare to the same frame as the highest scoring ALT k-mer
$ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23);
}
$results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq;
$results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq);
$results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score;
}
}
}
return \%results;
}
sub run_NCSS {
my ($self, $tva) = @_;
my $tv = $tva->transcript_variation;
my $tr = $tva->transcript;
my %results;
if ($tv->intron_number) {
my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number);
my $intron_number = (split(/-/, $intron_numbers))[0];
my $introns = $tr->get_all_Introns;
my $intron_idx = $intron_number - 1;
my $intron = $introns->[$intron_idx];
if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) {
$results{'MES-NCSS_upstream_donor_seq'} = $seq;
$results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
}
if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) {
$results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
$results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
}
# don't calculate an upstream acceptor score if the intron is the first in the transcript
unless ($intron_number == 1) {
my $upstream_intron = $introns->[$intron_idx - 1];
if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) {
$results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
$results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
}
}
# don't calculate a downstream donor score if the intron is the last in the transcript
unless ($intron_number == $total_introns) {
my $downstream_intron = $introns->[$intron_idx + 1];
if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) {
$results{'MES-NCSS_downstream_donor_seq'} = $seq;
$results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
}
}
}
elsif ($tv->exon_number) {
my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number);
my $exon_number = (split(/-/, $exon_numbers))[0];
my $exons = $tr->get_all_Exons;
my $exon_idx = $exon_number - 1;
my $exon = $exons->[$exon_idx];
# don't calculate upstream scores if the exon is the first in the transcript
unless ($exon_number == 1) {
my $upstream_exon = $exons->[$exon_idx - 1];
if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) {
$results{'MES-NCSS_upstream_donor_seq'} = $seq;
$results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
}
if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) {
$results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
$results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
}
}
# don't calculate downstream scores if the exon is the last exon in the transcript
unless ($exon_number == $total_exons) {
my $downstream_exon = $exons->[$exon_idx + 1];
if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) {
$results{'MES-NCSS_downstream_donor_seq'} = $seq;
$results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
}
if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) {
$results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
$results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
}
}
}
return \%results;
}
## Sliding window approach methods
##################################
sub get_max_donor {
my ($self, $sequence) = @_;
my ($seq, $frame, $max);
my @kmers = @{$self->sliding_window($sequence, 9)};
for my $i (0 .. $#kmers) {
my $kmer = $kmers[$i];
my $score = $self->score5($kmer);
if(!$max || $score > $max) {
$seq = $kmer;
$frame = $i + 1;
$max = $score;
}
}
return [$seq, $frame, $max];
}
sub get_max_acceptor {
my ($self, $sequence) = @_;
my ($seq, $frame, $max);
my @kmers = @{$self->sliding_window($sequence, 23)};
for my $i (0 .. $#kmers) {
my $kmer = $kmers[$i];
my $score = $self->score3($kmer);
if(!$max || $score > $max) {
$seq = $kmer;
$frame = $i + 1;
$max = $score;
}
}
return [$seq, $frame, $max];
}
sub sliding_window {
my ($self, $sequence, $winsize) = @_;
my @seqs;
for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) {
push @seqs, substr($sequence, $i - 1, $winsize);
}
return \@seqs;
}
## Nearest canonical splice site methods
########################################
sub get_donor_seq_from_exon {
my ($self, $exon) = @_;
my ($start, $end);
if ($exon->strand > 0) {
($start, $end) = ($exon->end - 2, $exon->end + 6);
}
else {
($start, $end) = ($exon->start - 6, $exon->start + 2);
}
my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
my $seq = $slice->seq() if defined($slice);
return $seq;
}
sub get_acceptor_seq_from_exon {
my ($self, $exon) = @_;
my ($start, $end);
if ($exon->strand > 0) {
($start, $end) = ($exon->start - 20, $exon->start + 2);
}
else {
($start, $end) = ($exon->end - 2, $exon->end + 20);
}
my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
my $seq = $slice->seq() if defined($slice);
return $seq;
}
sub get_donor_seq_from_intron {
my ($self, $intron) = @_;
my ($start, $end);
if ($intron->strand > 0) {
($start, $end) = ($intron->start - 3, $intron->start + 5);
}
else {
($start, $end) = ($intron->end - 5, $intron->end + 3);
}
my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
my $seq = $slice->seq() if defined($slice);
return $seq;
}
sub get_acceptor_seq_from_intron {
my ($self, $intron) = @_;
my ($start, $end);
if ($intron->strand > 0) {
($start, $end) = ($intron->end - 19, $intron->end + 3);
}
else {
($start, $end) = ($intron->start - 3, $intron->start + 19);
}
my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
my $seq = $slice->seq() if defined($slice);
return $seq;
}
## Common methods
#################
sub get_seqs {
my ($self, $tva, $start, $end) = @_;
my $vf = $tva->variation_feature;
my $tr_strand = $tva->transcript->strand;
my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand);
my ($ref_seq, $alt_seq);
if (defined $ref_slice) {
$ref_seq = $alt_seq = $ref_slice->seq();
my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end};
my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : '';
substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq;
}
return [$ref_seq, $alt_seq];
}
sub score5 {
my $self = shift;
my $seq = shift;
my $hex = md5_hex($seq);
# check cache
if($self->{cache}) {
my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}};
return $res->{score} if $res;
}
my $a = $self->score5_scoreconsensus($seq);
die("ERROR: No score5_scoreconsensus\n") unless defined($a);
my $b = $self->score5_getrest($seq);
die("ERROR: No score5_getrest\n") unless defined($b);
my $c = $self->{'score5_seq'}->{$b};
die("ERROR: No score5_seq for $b\n") unless defined($c);
my $d = $self->{'score5_me2x5'}->{$c};
die("ERROR: No score5_me2x5 for $c\n") unless defined($d);
my $score = $self->log2($a * $d);
# cache it
push @{$self->{cache}->{score5}}, { hex => $hex, score => $score };
shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE;
return $score;
}
sub score3 {
my $self = shift;
my $seq = shift;
my $hex = md5_hex($seq);
# check cache
if($self->{cache}) {
my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}};
return $res->{score} if $res;
}
my $a = $self->score3_scoreconsensus($seq);
die("ERROR: No score3_scoreconsensus\n") unless defined($a);
my $b = $self->score3_getrest($seq);
die("ERROR: No score3_getrest\n") unless defined($b);
my $c = $self->score3_maxentscore($b, $self->{'score3_metables'});
die("ERROR: No score3_maxentscore for $b\n") unless defined($c);
my $score = $self->log2($a * $c);
# cache it
push @{$self->{cache}->{score3}}, { hex => $hex, score => $score };
shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE;
return $score;
}
## methods copied from score5.pl
################################
sub score5_makesequencematrix {
my $self = shift;
my $file = shift;
my %matrix;
my $n=0;
open(SCOREF, $file) || die "Can't open $file!\n";
while(<SCOREF>) {
chomp;
$_=~ s/\s//;
$matrix{$_} = $n;
$n++;
}
close(SCOREF);
return \%matrix;
}
sub score5_makescorematrix {
my $self = shift;
my $file = shift;
my %matrix;
my $n=0;
open(SCOREF, $file) || die "Can't open $file!\n";
while(<SCOREF>) {
chomp;
$_=~ s/\s//;
$matrix{$n} = $_;
$n++;
}
close(SCOREF);
return \%matrix;
}
sub score5_getrest {
my $self = shift;
my $seq = shift;
my @seqa = split(//,uc($seq));
return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8];
}
sub score5_scoreconsensus {
my $self = shift;
my $seq = shift;
my @seqa = split(//,uc($seq));
my %bgd;
$bgd{'A'} = 0.27;
$bgd{'C'} = 0.23;
$bgd{'G'} = 0.23;
$bgd{'T'} = 0.27;
my %cons1;
$cons1{'A'} = 0.004;
$cons1{'C'} = 0.0032;
$cons1{'G'} = 0.9896;
$cons1{'T'} = 0.0032;
my %cons2;
$cons2{'A'} = 0.0034;
$cons2{'C'} = 0.0039;
$cons2{'G'} = 0.0042;
$cons2{'T'} = 0.9884;
my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]});
return $addscore;
}
sub log2 {
my ($self, $val) = @_;
return log($val)/log(2);
}
## methods copied from score3.pl
################################
sub score3_hashseq {
#returns hash of sequence in base 4
# $self->score3_hashseq('CAGAAGT') returns 4619
my $self = shift;
my $seq = shift;
$seq = uc($seq);
$seq =~ tr/ACGT/0123/;
my @seqa = split(//,$seq);
my $sum = 0;
my $len = length($seq);
my @four = (1,4,16,64,256,1024,4096,16384);
my $i=0;
while ($i<$len) {
$sum+= $seqa[$i] * $four[$len - $i -1] ;
$i++;
}
return $sum;
}
sub score3_makemaxentscores {
my $self = shift;
my $dir = $self->{'_dir'}."/splicemodels/";
my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4',
'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9');
my @metables;
my $num = 0 ;
foreach my $file (@list) {
my $n = 0;
open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
while(<SCOREF>) {
chomp;
$_=~ s/\s//;
$metables[$num]{$n} = $_;
$n++;
}
close(SCOREF);
#print STDERR $file."\t".$num."\t".$n."\n";
$num++;
}
return \@metables;
}
sub score3_maxentscore {
my $self = shift;
my $seq = shift;
my $table_ref = shift;
my @metables = @$table_ref;
my @sc;
$sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))};
$sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))};
$sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))};
$sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))};
$sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))};
$sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))};
$sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))};
$sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))};
$sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))};
my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]);
return $finalscore;
}
sub score3_getrest {
my $self = shift;
my $seq = shift;
my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3);
return $seq_noconsensus;
}
sub score3_scoreconsensus {
my $self = shift;
my $seq = shift;
my @seqa = split(//,uc($seq));
my %bgd;
$bgd{'A'} = 0.27;
$bgd{'C'} = 0.23;
$bgd{'G'} = 0.23;
$bgd{'T'} = 0.27;
my %cons1;
$cons1{'A'} = 0.9903;
$cons1{'C'} = 0.0032;
$cons1{'G'} = 0.0034;
$cons1{'T'} = 0.0030;
my %cons2;
$cons2{'A'} = 0.0027;
$cons2{'C'} = 0.0037;
$cons2{'G'} = 0.9905;
$cons2{'T'} = 0.0030;
my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]});
return $addscore;
}
1;
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化