neXtProt.pm 11.39 KB
=head1 NAME
mv neXtProt.pm ~/.vep/Plugins
./vep -i variations.vcf --plugin neXtProt
./vep -i variations.vcf --plugin neXtProt,max_set=1
This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
retrieves data for missense and stop gain variants from neXtProt, which is a comprehensive
human-centric discovery platform that offers integration of and navigation
through protein-related data (https://www.nextprot.org/).
Please cite the neXtProt publication alongside the VEP if you use this resource:
This plugin is only suitable for small sets of variants as an additional
individual remote API query run for each variant.
Running options:
(Default) the data retrieved by default is the MatureProtein, NucleotidePhosphateBindingRegion,
Variant, MiscellaneousRegion, TopologicalDomain and InteractingRegion.
The plugin can also be run with other options to retrieve other data than the default.
Options are passed to the plugin as key=value pairs:
max_set : Set value to 1 to return all available protein-related data
(includes the default data)
return_values : The set of data to be returned.
Use file 'neXtProt_headers.txt' to check which data (labels) are available.
url : Set value to 1 to include the URL to link to the neXtProt entry.
all_labels : Set value to 1 to include all labels, even if data is not available.
position : Set value to 1 to include the start and end position in the protein.
* note: 'max_set' and 'return_values' cannot be used simultaneously.
By default, the plugin only returns data that is available. Example (default behaviour):
neXtProt_MatureProtein=Rho guanine nucleotide exchange factor 10
The option 'all_labels' returns a consistent set of the requested fields, using "-" where
values are not available. Same example as above:
neXtProt_MatureProtein=Rho guanine nucleotide exchange factor 10;
The plugin can then be run as default:
./vep -i variations.vcf --plugin neXtProt
or to return only the data specified by the user:
./vep -i variations.vcf --plugin neXtProt,return_values='Domain&InteractingRegion'
package neXtProt;
use strict;
use warnings;
use JSON::XS;
use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
my $default_output = {
'neXtProt_MatureProtein' => 'Extent of an active peptide or a polypetide chain in the mature protein',
'neXtProt_NucleotidePhosphateBindingRegion' => 'Nucleotide phosphate binding region',
'neXtProt_Variant' => 'Variant-specific annotations',
'neXtProt_MiscellaneousRegion' => 'Region of interest in the sequence',
'neXtProt_TopologicalDomain' => 'Location of non-membrane regions of membrane-spanning proteins',
'neXtProt_InteractingRegion' => 'Region interacting with another macromolecule'
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
my $param_hash = $self->params_to_hash();
if(defined($param_hash->{max_set}) && defined($param_hash->{return_values})) {
die "ERROR: Can't use max_set and return_values simultaneously!\n";
# Return the isoform URL to the neXtProt web page
if(defined($param_hash->{url})) {
$self->{url} = $param_hash->{url};
if(defined($param_hash->{max_set}) || defined($param_hash->{return_values})) {
$self->{return_values_hash}->{'neXtProt_url'} = 'neXtProt URL';
else {
$default_output->{'neXtProt_url'} = 'neXtProt URL';
if(defined($param_hash->{max_set})) {
$self->{max_set} = $param_hash->{max_set};
if(defined($param_hash->{return_values})) {
$self->{return_values} = $param_hash->{return_values};
if(defined($param_hash->{all_labels})) {
$self->{all_labels} = $param_hash->{all_labels};
if(defined($param_hash->{position})) {
$self->{position} = $param_hash->{position};
return $self;
sub feature_types {
return ['Transcript'];
sub get_header_info {
my $self = shift;
my %header;
if($self->{max_set}) {
foreach my $value (keys $self->{return_values_hash}) {
$header{$value} = $self->{return_values_hash}->{$value};
elsif($self->{return_values}) {
foreach my $value (keys $self->{return_values_hash}) {
$header{$value} = $self->{return_values_hash}->{$value};
else {
foreach my $value (keys %$default_output) {
$header{$value} = $default_output->{$value};
return \%header;
sub run {
my ($self, $tva) = @_;
return {} unless grep {$_->SO_term =~ 'missense_variant|stop_gain'} @{$tva->get_all_OverlapConsequences};
my $tv = $tva->transcript_variation;
my $peptide_start = defined($tv->translation_start) ? $tv->translation_start : undef;
my $translation_id = $tva->transcript->translation->stable_id;
return {} unless defined($translation_id) && defined($peptide_start);
my $query = $self->get_sparql_query($peptide_start,$translation_id);
# run SPARQL query
my $query_output;
eval {
$query_output = `curl -X POST -H "Accept:application/sparql-results+json" --data-urlencode "query=$query" https://sparql.nextprot.org/ 2> /dev/null`;
warn $@ if $@;
my $output = decode_json ($query_output);
my %result_hash;
my %result_hash_final;
# Output format: 'iso','spos','epos','annot_type','callret-4'
# 'nx_lnk' -> isoform URL to neXtProt page; 'spos' -> start position; 'epos' -> end position; 'annot_type' -> annotation type (e.g. PdbMapping, Variant, etc.);
# 'annot_descr' -> data
my $output_list = $output->{results}->{bindings};
return {} if (@$output_list == 0);
foreach my $results (@$output_list) {
my $isoform_url = $results->{nx_lnk}->{value};
my $start_pos = $results->{spos}->{value};
my $end_pos = $results->{epos}->{value};
my $annot_type = $results->{annot_type}->{value};
$annot_type =~ s/.*#//;
my $data = $results->{annot_descr}->{value};
# PdbMapping and Variant values contain ';'
if($data =~ /;/) {
if($annot_type eq 'Variant') {
$data =~ s/;/\./g;
else {
$data =~ s/;//g;
$data =~ s/\.$//;
# There is only one URL
if($self->{url} && !$result_hash{'neXtProt_url'}) {
my @isoform_value = ($isoform_url);
$result_hash{'neXtProt_url'} = \@isoform_value;
# Some annot_type have more than one value
# Need to check if it's not duplicated
if($result_hash{'neXtProt_'.$annot_type}) {
my $annot_type_data = $self->{position} ? $start_pos.','.$end_pos.','.$data : $data;
push @{$result_hash{'neXtProt_'.$annot_type}}, $annot_type_data unless grep{$_ eq $annot_type_data} @{$result_hash{'neXtProt_'.$annot_type}};
else {
my @list_of_data;
my $annot_type_data = $self->{position} ? $start_pos.','.$end_pos.','.$data : $data;
push @list_of_data, $annot_type_data;
$result_hash{'neXtProt_'.$annot_type} = \@list_of_data;
my @keys;
if($self->{max_set} || $self->{return_values}) {
@keys = keys $self->{return_values_hash};
else {
@keys = keys %$default_output;
foreach my $key (@keys) {
if($result_hash{$key}) {
my $data_to_return = $result_hash{$key};
my $join_data = join('|', @$data_to_return);
$result_hash_final{$key} = $join_data;
elsif(!$result_hash{$key} && $self->{all_labels}) {
$result_hash_final{$key} = '-';
return \%result_hash_final;
sub get_sparql_query {
my ($self, $peptide_start, $translation_id) = @_;
my $query = "PREFIX : <http://nextprot.org/rdf#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX isoform: <http://nextprot.org/rdf/isoform/>
select distinct ?iso ?nx_lnk ?spos ?epos ?annot_type str(?txt) as ?annot_descr
where {
values ?poi {$peptide_start}
values ?ensp {'$translation_id'}
bind (IRI(CONCAT('http://rdf.ebi.ac.uk/resource/ensembl.protein/',?ensp)) as ?ENSP_IRI)
SERVICE <http://sparql.uniprot.org/sparql> {
?enst up:translatedTo ?ENSP_IRI .
?enst rdfs:seeAlso ?upiso .
BIND(IRI(replace(str(?upiso),'http://purl.uniprot.org/isoforms/','http://nextprot.org/rdf/isoform/NX_')) AS ?iso) .
?entry :isoform ?iso .
?iso :positionalAnnotation ?statement .
bind(replace(str(?iso),'http://nextprot.org/rdf/isoform/','') as ?iso_ac) .
bind(replace(str(?entry),'http://nextprot.org/rdf/entry/','') as ?entry_ac) .
bind(concat('https://www.nextprot.org/entry/', ?entry_ac, '/sequence?isoform=', ?iso_ac) as ?nx_lnk) .
?statement rdfs:comment ?txt .
?statement a ?annot_type .
?statement :start ?spos; :end ?epos .
filter((?spos <= ?poi) && (?epos >= ?poi))
} order by ?spos";
return $query;
sub build_data_hash {
my $self = shift;
my $option = shift; # Set to 1 to return all data from header file, set to 0 to return data specified by the user
my $plugin_dir = $INC{'neXtProt.pm'};
$plugin_dir =~ s/neXtProt\.pm//i;
my $file = $plugin_dir.'/neXtProt_headers.txt';
my %headers_file_hash;
my %output_hash;
if (! -e $file) {
die ("ERROR: neXtProt_headers file is not available in $plugin_dir");
} else {
open FILE, $file;
while(<FILE>) {
my ($value, $description) = split(/\t/, $_);
die ("ERROR: neXtProt value is missing from file.") if(!$value);
$headers_file_hash{$value} = $description;
close FILE;
# Going to return the data specified by the user
if($option == 0) {
my @data_from_user = split(/[\;\&\|]/, $self->{return_values});
foreach my $dfu (@data_from_user) {
if($headers_file_hash{$dfu}) {
$self->{return_values_hash}->{'neXtProt_' . $dfu} = $headers_file_hash{$dfu};
else {
die ("ERROR: $dfu is not available in neXtProt. Check file 'neXtProt_headers.txt' to see the data that is valid to query.\n");
# Return all data available in header file
else {
foreach my $hf (keys %headers_file_hash) {
$self->{return_values_hash}->{'neXtProt_' . $hf} = $headers_file_hash{$hf};
