加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
BtToxin_scanner2.pl 30.50 KB
一键复制 编辑 原始数据 按行查看 历史
liaochenlanruo 提交于 2018-05-17 10:08 . Add files via upload
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928
#!/usr/bin/perl -w
use strict;
use Bio::SeqIO;
use Getopt::Long;
use Pod::Usage;
use Getopt::Std;
if ($#ARGV < 1 ) {
&showhelp();
exit;
}else{
foreach my $i (0..$#ARGV) {
if ($ARGV[$i] eq "-h") {
&showhelp();
exit;
}
}
}
my $kmmer ='81';
my $cpus = '6';
my $tr_table ='11';
my $evalue = '1e-5';
my $reads1 = '.R1.clean.fastq.gz';
my $reads2 = '.R2.clean.fastq.gz';
my $postfix = '-8.fa';
my $coverage = '50';
my $database = 'Bt_toxin';
my $type = 'reads';
my $dir = 'CDs';
my $cdsa = '.faa';
my $cdsn ='.ffn';
use vars qw($opt_k $opt_n $opt_d $opt_e $opt_1 $opt_2 $opt_c $opt_g $opt_3 $opt_s $opt_4 $opt_5 $opt_p);
getopts('k:n:g:e:1:2:3:c:d:s:p:4:5:');
$kmmer =$opt_k if $opt_k;
$cpus =$opt_n if $opt_n;
$tr_table = $opt_g if $opt_g;
$evalue = $opt_e if $opt_e;
$reads1 = $opt_1 if $opt_1;
$reads2 = $opt_2 if $opt_2;
$postfix = $opt_3 if $opt_3;
$coverage = $opt_c if $opt_c;
$database = $opt_d if $opt_d;
$type = $opt_s if $opt_s;
$dir = $opt_p if $opt_p;
$cdsa = $opt_4 if $opt_4;
$cdsn = $opt_5 if $opt_5;
if ($type eq "reads") {
system("mkdir scaf");
my @files = glob("*$reads1");
my %lists;
foreach (@files) {
if (/(\S+)$reads1/) {
$lists{$1} = "1";
}
}
my @lists = keys %lists;
foreach my $name(@lists) {
my $read1 = $name . $reads1;
my $read2 = $name . $reads2;
my $str = $name;
print "Assembling...\n";
system("abyss-pe name=$name k=$kmmer in='$read1 $read2' np=$cpus");
print "Assemble complete !\n";
my $assem = $name . "_assembly";
system("mkdir $assem");
my $scaf = $name . "-8.fa";
my $faa = $str . ".faa";
my $fna = $str . ".ffn";
my $pout = $str . ".prodigal";
my $outdir = $str . "_cds";
system("mkdir $outdir");
print "Running ORFs finding ...\n";
system("prodigal -o $pout -i $scaf -a $faa -d $fna -g $tr_table");
print "Genes prediction complete !\n";
system("mv $faa $fna $pout $outdir/");
my $blast = $str . ".blast";
print "Running blastp against the Bt_toxin database...\n";
system("blastp -query $outdir/$faa -db $database -outfmt 6 -out $blast -evalue $evalue -max_target_seqs 1 -num_threads $cpus");
my %ids;
open BLAST, "$blast" or die;
while (<BLAST>) {
chomp;
my @tmps = split /\s+/, $_;
$ids{$tmps[0]} = "1";
}
my $in = Bio::SeqIO->new(-file=>"$outdir/$faa", -format=>"fasta");
my $inn = Bio::SeqIO->new(-file=>"$outdir/$fna", -format=>"fasta");
my $cry = $str . "_cry.faa";
my $cryn = $str . "_cry.fna";
my $out = Bio::SeqIO->new(-file=>">>$cry", -format=>"fasta");
my $outn = Bio::SeqIO->new(-file=>">>$cryn", -format=>"fasta");
while (my $seq = $in->next_seq) {
my $id = $seq->id;
if (exists $ids{$id}) {
$out->write_seq($seq);
}
}
while (my $seq = $inn->next_seq) {
my $id = $seq->id;
if (exists $ids{$id}) {
$outn->write_seq($seq);
}
}
system("mkdir cry CDs assembly reads");
system("cp $scaf scaf/");
system("mv *_cry.faa *_cry.fna *.blast cry/");
system("mv *_cds CDs/");
system("mv $read1 $read2 reads/");
system("mv $name* $assem/");
}
system("mv *_assembly assembly/");
}elsif($type eq "assembled"){
system("mkdir scaf");
my @assem = glob("*$postfix");
foreach my $scaf (@assem) {
$scaf=~/(\S+)$postfix/;
my $name = $1;
my $str = $name;
my $faa = $str . ".faa";
my $fna = $str . ".ffn";
my $pout = $str . ".prodigal";
my $outdir = $str . "_cds";
system("mkdir $outdir");
print "Running ORFs finding and annotating...\n";
system("prodigal -o $pout -i $scaf -a $faa -d $fna -g $tr_table");
print "Genes prediction complete !\n";
system("mv $faa $fna $pout $outdir/");
my $blast = $str . ".blast";
print "Running blastp against the Bt_toxin database...\n";
system("blastp -query $outdir/$faa -db $database -outfmt 6 -out $blast -evalue $evalue -max_target_seqs 1 -num_threads $cpus");
my %ids;
open BLAST, "$blast" or die;
while (<BLAST>) {
chomp;
my @tmps = split /\s+/, $_;
$ids{$tmps[0]} = "1";
}
my $in = Bio::SeqIO->new(-file=>"$outdir/$faa", -format=>"fasta");
my $inn = Bio::SeqIO->new(-file=>"$outdir/$fna", -format=>"fasta");
my $cry = $str . "_cry.faa";
my $cryn = $str . "_cry.fna";
my $out = Bio::SeqIO->new(-file=>">>$cry", -format=>"fasta");
my $outn = Bio::SeqIO->new(-file=>">>$cryn", -format=>"fasta");
while (my $seq = $in->next_seq) {
my $id = $seq->id;
if (exists $ids{$id}) {
$out->write_seq($seq);
}
}
while (my $seq = $inn->next_seq) {
my $id = $seq->id;
if (exists $ids{$id}) {
$outn->write_seq($seq);
}
}
system("mkdir cry CDs");
system("mv $scaf scaf/");
system("mv *_cry.faa *_cry.fna *.blast cry/");
system("mv *_cds CDs/");
}
}elsif($type eq "cds"){
system("mkdir cry");
chdir "$dir";
my @aa = glob("*$cdsa");
chdir "../";
foreach (@aa) {
$_ =~/(\S+)$cdsa/;
my $name = $1;
my $str = $name;
my $faa = $str . $cdsa;
my $fna = $str . $cdsn;
my $blast = $str . ".blast";
print "Running blastp against the Bt_toxin database...\n";
system("blastp -query $dir/$faa -db $database -outfmt 6 -out $blast -evalue $evalue -max_target_seqs 1 -num_threads $cpus");
my %ids;
open BLAST, "$blast" or die;
while (<BLAST>) {
chomp;
my @tmps = split /\s+/, $_;
$ids{$tmps[0]} = "1";
}
my $in = Bio::SeqIO->new(-file=>"$dir/$faa", -format=>"fasta");
my $inn = Bio::SeqIO->new(-file=>"$dir/$fna", -format=>"fasta");
my $cry = $str . "_cry.faa";
my $cryn = $str . "_cry.fna";
my $out = Bio::SeqIO->new(-file=>">>$cry", -format=>"fasta");
my $outn = Bio::SeqIO->new(-file=>">>$cryn", -format=>"fasta");
while (my $seq = $in->next_seq) {
my $id = $seq->id;
if (exists $ids{$id}) {
$out->write_seq($seq);
}
}
while (my $seq = $inn->next_seq) {
my $id = $seq->id;
if (exists $ids{$id}) {
$outn->write_seq($seq);
}
}
system("mv *_cry.faa *_cry.fna *.blast cry/");
}
}
#=====================================================================================================================================
#Function find new genes whose identities lt 95% and the length of aa gt 115aa, delete the genes who have the same aa sequence.
#=====================================================================================================================================
#21_fasta_to_one_line_v3.pl
#Function : make fasta sequences to a line(for fna and faa files).
#--------------------------------------------------------change faa to a line---------------------------------------------------------
print "Changing faa files to a line...\n\n";
chdir "cry";
my @faa = glob("*.faa");
foreach my $faa (@faa) {
my $name = substr($faa,0,(length($faa)-8));
my $out1 = $name . "_faa.out1";
open FAAI,"$faa" or die "Open FAAI Failed!\n";
open FAAO,">>$out1" or die "Open FAAO Failed!\n";
while (<FAAI>) {
chomp;
if (/>(\S+)/) {
print FAAO "\n" . ">" . $name . "_$1\t";
}else{
s/\n/\t/g;
print FAAO "$_";
}
}
}
close FAAI;
close FAAO;
my @out01 = glob("*.out1");
foreach my $out1 (@out01) {
my $name = substr($out1,0,(length($out1)-9));
my $out2 = $name . "_faa.out";
open READA,"$out1" or die "Open READA Failed!\n";
open OUTRA,">>$out2" or die "Open OUTRA Failed!\n";
readline READA; # skip the first line
while (<READA>){
chomp;
s/\*//g;#delete the "*" in the end
print OUTRA "$_\n";
}
}
close READA;
close OUTRA;
system("rm -rf *.out1");
#--------------------------------------------------change fna to a line------------------------------------------------------------
print "Changing fna files to a line...\n\n";
my @fna0 = glob("*.fna");
foreach my $fna (@fna0) {
my $name = substr($fna,0,(length($fna)-8));
my $out2 = $name . "_fna.out2";
open FNAI,"$fna" or die "Open FNAI Failed!\n";
open FNAO,">>$out2" or die "Open FNAO Failed!\n";
while (<FNAI>) {
chomp;
if (/>(\S+)/) {
print FNAO "\n" . ">" . $name . "_$1\t";
}else{
s/\n/\t/g;
print FNAO "$_";
}
}
}
close FNAI;
close FNAO;
my @out2 = glob("*.out2");
foreach my $out2 (@out2) {
my $name = substr($out2,0,(length($out2)-9));
my $out22 = $name . "_fna.out";
open READN,"$out2" or die "Open READN Failed!\n";
open OUTRN,">>$out22" or die "Open OUTRN Failed!\n";
readline READN; # skip the first line
while (<READN>){
chomp;
print OUTRN "$_\n";
}
}
close READN;
close OUTRN;
system("rm -rf *.out2");
#=================================================================================================================================
#22_length_fasta.pl
# Function: add sequence length to the end of the sequence.
print "Andding sequence length...\n\n";
my $chars =0 ;
my @out = glob("*.out");
foreach my $out (@out) {
my $name = substr($out,0,(length($out)-4));
my $length1 = $name . ".length1";
open IN1, "$out" or die "Open IN1 Failed!\n";
open OUT1, ">>$length1" or die "Open OUT1 Failed!\n";
while (<IN1>) {
chomp;
my @lines = split(/\t/);
foreach (@lines) {
while (/([A-Z]+)/g) {
$chars += length($1);
}
print OUT1 $_ . "\t" . "$chars" . "\n";
$chars = 0;
}
}
}
close IN1;
close OUT1;
#--------------------------------------------------------Change to one line------------------------------------------------------
my @length1 = glob("*.length1");
foreach my $length1 (@length1) {
my $name = substr($length1,0,(length($length1)-1));
my $out1 = $name . "2";
open IN2,"$length1" or die "Open IN2 Failed!\n";
open OUT2,">>$out1" or die "Open OUT2 Failed!\n";
while (<IN2>) {
chomp;
if (/>/) {
print OUT2 "\n" . "$_\t";
}else{
s/\n/\t/g;
print OUT2 "$_";
}
}
}
close IN2;
close OUT2;
my @out1 = glob("*.length2");
foreach my $out1 (@out1) {
my $name = substr($out1,0,(length($out1)-1));
my $out2 = $name . "3";
open READL,"$out1" or die "Open READL Failed!\n";
open OUTRL,">>$out2" or die "Open OUTRL Failed!\n";
readline READL; # skip the first line
while (<READL>){
chomp;
print OUTRL "$_\n";
}
}
close READL;
close OUTRL;
system("rm -rf *.length1 *.length2");
#-----------------Print out the needed columsminus the length of the Capitals in the IDs--------------------------------------
my @length3 = glob("*.length3");
foreach my $length3 (@length3) {
my $name = substr($length3,0,(length($length3)-8));
my $out3 = $name . ".length";
open INPUTL,"$length3" or die "Open INPUTL Failed!\n";
open OUTPUTL,">>$out3" or die "Open OUTPUTL Failed!\n";
while (<INPUTL>) {
chomp;
print OUTPUTL join "\t",(split(/\t/))[0,2,3],"\n";
}
}
close INPUTL;
close OUTPUTL;
system("rm -rf *.length3 *.out");
#================================================================================================================================
#23_co_faa_fna_length_v2.pl
# Function: make the fna and faa files to one file.
print "Make the fna and faa files to one file...\n\n";
my @fna1 = glob("*.fna");
foreach my $fna1 (@fna1) {
my $name = substr($fna1,0,(length($fna1)-8));
my $fna = $name . "_fna.length";
my $faa = $name . "_faa.length";
my $out = $name . "_cry.sequence";
my %hash;
open IN3,"$fna" or die "Open IN3 Failed!\n";
while (<IN3>){
chomp;
my ($id,$nucl,$lennl)=split(/\t/);
$hash{$id}=$nucl . "\t" . ($lennl-3);
}
close IN3;
open INPUT1,"$faa" or die "Open INPUT1 Failed!\n";
open OUT3, ">>$out" or die "Open OUT3 Failed!\n";
while (<INPUT1>){
chomp;
my ($id,$aa,$lenaa)=split(/\t/);
if (exists $hash{$id}) {
print OUT3 $id . "\t" . $hash{$id} . "\t" . $aa . "\t" .$lenaa . "\n";
}
}
}
close INPUT1;
close OUT3;
system("rm -rf *.length");
#================================================================================================================================
#24_id_pure.pl
# Function: divide the ids and the hits.
my @sequence = glob("*.sequence");
foreach (@sequence) {
my $name = substr($_,0,(length($_)-13));
my $out = $name . ".id_seqs";
open IN4,"$_" or die "Open IN4 Faild!\n";
open OUT4,">>$out" or die "Open OUT4 Failed!\n";
while (<IN4>) {
chomp;
if (/>(\S+)\s+([\d+\D+]+?)\t([\d+\D+]+)/) {
print OUT4 $1 . "\t" . $2 . "\t" . $3 . "\n";
}
}
}
close IN4;
close OUT4;
system("rm -rf *.sequence");
#===============================================================================================================================
#25_add_sequence_to_blast_v2.pl
# Function: combine the blast file with the "id_seqs" file to one line.
print "Combing the blast file with the 'id_seqs' file to one line...\n\n";
my @fna = glob("*.fna");
foreach my $fna (@fna) {
my $name = substr($fna,0,(length($fna)-8));
my $blast = $name . ".blast";
my $sequence = $name . ".id_seqs";
my $out = $name . "_cry.final";
my %hash;
open IN5,"$blast" or die "Open IN5 Failed!\n";
while (<IN5>){
chomp;
my ($id,$a,$b,$c,$d,$e,$f,$g,$h,$i,$j,$k)=split(/\t/);
my $idg = $name. "_$id";
$hash{$idg}=$a."\t".$b."\t".$c."\t".$d."\t".$e."\t".$f."\t".$g."\t".$h."\t".$i."\t".$j."\t".$k;
}
close IN5;
open INPUT2,"$sequence" or die "Open INPUT2 Failed!\n";
open OUT5, ">>$out" or die "Open OUT5 Failed!\n";
while (<INPUT2>){
chomp;
my ($id,$l,$m,$n,$o)=split(/\t/);
if (exists $hash{$id}) {
print OUT5 $id . "\t" . $hash{$id} . "\t" . $m."\t".$o."\t".$l."\t".$n."\n";
}
}
}
close INPUT2;
close OUT5;
system("rm -rf *.id_seqs");
#================================================================================================================================
#26_co_all_genes.pl
# Function: combine all genes to one line(All_genes.txt).
my @final = glob("*.final");
foreach (@final) {
open IN6,"$_" or die "Open IN6 Failed!\n";
open OUT6,">>All_genes.txt" or die "Open OUT6 Failed!\n";
print OUT6 join("\t", "Gene Ids", "Target", "Percent identity", "Alignment length", "Number of mismatches", "Number of gap opens", "1-based position of start in query", "1-based position of end in query", "1-based position of start in target", "1-based position of end in target", "E-value", "Bit score", "nucl length", "aa length", "nucl sequence", "prot sequence") . "\n";
while (<IN6>) {
chomp;
my @lines = split(/\t/);
my $cover = ($lines[3]/$lines[13])*100;
if ($cover >= $coverage) {
print OUT6 "$_\n";
}
}
}
close IN6;
close OUT6;
system ("mkdir faa fna blast");
system ("mv *.faa faa");
system ("mv *.fna fna");
system ("mv *.blast blast");
system ("rm -rf *.final");
#===============================================================================================================================
#27All_genes_lt95.pl
#-------------find new genes whose identities lt 95% and the length of aa gt 115aa----------------------------------------------
open IN7, "All_genes.txt" or die "Open IN7 Failed!\n";
open OUTPUT3,">>All_genes_real.txt" or die "Open OUTPUT3 Failed!\n";
<IN7>;
while (<IN7>) {
chomp;
if (/\d+\t(\d+)\t[A-Z]+/) {
my $length_aa = $1;
if ($length_aa > 115) {
print OUTPUT3 "$_\n";
}
}
}
close IN7;
close OUTPUT3;
open INPUT4, "All_genes_real.txt" or die "Open INPUT4 Failed!\n";
open OUT8, ">>All_genes_real_lt95.txt" or die "Open OUT8 Failed!\n";
while (<INPUT4>) {
chomp;
if (/^\S+\t\S+\t(\d+)/) {
my $identity = $1;
if ($identity < 95.00) {
print OUT8 "$_\n";
}
}
}
close INPUT4;
close OUT8;
open INPUTX, "All_genes_real.txt" or die "Open INPUTX Failed!\n";
open OUTX, ">>All_genes_real_gt95.txt" or die "Open OUTX Failed!\n";
print OUTX join("\t", "Gene Ids", "Target", "Percent identity", "Alignment length", "Number of mismatches", "Number of gap opens", "1-based position of start in query", "1-based position of end in query", "1-based position of start in target", "1-based position of end in target", "E-value", "Bit score", "nucl length", "aa length", "nucl sequence", "prot sequence") . "\n";
while (<INPUTX>) {
chomp;
if (/^\S+\t\S+\t(\d+)/) {
my $identity = $1;
if ($identity > 95.00) {
print OUTX "$_\n";
}
}
}
close INPUTX;
close OUTX;
#----------------------Print Old Genes Table--------------------------------------------------------------------------------
open AGT,"All_genes_real_gt95.txt" || die "Open AGT Failed !\n";
open AOLD,">>Old_genes.table" || die "Open AOLD Failed !\n";
print AOLD join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Integrity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
#print AOLD join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
<AGT>;
while (<AGT>) {
chomp;
my @line4 = split(/\t/);
$line4[1]=~/(\S+)_len(\d+)/;
my $besthit= $1;
my $hitlen1 = $2;
my $integrity = ($line4[3]/$hitlen1)*100;
my $sintegrity = sprintf "%.2f",$integrity;
my $covera1 = ($line4[3]/$line4[13])*100;
my $scovera1 = sprintf "%.2f",$covera1;
if ($line4[2] >= 95) {
print AOLD join ("\t", $line4[0], $line4[13], "Rank4", $besthit, $hitlen1, $scovera1, $line4[2], $sintegrity, $line4[15], $line4[14]) . "\n";
# print AOLD join ("\t", $line4[0], $line4[14], "Rank4", $line4[1], $hitlen1, $scovera1, $line4[2], $line4[16], $line4[15]) . "\n";
}
}
#-------------------If there are new genes----------------------------------------------------------------------------------
my @All_genes_real_lt95 = <All_genes_real_lt95.txt>;
foreach (@All_genes_real_lt95) {
if (-f $_ and -z _){
print "No new genes found !!!\n\n";
exit;
}
}
#===========================================================================================================================
#28All_genes_unique_v3.pl
#Function : delete the rows whose has the same aa sequences from "All_genes_real.txt" .IN the file "All_genes_unique.txt",there may have the genes with only a litte of difference which could be think the same .
my %hash;
open IN9,"All_genes_real_lt95.txt" or die "Open IN9 Failed!\n";
open OUT9,">>All_genes_real_lt95_unique.txt" or die "Open OUT9 Failed!\n";
print OUT9 join("\t", "Gene Ids", "Target", "Percent identity", "Alignment length", "Number of mismatches", "Number of gap opens", "1-based position of start in query", "1-based position of end in query", "1-based position of start in target", "1-based position of end in target", "E-value", "Bit score", "nucl length", "aa length", "nucl sequence", "prot sequence") . "\n";
while (<IN9>) {
chomp;
my @lines = split(/\t/);
my $keys = pop @lines;
next if(exists $hash{$keys});
$hash{$keys} = "$_";
print OUT9 $_ . "\n";
}
close IN9;
close OUT9;
#===========================================================================================================================
#Function extract ids and sequences of genes,get names of the strains who have new genes.
#-----------------extract ids and sequences of genes from "All_genes_unique.txt" to "genes_nucl.fa"-------------------------
open IN10,"All_genes_real_lt95_unique.txt" or die "Open IN10 Failed!\n";
open OUT10,">>genes_nucl.fa" or die "Open OUT10 Failed!\n";
<IN10>;
while (<IN10>) {
chomp;
print OUT10 ">" . join "\t",(split/\t/)[0]."\n".(split/\t/)[14],"\n",;
}
close IN10;
close OUT10;
#------------get names of the strains who have new genes to file "name_genes_scaf.txt"--------------------------------------
open IN11,"genes_nucl.fa" or die "Open IN11 File Failed!\n";
open OUT11,">>genes_scaf.txt" or die "Open OUT11 Failed!\n";
while (<IN11>) {
chomp;
if (/\>(\S+)_\d+/) {##################################
print OUT11 $1 . "\n";
}
}
close IN11;
close OUT11;
my %hashf;
open IN12,"genes_scaf.txt" or die "Open IN12 Failed!\n";
open OUT12,">>name_genes_scaf.txt" or die "Open OUT12 Failed!\n";
while (<IN12>) {
chomp;
my @lines = split;
my $keys = shift @lines;
next if(exists $hashf{$keys});
$hashf{$keys} = "$_";
print OUT12 $keys. "\n";
}
close IN12;
close OUT12;
system("rm -rf genes_scaf.txt");
#==========================================================================================================================
# Function: Given name_genes_scaf.txt, move the -8.fa to another place.
if (($type eq "reads") or ($type eq "assembled")) {
my $count = 0; # Number of IDs
my $hits = 0; # Number of matched IDs
local $| = 1; # Refreshing every time to improve the speed of display progress.
my %scafs;
#-----------------------------------------------[reads ids]----------------------------------------------------------------
open SCA,"name_genes_scaf.txt" || die "Open SCA Failed!\n";
while(<SCA>) {
my @lines = split(/\s+/);
my $key=shift @lines;
$key =~/(\S+)_\d+/;
next if (exists $scafs{$1});
$scafs{$1} = 1;
}
close SCA;
# -----------------------------------------show number of ids--------------------------------------------------------------
my @ids = keys %scafs;#Reading IDs from a file and extract the sequences from another file.
my $num = @ids;
print "\nRead $num ids.\n\n";
#--------------------------------------------[ searching ]-----------------------------------------------------------------
chdir "../scaf";
my @gbk = glob("*-8.fa");
foreach (@gbk) {
$_=~/(\S+)-8.fa/;#####################2-YBT1518-sp-2-11_L2_I350-8.fa
my $name = $1;
if (exists $scafs{$name}) {
system("cp $_ ../cry ");
}
}
chdir "../cry";
#==========================================================================================================================
# 211co_scaf.pl
# Function: place scaffolds in one file(All_scaf.txt).
my @fa = glob("*-8.fa");
foreach (@fa) {
$_=~/(\S+)-8.fa/;
my $name = $1;
open IN13,"$_" or die "Open IN13 Failed!\n";
open OUT13,">>All_scaf.txt" or die "Open OUT13 Failed!\n";
# local $/ = ">";
# <IN13>;
while (<IN13>) {
chomp;
if(/\>(\S+)/){
print OUT13 ">$name"."-scaf"."$1" . "\n";
}else{
print OUT13 "$_\n";
}
}
}
close IN13;
close OUT13;
#==========================================================================================================================
#212gene_blastn_scaf_v2.pl
#blastn to find the scaffolds where the gene is located.
system("makeblastdb -in All_scaf.txt -out All_scaf.db -dbtype nucl");
system("blastn -query genes_nucl.fa -out genes_nucl.blastn -db All_scaf.db -evalue 0.001 -outfmt 6 -perc_identity 100.00 -max_target_seqs 1 -num_threads $cpus");
#-------------------extract the names of the hit scaffolds(hit_scaf_name.txt)---------------------------------------------
open IN14,"genes_nucl.blastn" or die "Open IN14 Failed!\n";
open OUT14,">>hit_scaf_name.txt" or die "Open OUT14 Failed!\n";
while (<IN14>) {
chomp;
print OUT14 join "\t",(split/\t/)[1]."\t".(split/\t/)[0]."\t".(split/\t/)[8]."\t".(split/\t/)[9]."\n";
}
close IN14;
close OUT14;
#----------------------------extract sequences of scaffolds to file "gene_scaf_seqs.txt"----------------------------------
my $count2 = 0;
my $hits2 = 0;
local $| = 1;
my %ids_hash;
#---------------------------------------------[read ids]------------------------------------------------------------------
open ID,"hit_scaf_name.txt" or die "Can't open ID file!\n";
while(<ID>) {
my @lines = split(/\s+/);
my $key=shift @lines;
next if (exists $ids_hash{$key});
$ids_hash{$key} = $_;
}
# ------------------------------------------show number of ids-------------------------------------------------------------
my @ids1 = keys %ids_hash;
my $num1 = @ids1;
print "\nRead $num1 ids.\n\n";
#----------------------------------------------[ searching ]----------------------------------------------------------------
open IN15, "All_scaf.txt" or die "Failed to open IN15 file!\n";
open OUT15, ">>gene_scaf_seqs.txt" or die "Failed to open OUT15 file!\n";
local $/ = ">";
<IN15>; # ignore the first >.
my ( $head, $seq );
while (<IN15>) {
$count2++;
s/\r?\n>//;
( $head, $seq ) = split "\n", $_, 2;
$seq =~ s/\s+//g;
next unless $head =~ /(.+-scaf\d+)/;
#next unless $head =~/(\d*[a-zA-Z]+\d+\S+)/;
if (exists $ids_hash{$1}){
print OUT15 "$head\t$seq\n";
}
$ids_hash{$1}++;
$hits2++;
}
print "\rProcessing ${count2} th record. hits: $hits2";
# -----------------------------Show the numbers of IDs which had no match to any of the record------------------------------
my @ids2 = grep {$ids_hash{$_} == 0} keys %ids_hash;
my $num2 = @ids2;
print "\n\n$num2 ids did not match any record in all_seqs.fa\n";
print "@ids2\n";
$/ = "\n";
close IN15;
close OUT15;
#===========================================================================================================================
#213gene_scaf_seqs_v2.pl
# Function:genes' ids with scaffolds in a line(gene_scaf_seqs3_2.txt).
open INPUT5,"gene_scaf_seqs.txt" or die "Open INPUT5 Failed!\n";
open OUTPUT5,">>gene_scaf_seqs2.txt" or die "Open OUTPUT5 Failed!\n";
while (<INPUT5>) {
chomp;
if (/^(\S+)\t([A-Z]+)/) {
print OUTPUT5 "$1\t$2\n";
}
}
close INPUT5;
close OUTPUT5;
my %hashs;
open IN16,"gene_scaf_seqs2.txt" or die "Open IN16 Failed!\n";
while (<IN16>){
chomp;
my ($id,$seqs)=split(/\t/);
$hashs{$id}= "$_";
}
open INPUT6,"hit_scaf_name.txt" or die "Open INPUT6 Failed!\n";
open OUT16, ">>gene_scaf_seqs3.txt" or die "Open OUT16 Failed!\n";
print OUT16 join("\t", "Gene Ids", "1-based position of start in scaffold", "1-based position of end in scaffold", "Scaffolds Ids", "Scaffolds Sequences") . "\n";
while (<INPUT6>){
chomp;
my ($ids,$idg,$scafl,$scafr)=split(/\t/);
if (exists $hashs{$ids}) {
print OUT16 "$idg\t$scafl\t$scafr\t$hashs{$ids}\n";
}
}
close IN16;
close INPUT6;
close OUT16;
}
#======================================Print New Genes Table=====================================================================================
open IN17,"All_genes_real_lt95_unique.txt" || die "Open IN17 Failed !\n";
open OUT17,">>New_genes_unique.table" || die "Open OUT17 Failed !\n";
print OUT17 join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Integrity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
#print OUT17 join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
<IN17>;
while (<IN17>) {
chomp;
my @line3 = split(/\t/);
$line3[1]=~/(\S+)_len(\d+)/;
my $besthit2 = $1;
my $hitlen = $2;
my $covera = ($line3[3]/$line3[13])*100;
my $scovera = sprintf "%.2f",$covera;
my $Integrity = ($line3[3]/$hitlen)*100;
my $sIntegrity = sprintf "%.2f",$Integrity;
if ($line3[2] < 45) {
print OUT17 join ("\t", $line3[0], $line3[13], "Rank1", $besthit2, $hitlen, $scovera, $line3[2], $sIntegrity, $line3[15], $line3[14]) . "\n";
# print OUT17 join ("\t", $line3[0], $line3[14], "Rank1", $line3[1], $hitlen, $scovera, $line3[2], $line3[16], $line3[15]) . "\n";
}elsif ($line3[2] >= 45 and $line3[2] < 78) {
print OUT17 join ("\t", $line3[0], $line3[13], "Rank2", $besthit2, $hitlen, $scovera, $line3[2], $sIntegrity, $line3[15], $line3[14]) . "\n";
# print OUT17 join ("\t", $line3[0], $line3[14], "Rank2", $line3[1], $hitlen, $scovera, $line3[2], $line3[16], $line3[15]) . "\n";
}else{
print OUT17 join ("\t", $line3[0], $line3[13], "Rank3", $besthit2, $hitlen, $scovera, $line3[2], $sIntegrity, $line3[15], $line3[14]) . "\n";
# print OUT17 join ("\t", $line3[0], $line3[14], "Rank3", $line3[1], $hitlen, $scovera, $line3[2], $line3[16], $line3[15]) . "\n";
}
}
system("rm -rf gene_scaf_seqs2.txt gene_scaf_seqs.txt All_scaf.db.nhr All_scaf.db.nin All_scaf.db.nsq All_scaf.txt genes_nucl.blastn hit_scaf_name.txt");
system("mkdir new_genes old_genes other_files scaffolds");
system("mv New_genes_unique.table All_genes_real_lt95_unique.txt new_genes/");
system("mv All_genes.txt All_genes_real_lt95.txt name_genes_scaf.txt genes_nucl.fa All_genes_real.txt other_files/");
system("mv Old_genes.table All_genes_real_gt95.txt old_genes/");
if(($type eq "reads") or ($type eq "assembled")){
system("mv gene_scaf_seqs3.txt new_genes/");
system("mv *-8.fa scaffolds/");
}
print "Congratulations, programm completing...\n\n";
sub showhelp{
print STDERR<<EOF;
BtToxin_scanner2.pl (v1.0)
Usage and example:
for reads: BtToxin_scanner2.pl -s reads -1 .R1.clean.fastq.gz -2 .R2.clean.fastq.gz
for contigs/scaffolds: BtToxin_scanner2.pl -s assembled -3 "-8.fa"
for CDs: BtToxin_scanner2.pl -s cds -p CDs -4 ".faa" -5 ".ffn"
Essential:
-1 The suffix name of reads 1 (for example: if the name of reads 1 is "YBT-1520_L1_I050.R1.clean.fastq.gz",
"YBT-1520" is the strain same, so the suffix name should be ".R1.clean.fastq.gz")
-2 The suffix name of reads 2(for example: if the name of reads 2 is "YBT-1520_2.fq", the suffix name should be _2.fq")
-3 The suffix name of contigs or scaffolds(default -8.fa)
-s Input type for analysis(3 choices: reads, assembled, cds. default is reads)
-p The path of CDs contain both amino acid sequences and nucleotide sequences(default CDs)
-4 The suffix name of files contain amino acid sequences(default .faa)
-5 The suffix name of files contain nucleotide sequences(default .ffn)
Options:
-h Show help
-k k-mer size for genome assembly (default 81)
-n number of cpus to use (default 6)
-e evalue for blast against the Bt toxin database (default 1e-5)
-c the minimum size of blast coverage(default 50)
-g Specify a translation table to use for prodigal (default 11)
-d Database for blastp (default Bt_toxin)
EOF
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化