master

分支 (1)

管理

管理

master

Bt_toxin_scanner2
/
BtToxin_scanner2.pl

#!/usr/bin/perl -w
use strict;
use Bio::SeqIO;
use Getopt::Long;
use Pod::Usage;
use Getopt::Std;


if ($#ARGV < 1 ) {
	&showhelp();
	exit;
}else{
	foreach my $i (0..$#ARGV) {
       if ($ARGV[$i] eq "-h") {
           &showhelp();
		   exit;
       }
    }
}

my $kmmer ='81';
my $cpus = '6';
my $tr_table ='11';
my $evalue = '1e-5';
my $reads1 = '.R1.clean.fastq.gz';
my $reads2 = '.R2.clean.fastq.gz';
my $postfix = '-8.fa';
my $coverage = '50';
my $database = 'Bt_toxin';
my $type = 'reads';
my $dir = 'CDs';
my $cdsa = '.faa';
my $cdsn ='.ffn';


use vars qw($opt_k $opt_n $opt_d $opt_e $opt_1 $opt_2 $opt_c $opt_g $opt_3 $opt_s $opt_4 $opt_5 $opt_p);
getopts('k:n:g:e:1:2:3:c:d:s:p:4:5:');
$kmmer =$opt_k if $opt_k;
$cpus =$opt_n if $opt_n;
$tr_table = $opt_g if $opt_g;
$evalue = $opt_e if $opt_e;
$reads1 = $opt_1 if $opt_1;
$reads2 = $opt_2 if $opt_2;
$postfix = $opt_3 if $opt_3;
$coverage = $opt_c if $opt_c;
$database = $opt_d if $opt_d;
$type = $opt_s if $opt_s;
$dir = $opt_p if $opt_p;
$cdsa = $opt_4 if $opt_4;
$cdsn = $opt_5 if $opt_5;


if ($type eq "reads") {
   system("mkdir scaf");
   my @files = glob("*$reads1");
   my %lists;
   foreach (@files) {
       if (/(\S+)$reads1/) {
           $lists{$1} = "1";
       }
   }

   my @lists = keys %lists;

   foreach my $name(@lists) {
       my $read1 = $name . $reads1;
       my $read2 = $name . $reads2;
       my $str = $name;
	   print "Assembling...\n";
       system("abyss-pe name=$name k=$kmmer in='$read1 $read2' np=$cpus");
	   print "Assemble complete !\n";
       my $assem = $name . "_assembly";
       system("mkdir $assem");
       my $scaf = $name . "-8.fa";
       my $faa = $str . ".faa";
       my $fna = $str . ".ffn";
       my $pout = $str . ".prodigal";
	   my $outdir = $str . "_cds";
        system("mkdir $outdir");
	   print "Running ORFs finding ...\n";
       system("prodigal -o $pout -i $scaf -a $faa -d $fna -g $tr_table");
	   print "Genes prediction complete !\n";
        system("mv $faa $fna $pout $outdir/");
       my $blast = $str . ".blast";
	   print "Running blastp against the Bt_toxin database...\n";
       system("blastp -query $outdir/$faa -db $database -outfmt 6 -out $blast -evalue $evalue -max_target_seqs 1 -num_threads $cpus");
       my %ids;
       open BLAST, "$blast" or die;
       while (<BLAST>) {
           chomp;
           my @tmps = split /\s+/, $_;
           $ids{$tmps[0]} = "1";
       }
       my $in = Bio::SeqIO->new(-file=>"$outdir/$faa", -format=>"fasta");
       my $inn = Bio::SeqIO->new(-file=>"$outdir/$fna", -format=>"fasta");
       my $cry = $str . "_cry.faa";
       my $cryn = $str . "_cry.fna";
       my $out = Bio::SeqIO->new(-file=>">>$cry", -format=>"fasta");
       my $outn = Bio::SeqIO->new(-file=>">>$cryn", -format=>"fasta");
       while (my $seq = $in->next_seq) {
           my $id = $seq->id;
           if (exists $ids{$id}) {
               $out->write_seq($seq);
           }
       }
       while (my $seq = $inn->next_seq) {
           my $id = $seq->id;
           if (exists $ids{$id}) {
               $outn->write_seq($seq);
           }
       }
       system("mkdir cry CDs assembly reads");
       system("cp $scaf scaf/");
       system("mv *_cry.faa *_cry.fna *.blast cry/");
       system("mv *_cds CDs/");
       system("mv $read1 $read2 reads/");
       system("mv $name* $assem/");
   }

   system("mv *_assembly assembly/");

}elsif($type eq "assembled"){
    system("mkdir scaf");
    my @assem = glob("*$postfix");
    foreach  my $scaf (@assem) {
	   $scaf=~/(\S+)$postfix/;
	   my $name = $1;
       my $str = $name;
       my $faa = $str . ".faa";
       my $fna = $str . ".ffn";
       my $pout = $str . ".prodigal";
	   my $outdir = $str . "_cds";
          system("mkdir $outdir");
	   print "Running ORFs finding and annotating...\n";
       system("prodigal -o $pout -i $scaf -a $faa -d $fna -g $tr_table");
	   print "Genes prediction complete !\n";
       system("mv $faa $fna $pout $outdir/");
       my $blast = $str . ".blast";
	   print "Running blastp against the Bt_toxin database...\n";
       system("blastp -query $outdir/$faa -db $database -outfmt 6 -out $blast -evalue $evalue -max_target_seqs 1 -num_threads $cpus");
       my %ids;
       open BLAST, "$blast" or die;
       while (<BLAST>) {
           chomp;
           my @tmps = split /\s+/, $_;
           $ids{$tmps[0]} = "1";
       }
       my $in = Bio::SeqIO->new(-file=>"$outdir/$faa", -format=>"fasta");
       my $inn = Bio::SeqIO->new(-file=>"$outdir/$fna", -format=>"fasta");
       my $cry = $str . "_cry.faa";
       my $cryn = $str . "_cry.fna";
       my $out = Bio::SeqIO->new(-file=>">>$cry", -format=>"fasta");
       my $outn = Bio::SeqIO->new(-file=>">>$cryn", -format=>"fasta");
       while (my $seq = $in->next_seq) {
           my $id = $seq->id;
           if (exists $ids{$id}) {
               $out->write_seq($seq);
           }
       }
       while (my $seq = $inn->next_seq) {
           my $id = $seq->id;
           if (exists $ids{$id}) {
               $outn->write_seq($seq);
           }
       }
       system("mkdir cry CDs");
       system("mv $scaf scaf/");
       system("mv *_cry.faa *_cry.fna *.blast cry/");
       system("mv *_cds CDs/");
     }
}elsif($type eq "cds"){
     system("mkdir cry");
    chdir "$dir";
    my @aa = glob("*$cdsa");
     chdir "../";
    foreach (@aa) {
	   $_ =~/(\S+)$cdsa/;
	   my $name = $1;
       my $str = $name;
       my $faa = $str . $cdsa;
       my $fna = $str . $cdsn;
       my $blast = $str . ".blast";
	   print "Running blastp against the Bt_toxin database...\n";
       system("blastp -query $dir/$faa -db $database -outfmt 6 -out $blast -evalue $evalue -max_target_seqs 1 -num_threads $cpus");
       my %ids;
       open BLAST, "$blast" or die;
       while (<BLAST>) {
           chomp;
           my @tmps = split /\s+/, $_;
           $ids{$tmps[0]} = "1";
       }
       my $in = Bio::SeqIO->new(-file=>"$dir/$faa", -format=>"fasta");
       my $inn = Bio::SeqIO->new(-file=>"$dir/$fna", -format=>"fasta");
       my $cry = $str . "_cry.faa";
       my $cryn = $str . "_cry.fna";
       my $out = Bio::SeqIO->new(-file=>">>$cry", -format=>"fasta");
       my $outn = Bio::SeqIO->new(-file=>">>$cryn", -format=>"fasta");
       while (my $seq = $in->next_seq) {
           my $id = $seq->id;
           if (exists $ids{$id}) {
               $out->write_seq($seq);
           }
       }
       while (my $seq = $inn->next_seq) {
           my $id = $seq->id;
           if (exists $ids{$id}) {
               $outn->write_seq($seq);
           }
       }

       system("mv *_cry.faa *_cry.fna *.blast cry/");
     }
}
#=====================================================================================================================================
#Function find new genes whose identities lt 95% and the length of aa gt 115aa, delete the genes who have the same aa sequence.
#=====================================================================================================================================
#21_fasta_to_one_line_v3.pl
#Function : make fasta sequences to a line(for fna and faa files).
#--------------------------------------------------------change faa to a line---------------------------------------------------------

print "Changing faa files to a line...\n\n";
chdir "cry";
my @faa = glob("*.faa");
foreach my $faa (@faa) {
	my $name = substr($faa,0,(length($faa)-8));
    my $out1 = $name . "_faa.out1";
	open FAAI,"$faa" or die "Open FAAI Failed!\n";
    open FAAO,">>$out1" or die "Open FAAO Failed!\n";
	while (<FAAI>) {
		chomp;
		if (/>(\S+)/) {
			print FAAO "\n" . ">" . $name . "_$1\t";
		}else{
		s/\n/\t/g;
		print FAAO "$_";
		}
	}

}
close FAAI;
close FAAO;

my @out01 = glob("*.out1");
foreach my $out1 (@out01) {
	my $name = substr($out1,0,(length($out1)-9));
	my $out2 = $name . "_faa.out";
    open READA,"$out1" or die "Open READA Failed!\n";
	open OUTRA,">>$out2" or die "Open OUTRA Failed!\n";
    readline READA;    # skip the first line
    while (<READA>){
		 chomp;
		 s/\*//g;#delete the "*" in the end
         print OUTRA "$_\n";
	}
}
close READA;
close OUTRA;
system("rm -rf *.out1");

#--------------------------------------------------change fna to a line------------------------------------------------------------

print "Changing fna files to a line...\n\n";
my @fna0 = glob("*.fna");
foreach my $fna (@fna0) {
	my $name = substr($fna,0,(length($fna)-8));
    my $out2 = $name . "_fna.out2";
	open FNAI,"$fna" or die "Open FNAI Failed!\n";
    open FNAO,">>$out2" or die "Open FNAO Failed!\n";
	while (<FNAI>) {
		chomp;
		if (/>(\S+)/) {
			print FNAO "\n" . ">" . $name . "_$1\t";
		}else{
		  s/\n/\t/g;
		  print FNAO "$_";
		}
	}

}
close FNAI;
close FNAO;

my @out2 = glob("*.out2");
foreach my $out2 (@out2) {
	my $name = substr($out2,0,(length($out2)-9));
	my $out22 = $name . "_fna.out";
    open READN,"$out2" or die "Open READN Failed!\n";
	open OUTRN,">>$out22" or die "Open OUTRN Failed!\n";
    readline READN;    # skip the first line
     while (<READN>){
		 chomp;
         print OUTRN "$_\n";
	 }
}
close READN;
close OUTRN;
system("rm -rf *.out2");

#=================================================================================================================================
#22_length_fasta.pl
# Function: add sequence length to the end of the sequence.

print "Andding sequence length...\n\n";
my $chars =0 ;
my @out = glob("*.out");
foreach my $out (@out) {
	my $name = substr($out,0,(length($out)-4));
	my $length1 = $name . ".length1";
    open IN1, "$out" or die "Open IN1 Failed!\n";
    open OUT1, ">>$length1" or die "Open OUT1 Failed!\n";
	while (<IN1>) {
		chomp;
        my @lines = split(/\t/);
		foreach  (@lines) {
			while (/([A-Z]+)/g) {
				$chars += length($1);
			}
			print OUT1 $_ . "\t" . "$chars" . "\n";
			$chars = 0;
		}
	}
}
close IN1;
close OUT1;


#--------------------------------------------------------Change to one line------------------------------------------------------

my @length1 = glob("*.length1");
foreach my $length1 (@length1) {
	my $name = substr($length1,0,(length($length1)-1));
    my $out1 = $name . "2";
	open IN2,"$length1" or die "Open IN2 Failed!\n";
    open OUT2,">>$out1" or die "Open OUT2 Failed!\n";
	while (<IN2>) {
		chomp;
		if (/>/) {
			print OUT2 "\n" . "$_\t";
		}else{
		  s/\n/\t/g;
		  print OUT2 "$_";
		}
	}

}
close IN2;
close OUT2;

my @out1 = glob("*.length2");
foreach my $out1 (@out1) {
	my $name = substr($out1,0,(length($out1)-1));
	my $out2 = $name . "3";
    open READL,"$out1" or die "Open READL Failed!\n";
	open OUTRL,">>$out2" or die "Open OUTRL Failed!\n";
    readline READL;    # skip the first line
    while (<READL>){
		 chomp;
         print OUTRL "$_\n";
	 }
}
close READL;
close OUTRL;
system("rm -rf *.length1 *.length2");


#-----------------Print out the needed columsminus the length of the Capitals in the IDs--------------------------------------

my @length3 = glob("*.length3");
foreach my $length3 (@length3) {
	my $name = substr($length3,0,(length($length3)-8));
	my $out3 = $name . ".length";
    open INPUTL,"$length3" or die "Open INPUTL Failed!\n";
	open OUTPUTL,">>$out3" or die "Open OUTPUTL Failed!\n";
	while (<INPUTL>) {
		chomp;
		print OUTPUTL join "\t",(split(/\t/))[0,2,3],"\n";
	}

}
close INPUTL;
close OUTPUTL;
system("rm -rf *.length3 *.out");

#================================================================================================================================
#23_co_faa_fna_length_v2.pl
# Function: make the fna and faa files to one file.

print "Make the fna and faa files to one file...\n\n";

my @fna1 = glob("*.fna");
foreach my $fna1 (@fna1) {
	my $name = substr($fna1,0,(length($fna1)-8));
	my $fna = $name . "_fna.length";
	my $faa = $name . "_faa.length";
	my $out = $name . "_cry.sequence";
    my %hash;
    open IN3,"$fna" or die "Open IN3 Failed!\n";
    while (<IN3>){
       chomp;
       my ($id,$nucl,$lennl)=split(/\t/);
       $hash{$id}=$nucl . "\t" . ($lennl-3);
    }
       close IN3;
       open INPUT1,"$faa" or die "Open INPUT1 Failed!\n";
       open OUT3, ">>$out" or die "Open OUT3 Failed!\n";

     while (<INPUT1>){
        chomp;
        my ($id,$aa,$lenaa)=split(/\t/);
        if (exists $hash{$id}) {
           print OUT3 $id . "\t" . $hash{$id} . "\t" . $aa . "\t" .$lenaa . "\n";
        }
     }
}
close INPUT1;
close OUT3;
system("rm -rf *.length");

#================================================================================================================================
#24_id_pure.pl
# Function: divide the ids and the hits.

my @sequence = glob("*.sequence");
foreach  (@sequence) {
	my $name = substr($_,0,(length($_)-13));
	my $out = $name . ".id_seqs";
    open IN4,"$_" or die "Open IN4 Faild!\n";
	open OUT4,">>$out" or die "Open OUT4 Failed!\n";
	while (<IN4>) {
		chomp;
		if (/>(\S+)\s+([\d+\D+]+?)\t([\d+\D+]+)/) {
			print OUT4 $1 . "\t" . $2 . "\t" . $3 . "\n";

		}
	}
}
close IN4;
close OUT4;
system("rm -rf *.sequence");

#===============================================================================================================================
#25_add_sequence_to_blast_v2.pl
# Function: combine the blast file with the "id_seqs" file to one line.

print "Combing the blast file with the 'id_seqs' file to one line...\n\n";
my @fna = glob("*.fna");
foreach my $fna (@fna) {
	my $name = substr($fna,0,(length($fna)-8));
	my $blast = $name . ".blast";
	my $sequence = $name . ".id_seqs";
	my $out = $name . "_cry.final";
    my %hash;
    open IN5,"$blast" or die "Open IN5 Failed!\n";
    while (<IN5>){
       chomp;
       my ($id,$a,$b,$c,$d,$e,$f,$g,$h,$i,$j,$k)=split(/\t/);
       my $idg = $name. "_$id";
       $hash{$idg}=$a."\t".$b."\t".$c."\t".$d."\t".$e."\t".$f."\t".$g."\t".$h."\t".$i."\t".$j."\t".$k;
    }
       close IN5;
       open INPUT2,"$sequence" or die "Open INPUT2 Failed!\n";
       open OUT5, ">>$out" or die "Open OUT5 Failed!\n";

     while (<INPUT2>){
        chomp;
        my ($id,$l,$m,$n,$o)=split(/\t/);
        if (exists $hash{$id}) {
           print OUT5 $id . "\t" . $hash{$id} . "\t" . $m."\t".$o."\t".$l."\t".$n."\n";
        }
     }
}
close INPUT2;
close OUT5;
system("rm -rf *.id_seqs");

#================================================================================================================================
#26_co_all_genes.pl
# Function: combine all genes to one line(All_genes.txt).

my @final = glob("*.final");
foreach  (@final) {
	open IN6,"$_" or die "Open IN6 Failed!\n";
	open OUT6,">>All_genes.txt" or die "Open OUT6 Failed!\n";
	print OUT6 join("\t", "Gene Ids", "Target", "Percent identity", "Alignment length", "Number of mismatches", "Number of gap opens", "1-based position of start in query", "1-based position of end in query", "1-based position of start in target", "1-based position of end in target", "E-value", "Bit score", "nucl length", "aa length", "nucl sequence", "prot sequence") . "\n";
	while (<IN6>) {
		chomp;
		my @lines = split(/\t/);
		my $cover = ($lines[3]/$lines[13])*100;
		if ($cover >= $coverage) {
           print OUT6 "$_\n";
		}
	}
}
close IN6;
close OUT6;
system ("mkdir faa fna blast");
system ("mv *.faa faa");
system ("mv *.fna fna");
system ("mv *.blast blast");
system ("rm -rf *.final");

#===============================================================================================================================
#27All_genes_lt95.pl
#-------------find new genes whose identities lt 95% and the length of aa gt 115aa----------------------------------------------

open IN7, "All_genes.txt" or die "Open IN7 Failed!\n";
open OUTPUT3,">>All_genes_real.txt" or die "Open OUTPUT3 Failed!\n";

<IN7>;
while (<IN7>) {
	chomp;
	if (/\d+\t(\d+)\t[A-Z]+/) {
		my $length_aa = $1;
		if ($length_aa > 115) {
			print OUTPUT3 "$_\n";
		}
	}
}
close IN7;
close OUTPUT3;


open INPUT4, "All_genes_real.txt" or die "Open INPUT4 Failed!\n";
open OUT8, ">>All_genes_real_lt95.txt" or die "Open OUT8 Failed!\n";

while (<INPUT4>) {
	chomp;
	if (/^\S+\t\S+\t(\d+)/) {
		my $identity = $1;
		if ($identity < 95.00) {
			print OUT8 "$_\n";
		}
	}
}

close INPUT4;
close OUT8;

open INPUTX, "All_genes_real.txt" or die "Open INPUTX Failed!\n";
open OUTX, ">>All_genes_real_gt95.txt" or die "Open OUTX Failed!\n";
print OUTX join("\t", "Gene Ids", "Target", "Percent identity", "Alignment length", "Number of mismatches", "Number of gap opens", "1-based position of start in query", "1-based position of end in query", "1-based position of start in target", "1-based position of end in target", "E-value", "Bit score", "nucl length", "aa length", "nucl sequence", "prot sequence") . "\n";
while (<INPUTX>) {
	chomp;
	if (/^\S+\t\S+\t(\d+)/) {
		my $identity = $1;
		if ($identity > 95.00) {
			print OUTX "$_\n";
		}
	}
}

close INPUTX;
close OUTX;

#----------------------Print Old Genes Table--------------------------------------------------------------------------------

open AGT,"All_genes_real_gt95.txt" || die "Open AGT Failed !\n";
open AOLD,">>Old_genes.table" || die "Open AOLD Failed !\n";
print AOLD join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Integrity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
#print AOLD join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
<AGT>;
while (<AGT>) {
	chomp;
	my @line4 = split(/\t/);
	$line4[1]=~/(\S+)_len(\d+)/;
    my $besthit= $1;
	my $hitlen1 = $2;
	my $integrity = ($line4[3]/$hitlen1)*100;
     my $sintegrity = sprintf "%.2f",$integrity;
	my $covera1 = ($line4[3]/$line4[13])*100;
	my $scovera1 = sprintf "%.2f",$covera1;
	if ($line4[2] >= 95) {
		print AOLD join ("\t", $line4[0], $line4[13], "Rank4", $besthit, $hitlen1, $scovera1, $line4[2], $sintegrity, $line4[15], $line4[14]) . "\n";
#       print AOLD join ("\t", $line4[0], $line4[14], "Rank4", $line4[1], $hitlen1, $scovera1, $line4[2], $line4[16], $line4[15]) . "\n";
	}
}

#-------------------If there are new genes----------------------------------------------------------------------------------

my @All_genes_real_lt95 = <All_genes_real_lt95.txt>;
foreach (@All_genes_real_lt95) {
     if (-f $_ and -z _){
        print "No new genes found !!!\n\n";
        exit;
       }
}

#===========================================================================================================================
#28All_genes_unique_v3.pl
#Function : delete the rows whose has the same aa sequences from "All_genes_real.txt" .IN the file "All_genes_unique.txt",there may have the genes with only a litte of difference which could be think the same .


my %hash;
open IN9,"All_genes_real_lt95.txt" or die "Open IN9 Failed!\n";
open OUT9,">>All_genes_real_lt95_unique.txt" or die "Open OUT9 Failed!\n";
print OUT9 join("\t", "Gene Ids", "Target", "Percent identity", "Alignment length", "Number of mismatches", "Number of gap opens", "1-based position of start in query", "1-based position of end in query", "1-based position of start in target", "1-based position of end in target", "E-value", "Bit score", "nucl length", "aa length", "nucl sequence", "prot sequence") . "\n";
while (<IN9>) {
	chomp;
	my @lines = split(/\t/);
	my $keys = pop @lines;
	next if(exists $hash{$keys});
	$hash{$keys} = "$_";
    print OUT9  $_ . "\n";
}
close IN9;
close OUT9;

#===========================================================================================================================
#Function extract ids and sequences of genes,get names of the strains who have new genes.
#-----------------extract ids and sequences of genes from "All_genes_unique.txt" to "genes_nucl.fa"-------------------------

open IN10,"All_genes_real_lt95_unique.txt" or die "Open IN10 Failed!\n";
open OUT10,">>genes_nucl.fa" or die "Open OUT10 Failed!\n";
<IN10>;
while (<IN10>) {
	chomp;
	print OUT10 ">" . join "\t",(split/\t/)[0]."\n".(split/\t/)[14],"\n",;
}
close IN10;
close OUT10;

#------------get names of the strains who have new genes to file "name_genes_scaf.txt"--------------------------------------

open IN11,"genes_nucl.fa" or die "Open IN11 File Failed!\n";
open OUT11,">>genes_scaf.txt" or die "Open OUT11 Failed!\n";

while (<IN11>) {
	chomp;
	if (/\>(\S+)_\d+/) {##################################
		print OUT11 $1 . "\n";
	}
}
close IN11;
close OUT11;

my %hashf;
open IN12,"genes_scaf.txt" or die "Open IN12 Failed!\n";
open OUT12,">>name_genes_scaf.txt" or die "Open OUT12 Failed!\n";
while (<IN12>) {
	chomp;
	my @lines = split;
	my $keys = shift @lines;
	next if(exists $hashf{$keys});
	$hashf{$keys} = "$_";
	print OUT12 $keys. "\n";
}
close IN12;
close OUT12;
system("rm -rf genes_scaf.txt");


#==========================================================================================================================
# Function: Given name_genes_scaf.txt, move the -8.fa to another place.

if (($type eq "reads") or ($type eq "assembled")) {
    my $count = 0; # Number of IDs
    my $hits  = 0; # Number of matched IDs
    local $|  = 1; # Refreshing every time to improve the speed of display progress.
    my %scafs;

#-----------------------------------------------[reads ids]----------------------------------------------------------------

    open SCA,"name_genes_scaf.txt" || die "Open SCA Failed!\n";
       while(<SCA>) {
          my @lines = split(/\s+/);
          my $key=shift @lines;
          $key =~/(\S+)_\d+/;
          next if (exists $scafs{$1});
          $scafs{$1} = 1;
       }
    close SCA;

# -----------------------------------------show number of ids--------------------------------------------------------------

    my @ids = keys %scafs;#Reading IDs from a file and extract the sequences from another file.
    my $num = @ids;
    print "\nRead $num ids.\n\n";

#--------------------------------------------[ searching ]-----------------------------------------------------------------

    chdir "../scaf";
    my @gbk = glob("*-8.fa");
    foreach (@gbk) {
	    $_=~/(\S+)-8.fa/;#####################2-YBT1518-sp-2-11_L2_I350-8.fa
	    my $name = $1;
	    if (exists $scafs{$name}) {
		    system("cp $_ ../cry ");
	    }
    }

    chdir "../cry";

#==========================================================================================================================
# 211co_scaf.pl
# Function: place scaffolds in one file(All_scaf.txt).

    my @fa = glob("*-8.fa");
    foreach  (@fa) {
	    $_=~/(\S+)-8.fa/;
	    my $name = $1;
	    open IN13,"$_" or die "Open IN13 Failed!\n";
	    open OUT13,">>All_scaf.txt" or die "Open OUT13 Failed!\n";
#	local $/ = ">";
#	<IN13>;
	    while (<IN13>) {
		    chomp;
                if(/\>(\S+)/){
                   print OUT13 ">$name"."-scaf"."$1" . "\n";
                }else{
		           print OUT13 "$_\n";
                }
	    }
    }
    close IN13;
    close OUT13;

#==========================================================================================================================
#212gene_blastn_scaf_v2.pl
#blastn to find the scaffolds where the gene is located.

    system("makeblastdb -in All_scaf.txt -out All_scaf.db -dbtype nucl");
    system("blastn -query genes_nucl.fa -out genes_nucl.blastn -db All_scaf.db -evalue 0.001 -outfmt 6 -perc_identity 100.00 -max_target_seqs 1 -num_threads $cpus");

#-------------------extract the names of the hit scaffolds(hit_scaf_name.txt)---------------------------------------------

    open IN14,"genes_nucl.blastn" or die "Open IN14 Failed!\n";
    open OUT14,">>hit_scaf_name.txt" or die "Open OUT14 Failed!\n";
    while (<IN14>) {
	    chomp;
	    print OUT14 join "\t",(split/\t/)[1]."\t".(split/\t/)[0]."\t".(split/\t/)[8]."\t".(split/\t/)[9]."\n";
    }
    close IN14;
    close OUT14;

#----------------------------extract sequences of scaffolds to file "gene_scaf_seqs.txt"----------------------------------

    my $count2 = 0;
    my $hits2  = 0;
    local $|  = 1;
    my %ids_hash;

#---------------------------------------------[read ids]------------------------------------------------------------------

    open ID,"hit_scaf_name.txt" or die "Can't open ID file!\n";
    while(<ID>) {
       my @lines = split(/\s+/);
       my $key=shift @lines;
       next if (exists $ids_hash{$key});
       $ids_hash{$key} = $_;
    }

# ------------------------------------------show number of ids-------------------------------------------------------------

    my @ids1 = keys %ids_hash;
    my $num1 = @ids1;
    print "\nRead $num1 ids.\n\n";

#----------------------------------------------[ searching ]----------------------------------------------------------------

    open IN15, "All_scaf.txt" or die "Failed to open IN15 file!\n";
    open OUT15, ">>gene_scaf_seqs.txt" or die "Failed to open OUT15 file!\n";

    local $/ = ">";
    <IN15>;                 # ignore the first >.
    my ( $head, $seq );
    while (<IN15>) {
        $count2++;
        s/\r?\n>//;
        ( $head, $seq ) = split "\n", $_, 2;
        $seq =~ s/\s+//g;
        next unless $head =~ /(.+-scaf\d+)/;
        #next unless $head =~/(\d*[a-zA-Z]+\d+\S+)/;
        if (exists $ids_hash{$1}){
            print OUT15 "$head\t$seq\n";
        }
        $ids_hash{$1}++;
        $hits2++;
    }
    print "\rProcessing ${count2} th record. hits: $hits2";

# -----------------------------Show the numbers of IDs which had no match to any of the record------------------------------

    my @ids2 = grep {$ids_hash{$_} == 0} keys %ids_hash;
    my $num2 = @ids2;
    print "\n\n$num2 ids did not match any record in all_seqs.fa\n";
    print "@ids2\n";
    $/ = "\n";
    close IN15;
    close OUT15;

#===========================================================================================================================
#213gene_scaf_seqs_v2.pl
# Function:genes' ids with scaffolds in a line(gene_scaf_seqs3_2.txt).

    open INPUT5,"gene_scaf_seqs.txt" or die "Open INPUT5 Failed!\n";
    open OUTPUT5,">>gene_scaf_seqs2.txt" or die "Open OUTPUT5 Failed!\n";
    while (<INPUT5>) {
	    chomp;
	    if (/^(\S+)\t([A-Z]+)/) {
	       print OUTPUT5 "$1\t$2\n";
	    }
    }
    close INPUT5;
    close OUTPUT5;

    my %hashs;
    open IN16,"gene_scaf_seqs2.txt" or die "Open IN16 Failed!\n";
    while (<IN16>){
         chomp;
         my ($id,$seqs)=split(/\t/);
         $hashs{$id}= "$_";
    }
    open INPUT6,"hit_scaf_name.txt" or die "Open INPUT6 Failed!\n";
    open OUT16, ">>gene_scaf_seqs3.txt" or die "Open OUT16 Failed!\n";
    print OUT16 join("\t", "Gene Ids", "1-based position of start in scaffold", "1-based position of end in scaffold", "Scaffolds Ids", "Scaffolds Sequences") . "\n";
    while (<INPUT6>){
       chomp;
       my ($ids,$idg,$scafl,$scafr)=split(/\t/);
       if (exists $hashs{$ids}) {
          print OUT16 "$idg\t$scafl\t$scafr\t$hashs{$ids}\n";
       }
    }

    close IN16;
    close INPUT6;
    close OUT16;
}


#======================================Print New Genes Table=====================================================================================

open IN17,"All_genes_real_lt95_unique.txt" || die "Open IN17 Failed !\n";
open OUT17,">>New_genes_unique.table" || die "Open OUT17 Failed !\n";
print OUT17 join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Integrity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
#print OUT17 join ("\t", "Protein_ID", "Protein_Length", "Rank", "Best_hit", "Hit_Length", "Coverage", "Identity", "Protein_Sequence", "Nucleotide_Sequence") . "\n";
<IN17>;
while (<IN17>) {
	chomp;
	my @line3 = split(/\t/);
	$line3[1]=~/(\S+)_len(\d+)/;
    my $besthit2 = $1;
	my $hitlen = $2;
	my $covera = ($line3[3]/$line3[13])*100;
    my $scovera = sprintf "%.2f",$covera;
	my $Integrity = ($line3[3]/$hitlen)*100;
	my $sIntegrity = sprintf "%.2f",$Integrity;
	if ($line3[2] < 45) {
		print OUT17 join ("\t", $line3[0], $line3[13], "Rank1", $besthit2, $hitlen, $scovera, $line3[2], $sIntegrity, $line3[15], $line3[14]) . "\n";
#		print OUT17 join ("\t", $line3[0], $line3[14], "Rank1", $line3[1], $hitlen, $scovera, $line3[2], $line3[16], $line3[15]) . "\n";
	}elsif ($line3[2] >= 45 and $line3[2] < 78) {
		print OUT17 join ("\t", $line3[0], $line3[13], "Rank2", $besthit2, $hitlen, $scovera, $line3[2], $sIntegrity, $line3[15], $line3[14]) . "\n";
#		print OUT17 join ("\t", $line3[0], $line3[14], "Rank2", $line3[1], $hitlen, $scovera, $line3[2], $line3[16], $line3[15]) . "\n";
	}else{
	    print OUT17 join ("\t", $line3[0], $line3[13], "Rank3", $besthit2, $hitlen, $scovera, $line3[2], $sIntegrity, $line3[15], $line3[14]) . "\n";
#		print OUT17 join ("\t", $line3[0], $line3[14], "Rank3", $line3[1], $hitlen, $scovera, $line3[2], $line3[16], $line3[15]) . "\n";
	}
}


system("rm -rf gene_scaf_seqs2.txt gene_scaf_seqs.txt All_scaf.db.nhr All_scaf.db.nin All_scaf.db.nsq All_scaf.txt genes_nucl.blastn hit_scaf_name.txt");
system("mkdir new_genes old_genes other_files scaffolds");
system("mv New_genes_unique.table All_genes_real_lt95_unique.txt new_genes/");
system("mv All_genes.txt All_genes_real_lt95.txt name_genes_scaf.txt genes_nucl.fa All_genes_real.txt other_files/");
system("mv  Old_genes.table All_genes_real_gt95.txt old_genes/");

if(($type eq "reads") or ($type eq "assembled")){
   system("mv gene_scaf_seqs3.txt new_genes/");
   system("mv *-8.fa scaffolds/");
}

print "Congratulations, programm completing...\n\n";


sub  showhelp{
print STDERR<<EOF;

BtToxin_scanner2.pl (v1.0)

Usage and example:

         for reads: BtToxin_scanner2.pl -s reads -1 .R1.clean.fastq.gz -2 .R2.clean.fastq.gz

         for contigs/scaffolds: BtToxin_scanner2.pl -s assembled -3 "-8.fa"

         for CDs: BtToxin_scanner2.pl -s cds -p CDs -4 ".faa" -5 ".ffn"

    Essential:

	         -1          The suffix name of reads 1 (for example: if the name of reads 1 is "YBT-1520_L1_I050.R1.clean.fastq.gz",
                         "YBT-1520" is the strain same, so the suffix name should be ".R1.clean.fastq.gz")

	         -2          The suffix name of reads 2(for example: if the name of reads 2 is "YBT-1520_2.fq", the suffix name should be _2.fq")

			 -3          The suffix name of contigs or scaffolds(default -8.fa)

			 -s          Input type for analysis(3 choices: reads, assembled, cds. default is reads)

			 -p          The path of CDs contain both amino acid sequences and nucleotide sequences(default CDs)

			 -4          The suffix name of files contain amino acid sequences(default .faa)

			 -5          The suffix name of files contain nucleotide sequences(default .ffn)


    Options:

	         -h        Show help

	         -k	       k-mer size for genome assembly (default 81)

	         -n	       number of cpus to use (default 6)

	         -e	       evalue for blast against the Bt toxin database (default 1e-5)

	         -c        the minimum size of blast coverage(default 50)

			 -g	       Specify a translation table to use for prodigal (default 11)

			 -d        Database for blastp (default Bt_toxin)

EOF
            }