#!/usr/bin/perl -w
use strict;
use File::Basename;
use Bio::SeqIO;
use Cwd;


my $pep = $ARGV[0];
my $db = $ARGV[1];
my $outdb = $ARGV[2];

generate_ref_db($pep,$db,$outdb);



sub generate_ref_db{
	## A file containing both target and decoy peptides
	my $in_pep_file = shift;
	my $db = shift;
	my $outdb = shift;
		
	my %pro2seq;
	my $ref_pro_file = $db;
	print "Read protein reference db: $ref_pro_file\n";
	my $db_reader = Bio::SeqIO->new(-file=>"$ref_pro_file", -format=>'fasta');
	my $n_pro = 0;

	my $one_seq_for_all_pros = "";
	while(my $seq_obj = $db_reader->next_seq){
		my $seq = $seq_obj->seq();
		my $id = $seq_obj->display_name;
		#print "$id\t$seq\n";
		$pro2seq{$id} = $seq;
		$n_pro++;
		$one_seq_for_all_pros = "$one_seq_for_all_pros"."_"."$seq";
	}

	print "Total number of proteins: $n_pro\n";
	print "Length of all proteins: ",length($one_seq_for_all_pros),"\n";

	print "I 2 L ...\n";
	$one_seq_for_all_pros =~ s/I/L/g;
	
	open P,"$in_pep_file" or die "$!\n";
	while(<P>){
		chomp;
		my $pep = $_;
		$one_seq_for_all_pros =~ s/$pep/_/g;
	}
	close P;

	
	#my $new_ref_db = $in_pep_file;
	#$new_ref_db =~ s/.txt$/_refdb.fasta/;
	open D,">$outdb" or die "$!\n";
	print "Generate new reference database: $outdb\n";
	my @dd = split("_",$one_seq_for_all_pros);

	my $n_pro_new = 0;
	for(my $i=0;$i<=$#dd;$i++){
		if($dd[$i] =~ m/[A-Z]/){
			$n_pro_new++;
			print D ">new$i\n$dd[$i]\n";
		}
	}
	print "New reference proteins: $n_pro_new\n";
	close D;
	#return($new_ref_db);
}




