#!/usr/bin/perl
use strict;
use warnings;
use Cwd;
use Getopt::Long;
use Data::Dumper;
use File::Basename qw(basename dirname);
use FindBin qw($Bin $Script);
use PerlIO::gzip;


my $key1_column = 1;
my $key2_column = 1;
my $fIn1;
my $fIn2; 
my $fOut;
my $field_sep = "\t";
my @line1;
my @line2;
my %target; 
my %exp;
my $ID1;
my $ID2; 
my @selected_exp;

GetOptions(
            "help|h" =>\&USAGE,
            "key1|k1:i" =>\$key1_column,
            "key2|k2:i" =>\$key2_column,
            "field|f:s" =>\$field_sep,
            "in1|i1:s" =>\$fIn1,
            "in2|i2:s" =>\$fIn2,
            "out|o:s" =>\$fOut,
            ) or &USAGE;
&USAGE unless ($fIn1 and $fIn2 and $fOut);

sub USAGE {#
        my $usage=<<"USAGE";
        
Copyright (C) - belong to Ma et al., 2021, Genome research. If you use or modify it, please cite the paper.
        
Description: Extract information of ID (input file1) from input file2, and then merge info in file1 and file2 in output file.

Usage:

perl extract_merge_info.pl -k1 N1 -k2 N2 -f TAB --in1 file1 --in2 file2 --out out_file.txt 

  Options:
  
  --key1 N1
  -k1 N1 		column N1 as key/ID column for FILE1 (default: 1st column);
    
  --key2 N2
  -k2 N2 		column N2 as key/ID column for FILE1 (default: 1st column);
  
  --field TAB
  -f	  TAB   field sepeater among columns of files (default: TABLE) 
  
  --in1 <FILE>
  -i1 	<FILE> 	input file1: miRNA info;
  
  --in2 <FILE2>  
  -i2 	<FILE> 	input file2: target gene expression info
  
  --out <FILE>
  -o	<FILE> 	output file
            
  --help
  -h	        Help

Example files: 

### * file1: miRNA file * ###########################

miRNA	GeneID	Score
miR167	GRMZM2G073750	5
miR167	GRMZM2G028980	5
miR319	GRMZM2G120954	3.5
miR390ab	GRMZM2G020468	4.5
miR393	GRMZM2G137451	3
miR396	GRMZM5G850129	3

### * file2: gene expression file * #################

GeneID	Description	Chr	Start	End	Strand	Meristem	L1	L2	Tip	P0	Internode	P1	P2	P3	Vasculature	Adaxial	Abaxial
GRMZM2G073750	'highly similar to ( 741) AT1G30330 | Symbols: ARF6...' 	chr3	123862422	123868621	-	37.7	57.6	41.75	74.15	22.95	34.1	14.6	16.7	19.8	76.15	21.05	19.35
GRMZM2G028980	'weakly similar to ( 103) loc_os02g06910 ... ARF6...'	chr4	236472247	236478195	-	63.5	48	62.55	58.45	41	77.35	35.9	46.5	51.7	80.85	47.45	34.4
GRMZM2G120954	'moderately similar to ( 493) AT2G26770 plectin-related...'	chr7	169451699	169456757	+	44.15	46.05	38.75	36.1	32.35	31.55	27.4	33.85	40.3	71.15	35.75	46.8
GRMZM2G137451	'highly similar to ( 768) AT3G26810 | Symbols: AFB2...'	chr10	110996613	111000922	-	104.45	108.3	108.75	88.05	105.15	102.7	115.3	105.1	118.5	119.6	103.4	103.15
GRMZM5G850129	'weakly similar to ( 145) AT3G13960 | Symbols: AtGRF5...'	chr6	108305466	108307341	+	10.95	11	11.45	4.75	24.35	15.85	24.7	25.05	28.6	5.8	18.55	50.3
GRMZM2G067743	'weakly similar to ( 149) AT3G13960 | Symbols: AtGRF5...'	chr9	9812369	9814204	-	34	32.7	29.35	11.95	53.8	23.9	50.1	34.75	53.65	57.25	109.85	44.05
....

### * output file * #################################

miRNA	GeneID	Score	Description	Chr	Start	End	Strand	Meristem	L1	L2	Tip	P0	Internode	P1	P2	P3	Vasculature	Adaxial	Abaxial
miR167	GRMZM2G073750	5	'highly similar to ( 741) AT1G30330 | Symbols: ARF6...' 	chr3	123862422	123868621	-	37.7	57.6	41.75	74.15	22.95	34.1	14.6	16.7	19.8	76.15	21.05	19.35
miR167	GRMZM2G028980	5	'weakly similar to ( 103) loc_os02g06910 ... ARF6...'	chr4	236472247	236478195	-	63.5	48	62.55	58.45	41	77.35	35.9	46.5	51.7	80.85	47.45	34.4
miR319	GRMZM2G120954	3.5	'moderately similar to ( 493) AT2G26770 plectin-related...'	chr7	169451699	169456757	+	44.15	46.05	38.75	36.1	32.35	31.55	27.4	33.85	40.3	71.15	35.75	46.8
miR390ab	GRMZM2G020468	4.5	tas3b 	61.9	181.35	72.8	119.2	69.2	32.5	27.05	98.15	200.05	102.35	321.25	519.45	4.5
miR393	GRMZM2G137451	3	'highly similar to ( 768) AT3G26810 | Symbols: AFB2...'	chr10	110996613	111000922	-	104.45	108.3	108.75	88.05	105.15	102.7	115.3	105.1	118.5	119.6	103.4	103.15
miR396	GRMZM5G850129	3	'weakly similar to ( 145) AT3G13960 | Symbols: AtGRF5...'	chr6	108305466	108307341	+	10.95	11	11.45	4.75	24.35	15.85	24.7	25.05	28.6	5.8	18.55	50.3


USAGE
        print $usage;
        exit;
}

open IN1, "$fIn1" or die "Error: cannot open the file:$!";
open IN2, "$fIn2" or die "Error: cannot open the file:$!";
open OUT, ">$fOut" or die "Error: cannot open the file:$!";


while(<IN1>){
	
	chomp;	
	@line1 = split $field_sep, $_;
	$ID1 = $line1[$key1_column-1];
	$ID1 =~ s/^\s+//g;
	$ID1 =~ s/\s+$//g;
	$exp{$ID1} = $_;
}


while(<IN2>){
	
	chomp; 
	@line2 = split $field_sep, $_;
	
	$ID2 = $line2[$key2_column-1];
	$ID2 =~ s/^\s+//g;
	$ID2 =~ s/\s+$//g;
	
	if($exp{$ID2}){	
		
		$target{$ID2} = $_;
		@selected_exp = split $field_sep, $exp{$ID2};
		splice (@selected_exp, $key1_column-1, 1);
		
		print OUT "$target{$line2[$key2_column-1]}",$field_sep,join $field_sep, @selected_exp,"\n";
	
	}else{
		
		next;
		
		}
}

close IN1;
close IN2;
close OUT