#!/usr/local/bin/perl

use Statistics::Basic::Mean;  
use Statistics::Basic::StdDev;
use IO::File; 
use strict;


################################################################################################
################################################################################################
# SFPscanV10.pl
#
# Author: Hans van Leeuwen
#         Dept. of Plant Sciences
#         University of California, Davis
#         Mail Stop 3
#         One Shields Ave
#         Davis, CA  95616
#         E-mail: hvanleeuwen@ucdavis.edu
#         Web: http://elp.ucdavis.edu/data/analysis/sfp_map/SFPscan.html
#              
#
# Date:   September 2005
#
################################################################################################
################################################################################################





################################################################################################
################################################################################################
################################# Program usage ################################################
#
# >SFPscanV10.pl [CONVERSION_FILE] [SETUP_FILE] [CHIP_LIST_FILE] [PARENT_A_CHIP_LIST_FILE] 
#  [PARENT_B_CHIP_LIST_FILE] [GAP_SIZE] [ALLELE_DISTORTION] [UNASSIGNED_SCORES] [OUTPUT] [USE_CD]
#
#  (all input files should be in the same directory as the SFPscanV10.pl file)
#  (all arguments must be entered! even if you use default values; see below)
#
#
# [CONVERSION_FILE] 
#    The conversion file with the probeset and gene code data, having the format:
#    Probe Set ID	Sequence Derived From
#    245016_at	accD
#    261585_at	At1g01010
#    261568_at	At1g01030
#    261584_at	At1g01040
#    ...etc
#
#    This file is provided with this script: "ATH1_conv-probeset-genecode.txt"
#    Note: Positions in the Col-0 genome  of the probeset sequences from the Affymetrix ATH1 GeneChip
#          were obtained from the Arabidopsis annotation Version 4, TIGR release May 2003.
#
#
#
# [SETUP_FILE] 
#    A file with the experimental setup, first column is RIL code, next columns, rep chip codes.
#    First line should be a header line starting with ; (no space between ';' and 'RILcode')
#    For example: 
#    ;RILcode	SW1	SW2
#    4	R5	R620
#    5	R85	R687
#    8	R221	R495b
#    13	R30	R666
#
#
#
#
# [CHIP_LIST_FILE] 
#    A file with a list of chip files that will be used in this study.
#    this file should contain both the RIL chip file names and the parental chip file names.
#    File should have following format:
#    R5.CEL
#    R85.CEL
#    R221.CEL
#    R30.CEL
#    R82.CEL
#    R270.CEL
#    R76.CEL
# 
#    etc.
#
#
#
# [PARENT_A_CHIP_LIST_FILE] 
# [PARENT_B_CHIP_LIST_FILE] 
#    Two files with the chip file names of the parental chips.
#    File should have following format:
#    R5.CEL
#    R85.CEL
#    R221.CEL
#    R30.CEL
# 
#    etc.
#
#
#
#
# [GAP_SIZE]
#    The minimum distance between the two peaks in the bimodal distribution.
#    This value is calculated as follows: min_SFPdev SFP parent / max_SFPdev nonSFP parent.
#    The default value is 2.0
#    A value greater than 1.0 means that the distributions are seperated.
#    This value should be chosen carefully; a too low value may increase the number of markers,
#    but can also increase the number of unassigned alleles; a too high value may give higher
#    higher quality markers, but the number of markers obtained decreases.
#    
#
#
# [ALLELE_DISTORTION]
#    In a RIL population a allele segregation of 50%-50% is expected, 
#    but this can vary in certain populations
#    The default value is 80
#    This means that allele segregations from 80%-20% to 20%-80% are allowed.
#
#
#
# [UNASSIGNED_SCORES]
#    This sets a limit to the number of unassigned allele scores a marker may have.
#    The default value is 10
#    This means that a marker is discarded if more than 10% of the allele calls are unassigned.
#
#
#
# [OUTPUT]     
#    This is a short text (please use numbers, letters and underscores only)
#    that will be used in the output file names.         
#
#
#
# [USE_CD]
#    You have the option to use 'C' and 'D' scores in genotyping (C = NOT A, D = NOT B).
#    If you select this option the following genotyping rules will be used:
#          If half or more than half of the rep chips is genotyped as one parental accession,
#          and the other rep chips as unassigned: use C or D.
#       Type 'YES' to select this option.
#    If you do not select this option the above mentioned chips will be scored as '-'.
#       Type 'NO' if you do not want the 'C' and 'D' scoring to be used.
#    
#
#
#
################################################################################################
################################################################################################






################################################################################################
################################################################################################
################################### Genotyping rules ###########################################
#
#
#            The following rules are used when determining a genotype for a RIL,
#            based on the genotypes of the individual reps:
#            A = parent A, B = parent B, C = NOT A, D = NOT B, - = unassigned
#            If all rep chips are genotyped as one accession: use A or B
#            If half of the rep chips is genotyped for one accession,
#                 and the other half of the rep chips for the other accession: use -
#            If half or more than half of the rep chips is genotyped as one accession,
#                 and the other rep chips as unassigned: use C or D
#            Else, use: - 
#
#
################################################################################################
################################################################################################






################################################################################################
################################################################################################
################################# Preparation .CEL files #######################################
# 
# The .CEL files need to be background corrected seperately under the R programming environment,
# with the Bioconductor package installed. Use the following R-script (copy and paste
# this code (all lines between START SCRIPT and END SCRIPT in a text file
# and then load it into the R-programming environment).
# Make sure to remove the first '#' (only the first!!) in all code lines
# between the START and END lines. 
#
#
################################# START R-SCRIPT ###########################
# ##########################################################################
# #
# # Author: Hans van Leeuwen
# #         Department of Plant Sciences
# #         University of California, Davis
# #         Mail Stop 3
# #         One Shields Ave
# #         Davis, CA  95616
# #         
# #         Email: hvanleeuwen@ucdavis.edu
# #
# # Date:   JUNE 2005
# #
# ##########################################################################
# #
# ##########################################################################
# ######################### IMPORTANT ######################################
# ##########################################################################
# ########### THE LINES MARKED WITH 'CHANGE' MUST  #########################
# ########### BE MODIFIED USING YOUR PATH          #########################
# ##########################################################################
# ######################### IMPORTANT ######################################
# ##########################################################################
#
# ##########################################################################
# # Set work directory to the directory where your original .CEL files are located
# setwd("C:/rw1081/hans_temp") ######### CHANGE ########
# #
# ##########################################################################
# # Read in file with chip ID's for which the CEL files will be read later
# # This file should have format:
# # R5.CEL
# # R85.CEL
# # R221.CEL
# # R277.CEL
# # R30.CEL
# # R82.CEL
# # R270.CEL
# #
# # etc...
# #
# ##### This file should have the filename 'chips.txt' #####################
# chips <- read.table("chips.txt")
# #
# ##########################################################################
# # Load affy package
# library(affy)
# 
# ##########################################################################
# # Open cel files one by one
# for (i in 1:length(chips$V1)){
#   # Set work directory to the directory where your .CEL files are located
#   setwd("C:/rw1081/hans_temp") ######### CHANGE ########
#   data <- ReadAffy(filenames = as.character(chips$V1[i]))
#   data.bg <- bg.correct.rma(data)
#   data.bg.matrix <- pm(data.bg)
#   # Set work directory to the directory where the background corrected files
#   # will be saved. This can not be the same work directory that contains the original .CEL files!!
#   setwd("C:/rw1081/hans_temp/bg_files") ######### CHANGE ########
#   write.table(data.bg.matrix, file = as.character(chips$V1[i]), append = FALSE, quote = TRUE, sep = ",", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE, qmethod = c("escape", "double"))
# } # end for
################################# END R-SCRIPT #############################
#
#
# The obtained files should have a format similar to:
#"471.CEL"
#"244901_at1",363.582352941176
#"244901_at2",124.111764705882
# ...etc
#
#
################################################################################################
################################################################################################





################################################################################################
################################################################################################
################################# Output files  ################################################
#
# The following output files will be created:
#    (the asterisk indicates the text string from the OUTPUT argument provided by the user)
#
# 1. *_SFPscanV10_log.txt
#
#            Logfile, contains possible error messages.
#
#
# 
# 2. *_SFPscanV10_marker_data.txt
#
#            column 1: SFP marker ID
#            column 2: Parental line with the SFP (A/B)
#            column 3: max_SFPdev nonSFP parent
#                      This is the right boundary of the distribution of the
#                      chips that have the genotype of the parent that does 
#                      not contain the SFP.
#            column 4: min_SFPdev SFP parent
#                      This is the left boundary of the distribution of the
#                      chips that have the genotype of the parent that
#                      contains the SFP.
#            column 5: spread
#                      This is the result of: min_SFPdev SFP parent/max_SFPdev nonSFP parent
#                      The user sets the minimum allowed spread.
#                      A spread greater than 1 means that the distributions are seperated.
#            column 6: A frequency
#                      This is the frequency of the 'A' allele
#            column 7: B frequency
#                      This is the frequency of the '-' allele
#            column 8: missing data frequency
#                      This is the frequency of unassigned allele calls.
#
#            NOTE: The data in columns 6, 7, 8 is based on data from all chips,
#                  before a genotype is assigned to a RIL based on the individual
#                  genotypes of the reps for that RIL. In other words, these frequencies
#                  can be slightly different in the final marker data, because then
#                  the data for the RILs (and not the individual reps) is used.
#
# 
#
#
# 3. *_SFPscanV10_genotypes_all_chips.txt
#            
#            Contains genotypes for all chips for all markers in following format:
#
#            SFPmarker      RIL        chip     rep	genotype
#            At5g52440-9    Bay-0_2    R473b    1     B
#            At5g52440-9    Bay-0_2    R628     2     B
#            At1g64900-6    Bay-0_2    R473b    1     B
#            At1g64900-6    Bay-0_2    R628     2     B
#            At3g25610-10   30         R493     2     B
#            At3g25610-10   30         R260     1     B
#            At1g54030-7    30         R260     1     B
#            At1g54030-7    30         R493     2     B
# 
#
#
#
# 4. *_SFPscanV10_genotypes_jm.txt
#
#            Contains genotypes for all RILs for all markers, in JoinMap format:
#            (RILs are sorted in numerical order; the actual RIL codes 
#             can be seen in the Excel format file (see below))
#
#            marker14
#            DABAAADAADBADBAADBAABBDAAADDBAADAAABAAABABDAAABBAA
#            marker30
#            AABAABBBAABBAAAABABBAAABBAAAABAAABAAAAAAAAAABBAAA-A
#            marker35
#            BAADAAABADAABAAABABBAAAABABAADAABBDCADADAABAABA-DA
#            etc.
#
#
#
# 
# 5. *_SFPscanV10_genotypes_ex.txt
#
#           Contains genotypes for all RILs for all markers, in Excel format (tabulated):
#           (RILs are sorted in numerical order on first row)
#
#                      4	5	8	13	14	16
#           marker14   D	A	B	A	A	A
#           marker30   A      A     B     A     A     B
#           et. 
#
#
#
# 
# 6. *_SFPscanV10_genotypes_clean_jm.txt
#
#           Contains genotypes for all RILs for markers that passed the post-genotyping
#           filtering process. Data is in JoinMap format.
#
#
#
#
# 
# 7. *_SFPscanV10_genotypes_clean_ex.txt
#
#           Contains genotypes for all RILs for markers that passed the post-genotyping
#           filtering process. Data is in Excel format (tabulated).
#
#
#
################################################################################################
################################################################################################





################################################################################################
#
# Read in command line arguments, check for correct usage
#
# Remember usage:
# >SFPscanV10.pl [CONVERSION_FILE] [SETUP_FILE] [CHIP_LIST_FILE] [PARENT_A_CHIP_LIST_FILE] 
#  [PARENT_B_CHIP_LIST_FILE] [GAP_SIZE] [ALLELE_DISTORTION] [UNASSIGNED_SCORES] [OUTPUT] [USE_CD]
#
if(($#ARGV + 1) != 10){
  print_usage();
} # end if
my $probeset_gene_file = $ARGV[0];
my $expfile = $ARGV[1];
my $chipidfile = $ARGV[2];
my $parental_A_chips_file = $ARGV[3];
my $parental_B_chips_file = $ARGV[4];
my $spread = $ARGV[5];
my $distortion = $ARGV[6];
my $unassigned = $ARGV[7];
my $output = $ARGV[8];
my $use_cd = $ARGV[9];
chomp($use_cd);
# Check some values
if($use_cd ne "YES" and $use_cd ne "NO"){
  print_usage();
} # end if
if($spread < 1){
  print_warning("Are you sure you want to use a GAP_SIZE smaller than 1.0?\nThis will give overlapping distributions!\n");
} # end if
if($distortion > 80){
  print_warning("Are you sure you want to allow an ALLELE_DISTORTION greater than 80%?\n");
} # end if
if($unassigned > 10){
  print_warning("Are you sure you want to allow more than 10% UNASSIGNED_SCORES?\n");
} # end if




################################################################################################
# Define genotypes
my $genotype_A = "A"; 
my $genotype_B = "B";




################################################################################################
# Open logfile 
my $logfile = $output."_SFPscanV10_log.txt";
open(LOGFILE, ">$logfile") or die "Can't open '$logfile'!\n";
print "Program parameters:\n\tgap size: $spread\n\tallele distortion: $distortion\n\tunassigned scores: $unassigned\n\n";
print LOGFILE "Program parameters:\n\tgap size: $spread\n\tallele distortion: $distortion\n\tunassigned scores: $unassigned\n\n";



################################################################################################
# Open conversion file with probeset and gene id 
open(PGFILE, "$probeset_gene_file") or die "Can't open '$probeset_gene_file'!\n";
my @pg_data = <PGFILE>;
# close input file
close(PGFILE);
# remove header line
shift(@pg_data);
# Store in hash for easy lookup later
my %pg_hash;
# File has format: probeset    gene_id
# Store in hash with probeset as key, and gene_id as value
# $1 is probe set, $2 is gene_id
foreach my $line(@pg_data){
  my @split_line = split(/\t/, $line);
  chomp($split_line[1]);
  $pg_hash{$split_line[0]} = $split_line[1];
} # end foreach
# Let user know
print "probeset-gene_id file read: $probeset_gene_file\n";
print LOGFILE "probeset-gene_id file read: $probeset_gene_file\n";






################################################################################################
# Open inputfile with RIL experiment setup.
open(EXPFILE, "$expfile") or die "Can't open '$expfile'!\n";
my @exp_setup = <EXPFILE>;
# close input file
close(EXPFILE);
# Store in hash for easy lookup later
my %exp_setup_hash;
# Get IDs for reps
my @rep_IDs = split(/\t/, shift(@exp_setup));
# Check if file header line starting with ';'
my $check_header = shift(@rep_IDs);
if(substr($check_header, 0, 1) ne ";"){
  print "RIL experiment setup file $expfile does not have the correct format!\n";
  print LOGFILE "RIL experiment setup file $expfile does not have the correct format!\n";
  exit(1);
} # end if 
# remove newline chars
foreach(@rep_IDs){
  chomp;
} # end foreach
# Determine number of reps
my $number_reps = scalar @rep_IDs;
# Loop through file content and store in hash
foreach my $line(@exp_setup){
  # remove newline char
  chomp($line);
  my @current_line_array = split(/\t/, $line);
  # get RIL code
  my $current_RIL_code = shift(@current_line_array);
  # get rep codes
  for(my $i = 0; $i < $number_reps; $i++){
    $exp_setup_hash{$current_line_array[$i]}->{'rep'} = $i + 1;
    $exp_setup_hash{$current_line_array[$i]}->{'treatment'} = $rep_IDs[$i];
    $exp_setup_hash{$current_line_array[$i]}->{'RIL'} = $current_RIL_code;
  } # end for  
} # end foreach
# Let user know this is done
print "RIL experiment setup file read: $expfile\n";
print LOGFILE "RIL experiment setup file read: $expfile\n";
print "Number of reps determined: $number_reps.\n";
print LOGFILE "Number of reps determined: $number_reps.\n";






################################################################################################
# Open file with parental chip IDs
open(PARENTA, "$parental_A_chips_file") or die "Can't open '$parental_A_chips_file'!\n";
my @parental_A_chips = <PARENTA>;
# close file
close(PARENTA);
# remove end of line chars
foreach my $line(@parental_A_chips){
  chomp($line);
} # end foreach
open(PARENTB, "$parental_B_chips_file") or die "Can't open '$parental_B_chips_file'!\n";
my @parental_B_chips = <PARENTB>;
# close file
close(PARENTB);
# remove end of line chars
foreach my $line(@parental_B_chips){
  chomp($line);
} # end foreach





################################################################################################
# Open file with chip IDs
open(CHIPIDFILE, "$chipidfile") or die "Can't open '$chipidfile'!\n";
my @chipids = <CHIPIDFILE>;
# close file
close(CHIPIDFILE);
# remove end of line chars
foreach my $line(@chipids){
  chomp($line);
} # end foreach
# Let user know
print "ChipID file read: $chipidfile\n";
print LOGFILE "ChipID file read: $chipidfile\n";






################################################################################################
# Open one chip file to get list of probeset codes.
open(FILE, "$chipids[0]") or die "Can't open '$chipids[0]'!\n";
my @data = <FILE>;
# close file
close(FILE);
# remove header line
shift(@data);
# Get list of probeset names and store in array
my @temp;
my $previous_probeset;
my $current_probeset;
my $probe_number = 0;
my $first_line_done = "FALSE";
foreach my $line(@data){
  if($line =~ /\"(.+?_at)/g){
    $current_probeset = $1;
    # First line case
    if($first_line_done eq "FALSE"){
      $previous_probeset = $current_probeset;  
      $first_line_done = "TRUE"; 
    } # end if
    # Evaluate
    if($current_probeset eq $previous_probeset){ 
      $probe_number++;
    }
    else{
      my @to_add = ($previous_probeset, $probe_number);
      push(@temp, \@to_add);
      $probe_number = 1;
    } # end else
  } # end if
  $previous_probeset = $current_probeset;
} # end foreach
# Sort list
my @probesets = sort {lc($a->[0]) cmp lc($b->[0])} @temp;






################################################################################################
# Open all chip files
# Create array with filehandles for all files
my @chip_fh;
# Create filehandles
foreach my $file(@chipids){
  my $fh = IO::File->new("< $file") or die "Error opening file: $!";
  my @temp = ($fh, $file);
  push(@chip_fh, \@temp);
} # end foreach





################################################################################################
# Open output file to store for each marker: which parent has the SFP, 
# max_SFPdev nonSFP parent, min_SFPdev SFP parent, spread,
# frequency A allele, frequency B allele, frequency missing data.
my $output_0 = $output."_SFPscanV10_marker_data.txt";
open(OUTPUT0, ">$output_0");
# Print header
print OUTPUT0 "SFPmarker\tParental line with SFP\tmax_SFPdev nonSFP parent\tmin_SFPdev SFP parent\tSpread\tA frequency\tB frequency\tmissing data frequency\n"; 






################################################################################################
# Loop through probeset. 
# For each probeset, open all BG corrected RIL files and get expression values for 
# all probes belonging to that probeset.
# Then calculate SFPdev for all probes in the probesets, for all chips.
# Store SFPdev values in an array, sort the array, the loop through
# the array and test if there are gaps with a spread > 2. 
# If such a gap is found, check if all chips of one parent fall on the left of the gap,
# and all chips of the other parent on the right of the gap.
# Thus, if a gap with spread > 2 is found AND the distribution of the parental chips is okay,
# then this probe is an SFP marker.
# Genotype all the chips falling to the left of the gap as the corresponding parent,
# and all the chips falling to the right of the gap as the other parent.
#
# Result of all detected SFP markers
my %result;
# Result of all post-genotyping filtered SFP markers
my %result_clean;
my $line_count = 0;
my $number_of_probesets = scalar (@probesets);
my $probeset_counter = 0;
for (my $i = 0; $i < scalar (@probesets); $i++){
  my $probeset = $probesets[$i]->[0];
  my $number_probes = $probesets[$i]->[1];
  # loop through all chip files and get data for this probeset
  my @SFPdev_current_probeset_all_chips;
  foreach my $file(@chip_fh){
    my @expression_temp;
    for(my $j = 0; $j < $number_probes; $j++){
      *FILE = $file->[0];
      my $temp_line = <FILE>;
      # Skip header line in file
      if($temp_line =~ /.+?\.CEL/g){
        $temp_line = <FILE>;
      } # end if
      my $probeset_file;
      my $probe;
      my $expression;
      if($temp_line =~ /\"(.+?_at)(\d{1,2})\",(\d+?\.\d+)/g){
        $probeset_file = $1;
        if($probeset_file != $probeset){
          close(LOGFILE);
          die "Data not consistent! Probeset in list: $probeset, probeset in file $file->[1]: $probeset_file\n";
        } # end if
        $probe = $2;
        $expression = $3;
      } # end if
      else{
        die "Error reading data from file $file->[1] near line $line_count.\nData not in expected format!\n";
      } # end else
      push(@expression_temp, $expression);
    } # end for
    #
    # Calculate SFPdev for this chip for all the probes in this probeset
    my @SFPdev_current_probeset;
    for(my $q = 0; $q < $number_probes; $q++){
      my $SFPprobe_exp = $expression_temp[$q];
      my @nonSFP_expression;
      for(my $r = 0; $r < $number_probes; $r++){
        if($r == $q){
          next;
        } # end if
        else{
          push(@nonSFP_expression, $expression_temp[$r]);
        } # end else
      } # end for
      # Calculate SFPdev
      #
      #                Abs(SFP expression - Average expression other probes)
      # SFPdev value = -----------------------------------------------------
      #                              SFP expression
      # 
      my $nonSFPexp_mean = Statistics::Basic::Mean->new(\@nonSFP_expression)->query; 
      my $SFPdev = abs($SFPprobe_exp - $nonSFPexp_mean)/$SFPprobe_exp; 
      my @to_add = ($SFPdev, $file->[1]); 
      push(@SFPdev_current_probeset, \@to_add);
    } # end for
    push(@SFPdev_current_probeset_all_chips, \@SFPdev_current_probeset);
  } # end foreach
  # Now I have an array with pointers (one for each chip) to arrays
  # with pointers (one for each probe) to arrays with SFPdev values and corresponding filenames
  # for all probes of current probeset
  # Analyze SFPdevs for each probe seperate
  # Create a new array for one probe with SFPdev values from all chips
  for(my $s = 0; $s < $number_probes; $s++){
    my @SFPdev_current_probe;
    foreach my $item(@SFPdev_current_probeset_all_chips){
      my @to_add = ($item->[$s]->[0], $item->[$s]->[1]);
      push(@SFPdev_current_probe, \@to_add);
    } # end foreach
    # Sort SFPdev array
    my @SFPdev_sorted = sort {$a->[0] <=> $b->[0]} @SFPdev_current_probe; 
    # Loop through sorted SFPdev array to see if a gap with spread > user defined spread can be found
    ####################
    my $left_pos;
    my $right_pos;
    my $gap_count = 0;
    my $gap_valid = "FALSE";
    my $left_pos_set = "FALSE";
    my $genotype_left;
    my $genotype_right;
    my $SFP_parent;
    my $marker_spread;
    my $number_of_chips = scalar @SFPdev_sorted;
    my $A_freq;
    my $B_freq;
    my $nodata_freq;
    my $number_parental_chips;
    my $bad_marker = "FALSE";
    for(my $k = 0; $k < (scalar @SFPdev_sorted - 1); $k++){
      ################
      next if($SFPdev_sorted[$k]->[0] == 0);
      my $current_spread = ($SFPdev_sorted[$k+1]->[0])/($SFPdev_sorted[$k]->[0]);
      if($current_spread > $spread){
        #################
        #print "Spread greater than $spread!\n";
        $marker_spread = $current_spread;
        # Check if all chips of one parent fall on one side,
        # and all chips of the other parent on the other side
        my $all_A_left = 0;
        my $all_A_right = 0;
        my $all_B_left = 0;
        my $all_B_right = 0;
        my $number_A_chips = scalar @parental_A_chips;
        my $number_B_chips = scalar @parental_B_chips;
        ###### store number of parental chips for later use ##################
        $number_parental_chips = $number_A_chips + $number_B_chips;
        ########### check parental chips that fall on left of gap #############
        for(my $l = 0; $l < $k + 1; $l++){
          foreach my $parental_A_chip(@parental_A_chips){
             if($SFPdev_sorted[$l]->[1] eq $parental_A_chip){
                 $all_A_left++;
             } # end if
          } # end foreach
          foreach my $parental_B_chip(@parental_B_chips){
            if($SFPdev_sorted[$l]->[1] eq $parental_B_chip){
              $all_B_left++;
            } # end if
          } # end foreach
        } # end for
        # check parental chips that fall on right of gap
        for(my $m = $k + 1; $m < scalar @chipids; $m++){
          foreach my $parental_A_chip(@parental_A_chips){
            if($SFPdev_sorted[$m]->[1] eq $parental_A_chip){
              $all_A_right++;
            } # end if
          } # end foreach
          foreach my $parental_B_chip(@parental_B_chips){
            if($SFPdev_sorted[$m]->[1] eq $parental_B_chip){
              $all_B_right++;
            } # end if
          } # end foreach          
        } # end for
        #
        # Check if parental distributions allow this gap to be valid
        my $probenumber;
        if(($all_A_left == $number_A_chips) and ($all_B_right == $number_B_chips)){
          $gap_valid = "TRUE";
          $genotype_left = "A";
          $genotype_right = "B";
          $SFP_parent = "B";
          # Update gap counter
          $gap_count++;
        } # end if 
        elsif(($all_B_left == $number_B_chips) and ($all_A_right == $number_A_chips)){
          $gap_valid = "TRUE";
          $genotype_left = "B";
          $genotype_right = "A";
          $SFP_parent = "A";
          # Update gap counter
          $gap_count++;
        } # end elsif
        else{
          $gap_valid = "FALSE";
        } # end else
        #
        # Set left position of gap
        # Only do this if the gap is valid and the left position has not been set yet
        if(($gap_valid eq "TRUE") and ($left_pos_set eq "FALSE")){
          $left_pos = $k;
          $left_pos_set = "TRUE";
        } # end if
        # Set right position of gap
        # Only do this if the gap is valid 
        if($gap_valid eq "TRUE"){
          $right_pos = $k + 1;
        } # end if
      } # end if 
    } # end for
    ############ Do genotyping if a valid gap was found ##############
    if($left_pos_set eq "TRUE"){
      ############# calculations for allele frequencies #################
      if($genotype_left eq "A"){
        $A_freq = (($left_pos + 1) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips);
        $B_freq = (($number_of_chips - $right_pos) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips);
      } # end if
      else{
        $B_freq = (($left_pos + 1) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips);
        $A_freq = (($number_of_chips - $right_pos) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips);
      } # end else
      ############ calculate no data frequency ########################
      $nodata_freq = ($right_pos - $left_pos - 1)/($number_of_chips - $number_parental_chips);
      ############ Determine if marker should be discarded in post-genotyping filtering ################
      if( ($A_freq > ($distortion/100)) or ($B_freq > ($distortion/100)) or ($nodata_freq > ($unassigned/100))){
        $bad_marker = "TRUE";
      } # end if
      ############ Report SFP marker ##################
      my $probenumber = $s + 1;
      print "SFPmarker found: $probeset, $pg_hash{$probeset}-$probenumber\n";
      print LOGFILE "SFPmarker found: $probeset, $pg_hash{$probeset}-$probenumber\n";
      LOGFILE->autoflush(1);
      # Write to result file which parent has the SFP for this marker, max_SFPdev nonSFP parent,
      # min_SFPdev SFP parent, frequency A allele, frequency B allele, frequency missing data.
      print OUTPUT0 "$pg_hash{$probeset}-$probenumber\t$SFP_parent\t$SFPdev_sorted[$left_pos]->[0]\t$SFPdev_sorted[$right_pos]->[0]\t$marker_spread\t$A_freq\t$B_freq\t$nodata_freq\n";
      OUTPUT0->autoflush(1);
      # Genotype on left of gap 
      for(my $l = 0; $l < $left_pos + 1; $l++){
         # Store result in %result hash
         # Get chip code from file code for storing genotyping results
         my $chip_code = $SFPdev_sorted[$l]->[1];
         chomp($chip_code);
         $chip_code =~ s/.CEL//g;
         my $RIL = $exp_setup_hash{$chip_code}->{'RIL'};
         my $SFPmarker = $pg_hash{$probeset}."-".$probenumber;
         $result{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_left;
         # Store result in post-genotyping filtered result hash if marker is not bad
         if($bad_marker eq "FALSE"){
           $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_left;
         } # end if
      } # end for 
      # Genotype on right of gap     
      for(my $m = $right_pos; $m < scalar @chipids; $m++){
         # Store result in %result hash
         # Get chip code from file code for storing genotyping results
         my $chip_code = $SFPdev_sorted[$m]->[1];
         chomp($chip_code);
         $chip_code =~ s/.CEL//g;
         my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; 
         my $SFPmarker = $pg_hash{$probeset}."-".$probenumber;
         $result{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_right;
         # Store result in post-genotyping filtered result hash if marker is not bad
         if($bad_marker eq "FALSE"){
           $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_right;
         } # end if
      } # end for
      # Genotype within gap as '-'
      if($gap_count > 1){
        for(my $n = $left_pos + 1; $n < $right_pos; $n++){
           # Store result in %result hash
           # Get chip code from file code for storing genotyping results
           my $chip_code = $SFPdev_sorted[$n]->[1];
           chomp($chip_code);
           $chip_code =~ s/.CEL//g;
           my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; 
           my $SFPmarker = $pg_hash{$probeset}."-".$probenumber;
           $result{$RIL}->{$SFPmarker}->{$chip_code} = "-";
           # Store result in post-genotyping filtered result hash if marker is not bad
           if($bad_marker eq "FALSE"){
             $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = "-";
           } # end if
        } # end for
      } # end if 
    } # end if   
  } # end for 
  # update line counter
  $line_count = $line_count + $number_probes;
  # update probeset counter
  $probeset_counter++; 
  print "probeset $probeset_counter of $number_of_probesets.\n";
} # end foreach
# Close output file
close(OUTPUT0);

 



################################################################################################
# Write this result to a file in format:
# SFPmarker     RIL    chip    rep    treatment   genotype
my $output_1 = $output."_SFPscanV10_genotypes_all_chips.txt";
open(OUTPUT1, ">$output_1");
# Print header
print OUTPUT1 "SFPmarker\tRIL\tchip\trep\tgenotype\n"; 
# Print values
# Remember format of result hash:
# %result -> RIL1 -> SFPmarker1 -> chip1 = genotype A or B
#                               -> chip2 = genotype A or B
#                               -> chip3 = genotype A or B
#                               -> chip4 = genotype A or B
#                 -> SFPmarker2 -> chip1 = genotype A or B
#                                   etc
#                       etc
#             etc
while(my ($RIL, $SFPmarker) = each (%result)){ 
  while(my ($SFPmarker, $chip) = each %$SFPmarker){ 
    while(my ($chip, $genotype) = each %$chip){ 
      print OUTPUT1 "$SFPmarker\t";
      print OUTPUT1 "$RIL\t";  
      print OUTPUT1 "$chip\t";
      print OUTPUT1 "$exp_setup_hash{$chip}->{'rep'}\t";
      print OUTPUT1 "$genotype\n";
    } # end while
  } # end while
} # end while
close(OUTPUT1);
print "Output file '$output_1' written...\n";
print LOGFILE "Output file '$output_1' written...\n";





################################################################################################
# Do genotyping for all RILs with all detected SFP markers
my %SFPmarker_calls;
genotype(\%result, $genotype_A, $genotype_B, $number_reps, \%SFPmarker_calls, \*LOGFILE, $use_cd);
# Do genotyping for all RILs with post-genotyping filtered SFP markers
my %SFPmarker_calls_clean;
genotype(\%result_clean, $genotype_A, $genotype_B, $number_reps, \%SFPmarker_calls_clean, \*LOGFILE, $use_cd);






################################################################################################
# Write genotyping data for RILs done with all markers to files.
# Open files
my $output_2_jm = $output."_SFPscanV10_genotypes_jm.txt";
open(OUTPUT2JM, ">$output_2_jm") or die "Can't open '$output_2_jm'!\n";
my $output_2_ex = $output."_SFPscanV10_genotypes_ex.txt";
open(OUTPUT2EX, ">$output_2_ex") or die "Can't open '$output_2_ex'!\n";
write_genotypes(\%SFPmarker_calls, \*OUTPUT2JM, \*OUTPUT2EX, $output_2_jm, $output_2_ex, \*LOGFILE);
# close files
close(OUTPUT2JM);
close(OUTPUT2EX);





################################################################################################
# Write genotyping data for RILs done with post-genotyping filtered markers to files.
# Open files
my $output_2_jm_clean = $output."_SFPscanV10_genotypes_clean_jm.txt";
open(OUTPUT2JMCLEAN, ">$output_2_jm_clean") or die "Can't open '$output_2_jm_clean'!\n";
my $output_2_ex_clean = $output."_SFPscanV10_genotypes_clean_ex.txt";
open(OUTPUT2EXCLEAN, ">$output_2_ex_clean") or die "Can't open '$output_2_ex_clean'!\n";
write_genotypes(\%SFPmarker_calls_clean, \*OUTPUT2JMCLEAN, \*OUTPUT2EXCLEAN, $output_2_jm_clean, $output_2_ex_clean, \*LOGFILE);
# close files
close(OUTPUT2JMCLEAN);
close(OUTPUT2EXCLEAN);



################################################################################################
# Say bye and close log file
print LOGFILE "Done. Enjoy!\n";
close(LOGFILE);

# Tell user to check logfile
print "Check log: $logfile.\n";
       
die "Done. Enjoy!\n";











#######################################################################################
############################ Sub routine print_usage ##################################
#######################################################################################
sub print_usage
{
  print "\nUsage:\n\n";
  print "\t\>SFPscanV10.pl [CONVERSION_FILE] [SETUP_FILE] [CHIP_LIST_FILE]\n\t\t[PARENT_A_CHIP_LIST_FILE] [PARENT_B_CHIP_LIST_FILE] [GAP_SIZE]\n\t\t[ALLELE_DISTORTION] [UNASSIGNED_SCORES] [OUTPUT] [USE_CD]\n";              
  print "\n\tALL arguments must be entered.\n";
  print "\tDefault values you might want to use:\n";
  print "\t[GAP_SIZE] 2\n\t[ALLELE_DISTORTION] 80\n\t[UNASSIGNED_SCORES] 10.\n";
  print "\n\t[USE_CD] must be YES or NO.\n";
  exit(1);
} # end sub routine print_usage




#######################################################################################
############################ Sub routine print_warning ################################
#######################################################################################
sub print_warning
{
  my $text = $_[0];
  print "\n$text\n";
  my $answer = "";
  while($answer ne "Y" and $answer ne "N" and $answer ne "n" and $answer ne "y"){
    print "\tContinue? (Y/N)";
    $answer = <STDIN>;
    chomp($answer); 
  } # end while
  if($answer eq "Y" or $answer eq "y"){
    return;
  } # end if
  else{
    exit(1);
  } # end else             
} # end sub routine print_warning








#######################################################################################
############################ Sub routine genotype #####################################
#######################################################################################
sub genotype
{
  ######################################################################
  # Now evaluate the genotyping for the specified number of reps of each RIL
  # Create new hash with new genotyping codes, one for each RIL
  # A = SHA, B = BAY, C = NOT A, D = NOT B, - = A+B, - = no data
  # If all rep chips are genotyped as one accession: use A or B
  # If half of the rep chips is genotyped for one accession,
  #    and the other half of the rep chips for the other accession: use -
  # If half or more than half of the rep chips is genotyped as one accession,
  #    and the other rep chips as no data: use C or D
  #  NOTE: Use the C and D  scores only if the user wants to use it !!!
  # Else, use: - 
  #
  # Arguments to receive from sub routine call:
  #   1. reference to result hash with format:
  #        %result -> RIL1 -> SFPmarker1 -> chip1 = genotype A or B
  #                                      -> chip2 = genotype A or B
  #                                      -> chip3 = genotype A or B
  #                                      -> chip4 = genotype A or B
  #                        -> SFPmarker2 -> chip1 = genotype A or B
  #                                         etc
  #                           etc
  #                   etc
  #
  #  2. genotype codes ($genotype_A and $genotype_B)
  #
  #  3. number of reps ($number_reps)
  #  
  #  4. reference to hash to store result
  #
  #  5. reference to logfile FILE handle
  #
  #  6. use_cd option: yes or no.
  #
  #
  # Return variable:
  #     hash %SFPmarker_calls with format:
  #        SFPmarker_calls -> SFPmarker -> $RIL = $genotype;
  #
  #
  # Retrieve arguments
  my $inputresult_ref = $_[0];
  my $genotype_A = $_[1];
  my $genotype_B = $_[2];
  my $number_reps = $_[3];
  my $outputresult_ref = $_[4];
  my $log_ref = $_[5];
  my $use_cd = $_[6];
  #
  # Loop through RILs and store calls
  while(my ($RIL, $SFPmarker) = each (%{$inputresult_ref})){ 
    # Loop through SFP markers
    while(my ($SFPmarker, $chip) = each (%$SFPmarker)){ 
      my $genotype_to_add; 
      # use a hash as counter, key is genotype from previous step
      my %genotype;
      $genotype{$genotype_A} = 0;
      $genotype{$genotype_B} = 0;
      $genotype{'-'} = 0;
      my $chipcount = 0;
      # Loop through chips
      while(my ($chip, $call) = each %$chip){ 
        $genotype{$call}++;
        $chipcount++;
      } # end while
      if($chipcount != $number_reps){
        $genotype_to_add = '-';
        print "RIL $RIL only has $chipcount chips! Check input files! Genotyped as '-' (no data).\n"; 
        print {$log_ref} "RIL $RIL only has $chipcount chips! Check input files! Genotyped as '-' (no data).\n"; 
        next;
      } # end if
      elsif($genotype{$genotype_A} == $number_reps){
        $genotype_to_add = 'A';
      } # end elsif
      elsif($genotype{$genotype_B} == $number_reps){
        $genotype_to_add = 'B';
      } # end elsif
      elsif(($genotype{$genotype_A} == ($number_reps/2)) and ($genotype{$genotype_B} == ($number_reps/2))){
        $genotype_to_add = '-';
      } # end elsif
      elsif(($genotype{$genotype_A} >= ($number_reps/2)) and ($genotype{$genotype_B} == 0)){
        if($use_cd eq "YES"){
          $genotype_to_add = 'D';
        } # end if
        else{
          $genotype_to_add = '-';
        } # end else
      } # end elsif
      elsif(($genotype{$genotype_B} >= ($number_reps/2)) and ($genotype{$genotype_A} == 0)){
        if($use_cd eq "YES"){
          $genotype_to_add = 'C';
        } # end if
        else{
          $genotype_to_add = '-';
        } # end else
      } # end elsif
      else{
        $genotype_to_add = '-';
      } # end elsif
      $outputresult_ref->{$SFPmarker}->{$RIL} = $genotype_to_add;
    } # end while
    print "RIL '$RIL' genotyped...\n";
    print {$log_ref}  "RIL '$RIL' genotyped...\n";
  } # end while
  print "All RILs genotyped...\n";
  print {$log_ref} "All RILs genotyped...\n";
} # end sub routine genotype




#######################################################################################
############################ Sub routine write_genotypes ##############################
#######################################################################################
sub write_genotypes
{
  #######################################################################
  # Write result of genotyping to files in two different formats:
  # 1) for JoinMap
  #    marker14
  #    DHBHHADAADBADBHHDBHHBBDHHADDBAHDAHHBHAABABDAHHBBHH
  #    marker30
  #    AHBAHBBBHABBAHAABHBBHHHBBHHHHBAHABHHHHAHHHAHBBAHA-H
  #    marker35
  #    BHHDHAABHDHHBHHABABBAAAABHBHHDHHBBDCHDHDHHBAHBA-DA
  #    etc
  #
  # 2) For visualization in Excel
  #    marker14   D  H  B  H  H  A  D  A  A  D  B  etc 
  #    etc
  #    This should be tabulated.
  # 
  # Arguments to receive:
  #   1. reference to SFPmarker_calls hash
  #        Remember format SFPmarker_calls hash:
  #           SFPmarker_calls -> SFPmarker -> $RIL = $genotype;
  #   
  #   2. reference to output file handle for joinmap format
  #
  #   3. reference to output file handle for Excel format
  #
  #   4. name file joinmap format
  #
  #   5. name file excel format
  # 
  #   6. reference to logfile
  #
  #
  # Retrieve arguments
  my $hash_ref = $_[0];
  my $file_jm_ref = $_[1];
  my $file_ex_ref = $_[2];
  my $jm_file_name = $_[3];
  my $ex_file_name = $_[4];
  my $log_ref = $_[5];
  # Print header
  print {$file_jm_ref} "\;RIL genotyping with SFP markers\n\;$number_reps chips per RIL\n"; 
  print {$file_ex_ref} "\;RIL genotyping with SFP markers\n\;$number_reps chips per RIL\n"; 
  # Loop through markers
  # Arrays for excel output
  my @markers;
  my @calls;
  my @RILs_list;
  while(my ($SFPmarker, $RIL) = each (%{$hash_ref})){
    # Marker calls for all RILs
    my $genotypes_string = "";
    # Print marker name
    print {$file_jm_ref} "$SFPmarker\n";
    push(@markers, $SFPmarker);
    # Get keys containing RILs
    my @RILs = keys %$RIL;
    # Sort  RILs
    my @RILs_sorted = sort { $a <=> $b } @RILs;
    @RILs_list = @RILs_sorted;
    # Get genotype call for all RILs
    my @current_calls;
    foreach my $item(@RILs_sorted){
      $genotypes_string = $genotypes_string.$hash_ref->{$SFPmarker}->{$item};
      push(@current_calls, $hash_ref->{$SFPmarker}->{$item});
    } # end foreach
    # Print string
    print {$file_jm_ref} "$genotypes_string\n";
    push(@calls, \@current_calls);
  } # end while
  # Output header line with RIL numbers to excel format file
  print {$file_ex_ref} "\t";
  foreach my $item(@RILs_list){
    print {$file_ex_ref} "$item\t";
  } # end foreach
  print {$file_ex_ref} "\n";
  # Print info for each marker
  for(my $i = 0; $i < scalar @markers; $i++){
    print {$file_ex_ref} "$markers[$i]\t";
    my $current_calls = @calls[$i];
    foreach my $call(@$current_calls){
      print {$file_ex_ref} "$call\t";
    } # end foreach
    print {$file_ex_ref} "\n";
  } # end for
  print "Output file '$jm_file_name' written...\n";
  print {$log_ref} "Output file '$jm_file_name' written...\n";
  print "Output file '$ex_file_name' written...\n";
  print {$log_ref} "Output file '$ex_file_name' written...\n";
} # end sub routine write_genotypes