#!/usr/local/bin/perl use Statistics::Basic::Mean; use Statistics::Basic::StdDev; use IO::File; use strict; ################################################################################################ ################################################################################################ # SFPscanV10.pl # # Author: Hans van Leeuwen # Dept. of Plant Sciences # University of California, Davis # Mail Stop 3 # One Shields Ave # Davis, CA 95616 # E-mail: hvanleeuwen@ucdavis.edu # Web: http://elp.ucdavis.edu/data/analysis/sfp_map/SFPscan.html # # # Date: September 2005 # ################################################################################################ ################################################################################################ ################################################################################################ ################################################################################################ ################################# Program usage ################################################ # # >SFPscanV10.pl [CONVERSION_FILE] [SETUP_FILE] [CHIP_LIST_FILE] [PARENT_A_CHIP_LIST_FILE] # [PARENT_B_CHIP_LIST_FILE] [GAP_SIZE] [ALLELE_DISTORTION] [UNASSIGNED_SCORES] [OUTPUT] [USE_CD] # # (all input files should be in the same directory as the SFPscanV10.pl file) # (all arguments must be entered! even if you use default values; see below) # # # [CONVERSION_FILE] # The conversion file with the probeset and gene code data, having the format: # Probe Set ID Sequence Derived From # 245016_at accD # 261585_at At1g01010 # 261568_at At1g01030 # 261584_at At1g01040 # ...etc # # This file is provided with this script: "ATH1_conv-probeset-genecode.txt" # Note: Positions in the Col-0 genome of the probeset sequences from the Affymetrix ATH1 GeneChip # were obtained from the Arabidopsis annotation Version 4, TIGR release May 2003. # # # # [SETUP_FILE] # A file with the experimental setup, first column is RIL code, next columns, rep chip codes. # First line should be a header line starting with ; (no space between ';' and 'RILcode') # For example: # ;RILcode SW1 SW2 # 4 R5 R620 # 5 R85 R687 # 8 R221 R495b # 13 R30 R666 # # # # # [CHIP_LIST_FILE] # A file with a list of chip files that will be used in this study. # this file should contain both the RIL chip file names and the parental chip file names. # File should have following format: # R5.CEL # R85.CEL # R221.CEL # R30.CEL # R82.CEL # R270.CEL # R76.CEL # # etc. # # # # [PARENT_A_CHIP_LIST_FILE] # [PARENT_B_CHIP_LIST_FILE] # Two files with the chip file names of the parental chips. # File should have following format: # R5.CEL # R85.CEL # R221.CEL # R30.CEL # # etc. # # # # # [GAP_SIZE] # The minimum distance between the two peaks in the bimodal distribution. # This value is calculated as follows: min_SFPdev SFP parent / max_SFPdev nonSFP parent. # The default value is 2.0 # A value greater than 1.0 means that the distributions are seperated. # This value should be chosen carefully; a too low value may increase the number of markers, # but can also increase the number of unassigned alleles; a too high value may give higher # higher quality markers, but the number of markers obtained decreases. # # # # [ALLELE_DISTORTION] # In a RIL population a allele segregation of 50%-50% is expected, # but this can vary in certain populations # The default value is 80 # This means that allele segregations from 80%-20% to 20%-80% are allowed. # # # # [UNASSIGNED_SCORES] # This sets a limit to the number of unassigned allele scores a marker may have. # The default value is 10 # This means that a marker is discarded if more than 10% of the allele calls are unassigned. # # # # [OUTPUT] # This is a short text (please use numbers, letters and underscores only) # that will be used in the output file names. # # # # [USE_CD] # You have the option to use 'C' and 'D' scores in genotyping (C = NOT A, D = NOT B). # If you select this option the following genotyping rules will be used: # If half or more than half of the rep chips is genotyped as one parental accession, # and the other rep chips as unassigned: use C or D. # Type 'YES' to select this option. # If you do not select this option the above mentioned chips will be scored as '-'. # Type 'NO' if you do not want the 'C' and 'D' scoring to be used. # # # # ################################################################################################ ################################################################################################ ################################################################################################ ################################################################################################ ################################### Genotyping rules ########################################### # # # The following rules are used when determining a genotype for a RIL, # based on the genotypes of the individual reps: # A = parent A, B = parent B, C = NOT A, D = NOT B, - = unassigned # If all rep chips are genotyped as one accession: use A or B # If half of the rep chips is genotyped for one accession, # and the other half of the rep chips for the other accession: use - # If half or more than half of the rep chips is genotyped as one accession, # and the other rep chips as unassigned: use C or D # Else, use: - # # ################################################################################################ ################################################################################################ ################################################################################################ ################################################################################################ ################################# Preparation .CEL files ####################################### # # The .CEL files need to be background corrected seperately under the R programming environment, # with the Bioconductor package installed. Use the following R-script (copy and paste # this code (all lines between START SCRIPT and END SCRIPT in a text file # and then load it into the R-programming environment). # Make sure to remove the first '#' (only the first!!) in all code lines # between the START and END lines. # # ################################# START R-SCRIPT ########################### # ########################################################################## # # # # Author: Hans van Leeuwen # # Department of Plant Sciences # # University of California, Davis # # Mail Stop 3 # # One Shields Ave # # Davis, CA 95616 # # # # Email: hvanleeuwen@ucdavis.edu # # # # Date: JUNE 2005 # # # ########################################################################## # # # ########################################################################## # ######################### IMPORTANT ###################################### # ########################################################################## # ########### THE LINES MARKED WITH 'CHANGE' MUST ######################### # ########### BE MODIFIED USING YOUR PATH ######################### # ########################################################################## # ######################### IMPORTANT ###################################### # ########################################################################## # # ########################################################################## # # Set work directory to the directory where your original .CEL files are located # setwd("C:/rw1081/hans_temp") ######### CHANGE ######## # # # ########################################################################## # # Read in file with chip ID's for which the CEL files will be read later # # This file should have format: # # R5.CEL # # R85.CEL # # R221.CEL # # R277.CEL # # R30.CEL # # R82.CEL # # R270.CEL # # # # etc... # # # ##### This file should have the filename 'chips.txt' ##################### # chips <- read.table("chips.txt") # # # ########################################################################## # # Load affy package # library(affy) # # ########################################################################## # # Open cel files one by one # for (i in 1:length(chips$V1)){ # # Set work directory to the directory where your .CEL files are located # setwd("C:/rw1081/hans_temp") ######### CHANGE ######## # data <- ReadAffy(filenames = as.character(chips$V1[i])) # data.bg <- bg.correct.rma(data) # data.bg.matrix <- pm(data.bg) # # Set work directory to the directory where the background corrected files # # will be saved. This can not be the same work directory that contains the original .CEL files!! # setwd("C:/rw1081/hans_temp/bg_files") ######### CHANGE ######## # write.table(data.bg.matrix, file = as.character(chips$V1[i]), append = FALSE, quote = TRUE, sep = ",", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE, qmethod = c("escape", "double")) # } # end for ################################# END R-SCRIPT ############################# # # # The obtained files should have a format similar to: #"471.CEL" #"244901_at1",363.582352941176 #"244901_at2",124.111764705882 # ...etc # # ################################################################################################ ################################################################################################ ################################################################################################ ################################################################################################ ################################# Output files ################################################ # # The following output files will be created: # (the asterisk indicates the text string from the OUTPUT argument provided by the user) # # 1. *_SFPscanV10_log.txt # # Logfile, contains possible error messages. # # # # 2. *_SFPscanV10_marker_data.txt # # column 1: SFP marker ID # column 2: Parental line with the SFP (A/B) # column 3: max_SFPdev nonSFP parent # This is the right boundary of the distribution of the # chips that have the genotype of the parent that does # not contain the SFP. # column 4: min_SFPdev SFP parent # This is the left boundary of the distribution of the # chips that have the genotype of the parent that # contains the SFP. # column 5: spread # This is the result of: min_SFPdev SFP parent/max_SFPdev nonSFP parent # The user sets the minimum allowed spread. # A spread greater than 1 means that the distributions are seperated. # column 6: A frequency # This is the frequency of the 'A' allele # column 7: B frequency # This is the frequency of the '-' allele # column 8: missing data frequency # This is the frequency of unassigned allele calls. # # NOTE: The data in columns 6, 7, 8 is based on data from all chips, # before a genotype is assigned to a RIL based on the individual # genotypes of the reps for that RIL. In other words, these frequencies # can be slightly different in the final marker data, because then # the data for the RILs (and not the individual reps) is used. # # # # # 3. *_SFPscanV10_genotypes_all_chips.txt # # Contains genotypes for all chips for all markers in following format: # # SFPmarker RIL chip rep genotype # At5g52440-9 Bay-0_2 R473b 1 B # At5g52440-9 Bay-0_2 R628 2 B # At1g64900-6 Bay-0_2 R473b 1 B # At1g64900-6 Bay-0_2 R628 2 B # At3g25610-10 30 R493 2 B # At3g25610-10 30 R260 1 B # At1g54030-7 30 R260 1 B # At1g54030-7 30 R493 2 B # # # # # 4. *_SFPscanV10_genotypes_jm.txt # # Contains genotypes for all RILs for all markers, in JoinMap format: # (RILs are sorted in numerical order; the actual RIL codes # can be seen in the Excel format file (see below)) # # marker14 # DABAAADAADBADBAADBAABBDAAADDBAADAAABAAABABDAAABBAA # marker30 # AABAABBBAABBAAAABABBAAABBAAAABAAABAAAAAAAAAABBAAA-A # marker35 # BAADAAABADAABAAABABBAAAABABAADAABBDCADADAABAABA-DA # etc. # # # # # 5. *_SFPscanV10_genotypes_ex.txt # # Contains genotypes for all RILs for all markers, in Excel format (tabulated): # (RILs are sorted in numerical order on first row) # # 4 5 8 13 14 16 # marker14 D A B A A A # marker30 A A B A A B # et. # # # # # 6. *_SFPscanV10_genotypes_clean_jm.txt # # Contains genotypes for all RILs for markers that passed the post-genotyping # filtering process. Data is in JoinMap format. # # # # # # 7. *_SFPscanV10_genotypes_clean_ex.txt # # Contains genotypes for all RILs for markers that passed the post-genotyping # filtering process. Data is in Excel format (tabulated). # # # ################################################################################################ ################################################################################################ ################################################################################################ # # Read in command line arguments, check for correct usage # # Remember usage: # >SFPscanV10.pl [CONVERSION_FILE] [SETUP_FILE] [CHIP_LIST_FILE] [PARENT_A_CHIP_LIST_FILE] # [PARENT_B_CHIP_LIST_FILE] [GAP_SIZE] [ALLELE_DISTORTION] [UNASSIGNED_SCORES] [OUTPUT] [USE_CD] # if(($#ARGV + 1) != 10){ print_usage(); } # end if my $probeset_gene_file = $ARGV[0]; my $expfile = $ARGV[1]; my $chipidfile = $ARGV[2]; my $parental_A_chips_file = $ARGV[3]; my $parental_B_chips_file = $ARGV[4]; my $spread = $ARGV[5]; my $distortion = $ARGV[6]; my $unassigned = $ARGV[7]; my $output = $ARGV[8]; my $use_cd = $ARGV[9]; chomp($use_cd); # Check some values if($use_cd ne "YES" and $use_cd ne "NO"){ print_usage(); } # end if if($spread < 1){ print_warning("Are you sure you want to use a GAP_SIZE smaller than 1.0?\nThis will give overlapping distributions!\n"); } # end if if($distortion > 80){ print_warning("Are you sure you want to allow an ALLELE_DISTORTION greater than 80%?\n"); } # end if if($unassigned > 10){ print_warning("Are you sure you want to allow more than 10% UNASSIGNED_SCORES?\n"); } # end if ################################################################################################ # Define genotypes my $genotype_A = "A"; my $genotype_B = "B"; ################################################################################################ # Open logfile my $logfile = $output."_SFPscanV10_log.txt"; open(LOGFILE, ">$logfile") or die "Can't open '$logfile'!\n"; print "Program parameters:\n\tgap size: $spread\n\tallele distortion: $distortion\n\tunassigned scores: $unassigned\n\n"; print LOGFILE "Program parameters:\n\tgap size: $spread\n\tallele distortion: $distortion\n\tunassigned scores: $unassigned\n\n"; ################################################################################################ # Open conversion file with probeset and gene id open(PGFILE, "$probeset_gene_file") or die "Can't open '$probeset_gene_file'!\n"; my @pg_data = ; # close input file close(PGFILE); # remove header line shift(@pg_data); # Store in hash for easy lookup later my %pg_hash; # File has format: probeset gene_id # Store in hash with probeset as key, and gene_id as value # $1 is probe set, $2 is gene_id foreach my $line(@pg_data){ my @split_line = split(/\t/, $line); chomp($split_line[1]); $pg_hash{$split_line[0]} = $split_line[1]; } # end foreach # Let user know print "probeset-gene_id file read: $probeset_gene_file\n"; print LOGFILE "probeset-gene_id file read: $probeset_gene_file\n"; ################################################################################################ # Open inputfile with RIL experiment setup. open(EXPFILE, "$expfile") or die "Can't open '$expfile'!\n"; my @exp_setup = ; # close input file close(EXPFILE); # Store in hash for easy lookup later my %exp_setup_hash; # Get IDs for reps my @rep_IDs = split(/\t/, shift(@exp_setup)); # Check if file header line starting with ';' my $check_header = shift(@rep_IDs); if(substr($check_header, 0, 1) ne ";"){ print "RIL experiment setup file $expfile does not have the correct format!\n"; print LOGFILE "RIL experiment setup file $expfile does not have the correct format!\n"; exit(1); } # end if # remove newline chars foreach(@rep_IDs){ chomp; } # end foreach # Determine number of reps my $number_reps = scalar @rep_IDs; # Loop through file content and store in hash foreach my $line(@exp_setup){ # remove newline char chomp($line); my @current_line_array = split(/\t/, $line); # get RIL code my $current_RIL_code = shift(@current_line_array); # get rep codes for(my $i = 0; $i < $number_reps; $i++){ $exp_setup_hash{$current_line_array[$i]}->{'rep'} = $i + 1; $exp_setup_hash{$current_line_array[$i]}->{'treatment'} = $rep_IDs[$i]; $exp_setup_hash{$current_line_array[$i]}->{'RIL'} = $current_RIL_code; } # end for } # end foreach # Let user know this is done print "RIL experiment setup file read: $expfile\n"; print LOGFILE "RIL experiment setup file read: $expfile\n"; print "Number of reps determined: $number_reps.\n"; print LOGFILE "Number of reps determined: $number_reps.\n"; ################################################################################################ # Open file with parental chip IDs open(PARENTA, "$parental_A_chips_file") or die "Can't open '$parental_A_chips_file'!\n"; my @parental_A_chips = ; # close file close(PARENTA); # remove end of line chars foreach my $line(@parental_A_chips){ chomp($line); } # end foreach open(PARENTB, "$parental_B_chips_file") or die "Can't open '$parental_B_chips_file'!\n"; my @parental_B_chips = ; # close file close(PARENTB); # remove end of line chars foreach my $line(@parental_B_chips){ chomp($line); } # end foreach ################################################################################################ # Open file with chip IDs open(CHIPIDFILE, "$chipidfile") or die "Can't open '$chipidfile'!\n"; my @chipids = ; # close file close(CHIPIDFILE); # remove end of line chars foreach my $line(@chipids){ chomp($line); } # end foreach # Let user know print "ChipID file read: $chipidfile\n"; print LOGFILE "ChipID file read: $chipidfile\n"; ################################################################################################ # Open one chip file to get list of probeset codes. open(FILE, "$chipids[0]") or die "Can't open '$chipids[0]'!\n"; my @data = ; # close file close(FILE); # remove header line shift(@data); # Get list of probeset names and store in array my @temp; my $previous_probeset; my $current_probeset; my $probe_number = 0; my $first_line_done = "FALSE"; foreach my $line(@data){ if($line =~ /\"(.+?_at)/g){ $current_probeset = $1; # First line case if($first_line_done eq "FALSE"){ $previous_probeset = $current_probeset; $first_line_done = "TRUE"; } # end if # Evaluate if($current_probeset eq $previous_probeset){ $probe_number++; } else{ my @to_add = ($previous_probeset, $probe_number); push(@temp, \@to_add); $probe_number = 1; } # end else } # end if $previous_probeset = $current_probeset; } # end foreach # Sort list my @probesets = sort {lc($a->[0]) cmp lc($b->[0])} @temp; ################################################################################################ # Open all chip files # Create array with filehandles for all files my @chip_fh; # Create filehandles foreach my $file(@chipids){ my $fh = IO::File->new("< $file") or die "Error opening file: $!"; my @temp = ($fh, $file); push(@chip_fh, \@temp); } # end foreach ################################################################################################ # Open output file to store for each marker: which parent has the SFP, # max_SFPdev nonSFP parent, min_SFPdev SFP parent, spread, # frequency A allele, frequency B allele, frequency missing data. my $output_0 = $output."_SFPscanV10_marker_data.txt"; open(OUTPUT0, ">$output_0"); # Print header print OUTPUT0 "SFPmarker\tParental line with SFP\tmax_SFPdev nonSFP parent\tmin_SFPdev SFP parent\tSpread\tA frequency\tB frequency\tmissing data frequency\n"; ################################################################################################ # Loop through probeset. # For each probeset, open all BG corrected RIL files and get expression values for # all probes belonging to that probeset. # Then calculate SFPdev for all probes in the probesets, for all chips. # Store SFPdev values in an array, sort the array, the loop through # the array and test if there are gaps with a spread > 2. # If such a gap is found, check if all chips of one parent fall on the left of the gap, # and all chips of the other parent on the right of the gap. # Thus, if a gap with spread > 2 is found AND the distribution of the parental chips is okay, # then this probe is an SFP marker. # Genotype all the chips falling to the left of the gap as the corresponding parent, # and all the chips falling to the right of the gap as the other parent. # # Result of all detected SFP markers my %result; # Result of all post-genotyping filtered SFP markers my %result_clean; my $line_count = 0; my $number_of_probesets = scalar (@probesets); my $probeset_counter = 0; for (my $i = 0; $i < scalar (@probesets); $i++){ my $probeset = $probesets[$i]->[0]; my $number_probes = $probesets[$i]->[1]; # loop through all chip files and get data for this probeset my @SFPdev_current_probeset_all_chips; foreach my $file(@chip_fh){ my @expression_temp; for(my $j = 0; $j < $number_probes; $j++){ *FILE = $file->[0]; my $temp_line = ; # Skip header line in file if($temp_line =~ /.+?\.CEL/g){ $temp_line = ; } # end if my $probeset_file; my $probe; my $expression; if($temp_line =~ /\"(.+?_at)(\d{1,2})\",(\d+?\.\d+)/g){ $probeset_file = $1; if($probeset_file != $probeset){ close(LOGFILE); die "Data not consistent! Probeset in list: $probeset, probeset in file $file->[1]: $probeset_file\n"; } # end if $probe = $2; $expression = $3; } # end if else{ die "Error reading data from file $file->[1] near line $line_count.\nData not in expected format!\n"; } # end else push(@expression_temp, $expression); } # end for # # Calculate SFPdev for this chip for all the probes in this probeset my @SFPdev_current_probeset; for(my $q = 0; $q < $number_probes; $q++){ my $SFPprobe_exp = $expression_temp[$q]; my @nonSFP_expression; for(my $r = 0; $r < $number_probes; $r++){ if($r == $q){ next; } # end if else{ push(@nonSFP_expression, $expression_temp[$r]); } # end else } # end for # Calculate SFPdev # # Abs(SFP expression - Average expression other probes) # SFPdev value = ----------------------------------------------------- # SFP expression # my $nonSFPexp_mean = Statistics::Basic::Mean->new(\@nonSFP_expression)->query; my $SFPdev = abs($SFPprobe_exp - $nonSFPexp_mean)/$SFPprobe_exp; my @to_add = ($SFPdev, $file->[1]); push(@SFPdev_current_probeset, \@to_add); } # end for push(@SFPdev_current_probeset_all_chips, \@SFPdev_current_probeset); } # end foreach # Now I have an array with pointers (one for each chip) to arrays # with pointers (one for each probe) to arrays with SFPdev values and corresponding filenames # for all probes of current probeset # Analyze SFPdevs for each probe seperate # Create a new array for one probe with SFPdev values from all chips for(my $s = 0; $s < $number_probes; $s++){ my @SFPdev_current_probe; foreach my $item(@SFPdev_current_probeset_all_chips){ my @to_add = ($item->[$s]->[0], $item->[$s]->[1]); push(@SFPdev_current_probe, \@to_add); } # end foreach # Sort SFPdev array my @SFPdev_sorted = sort {$a->[0] <=> $b->[0]} @SFPdev_current_probe; # Loop through sorted SFPdev array to see if a gap with spread > user defined spread can be found #################### my $left_pos; my $right_pos; my $gap_count = 0; my $gap_valid = "FALSE"; my $left_pos_set = "FALSE"; my $genotype_left; my $genotype_right; my $SFP_parent; my $marker_spread; my $number_of_chips = scalar @SFPdev_sorted; my $A_freq; my $B_freq; my $nodata_freq; my $number_parental_chips; my $bad_marker = "FALSE"; for(my $k = 0; $k < (scalar @SFPdev_sorted - 1); $k++){ ################ next if($SFPdev_sorted[$k]->[0] == 0); my $current_spread = ($SFPdev_sorted[$k+1]->[0])/($SFPdev_sorted[$k]->[0]); if($current_spread > $spread){ ################# #print "Spread greater than $spread!\n"; $marker_spread = $current_spread; # Check if all chips of one parent fall on one side, # and all chips of the other parent on the other side my $all_A_left = 0; my $all_A_right = 0; my $all_B_left = 0; my $all_B_right = 0; my $number_A_chips = scalar @parental_A_chips; my $number_B_chips = scalar @parental_B_chips; ###### store number of parental chips for later use ################## $number_parental_chips = $number_A_chips + $number_B_chips; ########### check parental chips that fall on left of gap ############# for(my $l = 0; $l < $k + 1; $l++){ foreach my $parental_A_chip(@parental_A_chips){ if($SFPdev_sorted[$l]->[1] eq $parental_A_chip){ $all_A_left++; } # end if } # end foreach foreach my $parental_B_chip(@parental_B_chips){ if($SFPdev_sorted[$l]->[1] eq $parental_B_chip){ $all_B_left++; } # end if } # end foreach } # end for # check parental chips that fall on right of gap for(my $m = $k + 1; $m < scalar @chipids; $m++){ foreach my $parental_A_chip(@parental_A_chips){ if($SFPdev_sorted[$m]->[1] eq $parental_A_chip){ $all_A_right++; } # end if } # end foreach foreach my $parental_B_chip(@parental_B_chips){ if($SFPdev_sorted[$m]->[1] eq $parental_B_chip){ $all_B_right++; } # end if } # end foreach } # end for # # Check if parental distributions allow this gap to be valid my $probenumber; if(($all_A_left == $number_A_chips) and ($all_B_right == $number_B_chips)){ $gap_valid = "TRUE"; $genotype_left = "A"; $genotype_right = "B"; $SFP_parent = "B"; # Update gap counter $gap_count++; } # end if elsif(($all_B_left == $number_B_chips) and ($all_A_right == $number_A_chips)){ $gap_valid = "TRUE"; $genotype_left = "B"; $genotype_right = "A"; $SFP_parent = "A"; # Update gap counter $gap_count++; } # end elsif else{ $gap_valid = "FALSE"; } # end else # # Set left position of gap # Only do this if the gap is valid and the left position has not been set yet if(($gap_valid eq "TRUE") and ($left_pos_set eq "FALSE")){ $left_pos = $k; $left_pos_set = "TRUE"; } # end if # Set right position of gap # Only do this if the gap is valid if($gap_valid eq "TRUE"){ $right_pos = $k + 1; } # end if } # end if } # end for ############ Do genotyping if a valid gap was found ############## if($left_pos_set eq "TRUE"){ ############# calculations for allele frequencies ################# if($genotype_left eq "A"){ $A_freq = (($left_pos + 1) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); $B_freq = (($number_of_chips - $right_pos) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); } # end if else{ $B_freq = (($left_pos + 1) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); $A_freq = (($number_of_chips - $right_pos) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); } # end else ############ calculate no data frequency ######################## $nodata_freq = ($right_pos - $left_pos - 1)/($number_of_chips - $number_parental_chips); ############ Determine if marker should be discarded in post-genotyping filtering ################ if( ($A_freq > ($distortion/100)) or ($B_freq > ($distortion/100)) or ($nodata_freq > ($unassigned/100))){ $bad_marker = "TRUE"; } # end if ############ Report SFP marker ################## my $probenumber = $s + 1; print "SFPmarker found: $probeset, $pg_hash{$probeset}-$probenumber\n"; print LOGFILE "SFPmarker found: $probeset, $pg_hash{$probeset}-$probenumber\n"; LOGFILE->autoflush(1); # Write to result file which parent has the SFP for this marker, max_SFPdev nonSFP parent, # min_SFPdev SFP parent, frequency A allele, frequency B allele, frequency missing data. print OUTPUT0 "$pg_hash{$probeset}-$probenumber\t$SFP_parent\t$SFPdev_sorted[$left_pos]->[0]\t$SFPdev_sorted[$right_pos]->[0]\t$marker_spread\t$A_freq\t$B_freq\t$nodata_freq\n"; OUTPUT0->autoflush(1); # Genotype on left of gap for(my $l = 0; $l < $left_pos + 1; $l++){ # Store result in %result hash # Get chip code from file code for storing genotyping results my $chip_code = $SFPdev_sorted[$l]->[1]; chomp($chip_code); $chip_code =~ s/.CEL//g; my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; my $SFPmarker = $pg_hash{$probeset}."-".$probenumber; $result{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_left; # Store result in post-genotyping filtered result hash if marker is not bad if($bad_marker eq "FALSE"){ $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_left; } # end if } # end for # Genotype on right of gap for(my $m = $right_pos; $m < scalar @chipids; $m++){ # Store result in %result hash # Get chip code from file code for storing genotyping results my $chip_code = $SFPdev_sorted[$m]->[1]; chomp($chip_code); $chip_code =~ s/.CEL//g; my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; my $SFPmarker = $pg_hash{$probeset}."-".$probenumber; $result{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_right; # Store result in post-genotyping filtered result hash if marker is not bad if($bad_marker eq "FALSE"){ $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_right; } # end if } # end for # Genotype within gap as '-' if($gap_count > 1){ for(my $n = $left_pos + 1; $n < $right_pos; $n++){ # Store result in %result hash # Get chip code from file code for storing genotyping results my $chip_code = $SFPdev_sorted[$n]->[1]; chomp($chip_code); $chip_code =~ s/.CEL//g; my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; my $SFPmarker = $pg_hash{$probeset}."-".$probenumber; $result{$RIL}->{$SFPmarker}->{$chip_code} = "-"; # Store result in post-genotyping filtered result hash if marker is not bad if($bad_marker eq "FALSE"){ $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = "-"; } # end if } # end for } # end if } # end if } # end for # update line counter $line_count = $line_count + $number_probes; # update probeset counter $probeset_counter++; print "probeset $probeset_counter of $number_of_probesets.\n"; } # end foreach # Close output file close(OUTPUT0); ################################################################################################ # Write this result to a file in format: # SFPmarker RIL chip rep treatment genotype my $output_1 = $output."_SFPscanV10_genotypes_all_chips.txt"; open(OUTPUT1, ">$output_1"); # Print header print OUTPUT1 "SFPmarker\tRIL\tchip\trep\tgenotype\n"; # Print values # Remember format of result hash: # %result -> RIL1 -> SFPmarker1 -> chip1 = genotype A or B # -> chip2 = genotype A or B # -> chip3 = genotype A or B # -> chip4 = genotype A or B # -> SFPmarker2 -> chip1 = genotype A or B # etc # etc # etc while(my ($RIL, $SFPmarker) = each (%result)){ while(my ($SFPmarker, $chip) = each %$SFPmarker){ while(my ($chip, $genotype) = each %$chip){ print OUTPUT1 "$SFPmarker\t"; print OUTPUT1 "$RIL\t"; print OUTPUT1 "$chip\t"; print OUTPUT1 "$exp_setup_hash{$chip}->{'rep'}\t"; print OUTPUT1 "$genotype\n"; } # end while } # end while } # end while close(OUTPUT1); print "Output file '$output_1' written...\n"; print LOGFILE "Output file '$output_1' written...\n"; ################################################################################################ # Do genotyping for all RILs with all detected SFP markers my %SFPmarker_calls; genotype(\%result, $genotype_A, $genotype_B, $number_reps, \%SFPmarker_calls, \*LOGFILE, $use_cd); # Do genotyping for all RILs with post-genotyping filtered SFP markers my %SFPmarker_calls_clean; genotype(\%result_clean, $genotype_A, $genotype_B, $number_reps, \%SFPmarker_calls_clean, \*LOGFILE, $use_cd); ################################################################################################ # Write genotyping data for RILs done with all markers to files. # Open files my $output_2_jm = $output."_SFPscanV10_genotypes_jm.txt"; open(OUTPUT2JM, ">$output_2_jm") or die "Can't open '$output_2_jm'!\n"; my $output_2_ex = $output."_SFPscanV10_genotypes_ex.txt"; open(OUTPUT2EX, ">$output_2_ex") or die "Can't open '$output_2_ex'!\n"; write_genotypes(\%SFPmarker_calls, \*OUTPUT2JM, \*OUTPUT2EX, $output_2_jm, $output_2_ex, \*LOGFILE); # close files close(OUTPUT2JM); close(OUTPUT2EX); ################################################################################################ # Write genotyping data for RILs done with post-genotyping filtered markers to files. # Open files my $output_2_jm_clean = $output."_SFPscanV10_genotypes_clean_jm.txt"; open(OUTPUT2JMCLEAN, ">$output_2_jm_clean") or die "Can't open '$output_2_jm_clean'!\n"; my $output_2_ex_clean = $output."_SFPscanV10_genotypes_clean_ex.txt"; open(OUTPUT2EXCLEAN, ">$output_2_ex_clean") or die "Can't open '$output_2_ex_clean'!\n"; write_genotypes(\%SFPmarker_calls_clean, \*OUTPUT2JMCLEAN, \*OUTPUT2EXCLEAN, $output_2_jm_clean, $output_2_ex_clean, \*LOGFILE); # close files close(OUTPUT2JMCLEAN); close(OUTPUT2EXCLEAN); ################################################################################################ # Say bye and close log file print LOGFILE "Done. Enjoy!\n"; close(LOGFILE); # Tell user to check logfile print "Check log: $logfile.\n"; die "Done. Enjoy!\n"; ####################################################################################### ############################ Sub routine print_usage ################################## ####################################################################################### sub print_usage { print "\nUsage:\n\n"; print "\t\>SFPscanV10.pl [CONVERSION_FILE] [SETUP_FILE] [CHIP_LIST_FILE]\n\t\t[PARENT_A_CHIP_LIST_FILE] [PARENT_B_CHIP_LIST_FILE] [GAP_SIZE]\n\t\t[ALLELE_DISTORTION] [UNASSIGNED_SCORES] [OUTPUT] [USE_CD]\n"; print "\n\tALL arguments must be entered.\n"; print "\tDefault values you might want to use:\n"; print "\t[GAP_SIZE] 2\n\t[ALLELE_DISTORTION] 80\n\t[UNASSIGNED_SCORES] 10.\n"; print "\n\t[USE_CD] must be YES or NO.\n"; exit(1); } # end sub routine print_usage ####################################################################################### ############################ Sub routine print_warning ################################ ####################################################################################### sub print_warning { my $text = $_[0]; print "\n$text\n"; my $answer = ""; while($answer ne "Y" and $answer ne "N" and $answer ne "n" and $answer ne "y"){ print "\tContinue? (Y/N)"; $answer = ; chomp($answer); } # end while if($answer eq "Y" or $answer eq "y"){ return; } # end if else{ exit(1); } # end else } # end sub routine print_warning ####################################################################################### ############################ Sub routine genotype ##################################### ####################################################################################### sub genotype { ###################################################################### # Now evaluate the genotyping for the specified number of reps of each RIL # Create new hash with new genotyping codes, one for each RIL # A = SHA, B = BAY, C = NOT A, D = NOT B, - = A+B, - = no data # If all rep chips are genotyped as one accession: use A or B # If half of the rep chips is genotyped for one accession, # and the other half of the rep chips for the other accession: use - # If half or more than half of the rep chips is genotyped as one accession, # and the other rep chips as no data: use C or D # NOTE: Use the C and D scores only if the user wants to use it !!! # Else, use: - # # Arguments to receive from sub routine call: # 1. reference to result hash with format: # %result -> RIL1 -> SFPmarker1 -> chip1 = genotype A or B # -> chip2 = genotype A or B # -> chip3 = genotype A or B # -> chip4 = genotype A or B # -> SFPmarker2 -> chip1 = genotype A or B # etc # etc # etc # # 2. genotype codes ($genotype_A and $genotype_B) # # 3. number of reps ($number_reps) # # 4. reference to hash to store result # # 5. reference to logfile FILE handle # # 6. use_cd option: yes or no. # # # Return variable: # hash %SFPmarker_calls with format: # SFPmarker_calls -> SFPmarker -> $RIL = $genotype; # # # Retrieve arguments my $inputresult_ref = $_[0]; my $genotype_A = $_[1]; my $genotype_B = $_[2]; my $number_reps = $_[3]; my $outputresult_ref = $_[4]; my $log_ref = $_[5]; my $use_cd = $_[6]; # # Loop through RILs and store calls while(my ($RIL, $SFPmarker) = each (%{$inputresult_ref})){ # Loop through SFP markers while(my ($SFPmarker, $chip) = each (%$SFPmarker)){ my $genotype_to_add; # use a hash as counter, key is genotype from previous step my %genotype; $genotype{$genotype_A} = 0; $genotype{$genotype_B} = 0; $genotype{'-'} = 0; my $chipcount = 0; # Loop through chips while(my ($chip, $call) = each %$chip){ $genotype{$call}++; $chipcount++; } # end while if($chipcount != $number_reps){ $genotype_to_add = '-'; print "RIL $RIL only has $chipcount chips! Check input files! Genotyped as '-' (no data).\n"; print {$log_ref} "RIL $RIL only has $chipcount chips! Check input files! Genotyped as '-' (no data).\n"; next; } # end if elsif($genotype{$genotype_A} == $number_reps){ $genotype_to_add = 'A'; } # end elsif elsif($genotype{$genotype_B} == $number_reps){ $genotype_to_add = 'B'; } # end elsif elsif(($genotype{$genotype_A} == ($number_reps/2)) and ($genotype{$genotype_B} == ($number_reps/2))){ $genotype_to_add = '-'; } # end elsif elsif(($genotype{$genotype_A} >= ($number_reps/2)) and ($genotype{$genotype_B} == 0)){ if($use_cd eq "YES"){ $genotype_to_add = 'D'; } # end if else{ $genotype_to_add = '-'; } # end else } # end elsif elsif(($genotype{$genotype_B} >= ($number_reps/2)) and ($genotype{$genotype_A} == 0)){ if($use_cd eq "YES"){ $genotype_to_add = 'C'; } # end if else{ $genotype_to_add = '-'; } # end else } # end elsif else{ $genotype_to_add = '-'; } # end elsif $outputresult_ref->{$SFPmarker}->{$RIL} = $genotype_to_add; } # end while print "RIL '$RIL' genotyped...\n"; print {$log_ref} "RIL '$RIL' genotyped...\n"; } # end while print "All RILs genotyped...\n"; print {$log_ref} "All RILs genotyped...\n"; } # end sub routine genotype ####################################################################################### ############################ Sub routine write_genotypes ############################## ####################################################################################### sub write_genotypes { ####################################################################### # Write result of genotyping to files in two different formats: # 1) for JoinMap # marker14 # DHBHHADAADBADBHHDBHHBBDHHADDBAHDAHHBHAABABDAHHBBHH # marker30 # AHBAHBBBHABBAHAABHBBHHHBBHHHHBAHABHHHHAHHHAHBBAHA-H # marker35 # BHHDHAABHDHHBHHABABBAAAABHBHHDHHBBDCHDHDHHBAHBA-DA # etc # # 2) For visualization in Excel # marker14 D H B H H A D A A D B etc # etc # This should be tabulated. # # Arguments to receive: # 1. reference to SFPmarker_calls hash # Remember format SFPmarker_calls hash: # SFPmarker_calls -> SFPmarker -> $RIL = $genotype; # # 2. reference to output file handle for joinmap format # # 3. reference to output file handle for Excel format # # 4. name file joinmap format # # 5. name file excel format # # 6. reference to logfile # # # Retrieve arguments my $hash_ref = $_[0]; my $file_jm_ref = $_[1]; my $file_ex_ref = $_[2]; my $jm_file_name = $_[3]; my $ex_file_name = $_[4]; my $log_ref = $_[5]; # Print header print {$file_jm_ref} "\;RIL genotyping with SFP markers\n\;$number_reps chips per RIL\n"; print {$file_ex_ref} "\;RIL genotyping with SFP markers\n\;$number_reps chips per RIL\n"; # Loop through markers # Arrays for excel output my @markers; my @calls; my @RILs_list; while(my ($SFPmarker, $RIL) = each (%{$hash_ref})){ # Marker calls for all RILs my $genotypes_string = ""; # Print marker name print {$file_jm_ref} "$SFPmarker\n"; push(@markers, $SFPmarker); # Get keys containing RILs my @RILs = keys %$RIL; # Sort RILs my @RILs_sorted = sort { $a <=> $b } @RILs; @RILs_list = @RILs_sorted; # Get genotype call for all RILs my @current_calls; foreach my $item(@RILs_sorted){ $genotypes_string = $genotypes_string.$hash_ref->{$SFPmarker}->{$item}; push(@current_calls, $hash_ref->{$SFPmarker}->{$item}); } # end foreach # Print string print {$file_jm_ref} "$genotypes_string\n"; push(@calls, \@current_calls); } # end while # Output header line with RIL numbers to excel format file print {$file_ex_ref} "\t"; foreach my $item(@RILs_list){ print {$file_ex_ref} "$item\t"; } # end foreach print {$file_ex_ref} "\n"; # Print info for each marker for(my $i = 0; $i < scalar @markers; $i++){ print {$file_ex_ref} "$markers[$i]\t"; my $current_calls = @calls[$i]; foreach my $call(@$current_calls){ print {$file_ex_ref} "$call\t"; } # end foreach print {$file_ex_ref} "\n"; } # end for print "Output file '$jm_file_name' written...\n"; print {$log_ref} "Output file '$jm_file_name' written...\n"; print "Output file '$ex_file_name' written...\n"; print {$log_ref} "Output file '$ex_file_name' written...\n"; } # end sub routine write_genotypes