# Laura Tibbs Cortes
# Aug 21, 2020

# See https://github.com/LTibbs/MaizeNAMplasticity for more info
# Subfunctions are included at the end of this script.

# based on CERIS-JGRA code: https://github.com/jmyu/CERIS_JGRA
# as well as SimpleM https://github.com/LTibbs/SimpleM/
# and PET extraction code https://github.com/LTibbs/PET_extraction 

# set working directory with setwd()

#Start off printing the version information so you know what versions were used to generate the results
log_file <- paste('NAM_CERES.run.', format(Sys.time(), "%Y-%m-%d.%R:%S"), '.log',  sep = '');
sink(log_file);

#print start time:
print(paste0("Start time: ", Sys.time()))

# load libraries
library(R.utils)
library(rnoaa);
library(dplyr);
library(RCurl);
library(htmltab);
library(data.table);
# library(corrgram);
library(geosphere);
library(lubridate);
# library(animation);
library("colorspace");
library(RColorBrewer);
library(tidyverse)

source('subfunctions/sub_funcs_20200707_LTC.R');

print(sessionInfo())

# set working directories:
Dir <- ""
sp_dir <- ""
exp_dir <- "NAM_BLUE/"

# set filtering thresholds
line.outlier.filter <- 3 # remove lines before CERIS if there are LESS THAN 3; keep if there are AT LEAST 3 
filter.less.than <- 4 # remove data from predictions if there are LESS THAN X obs of the genotype for this trait, keep if there are AT LEAST (>=) X 
min.window.size <- 6 # set minimum window size: don't even look at windows less than X days when you subtract start from end day (windows are >= X). # Using window size with i < j-5, which gives min window size of 6, from Multicrop paper methods
last.FT <- 46 # set last day to check for FT traits (last day before any flower)
last.harvest <- last.FT + 60 # set last day to check for harvest traits
max.window <- 60 # don't consider any windows larger than 60 days 

# set up which traits must be decided by FT and which by harvest, based on when they were measured in original papers
FT.traits <- c("EH", "LL", "LW", "PH", "TPBN", "TL",
               "ULA", "ASI", "DTA", "DTS")
harvest.traits <- c("CD", "CL", "CM", "EM", "ERN",
                    "KN", "KPR", "TKW", "T20KW")

# Should we include precipitation-based indices in the search?
incl.precip="precip" # or "no.precip"

# Set up GDD calculations
## for maize 
t_base <- 50; t_max1 <- 86; t_max2 <- 1000; Haun_threshold <- 0;
#  this is for GDD. If the temp is below 50, it's set to 50 for GDD. If the temp is above 86, it's set to 86.
# Haun threshold is used in Barley etc for GDD and is also the reason for t_max2; not used here (see https://ndawn.ndsu.nodak.edu/help-barley-growing-degree-days.html)

# read in environmental metadata:
env_meta_file <- paste("11", 'Env_meta_table', sep = ''); 

env_meta_info_0 <- read.table(env_meta_file, header = T, sep = "\t", stringsAsFactors = F);

# set start and end years of the experiments
exp_s_year <- min(env_meta_info_0$TrialYear) 
exp_e_year <- max(env_meta_info_0$TrialYear) + 1

searching_daps <- 150 # how many days after planting do we care about?

# make name for ptt ptr file (holds env index values) out of : directory, number of environments, DAP to search up until
ptt_ptr_file <- paste(exp_dir, nrow(env_meta_info_0), 'Envs_envParas_DAP',  searching_daps, sep = '');

# read in the environmental data: 
# this was made from data from ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/ and https://earlywarning.usgs.gov/fews/product/81
# based on https://github.com/jmyu/CERIS_JGRA and https://github.com/LTibbs/PET_extraction
PTT_PTR <- read.table(ptt_ptr_file, header = T , sep = "\t"); 

# choose desired parameters:
PTT_PTR <- PTT_PTR %>%
  dplyr::select(-TMAX, -TMIN)

# if want to exclude precip-based env variables:
if(incl.precip=="no.precip") {
  PTT_PTR <- PTT_PTR %>%
    select(-PRECIP, -PET, -H20.balance)
}

# TO DO: add files to GitHub------------------------
# Also: whatever "data.join" is that was then used to remove outliers; NAM.population.list, sub_funcs ...

exp_traits_file <- paste(exp_dir, 'traits_ori_NAM', sep = ''); 
exp_fnd_file <- paste(exp_dir, 'traits_ori_FND', sep='')

pop.table <- fread("NAM.population.list.txt", sep="\t", data.table=F)


# Data pre-processing ---------------------------------------------------------

# remove outliers
  data.join <- fread("data/traits_ori_full") %>% # this is the full phenotype data compiled from publicly available sources, see paper
    filter(!is.na(ril_code)) %>%
    filter(ril_code != "") # remove entries that we don't know the genotype of
  # Now, using Aaron's Nature Plants paper on NAM as guide:
  # ". Outliers were removed as follows...
  # the interquartile ranges (IQRs) were calculated for each RIL
  # across environments and for each environment across RILs within a phenotype.
  # Any trait measurements of RILs that were more than 1.5 times larger or smaller
  # than either of the IQRs was removed."
  outlier.detect <- data.join %>%
    gather(key="Trait", value = "Value",
           -c(env_code, pop_code, Entry_id,
           ril_code, field, rep, pblock, block, Rangeposition, Rowposition)) %>%
    group_by(env_code, Trait) %>%
    mutate(Q1=quantile(Value, na.rm=T)[[2]],
           Q3=quantile(Value, na.rm=T)[[4]]) %>%
    distinct() %>%
    mutate(IQR=Q3-Q1) %>%
    mutate(lo.bound=Q1-1.5*IQR) %>%
    mutate(hi.bound=Q3+1.5*IQR) %>%
    mutate(to.keep.env=!(Value<lo.bound | Value > hi.bound))
  outlier.detect <- outlier.detect %>%
    ungroup() %>%
    group_by(ril_code, Trait) %>%
    mutate(ril.Q1=quantile(Value, na.rm=T)[[2]],
           ril.Q3=quantile(Value, na.rm=T)[[4]]) %>%
    distinct() %>%
    mutate(ril.IQR=ril.Q3-ril.Q1) %>%
    mutate(ril.lo.bound=ril.Q1-1.5*ril.IQR) %>%
    mutate(ril.hi.bound=ril.Q3+1.5*ril.IQR) %>%
    mutate(to.keep.ril=!(Value<ril.lo.bound | Value > ril.hi.bound)) %>%
    ungroup()
  outlier.detect <- outlier.detect %>% # keep observations that do NOT have to.keep==F
    filter(to.keep.env == T) %>% # (I'm ok with keeping to.keep==NA because those are just missing anyway)
    filter(to.keep.ril == T) %>%
    select(env_code, pop_code, Entry_id,
           field, rep, pblock, block, Rangeposition, Rowposition,
           ril_code, Trait, Value) %>%
    distinct()%>%
    pivot_wider(names_from=Trait, values_from=Value, values_fill=NA)

  # output data after outlier removal
  fwrite(outlier.detect, "traits_ori_NAM_no.outliers", sep="\t")
  
# Calculate BLUEs - - - - - - - - - -
# following Hung et al Heredity BUT
# excluding "env" and "family" from the model because
# I'm interested in those things and don't want to average them out!
library(lme4)
BLUE.data  <- fread("NAM/traits_ori_NAM_no.outliers") %>% 
  mutate(rep=paste(field, rep, sep="_"), # make rep etc unique within each field so no need to nest terms in model (for efficiency)
         pblock=paste(rep, pblock, sep="_"),
         block=paste(pblock, block, sep="_"),
         Rowposition=paste(rep, Rowposition, sep="_"),
         Rangeposition=paste(rep, Rangeposition, sep="_"))

# Ideal BLUE model: other things are just a simplification of this
# lmer(data=data[[n]], Value ~ Grp + Grp:ril_code +
#                             (1|field) + (1|field:rep) + (1|field:rep:pblock) +
#                             (1|field:rep:pblock:block) +(1|field:rep:Rowposition)+
#                             (1|field:rep:Rangeposition)

# make things factors:
BLUE.data$field <- factor(BLUE.data$field)
BLUE.data$rep <- factor(BLUE.data$rep)
BLUE.data$pblock <- factor(BLUE.data$pblock)
BLUE.data$ril_code <- factor(BLUE.data$ril_code)
BLUE.data$block <- factor(BLUE.data$block)
BLUE.data$Rangeposition <- factor(BLUE.data$Rangeposition)
BLUE.data$Rowposition <- factor(BLUE.data$Rowposition)

# make function to pull BLUEs (efficiently):
pheno.efficient <- function(predf, original_data, BLUEs.from.file=F) {
if(BLUEs.from.file) { # if reading in BLUEs from a file from a previous run, don't need to make the tibble
  df <- predf
} else {df <- tibble(Gen=names(predf), Estimate=predf)}
df$Gen <- gsub("ril_code", "", df$Gen) # remove extra characters

# pull intercept value
df.b1 <- df$Estimate[df$Gen=="(Intercept)"]

# calculate BLUEs:
df <- mutate(df, BLUE=ifelse(Gen=="(Intercept)", Estimate, # if intercept, pheno=estimate
                             Estimate + df.b1)) # otherwise, add estimate + intercept

# but the very last Gen (missing from named Gens) will have BLUE=intercept
thing <- original_data %>%
  filter(!is.na(Value))
stopifnot(length(setdiff(thing$ril_code, df$Gen))==1)
df$Gen <- gsub("\\(Intercept\\)", setdiff(thing$ril_code, df$Gen), df$Gen) # set the intercept as the missing genotype's value

return(df)

}

# Calculate the BLUEs:
R.BLUEs  <- vector("list", length(unique(BLUE.data$env_code))*length(c(FT.traits, harvest.traits)))
data <- vector("list", length(unique(BLUE.data$env_code))*length(c(FT.traits, harvest.traits)))

n <- 1
for(my.trait in c(FT.traits, harvest.traits)) {
# for(my.trait in c(FT.traits, harvest.traits)[1:2]) {
  print(paste0("beginning trait ", my.trait))
  print(Sys.time())

    for(my.env in unique(BLUE.data$env_code)) {
# for(my.env in unique(BLUE.data$env_code)[1:2]) { # make mini
  print(paste0("beginning env ", my.env))
  print(Sys.time())


  # filter out trait and env of interest
  data[[n]] <- BLUE.data %>%
    filter(env_code==my.env) %>%
    # filter(pblock %in% c("1_1_1", "1_1_2")) %>% # make mini
    select(env_code, pop_code, Entry_id, field, rep, pblock, block, Rangeposition, Rowposition, ril_code, Value=(my.trait))


  # skip if no data for trait in env
  if(nrow(data[[n]] %>% filter(!(is.na(Value))))==0) {print(paste("No data for", my.trait, "in", my.env))
    next}
  # comment the following out if only want to read in the fixef to feed into function:

  # Some envs have multiple fields, others multiple reps
  if(length(unique(data[[n]]$field))==1) {
    if(length(unique(data[[n]]$rep))==1) { # 1 rep, 1 field
      BLUE.model <- lmer(data=data[[n]], Value ~ ril_code +
                           (1|pblock) +
                           (1|block) +(1|Rowposition)+
                           (1|Rangeposition),
                         na.action = "na.exclude", control=lmerControl(calc.derivs=FALSE))
    } else { # >1 rep, 1 field
      BLUE.model <- lmer(data=data[[n]], Value ~ ril_code +
                           (1|rep) + (1|pblock) +
                           (1|block) +(1|Rowposition)+
                           (1|Rangeposition),
                         na.action = "na.exclude", control=lmerControl(calc.derivs=FALSE))
    }

  } else {
    # if(length(unique(data[[n]]$rep))==1) { # 1 rep, >1 field. It will look like >1 rep because "field" is now included in "rep" but it's really just one. No environments exist with really >1 field AND >1 rep
    BLUE.model <- lmer(data=data[[n]], Value ~ ril_code +
                         (1|field) + (1|pblock) +
                         (1|block) +(1|Rowposition)+
                         (1|Rangeposition),
                       na.action = "na.exclude", control=lmerControl(calc.derivs=FALSE))
    # } else { # full model when rep>1 AND field >1
    # warning(print("no environment has >1 rep AND >1 field!"))
    # BLUE.model <- lmer(data=data[[n]], Value ~ Grp + Grp:ril_code +
    #                      (1|field) + (1|field:rep) + (1|field:rep:pblock) +
    #                      (1|field:rep:pblock:block) +(1|field:rep:Rowposition)+
    #                      (1|field:rep:Rangeposition),
    #                    na.action = "na.exclude")
    # }
  }


  # pull BLUEs
  R.BLUEs.temp <- fixef(BLUE.model)
  # output in case this fails:
  fwrite(tibble(Gen=names(R.BLUEs.temp), Estimate=R.BLUEs.temp),
         paste0("BLUEs.", my.env, ".", my.trait, ".csv"))

  # read in BLUEs:
  R.BLUEs.temp <- fread(paste0("BLUEs.", my.env, ".", my.trait, ".csv"))

  R.BLUEs[[n]] <- pheno.efficient(predf=R.BLUEs.temp, original_data=data[[n]], BLUEs.from.file=T)
  R.BLUEs[[n]] <- R.BLUEs[[n]] %>%
    mutate(env_code=my.env) %>%
    mutate(Trait=my.trait)

  n <- n+1
}

}
out.BLUEs <- do.call("rbind", R.BLUEs) %>% select(-Estimate)# rbind resuls together
out.wide <- out.BLUEs %>%
  pivot_wider(names_from = Trait, values_from="BLUE") %>%
  rename(ril_code=Gen) %>%
  left_join(BLUE.data %>% select(env_code, pop_code, Entry_id, ril_code) %>% distinct,
          by = c("ril_code", "env_code"))%>%
  select(colnames(mean.data))

# output BLUEs
fwrite(out.wide, "NAM/traits_BLUE_full", sep="\t") # output ALL BLUEs, including diversity panel
fwrite(out.wide %>% filter(pop_code != 27 | ril_code %in% c("B73","HP301", pop.table$parent)), exp_traits_file, sep="\t") # output BLUEs for RILs and parents
fwrite(out.wide %>% filter(ril_code %in% c("B73","HP301", pop.table$parent)), exp_fnd_file, sep="\t") # output only parents


# Read in processed data --------------------------------------------------

    # read in the trait data:
    exp_traits <- read.table(exp_traits_file, sep = "\t", header = T, stringsAsFactors = F) %>% # read in multi-environment phenotype
      filter(pop_code !=17) %>% # remove IBM, it's not actually in NAM and has no genotype
      filter(ril_code != "Mo17") # remove IBM parent
    total_n <- length(unique(exp_traits$ril_code)) # find total number of lines
    
    fnd_traits <-  read.table(exp_fnd_file, sep="\t", header=T, stringsAsFactors = F)
    
    # every time -- make exp_traits match genotype
    mytaxa <- fread("data/NAM.matchY.txt")
    exp_traits <- exp_traits %>%
      filter(ril_code %in% mytaxa$Taxa)
    stopifnot(setdiff(mytaxa$Taxa, exp_traits$ril_code)==0)
    stopifnot(setdiff(exp_traits$ril_code,mytaxa$Taxa)==0)
    exp_traits$ril_code <- factor(exp_traits$ril_code,levels=mytaxa$Taxa)
    exp_traits <- exp_traits %>%
      arrange(ril_code)
    exp_traits$ril_code <- as.character(exp_traits$ril_code)
    stopifnot(all.equal(unique(exp_traits$ril_code), mytaxa$Taxa))
    
    all_env_codes <- sort(unique(exp_traits$env_code)); # pull all environment codes
    env_cols <- colorspace::rainbow_hcl(length(all_env_codes), c = 80, l = 60, start = 0, end = 300, fixup = TRUE, alpha = 0.75); # make colors
    
    # read in geno with ril names in order:
    taxa <- mytaxa$Taxa # for the folds from CERIS to work, taxa needs to contain EVERY taxa, not just those with phenotypes
    
    # read in data formatted for FR-gBLUP
    fr.exp_traits <- fread("NAM_BLUE/traits_ori_NAM_FR", data.table = F)
    
# CERES for whole NAM population: -----------------------------------------
    # based on CERIS-JGRA code: https://github.com/jmyu/CERIS_JGRA
    
    print(paste0("CERES for the whole population, with parameters: line.outlier.filter=", line.outlier.filter,
                 ", filter.less.than=",filter.less.than,", min.window.size=", min.window.size, ", last.FT=",last.FT,
                 ", last.harvest=", last.harvest, ", max.window.size=", max.window))
    output.results <- vector("list", ncol(exp_traits)-2)
    for (trait_num in c(5:ncol(exp_traits))) {
      # for (trait_num in c(23)) {
      trait <- colnames(exp_traits)[trait_num];
      current.data <- exp_traits[,trait_num,]
      exp_trait_dir <- paste(exp_dir, trait, '/', sep = ''); if (!dir.exists(exp_trait_dir))  { dir.create(exp_trait_dir)};
      exp_pop_dir <- paste0(exp_trait_dir, "whole_pop", "/") # make directory
      if(incl.precip=="no.precip") {exp_pop_dir <- paste0(exp_trait_dir, "whole_pop_no_precip", "/")} # make no precip directory
      if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir)};# make directory
      
      # set indices for which column name is which
      lInd <- which(colnames(exp_traits) == 'ril_code'); # assumint that each ril is essentially the "line" from before
      eInd <- which(colnames(exp_traits) == 'env_code');
      tInd <- which(colnames(exp_traits) == trait);
      
      exp_trait <- exp_traits[,c(lInd, eInd, tInd)]; ### make sure the colname order is line env trait
      colnames(exp_trait)[ncol(exp_trait)] <- 'Yobs'; # rename phenotype data column as Yobs
      
      # remove missing values--negative is ok though
      exp_trait <- exp_trait[!is.na(exp_trait$Yobs),];
      # exp_trait <- exp_trait[exp_trait$Yobs > 0,]
      
      # How many observations are there per pedigree?
      obs_lne_n <- aggregate(Yobs ~ ril_code, data = exp_trait, length);
      # hist(obs_lne_n$Yobs)
      
      # find lines with only one or two observations and remove them:
      line_outlier <- obs_lne_n$ril_code[obs_lne_n$Yobs < line.outlier.filter]
      exp_trait <- exp_trait[!(exp_trait$ril_code %in% line_outlier),]
      
      line_codes <- unique(exp_trait$ril_code); # pull unique line codes
      
      # find mean value for each environment:
      env_mean_trait_0 <- na.omit(aggregate(x = exp_trait$Yobs, by = list(env_code = exp_trait$env_code),
                                            mean, na.rm = T));
      colnames(env_mean_trait_0)[2] <- 'meanY';
      env_mean_trait <- env_mean_trait_0[order(env_mean_trait_0$meanY),]; # sort by mean phenotype value
      
      #### Do you want to leave one environment out while calculating correlations?
      LOO_env <- 0; ### for maize diversity panel, it is 0
      
      # set param for graphing:
      col_wdw <- 25;
      col_palette <- diverge_hcl(col_wdw + 1, h = c(260, 0), c = 100, l = c(50, 90), power = 1) # this function makes a diverging color palette
      
      # Perform exhaustive search in WHOLE NAM pop for optimal window and environmental parameter to use
      Exhaustive_search_full(env_mean_trait, PTT_PTR, searching_daps, exp_pop_dir,
                             current.data,
                             trait,
                             searching_daps, searching_daps, LOO_env, min.window.size = min.window.size)
      
      # plot the exhaustive search triangles:
      pop_cor_file <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait), 'Envs_PTTPTR_', LOO_env, 'LOO_cor_whole_pop', sep = '');
      pop_pval_file <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait), 'Envs_PTTPTR_', LOO_env, 'LOO_P_whole_pop', sep = '');
      
      exhaustive_plot(pop_cor_file, pop_pval_file, PTT_PTR, searching_daps, searching_daps,
                      current.data, type="r.only")
      exhaustive_plot(pop_cor_file, pop_pval_file, PTT_PTR, searching_daps, searching_daps,
                      current.data, type="p.only")
      exhaustive_plot(pop_cor_file, pop_pval_file, PTT_PTR, searching_daps, searching_daps,
                      current.data, type="both")
      
      # filter windows by established criteria:
      search.results <- fread(paste(exp_pop_dir, trait, '_', nrow(env_mean_trait),
                                    'Envs_PTTPTR_', LOO_env, 'LOO_cor_whole_pop', sep = ''))%>%
        select(-starts_with("nR")) # don't need the negative version
      search.results <- search.results %>%
        tidyr::gather(key="Parameter", value="Corr", -Day_x, -Day_y, -window)
      
      search.results <- search.results %>%
        filter(window >= min.window.size) %>% # i < j-5
        filter(window <= max.window)
      
      if(trait %in% FT.traits) {
        search.results <- search.results %>%
          filter(Day_y <= last.FT) # last possible day for FT traits = 55
      } else if (trait %in% harvest.traits) {
        search.results <- search.results %>%
          filter(Day_y <= last.harvest) # last possible day for harvest traits
      } else (warning("trait not in FT or harvest traits"))
      
      # Only now choose the remaining window with the best corr:
      search.results  <- search.results %>%
        filter(abs(Corr)==max(abs(Corr), na.rm = T))
      ties <- nrow(search.results) # record if there are ties
      # save.search.results <- search.results # output all results
      fwrite(x = search.results, file=paste0(exp_pop_dir,trait, "_optimalparameters_", LOO_env, "LOO_",
                                             searching_daps, "DAP.txt"), sep="\t")
      # if there are ties, just pick the top one in the table.
      search.results <- search.results[1,]%>%
        mutate(trait=trait)
      output.results[[trait_num-2]] <- search.results
      print(output.results[[trait_num-2]])
      
      maxR_dap1 <- search.results$Day_x;
      maxR_dap2 <- search.results$Day_y;
      kPara_Name <- search.results$Parameter;
      kPara_Name <- gsub("R_", "", kPara_Name)
      PTT_PTR_ind <-  which(colnames(PTT_PTR) == kPara_Name); ## DL -> 5, GDD -> 6; PTT -> 7; PTR -> 8; PTD1 -> 8, PTD2 -> 9, PTS -> 10
      
      
      # need to SAVE the optimal window chosen, the associated correlation, and whether there were ties, for each trait.
      
      # Plot the environmental parameter means vs trait mean for each environment
      Plot_Trait_mean_envParas(env_mean_trait, PTT_PTR, maxR_dap1, maxR_dap2, trait, exp_pop_dir, env_cols);
      
      # Make output file containing the slopes and intercepts from the linear models for each line across environments
      Slope_Intercept(maxR_dap1, maxR_dap2, env_mean_trait, PTT_PTR, exp_trait, line_codes, exp_pop_dir, PTT_PTR_ind, filter.less.than = filter.less.than);
      
      #### LOOCV function
      # aka: leave one out cross validation:
      obs_prd_file <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait), 'Env_LOO_by_Lines_', kPara_Name, maxR_dap1, '_', maxR_dap2, sep = '');
      LOO_pdf_file <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait), 'Env_LOO_by_Lines_', kPara_Name, maxR_dap1, '_', maxR_dap2, '.png', sep = '');
      
      # Output data file with results of leave-one-out cross validation using both environ mean and param as predictors
      # if (!file.exists(obs_prd_file)) {
      if(length(unique(exp_trait$env_code))>=filter.less.than){ # only do this if there's enough environments to actually work
        prdM <- LOOCV(maxR_dap1, maxR_dap2, env_mean_trait, PTT_PTR, PTT_PTR_ind, exp_trait,  obs_prd_file, filter.less.than=filter.less.than, line_codes)
      }
      
      # Plot the observed vs predicted results using several prediction methods (see subfunction notes)
      Plot_prediction_result(obs_prd_file, all_env_code, prdM, kPara_Name, LOO_pdf_file, trait=trait);
    }
    
    # output the auto-chosen optimal parameter and window for each trait
    if (incl.precip=="no.precip") {
      fwrite(do.call("rbind", output.results), paste0(exp_dir, "CERES.auto.optimal.parameters.whole_pop.no_precip.txt"), sep="\t")
    } else {
      fwrite(do.call("rbind", output.results), paste0(exp_dir, "CERES.auto.optimal.parameters.whole_pop.txt"), sep="\t")
    }
    

#  FR-gBLUP 1->3 --------------------------------------------------
    # Goal: 1 ->3 prediction using windows from each training set
    # So:
    #  training set = randomly-selected 50% of genotypes from each family (all environments)
    #  testing set = the other 50% of genotypes 
    # 1. Get optimal window from CERIS, 
    # 2. Use asreml to get slope and intercept parameters for ALL lines (training and testing),
    #      which will give us the final pheno
    # 4. Repeat 50x
    
    source("subfunctions/genomic_prediction_functions.R")
    
    # This is designed for 2 folds-- split 50/50 training/testing
    nfolds <- 2
    rep.num <- c(1:10) # what rep to run -- eventually, want 1:50
    CERIS.done <- TRUE # use to determine if CERIS already run to prevent re-running it
    
    # make the folders for the .pop version (equivalent to .v4 in rrBLUP)
    exp_prepop_dir <- paste0(exp_dir, "FR.TEUG.1to3.pop/"); if (!dir.exists(exp_prepop_dir))  { dir.create(exp_prepop_dir, recursive=T)};
    exp_pop_dir <- paste0(exp_prepop_dir, "whole_pop/"); if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir, recursive = T)};
    if(incl.precip=="no.precip") {exp_pop_dir <- paste0(exp_prepop_dir, "whole_pop_no_precip", "/")} # make no precip directory
    if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir)};# make directory
    if(!dir.exists(paste0(exp_pop_dir, "output"))) {dir.create(paste0(exp_pop_dir, "output")) }
    
    #### Do you want to leave one environment out while calculating correlations?
    LOO_env <- 0; ### leave as 0 here because I already manually left one out
    
    # set param for graphing:
    col_wdw <- 25;
    col_palette <- colorspace::diverge_hcl(col_wdw + 1, h = c(260, 0), c = 100, l = c(50, 90), power = 1) # this function makes a diverging color palette
    
    print(paste0("FR-gBLUP 1 to 3, with parameters: line.outlier.filter=", line.outlier.filter,
                 ", filter.less.than=",filter.less.than,", min.window.size=", min.window.size, ", last.FT=",last.FT,
                 ", last.harvest=", last.harvest, ", max.window.size=", max.window))
    
    # First time only: make grm files (sparse matrix of lower triangle of kinship):
    # read in kinship:
    # kinship was calculated in GAPIT (maf filter 0.01) based on public genotype data from 
    # ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023.vcf.gz (https://datacommons.cyverse.org/browse/iplant/home/shared/panzea/genotypes/GBS/v27)
    
    # First, need to re-combine file parts in command line with:
    # cat NAM.GAPIT.Kin.VanRaden.csv.tar.gz.part* > NAM.GAPIT.Kin.VanRaden.csv.tar.gz
    # tar -xzvf NAM.GAPIT.Kin.VanRaden.csv.tar.gz
    
    preK <- fread("data/NAM.GAPIT.Kin.VanRaden.csv", 
                  data.table=F, stringsAsFactors = F) %>%
      rename(taxa=Taxa)
    for(p in unique(pop.table$number)) {
      if(p==17) {next} # pop 17 has no genotype data
    # prove it
        # temp <- exp_traits %>% filter(pop_code==17)
        # sum(temp$ril_code %in% taxa)
      pop.taxa <- unique(fr.exp_traits %>% filter(pop_code==p) %>% select(ril_code))
      popK <- preK[,-1][preK$taxa %in% pop.taxa$ril_code, preK$taxa %in% pop.taxa$ril_code] 
      stopifnot(dim(popK)==nrow(pop.taxa))

      sa.K <- matrix(ncol=3, nrow=(.5*nrow(popK)^2)+nrow(popK))
      m <- 1 # make iterator (row in new sparse matrix)
      for (i in 1:nrow(popK)) { # go through all of K
        for(j in 1:ncol(popK)) {
          if(isZero(popK[i,j])) {next}
          if(j>i) {next}
          # get the info for the sparse matrix:
          sa.K[m,1] <- i
          sa.K[m,2] <- j
          sa.K[m,3] <- popK[i,j]
          if(is.na(sa.K[m,3])) {break}
          m <- m+1
        }
        print(i)
      }
      sa.K <- sa.K[rowSums(is.na(sa.K)) != ncol(sa.K),]
      fwrite(sa.K,paste0("asreml.pop",p,".grm"), col.names =F, sep="\t")
      rm(sa.K, popK, pop.taxa)
    }
    
    # run CERIS first:
if(!CERIS.done) {
  # read in geno with ril names in order:
  geno <- fread("data/NAM.matchY.txt")
  taxa <- geno$Taxa
  rm(geno)
  
  
  lInd <- which(colnames(exp_traits) == 'ril_code'); 
  eInd <- which(colnames(exp_traits) == 'env_code');
  exp_prepop_dir <- paste0(exp_dir, "TEUG.1to3.v4/"); if (!dir.exists(exp_prepop_dir))  { dir.create(exp_prepop_dir, recursive=T)};
  exp_pop_dir <- paste0(exp_prepop_dir, "whole_pop/"); if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir, recursive = T)};
  if(incl.precip=="no.precip") {exp_pop_dir <- paste0(exp_prepop_dir, "whole_pop_no_precip", "/")} # make no precip directory
  if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir)};# make directory
  
  #### Do you want to leave one environment out while calculating correlations?
  LOO_env <- 0; ### leave as 0 here because I already manually left one out
  
  # set param for graphing:
  col_wdw <- 25;
  col_palette <- diverge_hcl(col_wdw + 1, h = c(260, 0), c = 100, l = c(50, 90), power = 1) # this function makes a diverging color palette
  
  print(paste0("1 to 3 for the whole population, rrBLUP within families, with parameters: line.outlier.filter=", line.outlier.filter,
               ", filter.less.than=",filter.less.than,", min.window.size=", min.window.size, ", last.FT=",last.FT,
               ", last.harvest=", last.harvest, ", max.window.size=", max.window))
  
  for (i in rep.num) {
    # first, make folds:
    
    # make random folds for this rep, across all subpops:
    taxa.table <- as.tibble(taxa)
    colnames(taxa.table) <- c("value")
    taxa.table$obs.id <- 1:length(taxa)
    taxa.table <- left_join(taxa.table, exp_traits, by=c("value"="ril_code")) %>%
      select(value, obs.id, pop_code) %>%
      distinct
    #make the folds--
    # for any number of folds:
    myF <- vector("list", nfolds)
    # preF <- vector("list", nfolds)
    for(subpop in unique(exp_traits$pop_code)) {
      # get the obs id's for this pop:
      pop.ids <- taxa.table %>%
        filter(pop_code==subpop) %>%
        select(obs.id)
      
      preF <- fold.maker.2(pop.ids$obs.id, nfolds)
      for(j in c(1:nfolds)) {
        myF[[j]] <- c(myF[[j]], preF[[j]])
      }
    }
    
    output.results <- vector("list", 
                             length(c(FT.traits, harvest.traits))*nrow(pop.table))
    n <- 1  
    # get optimal windows and slope + intercept:
    for(trait in c(FT.traits, harvest.traits)) {
      tInd <- which(colnames(exp_traits) == trait);
      
      for(fold in c(1:nfolds)) {
        # pull training data:
        exp_trait_trn <- exp_traits[!(exp_traits$ril_code %in% taxa[myF[[fold]]]),c(lInd, eInd, tInd)]
        colnames(exp_trait_trn)[ncol(exp_trait_trn)] <- 'Yobs'; # rename phenotype data column as Yobs
        # and pull the training data as a vector only:
        current.data <- exp_trait_trn[,c(colnames(exp_trait_trn)=="Yobs")]
        
        # remove missing values
        exp_trait_trn <- exp_trait_trn[!is.na(exp_trait_trn$Yobs),];
        # How many observations are there per pedigree?
        obs_lne_n <- aggregate(Yobs ~ ril_code, data = exp_trait_trn, length);
        # find lines with only one or two observations and remove them:
        line_outlier <- obs_lne_n$ril_code[obs_lne_n$Yobs < line.outlier.filter]
        exp_trait_trn <- exp_trait_trn[!(exp_trait_trn$ril_code %in% line_outlier),]
        line_codes <- unique(exp_trait_trn$ril_code); # pull unique line codes
        # also pull testing data:
        exp_trait_test <- exp_traits[(exp_traits$ril_code %in% taxa[myF[[fold]]]),c(lInd, eInd, tInd)]; ### make sure the colname order is line env trait
        exp_trait_test <- exp_trait_test[!(exp_trait_test$ril_code %in% line_outlier),] # also remove the outlier lines from your testing set to not mess up the training means
        colnames(exp_trait_test)[ncol(exp_trait_test)] <- 'Yobs'; # rename phenotype data column as Yobs
        
        
        # find mean value for each environment:
        env_mean_trait_0_trn <- na.omit(aggregate(x = exp_trait_trn$Yobs, by = list(env_code = exp_trait_trn$env_code),
                                                  mean, na.rm = T));
        colnames(env_mean_trait_0_trn)[2] <- 'meanY';
        env_mean_trait_trn <- env_mean_trait_0_trn[order(env_mean_trait_0_trn$meanY),]; # sort by mean phenotype value
        
        env_mean_trait_0_test <- na.omit(aggregate(x = exp_trait_test$Yobs, by = list(env_code = exp_trait_test$env_code),
                                                   mean, na.rm = T));
        colnames(env_mean_trait_0_test)[2] <- 'meanY';
        env_mean_trait_test <- env_mean_trait_0_test
        
        # Perform exhaustive search in training set from WHOLE NAM pop, with 1 environment left out, for optimal window and environmental parameter to use
        Exhaustive_search_full(env_mean_trait_trn, PTT_PTR, searching_daps, exp_pop_dir,
                               current.data,
                               trait,
                               searching_daps, searching_daps, LOO_env, min.window.size = min.window.size)
        
        # need to read in default output and rename them:
        pop_cor_file.old <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait_trn), 'Envs_PTTPTR_', LOO_env, 'LOO_cor_whole_pop', sep = '');
        pop_pval_file.old <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait_trn), 'Envs_PTTPTR_', LOO_env, 'LOO_P_whole_pop', sep = '');
        
        pop_cor <- fread(pop_cor_file.old)
        pop_pval <- fread(pop_pval_file.old) 
        
        pop_cor_file <- paste(exp_pop_dir, trait, '_TEUG.1to3.fold', fold, ".rep", i, '_cor_whole_pop', sep = '')
        fwrite(pop_cor, pop_cor_file, sep="\t")
        
        pop_pval_file <-paste(exp_pop_dir, trait, '_TEUG.1to3.fold.', fold, ".rep", i, '_P_whole_pop', sep = '')
        fwrite(pop_pval, pop_pval_file, sep="\t")
        
        # tidy up:
        file.remove(pop_cor_file.old, pop_pval_file.old) 
        rm(pop_cor, pop_pval)
        
        # filter windows by established criteria:
        search.results <- fread(pop_cor_file)%>%
          select(-starts_with("nR")) # don't need the negative version
        search.results <- search.results %>%
          tidyr::gather(key="Parameter", value="Corr", -Day_x, -Day_y, -window)
        
        search.results <- search.results %>%
          filter(window >= min.window.size) %>% # i < j-5
          filter(window <= max.window)
        
        if(trait %in% FT.traits) {
          search.results <- search.results %>%
            filter(Day_y <= last.FT) # last possible day for FT traits = 55
        } else if (trait %in% harvest.traits) {
          search.results <- search.results %>%
            filter(Day_y <= last.harvest) # last possible day for harvest traits
        } else (warning("trait not in FT or harvest traits"))
        
        # Only now choose the remaining window with the best corr:
        search.results <- search.results <- search.results %>%
          filter(abs(Corr)==max(abs(Corr), na.rm = T))
        
        # if there are ties, just pick the top one in the table.
        search.results <- search.results[1,]%>%
          mutate(trait=trait)
        
        # output results:
        output.results[[n]] <- cbind(rep=i, fold=fold, search.results)
        print(output.results[[n]])
        
        maxR_dap1 <- search.results$Day_x;
        maxR_dap2 <- search.results$Day_y;
        kPara_Name <- search.results$Parameter;
        kPara_Name <- gsub("R_", "", kPara_Name)
        PTT_PTR_ind <-  which(colnames(PTT_PTR) == kPara_Name); ## DL -> 5, GDD -> 6; PTT -> 7; PTR -> 8; PTD1 -> 8, PTD2 -> 9, PTS -> 10
        
        # Make output file containing the slopes and intercepts from the linear models for each line across environments
        Slope_Intercept(maxR_dap1, maxR_dap2, env_mean_trait_trn, PTT_PTR, exp_trait_trn, line_codes, exp_pop_dir, PTT_PTR_ind, 
                        filter.less.than); # do filtering for number of observations here
        
        # read file in and rename it:
        slope_file.old <-paste0(exp_pop_dir,"Intcp_Slope")
        pop_slope <- fread(slope_file.old)
        slope_file <- paste0(exp_pop_dir, trait, "_TEUG.1to3.fold", fold, ".rep", i, "_Intcp_Slope")
        fwrite(pop_slope,slope_file, sep="\t")
        
        # tidy up:
        file.remove(slope_file.old) 
        rm(pop_slope)
        rm(exp_trait_trn, exp_trait_test)
        
        n <- n+1 # increase iterator
        
      }
    }
    # output windows:
    opt.windows <- do.call("rbind", output.results)
    fwrite(opt.windows, paste0(exp_pop_dir, "TEUG.1to3.auto.optimal.parameters.whole_pop.rep", i, ".csv"))
    
  } 
  
}
    
    # Now prep for and run FR-gBLUP in ASREML
    
    # make the folders for the .pop version (equivalent to .v4 in rrBLUP)
    exp_prepop_dir <- paste0(exp_dir, "FR.TEUG.1to3.pop/"); if (!dir.exists(exp_prepop_dir))  { dir.create(exp_prepop_dir, recursive=T)};
    exp_pop_dir <- paste0(exp_prepop_dir, "whole_pop/"); if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir, recursive = T)};
    if(incl.precip=="no.precip") {exp_pop_dir <- paste0(exp_prepop_dir, "whole_pop_no_precip", "/")} # make no precip directory
    if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir)};# make directory
    if(!dir.exists(paste0(exp_pop_dir, "output"))) {dir.create(paste0(exp_pop_dir, "output")) }
    
    for (i in rep.num){
      if(CERIS.done) { # if CERIS-JGRA already run for 1->3, need to read in: optimal window, which line in which fold
        # first, read in folds:
        myF <-vector("list", nfolds)
        for (j in c(1:nfolds)) {
          fold.long <- vector()
          for(my.trait in c(FT.traits, harvest.traits)){ # need to loop through traits because some had missing data
            fold <- fread(paste0(exp_dir, "TEUG.1to3.v4/whole_pop/",my.trait,"_TEUG.1to3.fold",j,".rep", i, "_Intcp_Slope")) %>%
              select(line_codes)
            fold.long <- unique(c(fold.long, fold$line_codes)) 
          }
          myF[[j]] <- which(taxa %in% fold.long)
        }
        stopifnot(sum(taxa[setdiff(1:length(taxa), c(myF[[1]], myF[[2]]))] %in% exp_traits$ril_code)==0) # check that everything with a phenotype is in here
        stopifnot(length(intersect(myF[[1]], myF[[2]]))==0) # check that no lines are in both folds
        
        
      }  
      # Now, read in optimal windows:
      opt.windows <- fread(paste0(exp_dir, "TEUG.1to3.v4/whole_pop/TEUG.1to3.auto.optimal.parameters.whole_pop.rep", i, ".csv"))
      if(incl.precip=="no.precip") {    
        opt.windows <- fread(paste0(exp_dir, "TEUG.1to3.v4/whole_pop_no_precip/TEUG.1to3.auto.optimal.parameters.whole_pop.rep", i, ".csv"))
      }
      
      # prep the input files for asreml:
      
      # get the EC (environmental covariate) value for each trait x fold combination:
      EC.table <- tibble(rep=NA, fold=NA, trait=NA, env_code=NA, EC=NA)
      for(my.trait in c(FT.traits, harvest.traits)) {
        for(my.fold in 1:nfolds) {
          current.window <- opt.windows %>% filter(fold==my.fold, trait==my.trait)
          maxR_dap1 <- current.window$Day_x;
          maxR_dap2 <- current.window$Day_y;
          kPara_Name <- current.window$Parameter;
          kPara_Name <- gsub("R_", "", kPara_Name)
          
          current.EC <- PTT_PTR %>% 
            group_by(env_code) %>% 
            mutate(day.num=1:150) %>% 
            filter(day.num %in% c(maxR_dap1:maxR_dap2)) %>%
            select(env_code, kPara_Name, day.num) %>%
            mutate(EC=mean(get(kPara_Name))) %>%
            mutate(trait=my.trait,
                   rep=i,
                   fold=my.fold
            ) %>%
            select(rep, fold, trait, env_code, EC) %>%distinct
          current.EC$EC <- scale(current.EC$EC,center = T,scale = T) # scale the environmental covariate as required for FR-gBLUP
          
          EC.table <- rbind(EC.table, current.EC)
          rm(current.EC)
        }
      }
      EC.table <- EC.table %>% filter(!is.na(rep))
      wide.EC <- EC.table %>%
        group_by(trait) %>%
        mutate(row = row_number()) %>%
        pivot_wider(names_from=trait, values_from=EC) %>%
        select(-row) %>%
        rename_with(.cols=-c(rep, fold, env_code), ~ paste0(.x, ".EC"))
      
      # add the EC to the phenotype data:
      
      # make asreml input files, with different training sets set to NAs:
        for(my.pop in unique(fr.exp_traits$pop_code)) {
          if(is.na(my.pop)) {next}
          for(my.fold in 1:nfolds) {
            pheno.fold <- fr.exp_traits %>%
              rename(Genotype=ril_code) %>%
              filter(pop_code ==my.pop) %>%
              mutate(
                ASI=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, ASI),
                CD=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, CD),
                CL=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, CL),
                CM=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, CM),
                DTA=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, DTA),
                DTS=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, DTS),
                EH=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, EH),
                EM=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, EM),
                ERN=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, ERN),
                KN=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, KN),
                KPR=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, KPR),
                TKW=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, TKW),
                LL=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, LL),
                LW=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, LW),
                PH=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, PH),
                T20KW=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, T20KW),
                TPBN=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, TPBN),
                TL=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, TL),
                ULA=ifelse(Genotype %in% taxa[myF[[my.fold]]], NA, ULA)) %>%
              mutate(rep=i) %>%
              mutate(fold=my.fold) %>%
              select(Genotype, env_code, pop_code, rep, fold, everything(), -Entry_id)
            for(my.trait in c(FT.traits, harvest.traits)) { # filter out lines without enough observations
              temp.pheno.fold <- pheno.fold[,c(1,which(colnames(pheno.fold)==my.trait))]
              colnames(temp.pheno.fold)[2] <- "Yobs" 
              obs_lne_n <- aggregate(Yobs~Genotype, data=temp.pheno.fold, length)
              line_outlier <- obs_lne_n$Genotype[obs_lne_n$Yobs < (filter.less.than)] # want to make sure have AT LEAST filter.less.than-1 training obs, so need at least filter.less.than total obs
              
              temp.pheno.fold$Yobs[temp.pheno.fold$Genotype %in% line_outlier] <- NA
              pheno.fold[,which(colnames(pheno.fold)==my.trait)] <- temp.pheno.fold$Yobs
            }
            pheno.fold <- left_join(pheno.fold, wide.EC)
            colnames(pheno.fold) <- gsub("\\.", "_", colnames(pheno.fold))
            
            fwrite(pheno.fold, paste0(exp_pop_dir, paste0("asreml_pop",my.pop, "_fold", my.fold, "_rep", i, ".csv")))
            rm(pheno.fold)
          }
          
        }

    }
    

    # Run ASREML:
    # file is asreml_1to3.as
    
    # general ASREML code is:
    # ../../../asreml.pop$J.grm !NSD 
    # asreml_pop$J_fold$K_rep$1.csv !SKIP 1 !MAXIT 10000 # should be 10000 # run job as: -R asreml_pop_rep.as repnum
    # !ROWFAC env_code !COLFAC Genotype
    # # !AISINGULARITIES
    # $I !SIGMAP ~ mu $I_EC mv !r  str(Genotype Genotype.$I_EC us(2).grm(Genotype))    
    # residual idv(env_code).id(Genotype)
    
    # Compile ASREML results - - -
    
      n <- 1
      result.id <- vector()
      for(my.trait in c(FT.traits, harvest.traits)){
        for(my.pop in unique(fr.exp_traits$pop_code)) {
          result.id[n] <- paste(my.trait, my.pop, sep="_")
          print(paste(my.trait, my.pop, sep=";"))
          n <- n+1
        }
        
      }

      sln <- fread(paste0(temp.dir, "asreml_full.pop.sln"), data.table=F) %>%
        mutate(Set = result.id[cumsum(Model_Term == "Model_Term")+1]) %>% # set the cycle name used for these table rows
        filter(Model_Term != "Model_Term") %>%
        mutate(tempcol=str_split(Set, '_')) %>% # split the string
        rowwise() %>%
        mutate(trait=unlist(tempcol)[1], pop=unlist(tempcol)[2]) %>%
        dplyr::select(-tempcol) %>%
        ungroup() 
      
      thing <- sln %>%
        filter(str_detect(Model_Term, '_EC')) %>%
        filter(!(str_detect(Model_Term, "Genotype"))) %>%
        mutate(tempcol=str_split(Model_Term, '_')) %>% # split the string
        rowwise() %>%
        mutate(check.trait=unlist(tempcol)[1]) %>%
        dplyr::select(-tempcol) %>%
        ungroup()
      stopifnot(all.equal(thing$trait, thing$check.trait))
      
      sln.full <- sln
      sln.full$pop <- as.numeric(sln.full$pop)
      sln.full$Effect <- as.numeric(sln.full$Effect)
      
      # calculate predictions:
      for (my.trait in c(FT.traits, harvest.traits)){
        pre.combine <- vector("list", length(unique(fr.exp_traits$pop_code)))
        n <- 1
        
        for (my.pop in sort(unique(fr.exp_traits$pop_code))){
          if (is.na(my.pop)) {next}
          
          # pull current data from sln file
          current.sln <- sln.full %>%
            filter(Set %in% sln.full$Set[which(sln.full$Model_Term==paste0(my.trait,"_EC"))]) %>%
            filter(pop==my.pop) %>%
            distinct
          if(nrow(current.sln)==0) {next} # skip if no data
          
          # pull input data for this pop
          current.pop <- fread(paste0(temp.dir, "asreml_pop",my.pop,".csv"), data.table=F) %>%
            dplyr::select(Genotype, env_code, paste0(my.trait), paste0(my.trait, "_EC"))%>%
            # filter(is.na(UQ(rlang::sym(my.trait)))) %>% # commenting this out because need to predict ALL of them, not just the unknowns
            # filter(env_code==my.env) %>% # only need the testing environ
            distinct
          
          current.pop$y.hat <- rep(NA, nrow(current.pop))
          current.pop$intcp.hat <- rep(NA, nrow(current.pop))
          current.pop$slope.hat <- rep(NA, nrow(current.pop))
          
          
          for (j in 1:nrow(current.pop)) {
            # calculate the prediction for each line
            current.genotype <- current.pop$Genotype[j]
            current.env <- current.pop$env_code[j]
            current.EC <- current.pop[j,4][[1]] # this only works because I select the column order above!
            current.intcp <- current.sln$Effect[current.sln$Level==current.genotype]
            current.slope <- current.sln$Effect[current.sln$Level==paste0(current.genotype, ".001")]
            
            current.pop$y.hat[j] <- current.sln$Effect[current.sln$Model_Term=="mu"]+ #mu
              current.sln$Effect[current.sln$Model_Term==paste0(my.trait, "_EC")]*current.EC + # + BXe
              current.intcp + # + uG
              current.slope*current.EC # + bG*xE
            current.pop$intcp.hat[j] <- current.intcp
            current.pop$slope.hat[j] <- current.slope
          }
          pre.combine[[n]] <- current.pop
          print(paste(my.trait, " pop ", my.pop, ": cor ", cor(pre.combine[[n]][,7], pre.combine[[n]]$y.hat, use="complete")))
          
          rm(current.pop)
          n <- n+1
          
        }
        
        combined <- do.call(what = "rbind", pre.combine) %>%
          mutate(obs.id=NA) %>%
          select(env_code, ril_code=Genotype, paste0(my.trait), obs.id, intcp.hat, slope.hat, y.hat)
        # plot(combined$PH, combined$y.hat)
        # cor(combined$PH, combined$y.hat, use="complete")
        
        kPara_Name <- "Env Param"
        trait <- my.trait
        
        Plot_TEUG_result(obs_prd_file = combined, all_env_codes, kPara_Name, trait=trait, 
                         forecast_png_file = paste0(temp.dir, "output/FR_", trait, "_wholepop_obs.prd.png"),
                         path = F, save.output = T)
        fwrite(combined, paste0(temp.dir, "output/FR_", trait, "_wholepop_obs.prd.txt"))
        
        
      }
    
      
      # FR-gBLUP 1->2 -----------------------------------------------------------
      
      # training set = n-1 environments (all genotypes)
      # testing set = the other environment (all genotypes, predict families one at a time)
      
      # use training set to find optimal window and to get slope and intercept
      # parameters for each genotype in CERIS. Then, use that slope and intercept, plus
      # the environmental index of the other environment, to get predicted pheno of
      # testing set
      
      # Repeat until each environment has been in the testing set
      #set current trait: 
      trait <- "DTA" # provide desired trait for CERIS; can also make loop
      
      # run CERIS to get optimal windows
      temp.dir <- paste0(exp_dir, "UETG.1to2/")
      if (!dir.exists(temp.dir))  { dir.create(temp.dir)};
      exp_trait_dir <- paste0(temp.dir, trait, '/', sep = ''); if (!dir.exists(exp_trait_dir))  { dir.create(exp_trait_dir)};
      exp_pop_dir <- paste0(exp_trait_dir, "whole_pop", "/"); 
      if(incl.precip=="no.precip") {exp_pop_dir <- paste0(exp_trait_dir, "whole_pop_no_precip", "/")} # make no precip directory
      if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir)};# make directory
      
      print(paste0("1 to 2 for ", trait, ", with parameters: line.outlier.filter=", line.outlier.filter,
                   ", filter.less.than=",filter.less.than,", min.window.size=", min.window.size, ", last.FT=",last.FT,
                   ", last.harvest=", last.harvest, ", max.window.size=", max.window))
      
      # set indices for which column name is which
      lInd <- which(colnames(exp_traits) == 'ril_code'); # assumint that each ril is essentially the "line" from before
      eInd <- which(colnames(exp_traits) == 'env_code');
      tInd <- which(colnames(exp_traits) == trait);
      
      output.results <- vector("list", length(all_env_codes))
      UETG.results <- vector("list", length(all_env_codes))
      for (env_num in seq_along(all_env_codes)) {
        # for (env_num in 1:2) {
        current.env <- all_env_codes[env_num] # set current environment
        
        # skip if testing data is blank:
        if (sum(!(is.na(exp_traits[exp_traits$env_code==current.env,c(tInd)])))==0) {next}
        
        # pull training data
        exp_trait_trn <- exp_traits[exp_traits$env_code!=current.env,c(lInd, eInd, tInd)]; ### make sure the colname order is line env trait
        colnames(exp_trait_trn)[ncol(exp_trait_trn)] <- 'Yobs'; # rename phenotype data column as Yobs
        
        
        # remove missing values--negative is ok though
        exp_trait_trn <- exp_trait_trn[!is.na(exp_trait_trn$Yobs),];
        # exp_trait <- exp_trait[exp_trait$Yobs > 0,]
        
        # How many observations are there per pedigree?
        obs_lne_n <- aggregate(Yobs ~ ril_code, data = exp_trait_trn, length);
        # hist(obs_lne_n$Yobs)
        
        # find lines with only one or two observations and remove them:
        line_outlier <- obs_lne_n$ril_code[obs_lne_n$Yobs < line.outlier.filter]
        exp_trait_trn <- exp_trait_trn[!(exp_trait_trn$ril_code %in% line_outlier),]
        
        line_codes <- unique(exp_trait_trn$ril_code); # pull unique line codes
        
        # also pull testing data:
        exp_trait_test <- exp_traits[exp_traits$env_code==current.env,c(lInd, eInd, tInd)]; ### make sure the colname order is line env trait
        exp_trait_test <- exp_trait_test[!(exp_trait_test$ril_code %in% line_outlier),] # also remove the outlier lines from your testing set to not mess up the training means
        colnames(exp_trait_test)[ncol(exp_trait_test)] <- 'Yobs'; # rename phenotype data column as Yobs
        
        # and pull the training data as a vector only:
        current.data <- exp_trait_trn[,c(colnames(exp_trait_trn)=="Yobs")]
        
        # find mean value for each environment:
        env_mean_trait_0_trn <- na.omit(aggregate(x = exp_trait_trn$Yobs, by = list(env_code = exp_trait_trn$env_code),
                                                  mean, na.rm = T));
        colnames(env_mean_trait_0_trn)[2] <- 'meanY';
        env_mean_trait_trn <- env_mean_trait_0_trn[order(env_mean_trait_0_trn$meanY),]; # sort by mean phenotype value
        
        env_mean_trait_0_test <- na.omit(aggregate(x = exp_trait_test$Yobs, by = list(env_code = exp_trait_test$env_code),
                                                   mean, na.rm = T));
        colnames(env_mean_trait_0_test)[2] <- 'meanY';
        env_mean_trait_test <- env_mean_trait_0_test
        
        #### Do you want to leave one environment out while calculating correlations?
        LOO_env <- 0; ### leave as 0 here because I already manually left one out
        
        # set param for graphing:
        col_wdw <- 25;
        col_palette <- diverge_hcl(col_wdw + 1, h = c(260, 0), c = 100, l = c(50, 90), power = 1) # this function makes a diverging color palette
        
        # Perform exhaustive search in WHOLE NAM pop, with 1 environment left out, for optimal window and environmental parameter to use
        Exhaustive_search_full(env_mean_trait_trn, PTT_PTR, searching_daps, exp_pop_dir,
                               current.data,
                               trait,
                               searching_daps, searching_daps, LOO_env, min.window.size = min.window.size)
        
        # need to read in default output and rename them:
        pop_cor_file.old <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait_trn), 'Envs_PTTPTR_', LOO_env, 'LOO_cor_whole_pop', sep = '');
        pop_pval_file.old <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait_trn), 'Envs_PTTPTR_', LOO_env, 'LOO_P_whole_pop', sep = '');
        
        pop_cor <- fread(pop_cor_file.old)
        pop_pval <- fread(pop_pval_file.old) 
        
        pop_cor_file <- paste(exp_pop_dir, trait, '_UETG.1to2.whole.no', current.env, '_cor_whole_pop', sep = '')
        fwrite(pop_cor, pop_cor_file, sep="\t")
        
        pop_pval_file <- paste(exp_pop_dir, trait, '_UETG.1to2.whole.no', current.env, '_P_whole_pop', sep = '')
        fwrite(pop_pval, pop_pval_file, sep="\t")
        
        # tidy up:
        file.remove(pop_cor_file.old, pop_pval_file.old) 
        rm(pop_cor, pop_pval)
        
        # filter windows by established criteria:
        search.results <- fread(pop_cor_file)%>%
          select(-starts_with("nR")) # don't need the negative version
        search.results <- search.results %>%
          tidyr::gather(key="Parameter", value="Corr", -Day_x, -Day_y, -window)
        
        search.results <- search.results %>%
          filter(window >= min.window.size) %>% # i < j-5
          filter(window <= max.window)
        
        if(trait %in% FT.traits) {
          search.results <- search.results %>%
            filter(Day_y <= last.FT) # last possible day for FT traits = 55
        } else if (trait %in% harvest.traits) {
          search.results <- search.results %>%
            filter(Day_y <= last.harvest) # last possible day for harvest traits
        } else (warning("trait not in FT or harvest traits"))
        
        # Only now choose the remaining window with the best corr:
        search.results <- search.results <- search.results %>%
          filter(abs(Corr)==max(abs(Corr), na.rm = T))
        
        # if there are ties, just pick the top one in the table.
        search.results <- search.results[1,]%>%
          mutate(trait=trait)
        output.results[[env_num]] <- cbind(testing.env=current.env, search.results)
        print(output.results[[env_num]])
        
        maxR_dap1 <- search.results$Day_x;
        maxR_dap2 <- search.results$Day_y;
        kPara_Name <- search.results$Parameter;
        kPara_Name <- gsub("R_", "", kPara_Name)
        PTT_PTR_ind <-  which(colnames(PTT_PTR) == kPara_Name); ## DL -> 5, GDD -> 6; PTT -> 7; PTR -> 8; PTD1 -> 8, PTD2 -> 9, PTS -> 10
        
        # Make output file containing the slopes and intercepts from the linear models for each line across environments
        Slope_Intercept(maxR_dap1, maxR_dap2, env_mean_trait_trn, PTT_PTR, 
                        exp_trait_trn, line_codes, exp_pop_dir, PTT_PTR_ind,filter.less.than-1);
        
        
        # read file in and rename it:
        slope_file.old <-paste0(exp_pop_dir,"Intcp_Slope")
        pop_slope <- fread(slope_file.old)
        slope_file <- paste0(exp_pop_dir, trait, "_UETG.1to2.whole.no", current.env, "_Intcp_Slope")
        fwrite(pop_slope,slope_file, sep="\t")
        
        # tidy up:
        file.remove(slope_file.old) 
        rm(pop_slope)
        
        exp_trait <- rbind(exp_trait_trn, exp_trait_test) # combine training and testing data
        
        if(max(obs_lne_n$Yobs, na.rm = T)>= (filter.less.than-1) & sum(!(is.na(exp_trait_test$Yobs)))>=1) { # only run if enough obs:
          # Use slopes and intercepts to predict new environment:
          function.output <- UETG.1to2.function(maxR_dap1, maxR_dap2, PTT_PTR_ind, env_mean_trait_trn, env_mean_trait_test,
                                                PTT_PTR, exp_trait, test_env=current.env, filter.less.than)
          # and save the output:
          prdM <- function.output[[2]]
          UETG.results[[env_num]] <- function.output[[1]]
        }
        
        rm(search.results, exp_trait_test, exp_trait_trn, kPara_Name)
      }
      
      # Now, run ASREML
      # using code asreml_1to2.as
      
      CERIS.done <- TRUE # use to determine if CERIS already run before FRgBLUP
      
      temp.dir <- paste(exp_dir, "FR.UETG.1to2.pop", sep=""); if (!dir.exists(temp.dir))  { dir.create(temp.dir)};
      if(incl.precip=="no.precip") {
        temp.dir <- paste(exp_dir, "FR.UETG.1to2.pop_no_precip", sep=""); if (!dir.exists(temp.dir))  { dir.create(temp.dir)};
      }
      
      print(paste0("FR-gBLUP 1 to 2 for the whole population, with parameters: line.outlier.filter=", line.outlier.filter,
                   ", filter.less.than=",filter.less.than,", min.window.size=", min.window.size, ", last.FT=",last.FT,
                   ", last.harvest=", last.harvest, ", max.window.size=", max.window))
      
      #read in optimal windows:
      window.list <- vector("list", length=length(c(harvest.traits, FT.traits)))
      for (j in 1:length(c(harvest.traits, FT.traits))) {
        my.trait <- c(harvest.traits, FT.traits)[j]
        if(incl.precip=="no.precip") {
          window.list[[j]] <- fread(paste0(exp_dir, "UETG.1to2/", my.trait, "/whole_pop_no_precip/", my.trait, "_UETG.1to2.auto.optimal.parameters.whole_pop.txt"))
        } else{
          window.list[[j]] <- fread(paste0(exp_dir, "UETG.1to2/", my.trait, "/whole_pop/", my.trait, "_UETG.1to2.auto.optimal.parameters.whole_pop.txt"))
        }
      }
      opt.windows <- do.call("rbind", window.list)
      
      # prep the input files for asreml:
      
      # get the EC (environmental covariate) value for each trait x left-out env combination:
      EC.table <- tibble(trait=NA, env_code=NA, testing.env=NA, EC=NA)
      
      for(my.trait in c(FT.traits, harvest.traits)) {
        for(my.env in env_meta_info_0$env_code) {
          current.window <- opt.windows %>% filter(testing.env==my.env, trait==my.trait)
          if(nrow(current.window)==0) {
            print(paste0("No window found for ", my.trait, " in training env ", my.env))
            next
          }
          
          maxR_dap1 <- current.window$Day_x;
          maxR_dap2 <- current.window$Day_y;
          kPara_Name <- current.window$Parameter;
          kPara_Name <- gsub("R_", "", kPara_Name)
          
          current.EC <- PTT_PTR %>% 
            group_by(env_code) %>% 
            mutate(day.num=1:150) %>% 
            filter(day.num %in% c(maxR_dap1:maxR_dap2)) %>%
            select(env_code, kPara_Name, day.num) %>%
            mutate(EC=mean(get(kPara_Name))) %>%
            mutate(trait=my.trait,
                   testing.env=my.env) %>%
            select(trait, env_code, testing.env, EC) %>%distinct
          current.EC$EC <- scale(current.EC$EC,center = T,scale = T) # scale the environmental covariate as required for FR-gBLUP
          
          EC.table <- rbind(EC.table, current.EC)
          rm(current.EC)
        }
      }
      EC.table <- EC.table %>% filter(!is.na(trait))
      wide.EC <- EC.table %>%
        pivot_wider(names_from=trait, values_from=EC)%>%
        rename_with(.cols=-c(env_code, testing.env), ~ paste0(.x, ".EC"))
      
      # add EC to the phenotype data:
      pheno.full <- left_join(fr.exp_traits, wide.EC, by="env_code") %>%
        select(-Entry_id) %>%
        arrange(env_code, ril_code)
      
      # make asreml input files, with different training sets set to NAs:
      
      for(my.env in env_meta_info_0$env_code) {
        for(my.pop in unique(fr.exp_traits$pop_code)){
          pheno.part <- pheno.full %>%
            filter(testing.env==my.env) %>%
            filter(pop_code==my.pop) %>%
            mutate(
              ASI=ifelse(env_code==my.env, NA, ASI),
              CD=ifelse(env_code==my.env, NA, CD),
              CL=ifelse(env_code==my.env, NA, CL),
              CM=ifelse(env_code==my.env, NA, CM),
              DTA=ifelse(env_code==my.env, NA, DTA),
              DTS=ifelse(env_code==my.env, NA, DTS),
              EH=ifelse(env_code==my.env, NA, EH),
              EM=ifelse(env_code==my.env, NA, EM),
              ERN=ifelse(env_code==my.env, NA, ERN),
              KN=ifelse(env_code==my.env, NA, KN),
              KPR=ifelse(env_code==my.env, NA, KPR),
              TKW=ifelse(env_code==my.env, NA, TKW),
              LL=ifelse(env_code==my.env, NA, LL),
              LW=ifelse(env_code==my.env, NA, LW),
              PH=ifelse(env_code==my.env, NA, PH),
              T20KW=ifelse(env_code==my.env, NA, T20KW),
              TPBN=ifelse(env_code==my.env, NA, TPBN),
              TL=ifelse(env_code==my.env, NA, TL),
              ULA=ifelse(env_code==my.env, NA, ULA)) %>%
            select(Genotype=ril_code, env_code, pop_code, testing.env, everything()) 
          for(my.trait in c(FT.traits, harvest.traits)) { # filter out lines without enough observations
            temp.pheno.part <- pheno.part[,c(1,which(colnames(pheno.part)==my.trait))]
            colnames(temp.pheno.part)[2] <- "Yobs" 
            obs_lne_n <- aggregate(Yobs~Genotype, data=temp.pheno.part, length)
            line_outlier <- obs_lne_n$Genotype[obs_lne_n$Yobs < (filter.less.than)] # want to make sure have AT LEAST filter.less.than-1 training obs, so need at least filter.less.than total obs
            
            temp.pheno.part$Yobs[temp.pheno.part$Genotype %in% line_outlier] <- NA
            pheno.part[,which(colnames(pheno.part)==my.trait)] <- temp.pheno.part$Yobs
          }
          
          colnames(pheno.part) <- gsub("\\.", "_", colnames(pheno.part))
          
          fwrite(pheno.part, paste0(temp.dir, paste0("/asreml_1to2_no",my.env,"_pop", my.pop, ".csv")))
          rm(pheno.part)
        }
        
      }
      
      
      # compile FRgBLUP1->2 --- 
      
      # get the CYCLE arguments for asreml_rep.as:
      n <- 1
      result.id <- vector()
      for(my.trait in c(FT.traits, harvest.traits)) {
        for(my.env in env_meta_info_0$env_code) {
          current.window <- opt.windows %>% filter(testing.env==my.env, trait==my.trait)
          if(nrow(current.window)==0) {
            # print(paste0("No window found for ", my.trait, " in training env ", my.env))
            next
          }
          for(my.pop in unique(fr.exp_traits$pop_code)) {
            result.id[n] <- paste(my.trait, my.env, my.pop, sep="_")
            # print(paste(my.trait, my.env, my.pop, sep=";"))
            n <- n+1
            
          }
          
        }
      }
      # fwrite(as.tibble(result.id), "output_1to2_pop.txt")
      
      
      sln <- fread(paste0(temp.dir, "/asreml_1to2.sln"), data.table=F) %>%
        mutate(Set = result.id[cumsum(Model_Term == "Model_Term")+1]) %>% # set the cycle name used for these table rows
        filter(Model_Term != "Model_Term") %>%
        mutate(tempcol=str_split(Set, '_')) %>% # split the string
        rowwise() %>%
        mutate(trait=unlist(tempcol)[1], testing.env=unlist(tempcol)[2], pop=unlist(tempcol)[3]) %>%
        dplyr::select(-tempcol) %>%
        ungroup() 
      
      thing <- sln %>%
        filter(str_detect(Model_Term, '_EC')) %>%
        filter(!(str_detect(Model_Term, "Genotype"))) %>%
        mutate(tempcol=str_split(Model_Term, '_')) %>% # split the string
        rowwise() %>%
        mutate(check.trait=unlist(tempcol)[1]) %>%
        dplyr::select(-tempcol) %>%
        ungroup()
      stopifnot(all.equal(thing$trait, thing$check.trait))
      
      # tidy
      sln.full <- sln
      sln.full$pop <- as.numeric(sln.full$pop)
      sln.full$Effect <- as.numeric(sln.full$Effect)
      
      # calculate predictions:
      
      for (my.trait in c(FT.traits, harvest.traits)){
        pre.combine <- vector("list", length(unique(fr.exp_traits$pop_code))*nfolds*nrow(env_meta_info_0))
        n <- 1
        for(my.env in env_meta_info_0$env_code) {
          for (my.pop in sort(unique(fr.exp_traits$pop_code))){
            if (is.na(my.pop)) {next}
            
            # pull current data from sln file
            current.sln <- sln.full %>%
              filter(Set %in% sln.full$Set[which(sln.full$Model_Term==paste0(my.trait,"_EC"))]) %>%
              filter(pop==my.pop) %>%
              filter(testing.env==my.env) %>%
              distinct
            if(nrow(current.sln)==0) {next} # skip if no data
            
            # pull input data for this pop
            current.pop <- fread(paste0(temp.dir, "/asreml_1to2_no", my.env, "_pop",my.pop,".csv"), data.table=F) %>%
              dplyr::select(Genotype, env_code, paste0(my.trait), paste0(my.trait, "_EC"))%>%
              filter(is.na(UQ(rlang::sym(my.trait)))) %>% # only need to predict the ones that weren't known
              filter(env_code==my.env) %>% # only need the testing environ
              distinct
            
            current.pop$y.hat <- rep(NA, nrow(current.pop))
            current.pop$intcp.hat <- rep(NA, nrow(current.pop))
            current.pop$slope.hat <- rep(NA, nrow(current.pop))
            
            # and we'll need it for the opposite fold too; need to get it from some other testing environ too!
            other.pop <- fread(paste0(temp.dir, "/asreml_1to2_no",
                                      env_meta_info_0$env_code[!(env_meta_info_0$env_code == my.env)][1],
                                      "_pop",my.pop, ".csv"), data.table=F) %>%
              dplyr::select(Genotype, env_code, paste0(my.trait), paste0(my.trait, "_EC")) %>%
              filter(env_code==my.env) # only need the testing environ
            
            
            for (j in 1:nrow(current.pop)) {
              # calculate the prediction for each line
              current.genotype <- current.pop$Genotype[j]
              current.env <- current.pop$env_code[j]
              current.EC <- current.pop[j,4][[1]] # this only works because I select the column order above!
              current.intcp <- current.sln$Effect[current.sln$Level==current.genotype]
              current.slope <- current.sln$Effect[current.sln$Level==paste0(current.genotype, ".001")]
              
              current.pop$y.hat[j] <- current.sln$Effect[current.sln$Model_Term=="mu"]+ #mu
                current.sln$Effect[current.sln$Model_Term==paste0(my.trait, "_EC")]*current.EC + # + BXe
                current.intcp + # + uG
                current.slope*current.EC # + bG*xE
              # sln1b$Effect[sln1b$Level==current.env] # take out + uE
              current.pop$intcp.hat[j] <- current.intcp
              current.pop$slope.hat[j] <- current.slope
            }
            pre.combine[[n]] <- left_join(current.pop %>% select(-paste0(my.trait)),
                                          other.pop %>% select(-paste0(my.trait, "_EC")))
            # print(paste("Fold ", my.fold, ", pop ", my.pop, ": cor ", cor(pre.combine[[n]][,7], pre.combine[[n]]$y.hat, use="complete")))
            
            rm(current.pop)
            n <- n+1
            
          }
        }
        
        combined <- do.call(what = "rbind", pre.combine) %>%
          mutate(obs.id=NA) %>%
          select(env_code, ril_code=Genotype, paste0(my.trait), obs.id, intcp.hat, slope.hat, y.hat)
        
        # plot(combined$EH, combined$y.hat)
        # cor(combined$EH, combined$y.hat, use="complete")
        kPara_Name <- "Env Param"
        trait <- my.trait
        
        Plot_TEUG_result(obs_prd_file = combined, all_env_codes, kPara_Name, trait=trait, 
                         forecast_png_file = paste0(temp.dir, "/output/FR_", trait, "_UETG.1to2.pop_obs.prd.png"),
                         # forecast_png_file = paste0("NAM/FR.TEUG.1to3/no_covar", "FR_", trait, "_rep",i, "_TEUG.1to3_obs.prd.png"),
                         path = F, save.output = T)
        fwrite(combined, paste0(temp.dir, "/output/FR_", trait, "_UETG.1to2.pop_obs.prd.txt"))
      }
      
      
# FR-gBLUP 1->4 -----------------------------------------------------------
      
      
      # Goal: 1 ->4 prediction using windows from each training set
      
      # CERIS training set: n-1 environment, 50% of genotypes from each family
      # testing set: other 1 environment, other 50% of all genotypes
       # 1.	Use CERIS to find optimal window for training set, plus Slope and Intcp
      # 2. Use asreml to predict slope and intercept for other genotypes in testing environ
      # 3. Use environmental indices plus estimated slope and intercept to predict phenotype in testing set.
      # 4. Repeat until each environment has been in the testing set.
      # 5. Repeat 50x for each environment left out.
      
      source("subfunctions/genomic_prediction_functions.R")
      
      # This is designed for 2 folds-- split 50/50 training/testing
      nfolds <- 2
      rep.num <- c(12) # what rep to run -- eventually, want 1:50
      CERIS.done <- TRUE # use to determine if CERIS already run before FRgBLUP

      #### Do you want to leave one environment out while calculating correlations?
      LOO_env <- 0; ### leave as 0 here because I already manually left one out
      
      # set param for graphing:
      col_wdw <- 25;
      col_palette <- colorspace::diverge_hcl(col_wdw + 1, h = c(260, 0), c = 100, l = c(50, 90), power = 1) # this function makes a diverging color palette
      
      # Run CERIS itself- - - -
      # read in geno with ril names in order:
      geno <- fread("data/NAM.matchY.txt")
      taxa <- geno$Taxa
      rm(geno)
      
      lInd <- which(colnames(exp_traits) == 'ril_code'); 
      eInd <- which(colnames(exp_traits) == 'env_code');
      exp_prepop_dir <- paste0(exp_dir, "UEUG.1to4.v4/"); if (!dir.exists(exp_prepop_dir))  { dir.create(exp_prepop_dir, recursive=T)};
      # exp_pop_dir <- paste0(exp_prepop_dir, "whole_pop/"); if (!dir.exists(exp_pop_dir))  { dir.create(exp_pop_dir, recursive = T)};
      if(incl.precip=="no.precip") {exp_prepop_dir <- paste0(exp_dir, "UEUG.1to4.v4_no_precip", "/")} # make no precip directory
      if (!dir.exists(exp_prepop_dir))  { dir.create(exp_prepop_dir, recursive=T)};
      
      #### Do you want to leave one environment out while calculating correlations?
      LOO_env <- 0; ### leave as 0 here because I already manually left one out
      
      # set param for graphing:
      col_wdw <- 25;
      col_palette <- diverge_hcl(col_wdw + 1, h = c(260, 0), c = 100, l = c(50, 90), power = 1) # this function makes a diverging color palette
      
      print(paste0("1 to 4 v4 for the whole population, rrBLUP within families, with parameters: line.outlier.filter=", line.outlier.filter,
                   ", filter.less.than=",filter.less.than,", min.window.size=", min.window.size, ", last.FT=",last.FT,
                   ", last.harvest=", last.harvest, ", max.window.size=", max.window))
      
      
      
      for (i in rep.num) {
        
        # first, make folds:
        # make random folds for this rep, across all subpops:
        taxa.table <- as.tibble(taxa)
        colnames(taxa.table) <- c("value")
        taxa.table$obs.id <- 1:length(taxa)
        taxa.table <- left_join(taxa.table, exp_traits, by=c("value"="ril_code")) %>%
          select(value, obs.id, pop_code) %>%
          distinct
        #make the folds--
        # for any number of folds:
        myF <- vector("list", nfolds)
        # preF <- vector("list", nfolds)
        for(subpop in unique(exp_traits$pop_code)) {
          # for(subpop in pop.table$number[c(1,6)]) {
          # get the obs id's for this pop:
          pop.ids <- taxa.table %>%
            filter(pop_code==subpop) %>%
            select(obs.id)
          
          preF <- fold.maker.2(pop.ids$obs.id, nfolds)
          # preF[[1]] <- pop.ids$obs.id[1:90]
          # preF[[2]] <- pop.ids$obs.id[-c(1:90)]
          for(j in c(1:nfolds)) {
            myF[[j]] <- c(myF[[j]], preF[[j]])
          }
        }
        
        # get optimal windows and slope + intercept, for each trait x fold x environment:
        output.results <- vector("list", 
                                 length(c(FT.traits, harvest.traits))*nrow(pop.table)*length(all_env_codes))
        n <- 1
        for(trait in c(FT.traits, harvest.traits)) {
          # for(trait in c("ERN")) {
          tInd <- which(colnames(exp_traits) == trait);
          exp_pop_dir <- paste0(exp_prepop_dir, trait, "/"); if (!(dir.exists(exp_pop_dir))) { dir.create(exp_pop_dir, recursive=T)};
          if(!dir.exists(paste0(exp_pop_dir, "/whole_pop/"))) {dir.create(paste0(exp_pop_dir, "/whole_pop/"), recursive=T)}
          current.trait <- trait
          
          for(env_num in seq_along(all_env_codes)) {
            current.env <- all_env_codes[env_num] # set current environment
            
            for(fold in c(1:nfolds)) {
              
              # pull training data
              exp_trait_trn <- exp_traits[exp_traits$env_code!=current.env,c(lInd, eInd, tInd)]; ### make sure the colname order is line env trait
              exp_trait_trn <- exp_trait_trn[!(exp_trait_trn$ril_code %in% taxa[myF[[fold]]]),]
              colnames(exp_trait_trn)[ncol(exp_trait_trn)] <- 'Yobs'; # rename phenotype data column as Yobs
              
              # also pull testing data:
              exp_trait_test <- exp_traits[exp_traits$env_code==current.env,c(lInd, eInd, tInd)]; ### make sure the colname order is line env trait
              exp_trait_test <- exp_trait_test[(exp_trait_test$ril_code %in% taxa[myF[[fold]]]),]; # filter by fold
              colnames(exp_trait_test)[ncol(exp_trait_test)] <- 'Yobs'; # rename phenotype data column as Yobs
              
              # skip this environment if there is no testing data in this fold:
              if(sum(!(is.na(exp_trait_test$Yobs)))==0) {next}
              
              # remove missing values--negative is ok though
              exp_trait_trn <- exp_trait_trn[!is.na(exp_trait_trn$Yobs),];
              # How many observations are there per pedigree?
              obs_lne_n <- aggregate(Yobs ~ ril_code, data = exp_trait_trn, length);
              # find lines with only one or two observations and remove them:
              line_outlier <- obs_lne_n$ril_code[obs_lne_n$Yobs < line.outlier.filter]
              exp_trait_trn <- exp_trait_trn[!(exp_trait_trn$ril_code %in% line_outlier),]
              line_codes <- unique(exp_trait_trn$ril_code); # pull unique line codes
              
              # and pull the training data as a vector only:
              current.data <- exp_trait_trn[,c(colnames(exp_trait_trn)=="Yobs")]
              
              # find mean value for each environment: 
              env_mean_trait_0_trn <- na.omit(aggregate(x = exp_trait_trn$Yobs, by = list(env_code = exp_trait_trn$env_code),
                                                        mean, na.rm = T));
              colnames(env_mean_trait_0_trn)[2] <- 'meanY';
              env_mean_trait_trn <- env_mean_trait_0_trn[order(env_mean_trait_0_trn$meanY),]; # sort by mean phenotype value
              
              # don't need this section for 1->4:
              # env_mean_trait_0_test <- na.omit(aggregate(x = exp_trait_test$Yobs, by = list(env_code = exp_trait_test$env_code),
              #                                       mean, na.rm = T));
              # colnames(env_mean_trait_0_test)[2] <- 'meanY';
              # env_mean_trait_test <- env_mean_trait_0_test
              
              # Perform exhaustive search in training set from WHOLE NAM pop, with 1 environment left out, for optimal window and environmental parameter to use
              Exhaustive_search_full(env_mean_trait_trn, PTT_PTR, searching_daps, 
                                     exp_pop_dir=paste0(exp_pop_dir,"whole_pop/"),
                                     current.data,
                                     trait,
                                     searching_daps, searching_daps, LOO_env, min.window.size = min.window.size)
              
              # need to read in default output and rename them:
              pop_cor_file.old <- paste(exp_pop_dir,'whole_pop/', trait, '_', nrow(env_mean_trait_trn), 'Envs_PTTPTR_', LOO_env, 'LOO_cor_whole_pop', sep = '');
              pop_pval_file.old <- paste(exp_pop_dir, 'whole_pop/',trait, '_', nrow(env_mean_trait_trn), 'Envs_PTTPTR_', LOO_env, 'LOO_P_whole_pop', sep = '');
              
              pop_cor <- fread(pop_cor_file.old)
              pop_pval <- fread(pop_pval_file.old) 
              
              pop_cor_file <- paste(exp_pop_dir, 'whole_pop/', trait, '_UEUG.1to4.no', current.env,'.fold', fold, ".rep", i, '_cor_whole_pop', sep = '')
              fwrite(pop_cor, pop_cor_file, sep="\t")
              
              pop_pval_file <-paste(exp_pop_dir, 'whole_pop/', trait, '_UEUG.1to4.no',current.env,'.fold', fold, ".rep", i, '_P_whole_pop', sep = '')
              fwrite(pop_pval, pop_pval_file, sep="\t")
              
              # tidy up:
              file.remove(pop_cor_file.old, pop_pval_file.old) 
              rm(pop_cor, pop_pval)
              
              # filter windows by established criteria:
              search.results <- fread(pop_cor_file)%>%
                select(-starts_with("nR")) # don't need the negative version
              search.results <- search.results %>%
                tidyr::gather(key="Parameter", value="Corr", -Day_x, -Day_y, -window)
              
              search.results <- search.results %>%
                filter(window >= min.window.size) %>% # i < j-5
                filter(window <= max.window)
              
              if(trait %in% FT.traits) {
                search.results <- search.results %>%
                  filter(Day_y <= last.FT) # last possible day for FT traits = 55
              } else if (trait %in% harvest.traits) {
                search.results <- search.results %>%
                  filter(Day_y <= last.harvest) # last possible day for harvest traits
              } else (warning("trait not in FT or harvest traits"))
              
              # Only now choose the remaining window with the best corr:
              search.results <- search.results <- search.results %>%
                filter(abs(Corr)==max(abs(Corr), na.rm = T))
              
              # if there are ties, just pick the top one in the table.
              search.results <- search.results[1,]%>%
                mutate(trait=trait)
              
              # output results:
              output.results[[n]] <- cbind(testing.env=current.env,rep=i, fold=fold, search.results)
              print(output.results[[n]])
              
              maxR_dap1 <- search.results$Day_x;
              maxR_dap2 <- search.results$Day_y;
              kPara_Name <- search.results$Parameter;
              kPara_Name <- gsub("R_", "", kPara_Name)
              PTT_PTR_ind <-  which(colnames(PTT_PTR) == kPara_Name); ## DL -> 5, GDD -> 6; PTT -> 7; PTR -> 8; PTD1 -> 8, PTD2 -> 9, PTS -> 10
              
              # Make output file containing the slopes and intercepts from the linear models for each line across environments
              Slope_Intercept(maxR_dap1, maxR_dap2, env_mean_trait_trn, PTT_PTR, exp_trait_trn, line_codes, paste0(exp_pop_dir,"whole_pop/"), PTT_PTR_ind, 
                              filter.less.than-1); # do filtering for number of observations here
              # read file in and rename it:
              slope_file.old <-paste0(exp_pop_dir,"whole_pop/Intcp_Slope")
              pop_slope <- fread(slope_file.old)
              slope_file <- paste0(exp_pop_dir,"whole_pop/", trait, "_UEUG.1to4.no", current.env,".fold", fold, ".rep", i, "_Intcp_Slope")
              fwrite(pop_slope,slope_file, sep="\t")
              
              # tidy up:
              file.remove(slope_file.old) 
              rm(pop_slope)
              
              n <- n+1 # increase iterator
            }
            
          }
        }
        # output windows: 
        opt.windows <- do.call("rbind", output.results)
        if (incl.precip=="no.precip"){
          fwrite(opt.windows, paste0("NAM_BLUE/UEUG.1to4.v4_no_precip/UEUG.1to4_auto.optimal.parameters.whole_pop.rep",i, ".csv"))
          
        } else {
          fwrite(opt.windows, paste0("NAM_BLUE/UEUG.1to4.v4/UEUG.1to4_auto.optimal.parameters.whole_pop.rep",i, ".csv"))
          
        }
      }
      
  # Prep and run ASREML
        if(incl.precip=="no.precip") {
          temp.dir <- paste(exp_dir, "FR.UEUG.1to4.pop_no_precip", sep=""); if (!dir.exists(temp.dir))  { dir.create(temp.dir)};
        } else {
          temp.dir <- paste(exp_dir, "FR.UEUG.1to4.pop", sep=""); if (!dir.exists(temp.dir))  { dir.create(temp.dir)};
        }
       

      for (i in rep.num){
        if(CERIS.done) { # if CERIS-JGRA already run for 1->4, need to read in: optimal window, which line in which fold
          # first, read in folds:
          myF <-vector("list", nfolds)
          for (j in c(1:nfolds)) {
            fold.long <- vector()
            for(my.trait in c(FT.traits, harvest.traits)){ # need to loop through traits because some had missing data
              for(my.env in env_meta_info_0$env_code) {
                my.file <- paste0(exp_dir, "UEUG.1to4.v4/",my.trait,"/whole_pop/", my.trait, "_UEUG.1to4.no", my.env, ".fold",j,".rep", i, "_Intcp_Slope")
                
                if(file.exists(my.file)) {
                  fold <- fread(my.file) %>%
                    select(line_codes)
                  fold.long <- unique(c(fold.long, fold$line_codes))
                }
              }
            }
            myF[[j]] <- which(taxa %in% fold.long)
          }
          stopifnot(sum(taxa[setdiff(1:length(taxa), c(myF[[1]], myF[[2]]))] %in% exp_traits$ril_code)==0) # check that everything with a phenotype is in here
          stopifnot(length(intersect(myF[[1]], myF[[2]]))==0) # check that no lines are in both folds
          
        }   
        # Now, read in optimal windows:
        opt.windows <- fread(paste0(exp_dir, "UEUG.1to4.v4/UEUG.1to4_auto.optimal.parameters.whole_pop.rep", i, ".csv"))
        if(incl.precip=="no.precip") {
          opt.windows <- fread(paste0(exp_dir, "UEUG.1to4.v4_no_precip/UEUG.1to4_auto.optimal.parameters.whole_pop.rep", i, ".csv"))
        }
        # prep the input files for asreml:
        # get the EC (environmental covariate) value for each trait x fold x testing env combination:
        # EC.table <- tibble(rep=NA, fold=NA, trait=NA, env_code=NA, testing.env=NA, EC=NA)
        # sink("outfile.txt")
        for(my.trait in c(FT.traits, harvest.traits)) {
          # for(my.trait in c(FT.traits)[1:2]) {
          for(my.env in env_meta_info_0$env_code) {
            for(my.fold in 1:nfolds) {
              current.window <- opt.windows %>% filter(testing.env==my.env, trait==my.trait, fold==my.fold)
              if(nrow(current.window)==0) {
                # print(paste0("No window found for ", my.trait, " in training env ", my.env))
                next
              }
              # if(my.fold==1) { # only print them for one fold
              #   for(my.pop in unique(fr.exp_traits$pop_code)) {
              #     print(paste(my.trait, my.pop, my.env, sep="_"))
              #   }
              # }
              
              
              maxR_dap1 <- current.window$Day_x;
              maxR_dap2 <- current.window$Day_y;
              kPara_Name <- current.window$Parameter;
              kPara_Name <- gsub("R_", "", kPara_Name)
              
              current.EC <- PTT_PTR %>%
                group_by(env_code) %>%
                mutate(day.num=1:150) %>%
                filter(day.num %in% c(maxR_dap1:maxR_dap2)) %>%
                select(env_code, kPara_Name, day.num) %>%
                mutate(EC=mean(get(kPara_Name))) %>%
                mutate(trait=my.trait,
                       testing.env=my.env,
                       fold=my.fold,
                       rep=i) %>%
                select(rep, fold, trait, env_code, testing.env, EC) %>%distinct
              current.EC$EC <- scale(current.EC$EC,center = T,scale = T) # scale the environmental covariate as required for FR-gBLUP
              
              if(exists("EC.table")) {
                EC.table <- rbind(EC.table, current.EC)
              } else {
                EC.table <- current.EC
              }
              rm(current.EC)
            }
            
          }
        }
        # sink()
        EC.table <- EC.table %>% filter(!is.na(rep))
         wide.EC <- EC.table %>%
          spread(key=trait, value=EC)
        colnames(wide.EC)[-c(1:4)] <- paste0(  colnames(wide.EC)[-c(1:4)], ".EC")
        
        # add EC to the phenotype data:
        pheno.full <- left_join(fr.exp_traits, wide.EC, by="env_code") %>%
          select(-Entry_id)
        
        # make asreml input files, with different training sets set to NAs:
        for(my.env in env_meta_info_0$env_code) {
          pheno.env <- pheno.full %>%
            filter(testing.env==my.env) %>%
            mutate(
              ASI=ifelse(env_code==my.env, NA, ASI),
              CD=ifelse(env_code==my.env, NA, CD),
              CL=ifelse(env_code==my.env, NA, CL),
              CM=ifelse(env_code==my.env, NA, CM),
              DTA=ifelse(env_code==my.env, NA, DTA),
              DTS=ifelse(env_code==my.env, NA, DTS),
              EH=ifelse(env_code==my.env, NA, EH),
              EM=ifelse(env_code==my.env, NA, EM),
              ERN=ifelse(env_code==my.env, NA, ERN),
              KN=ifelse(env_code==my.env, NA, KN),
              KPR=ifelse(env_code==my.env, NA, KPR),
              TKW=ifelse(env_code==my.env, NA, TKW),
              LL=ifelse(env_code==my.env, NA, LL),
              LW=ifelse(env_code==my.env, NA, LW),
              PH=ifelse(env_code==my.env, NA, PH),
              T20KW=ifelse(env_code==my.env, NA, T20KW),
              TPBN=ifelse(env_code==my.env, NA, TPBN),
              TL=ifelse(env_code==my.env, NA, TL),
              ULA=ifelse(env_code==my.env, NA, ULA)) %>%
            select(Genotype=ril_code, env_code, pop_code, rep, fold, testing.env, everything())
          
          colnames(pheno.env) <- gsub("\\.", "_", colnames(pheno.env))
          
          for(my.pop in unique(pheno.env$pop_code)) {
            pheno.pop <- pheno.env %>%
              filter(pop_code==my.pop)
            for(my.fold in 1:nfolds) {
              fold.taxa <- taxa[myF[[my.fold]]]
              pheno.part <- pheno.pop %>%
                filter(fold==my.fold) %>%
                mutate(
                  ASI=ifelse(Genotype %in% fold.taxa, NA, ASI),
                  CD=ifelse(Genotype %in% fold.taxa, NA, CD),
                  CL=ifelse(Genotype %in% fold.taxa, NA, CL),
                  CM=ifelse(Genotype %in% fold.taxa, NA, CM),
                  DTA=ifelse(Genotype %in% fold.taxa, NA, DTA),
                  DTS=ifelse(Genotype %in% fold.taxa, NA, DTS),
                  EH=ifelse(Genotype %in% fold.taxa, NA, EH),
                  EM=ifelse(Genotype %in% fold.taxa, NA, EM),
                  ERN=ifelse(Genotype %in% fold.taxa, NA, ERN),
                  KN=ifelse(Genotype %in% fold.taxa, NA, KN),
                  KPR=ifelse(Genotype %in% fold.taxa, NA, KPR),
                  TKW=ifelse(Genotype %in% fold.taxa, NA, TKW),
                  LL=ifelse(Genotype %in% fold.taxa, NA, LL),
                  LW=ifelse(Genotype %in% fold.taxa, NA, LW),
                  PH=ifelse(Genotype %in% fold.taxa, NA, PH),
                  T20KW=ifelse(Genotype %in% fold.taxa, NA, T20KW),
                  TPBN=ifelse(Genotype %in% fold.taxa, NA, TPBN),
                  TL=ifelse(Genotype %in% fold.taxa, NA, TL),
                  ULA=ifelse(Genotype %in% fold.taxa, NA, ULA)) 
              for(my.trait in c(FT.traits, harvest.traits)) { # filter out lines without enough observations
                temp.pheno.part <- pheno.part[,c(1,which(colnames(pheno.part)==my.trait))]
                colnames(temp.pheno.part)[2] <- "Yobs" 
                obs_lne_n <- aggregate(Yobs~Genotype, data=temp.pheno.part, length)
                line_outlier <- obs_lne_n$Genotype[obs_lne_n$Yobs < (filter.less.than)] # want to make sure have AT LEAST filter.less.than-1 training obs, so need at least filter.less.than total obs
                
                temp.pheno.part$Yobs[temp.pheno.part$Genotype %in% line_outlier] <- NA
                pheno.part[,which(colnames(pheno.part)==my.trait)] <- temp.pheno.part$Yobs
              }
              pheno.part <- pheno.part %>% 
                select(Genotype,env_code,pop_code,rep,fold,testing_env,
                       ASI,CD,CL,CM,DTA,DTS,EH,EM,ERN,KN,KPR,TKW,LL,LW,
                       PH,T20KW,TPBN,TL,ULA,EH_EC,LL_EC,LW_EC,PH_EC,TPBN_EC,
                       TL_EC,ULA_EC,ASI_EC,DTA_EC,DTS_EC,CD_EC,CL_EC,CM_EC,
                       EM_EC,ERN_EC,KN_EC,KPR_EC,TKW_EC,T20KW_EC)
              fwrite(pheno.part, paste0(temp.dir, paste0("/asreml_1to4_no",my.env,"_pop", my.pop, "_fold", my.fold, "_rep", i, ".csv")))
              rm(pheno.part)
            }
          }
          
          
        }
        rm(EC.table)
      }
      
      
      # Run ASREML with asreml_1to4_fold1_full_rep.as and asreml_1to4_fold2_full_rep.as
      
      # Compile FR 1->4 -- -- -- - --- -
      
      for (i in rep.num) {
        if(CERIS.done) {
          # first, read in folds:
          myF <-vector("list", nfolds)
          for (j in c(1:nfolds)) {
            fold.long <- vector()
            for(my.trait in c(FT.traits, harvest.traits)){ # need to loop through traits because some had missing data
              for(my.env in env_meta_info_0$env_code) {
                my.file <- paste0(exp_dir, "UEUG.1to4.v4/",my.trait,"/whole_pop/", my.trait, "_UEUG.1to4.no", my.env, ".fold",j,".rep", i, "_Intcp_Slope")
                
                if(file.exists(my.file)) {
                  fold <- fread(my.file) %>%
                    select(line_codes)
                  fold.long <- unique(c(fold.long, fold$line_codes))
                }
              }
            }
            myF[[j]] <- which(taxa %in% fold.long)
          }
          stopifnot(sum(taxa[setdiff(1:length(taxa), c(myF[[1]], myF[[2]]))] %in% exp_traits$ril_code)==0) # check that everything with a phenotype is in here
          stopifnot(length(intersect(myF[[1]], myF[[2]]))==0) # check that no lines are in both folds
          
        } 
        # Now, read in optimal windows:
        opt.windows <- fread(paste0(exp_dir, "UEUG.1to4.v4/UEUG.1to4_auto.optimal.parameters.whole_pop.rep", 1, ".csv"))
        
        # if predicting within each pop one at a time:
          # get the CYCLE arguments for asreml_rep.as:
          n <- 1
          result.id <- vector()
          for(my.trait in c(FT.traits, harvest.traits)) {
            for(my.env in env_meta_info_0$env_code) {
              for(my.fold in 1:nfolds) {
                current.window <- opt.windows %>% filter(testing.env==my.env, trait==my.trait, fold==my.fold)
                if(nrow(current.window)==0) {
                  # print(paste0("No window found for ", my.trait, " in training env ", my.env, " fold", my.fold))
                  next
                }
                if(my.fold==1) { # only print them for one fold
                  for(my.pop in unique(fr.exp_traits$pop_code)) {
                    result.id[n] <- paste(my.trait, my.pop, my.env, sep="_")
                    n <- n+1
                  }
                }
                
              }
              
            }
          }
        
        
         # Need to read in each fold separately:
        sln.1 <- fread(paste0(temp.dir, "/asreml_1to4_fold1_full_rep", i, ".sln"), data.table=F) %>%
          # sln.1 <- fread(paste0(temp.dir, "/asreml_1to4_fold1_full_rep", i, "_begin1.sln"), data.table=F) %>%
          mutate(Set = result.id[cumsum(Model_Term == "Model_Term")+1]) %>% # set the cycle name used for these table rows
          filter(Model_Term != "Model_Term") %>%
          mutate(tempcol=str_split(Set, '_')) %>% # split the string
          rowwise() %>%
          mutate(trait=unlist(tempcol)[1], pop=unlist(tempcol)[2], testing.env=unlist(tempcol)[3]) %>%
          dplyr::select(-tempcol) %>%
          ungroup() %>%
          mutate(fold=1)
        
        # double check:
        thing <- sln.1 %>%
          filter(str_detect(Model_Term, '_EC')) %>%
          filter(!(str_detect(Model_Term, "Genotype"))) %>%
          mutate(tempcol=str_split(Model_Term, '_')) %>% # split the string
          rowwise() %>%
          mutate(check.trait=unlist(tempcol)[1]) %>%
          dplyr::select(-tempcol) %>%
          ungroup()
        stopifnot(all.equal(thing$trait, thing$check.trait))
        tail(unique(sln.1$Set))
        
        sln.2 <- fread(paste0(temp.dir, "/asreml_1to4_fold2_full_rep", i, ".sln"), data.table=F) %>%
          # sln.2 <- fread(paste0(temp.dir, "/asreml_1to4_fold2_full_rep", i, "_begin.sln"), data.table=F) %>%
          mutate(Set = result.id[cumsum(Model_Term == "Model_Term")+1]) %>% # set the cycle name used for these table rows
          filter(Model_Term != "Model_Term") %>%
          mutate(tempcol=str_split(Set, '_')) %>% # split the string
          rowwise() %>%
          mutate(trait=unlist(tempcol)[1], pop=unlist(tempcol)[2], testing.env=unlist(tempcol)[3]) %>%
          dplyr::select(-tempcol) %>%
          ungroup() %>%
          mutate(fold=2)
        
        # double check:
        thing <- sln.2 %>%
          filter(str_detect(Model_Term, '_EC')) %>%
          filter(!(str_detect(Model_Term, "Genotype"))) %>%
          mutate(tempcol=str_split(Model_Term, '_')) %>% # split the string
          rowwise() %>%
          mutate(check.trait=unlist(tempcol)[1]) %>%
          dplyr::select(-tempcol) %>%
          ungroup()
        stopifnot(all.equal(thing$trait, thing$check.trait))
        tail(unique(sln.2$Set))
        
        # combine the folds, tidy formatting:
        sln.full <- rbind(sln.1, sln.2)
        sln.full$pop <- as.numeric(sln.full$pop)
        sln.full$fold <- as.numeric(sln.full$fold)
        sln.full$Effect <- as.numeric(sln.full$Effect)
        
        # calculate predictions:
        for (my.trait in c(FT.traits, harvest.traits)){
          # for (my.trait in c(FT.traits, harvest.traits)[-c(15:19)]){
          pre.combine <- vector("list", length(unique(fr.exp_traits$pop_code))*nfolds*nrow(env_meta_info_0))
          n <- 1
          for(my.env in env_meta_info_0$env_code) {
            for (my.pop in sort(unique(fr.exp_traits$pop_code))){
              if (is.na(my.pop)) {next}
              for(my.fold in 1:nfolds) {
                # pull current data from sln file
                current.sln <- sln.full %>%
                  filter(Set %in% sln.full$Set[which(sln.full$Model_Term==paste0(my.trait,"_EC"))]) %>%
                  filter(pop==my.pop) %>%
                  filter(fold==my.fold) %>%
                  filter(testing.env==my.env) %>%
                  distinct
                if(nrow(current.sln)==0) {next} # skip if no data
                
                # pull input data for this fold
                current.fold <- fread(paste0(temp.dir, "/asreml_1to4_no", my.env, "_pop",my.pop, "_fold",my.fold,"_rep", i,".csv"), data.table=F) %>%
                  dplyr::select(Genotype, env_code, paste0(my.trait), paste0(my.trait, "_EC"))%>%
                  filter(is.na(UQ(rlang::sym(my.trait)))) %>% # only need to predict the ones that weren't known
                  filter(env_code==my.env) %>% # only need the testing environ
                  filter((Genotype %in% taxa[myF[[my.fold]]])) %>% # only need the testing genotypes
                  distinct
                
                current.fold$y.hat <- rep(NA, nrow(current.fold))
                current.fold$intcp.hat <- rep(NA, nrow(current.fold))
                current.fold$slope.hat <- rep(NA, nrow(current.fold))
                
                # and we'll need it for the opposite fold too; need to get it from some other testing environ too!
                other.fold <- fread(paste0(temp.dir, "/asreml_1to4_no",
                                           env_meta_info_0$env_code[!(env_meta_info_0$env_code == my.env)][1],
                                           "_pop",my.pop, "_fold",c(1:nfolds)[-my.fold],"_rep", i,".csv"), data.table=F) %>%
                  dplyr::select(Genotype, env_code, paste0(my.trait), paste0(my.trait, "_EC")) %>%
                  filter((Genotype %in% taxa[myF[[my.fold]]])) %>% # only need testing genotypes
                  filter(env_code==my.env) # only need the testing environ
                
                
                for (j in 1:nrow(current.fold)) {
                  # calculate the prediction for each line
                  current.genotype <- current.fold$Genotype[j]
                  current.env <- current.fold$env_code[j]
                  current.EC <- current.fold[j,4][[1]] # this only works because I select the column order above!
                  current.intcp <- current.sln$Effect[current.sln$Level==current.genotype]
                  current.slope <- current.sln$Effect[current.sln$Level==paste0(current.genotype, ".001")]
                  
                  current.fold$y.hat[j] <- current.sln$Effect[current.sln$Model_Term=="mu"]+ #mu
                    current.sln$Effect[current.sln$Model_Term==paste0(my.trait, "_EC")]*current.EC + # + BXe
                    current.intcp + # + uG
                    current.slope*current.EC # + bG*xE
                  # sln1b$Effect[sln1b$Level==current.env] # take out + uE
                  current.fold$intcp.hat[j] <- current.intcp
                  current.fold$slope.hat[j] <- current.slope
                }
                # pre.combine[[n]] <- left_join(current.fold %>% select(-paste0(my.trait)),
                #                               other.fold %>% select(-paste0(my.trait, "_EC")))
                # print(paste("Fold ", my.fold, ", pop ", my.pop, ": cor ", cor(pre.combine[[n]][,7], pre.combine[[n]]$y.hat, use="complete")))
                
                # to use on Nova:
                my.EC <- paste0(my.trait, "_EC")
                pre.combine[[n]] <- left_join(current.fold %>% select(-my.trait),
                                              other.fold %>% select(-my.EC))
                
                rm(current.fold)
                n <- n+1
              }
            }
          }
          
          combined <- do.call(what = "rbind", pre.combine) %>%
            mutate(obs.id=NA) %>%
            select(env_code, ril_code=Genotype, paste0(my.trait), obs.id, intcp.hat, slope.hat, y.hat)
          
          # plot(combined$EH, combined$y.hat)
          # cor(combined$EH, combined$y.hat, use="complete")
          kPara_Name <- "Env Param"
          trait <- my.trait
          
          Plot_TEUG_result(obs_prd_file = combined, all_env_codes, kPara_Name, trait=trait, 
                           forecast_png_file = paste0(temp.dir, "/output/FR_", trait, "_rep",i, "_UEUG.1to4.pop_obs.prd.png"),
                           # forecast_png_file = paste0("NAM/FR.TEUG.1to3/no_covar", "FR_", trait, "_rep",i, "_TEUG.1to3_obs.prd.png"),
                           path = F, save.output = T)
          fwrite(combined, paste0(temp.dir, "/output/FR_", trait, "_rep",i, "_UEUG.1to4.pop_obs.prd.txt"))
          
        }
        
      }
      
 
    
    
# GCTA GWAS --------------------------------------------------------------------

# Use GCTA to get high-density GWAS results

# on server, run the following: 
# NOTE: the following lines are to be run on the command line, NOT R
# Then, the results can be used in R
module load gcta/1.91.2beta-q2ognx7
module load plink

# merge the SV and SNP data
plink --merge-list sv_and_snp_mb --memory 40000 --out merge # memory says to limit to 40GB, sv_and_snp_mb has the list of all files to combine (SNP files plus SV ones)

# make files to run SV+SNP GWAS:
# make GRM (K), took ~11 hr
nohup gcta64 --bfile merge --make-grm --maf 0.01 --out sv.snp.grm &
  
  # make it sparse: took 2 seconds
  gcta64 --grm sv.snp.grm --make-bK-sparse 0.05 --out sparse.grm

# need to make maf-filtered version: took ~25 min
# nohup gcta64 --bfile merge --maf 0.01 --out merge.maf &
plink --bfile merge --memory 40000 --maf 0.01 --make-bed --out merge.maf01

# and run GWAS:
gcta.format <- fread("merge.maf01.fam") %>% select(V1, V2)
gcta.env <- left_join(gcta.format, fr.exp_traits, by=c("V1"="ril_code"))
env.list <- sort(unique(fr.exp_traits$env_code))
for(current.trait in c(FT.traits, harvest.traits)) {
  
  # read in phenotypes
  myY <- fread(paste0(exp_dir, current.trait, "/whole_pop/Intcp_Slope"), header = T, stringsAsFactors = F, data.table=F)
  
  # format for GCTA GWAS
  gcta.y <- left_join(gcta.format, 
                      myY %>% 
                        select(line_codes, Intcp_para_adj, Slope_para) %>% 
                        distinct,
                      by=c("V1"="line_codes"))
  
  # write slope pheno file
  fwrite(gcta.y %>% select(V1,V2,slope.hat=Slope_para),
         paste0(exp_dir, "GCTA_fixed/", current.trait, "_slope_fixed.phen"),
         sep="\t", col.names = F, na="NA", quote=F)
  
  # write intcp pheno file
  fwrite(gcta.y %>% select(V1,V2,intcp.hat=Intcp_para_adj),
         paste0(exp_dir, "GCTA_fixed/", current.trait, "_intcp_fixed.phen"),
         sep="\t", col.names = F, na="NA", quote = F)
  
  for(env_num in seq_along(env.list)){ # make phenotype in each environment
    my.env <- env.list[env_num]
    fwrite(gcta.env %>% filter(env_code==my.env) %>% select(V1, V2, current.trait),
           paste0(exp_dir, "GCTA_fixed/", current.trait, "_BLUE_", env_num, ".phen"),
           sep="\t", col.names=F, na="NA", quote=F)
  }
}

# Calculate SimpleM threshold for high-density:
# use https://github.com/LTibbs/SimpleM/blob/main/simpleM_efficient.R
# Result was:
threshold <- 4.750729e-07

# Run GCTA GWAS: on server (this is command line code, NOT R code)
# need to run for each trait; example (DTA) shown here
# This uses slurm array numbers to iterate through environments
module load gcta/1.94.0b
gcta64 --mlma --bfile merge.maf01 --grm sv.snp.grm --pheno DTA_intcp.phen --out geno_assoc_DTA_intcp --thread-num 28
gcta64 --mlma --bfile merge.maf01 --grm sv.snp.grm --pheno DTA_BLUE_$SLURM_ARRAY_TASK_ID.phen --out geno_assoc_DTA_BLUE_$SLURM_ARRAY_TASK_ID --thread-num 28

# Make Manhattan plots and P value plots for 20M snps/svs
for(my.trait in c(FT.traits, harvest.traits)) {
  
  untar(paste0("GCTA_fixed/results_tar/",my.trait,".mlma.fixed.tar.gz"),
        files=paste0("geno_assoc_", my.trait, "_slope_fixed.mlma"),
        list=F, exdir="GCTA_fixed/results_tar/temp")
  slope.snp <- fread(paste0("/GCTA_fixed/results_tar/temp/geno_assoc_",
                            my.trait, "_slope_fixed.mlma"),
                     select=c(1:3,9))
  
  # untar intcp, then read in:
  untar(paste0("GCTA_fixed/results_tar/",my.trait,".mlma.fixed.tar.gz"),
        files=paste0("geno_assoc_", my.trait, "_intcp_fixed.mlma"),
        list=F, exdir="GCTA_fixed/results_tar/temp")
  int.snp <- fread(paste0("GCTA_fixed/results_tar/temp/geno_assoc_",
                          my.trait, "_intcp_fixed.mlma"),
                   select=c(1:3,9))
  min.p <- min(c(int.snp$p, slope.snp$p), na.rm=T)
  
  png(paste0('Figures/', my.trait, '_fixed_int_slope_fastman.png'), width =6.5, height = 8, units = 'in', res = 300)
  layout(matrix(c(1:2),2,1))
  fastman(int.snp,
          suggestiveline=F,
          genomewideline=-log10(threshold),
          chr="Chr", bp = "bp", p = "p",
          maxP=-log10(min.p)
  )
  fastman(slope.snp,
          suggestiveline=F,
          genomewideline=-log10(threshold),
          chr="Chr", bp = "bp", p = "p",
          maxP=-log10(min.p)
  )
  dev.off()
  
  # Make plots of intercept vs slope P values
  stopifnot(all.equal(slope.snp$SNP, int.snp$SNP))
  pdat <- tibble(trait=my.trait,
                 neg.log.int.p=-log10(int.snp$p),
                 neg.log.slope.p=-log10(slope.snp$p))
  max.p <- max(c(pdat$neg.log.int.p, pdat$neg.log.slope.p),
               na.rm = T)
  pplot <- ggplot(pdat) +
    scattermore::geom_scattermore(aes(x=neg.log.int.p, y=neg.log.slope.p),
                                  pointsize=2.5)+
    facet_wrap(.~trait) +
    geom_abline(slope=1, intercept=0) +
    geom_vline(xintercept=-log10(threshold), color="red") +
    geom_hline(yintercept=-log10(threshold), color="red") +
    xlab("Intercept (-log P)")+
    ylab("Slope (-log P)")+
    xlim(0,max.p)+
    ylim(0,max.p)
  ggsave(paste0("Figures/",my.trait,"pplot.jpg"), plot=pplot, device="jpeg", width=2.5, height=2.5, units="in", dpi=500)
  
  
  rm(slope.snp, int.snp, min.p)
}

# PCA ---------------------------------------------------------------------

      # env mean:
      # For PC of the traits, we want to have the environments as column names plus have a "Trait" column
      pc.mean <- fr.exp_traits %>%
        select(-c(pop_code, Entry_id)) %>%
        pivot_longer(-c(env_code, ril_code), names_to="Trait", values_to="BLUE") %>%
        group_by(env_code, Trait) %>%
        mutate(env.mean=mean(BLUE, na.rm = T)) %>%
        select(-ril_code, -BLUE) %>%
        distinct
      pc.mean.traits <- pc.mean %>% # for the PCs of the traits using the env mean:
        pivot_wider(names_from=Trait, values_from=env.mean)
      plot.pc.mean.traits <- prcomp(na.omit(pc.mean.traits[,-1]), center=T, scale=T)$rotation

# Subfunctions:  ----------------------------------------------------------

      # Genomic prediction functions --------------------------------------------
      
      # Laura Tibbs Cortes
      # June 9, 2020
      
      # make FUNCTIONS to do the repetitive parts of genomic prediction (not just giant loops anymore)
      
      
      # function to split observations into folds for kfold cross-validation:
      fold.maker <- function(num.obs, kfold) { # provide the total number of observations you are splitting and the number of folds
        new.order <- sample(1:num.obs, num.obs) # randomly permute observation order
        Q <- num.obs%/%kfold # find whole-number quotient (number of obs in some folds)
        R <- num.obs%%kfold # find remainder (how many folds need an extra obs)
        
        folds <- vector("list", length = kfold) # make list to hold folds
        for (i in 1:kfold) {
          if (i <=R) { # for the larger folds
            # print(((i-1)*(Q+1)+1):(i*(Q+1)))
            folds[[i]] <- new.order[c(((i-1)*(Q+1)+1):(i*(Q+1)))]
          }
          else if (i>R) { # for the smaller folds
            # print(((R*(Q+1))+(i-R-1)*Q+1):((R*(Q+1))+(i-R)*Q))
            folds[[i]] <- new.order[c((R*(Q+1))+(i-R-1)*Q+1):((R*(Q+1))+(i-R)*Q)] # identify the next fold
          } else {warning("Error in fold creation.")}
        }
        
        return(folds)
      }
      
      # make folds given observation ids rather than number of obs
      fold.maker.2 <- function(obs.ids, kfold) { # provide the ids of observations you are splitting
        new.order <- sample(obs.ids, length(obs.ids))
        Q <- length(obs.ids)%/%kfold # find whole-number quotient (number of obs in some folds)
        R <- length(obs.ids)%%kfold # find remainder (how many folds need an extra obs)  
        
        folds <- vector("list", length = kfold) # make list to hold folds
        for (i in 1:kfold) {
          if (i <=R) { # for the larger folds
            # print(((i-1)*(Q+1)+1):(i*(Q+1)))
            folds[[i]] <- new.order[c(((i-1)*(Q+1)+1):(i*(Q+1)))]
          }
          else if (i>R) { # for the smaller folds
            # print(((R*(Q+1))+(i-R-1)*Q+1):((R*(Q+1))+(i-R)*Q))
            folds[[i]] <- new.order[c((R*(Q+1))+(i-R-1)*Q+1):((R*(Q+1))+(i-R)*Q)] # identify the next fold
          } else {warning("Error in fold creation.")}
        }
        
        return(folds)
      }

# sub_funcs_202000707_LTC -----------------------------------------------------

      
      # Laura Tibbs Cortes
      # Aug 21, 2020
      
      # Sub-functions for running CERES, based on Xianran's code (sub_funcs_20200707.r) but also based on my own previous
      # work with running my vitamins through CERES
      
      # Updated July 14 2020 to allow for more environmental indices
      
      # Make ptt_ptr file using function from source:
      # this function calls sub-functions to calculate the day length for each day the plants were growing, up until our searching_daps limit,
      # then, find the weather from the 6 nearest weather stations for those dates and extract average high and low daily temperature
      # and use to calculate GDDs and other environmental parameters
      Compile_PTT_PTR_local_GHCN <-  function(exp_dir, env_meta_info, exp_s_year, exp_e_year, searching_daps,ptt_ptr_file, t_base, t_max1, t_max2) {
        
        # make directories:
        sp_env_dir <- paste(exp_dir, 'envs/', sep = '');     if (!dir.exists(sp_env_dir))  { dir.create(sp_env_dir)};
        # GHCN is global historical climatology network
        sp_ghcn_dir <- paste(sp_env_dir, 'ghcn/', sep = ''); if (!dir.exists(sp_ghcn_dir)) { dir.create(sp_ghcn_dir)};
        sp_navy_dir <- paste(sp_env_dir, 'Geoshpere/', sep = ''); if (!dir.exists(sp_navy_dir)) { dir.create(sp_navy_dir)};
        
        # set latitude and longitude range of data
        lat_range <- range(env_meta_info$lat, na.rm = T); lon_range <- range(env_meta_info$lon, na.rm = T);
        
        # pull the weather stations with info relevant to your environments
        local_ghcn_st_file <- paste('all_ghcn_stations', sep = '');  
        local_ghcn_target_st_file <- paste(sp_ghcn_dir, '0target_ghcn_stations', sep = '');
        if (!file.exists(local_ghcn_target_st_file)) {
          
          # if you don't have a pre-made list of all ghcn stations, then get that info
          if (!file.exists(local_ghcn_st_file)) { ghcn_all_sts <- ghcnd_stations() # this function returns data for all GHCN weather stations available
          } else { ghcn_all_sts <- read.csv(local_ghcn_st_file);}
          
          # filter the ghcn info to only include those within your begin and end year, latitude, and longitude
          # only keep the max and min temperature
          STS_ghcn <- dplyr::filter(ghcn_all_sts, first_year <= exp_s_year & last_year >= exp_e_year 
                                    & between(latitude, lat_range[1] - 2, lat_range[2] + 2) 
                                    & between(longitude,lon_range[1] - 2, lon_range[2] + 2)
                                    & element %in% c("TMAX", "TMIN")
          );
          
          write.csv(STS_ghcn, local_ghcn_target_st_file  ) # write out file
        }
        STS_ghcn <- read.csv(local_ghcn_target_st_file); 
        
        st_ids <- c();
        for (e_i in 1:nrow(env_meta_info)) { 
          
          # for each environment, pull the metadata:
          env_code <- env_meta_info$env_code[e_i];
          field_lat <- env_meta_info$lat[e_i]; field_lon <- env_meta_info$lon[e_i]; 
          planting_date <- env_meta_info$PlantingDate[e_i];
          planting_year <- year(planting_date);
          
          # set the dates to inspect for this environment (from planting until your set end date)
          DAPs <- seq(as.Date(planting_date), length.out = searching_daps, by = "day"); # make a list of all the dates this field was growing, up until the specified searching_daps limit
          ending_date <- DAPs[searching_daps]; 
          ending_year <- year(ending_date);
          
          # Make file containing daylength if it does not already exist
          # Using DayLength_from_Equation function
          local_dl_file <- paste(sp_navy_dir, env_code, '_DL_', searching_daps, 'DAPs', sep = '')
          if (!file.exists(local_dl_file)) {
            DayLength_from_Equation(planting_year, ending_year, 
                                    field_lat, field_lon, 
                                    local_dl_file, DAPs ) 
          }
          
          # find closest 6 weather stations within 50 km of the field location
          field_sts <- meteo_distance(STS_ghcn, lat = field_lat, lon = field_lon, radius = 50, limit = 6);
          
          # save the IDs of those weather stations
          st_ids <- append(st_ids, unique(as.vector(field_sts$id)));
        }
        
        # keep only unique weather stations (in case environments near one another)
        st_ids <- unique(st_ids);
        
        # work through all years of the overall experiment to read in weather data:
        for (e_y in 1:(exp_e_year - exp_s_year + 1)) {
          gz_ghcn_file <- paste(ghcn_year_dir, exp_s_year + e_y - 1, '.csv.gz', sep = ''); # set weather file name
          
          # read in weather data, and filter to keep only the weather stations we want AND only keep the max and min temp data from them
          gz_ghcn <- fread(gz_ghcn_file, select = c(1:4),showProgress = FALSE);
          gz_ghcn <- gz_ghcn[gz_ghcn$V1 %in% st_ids & gz_ghcn$V3 %in% c('TMAX', 'TMIN', 'PRCP')][,1:4] # keep precip plus temp 
          
          #    # definitions of weather units:
          #               PRCP = Precipitation (tenths of mm)
          #    	   SNOW = Snowfall (mm)
          # 	   SNWD = Snow depth (mm)
          #            TMAX = Maximum temperature (tenths of degrees C)
          #            TMIN = Minimum temperature (tenths of degrees C)
          # 	   
          # 	   The other elements are:
          # 	   
          # 	   ACMC = Average cloudiness midnight to midnight from 30-second 
          # 	          ceilometer data (percent)
          # 	   ACMH = Average cloudiness midnight to midnight from 
          # 	          manual observations (percent)
          #            ACSC = Average cloudiness sunrise to sunset from 30-second 
          # 	          ceilometer data (percent)
          # 	   ACSH = Average cloudiness sunrise to sunset from manual 
          # 	          observations (percent)
          #            AWDR = Average daily wind direction (degrees)
          # 	   AWND = Average daily wind speed (tenths of meters per second)
          # 	   DAEV = Number of days included in the multiday evaporation
          # 	          total (MDEV)
          # 	   DAPR = Number of days included in the multiday precipiation 
          # 	          total (MDPR)
          #            DASF = Number of days included in the multiday snowfall 
          # 	          total (MDSF)		  
          # 	   DATN = Number of days included in the multiday minimum temperature 
          # 	         (MDTN)
          # 	   DATX = Number of days included in the multiday maximum temperature 
          # 	          (MDTX)
          #            DAWM = Number of days included in the multiday wind movement
          # 	          (MDWM)
          # 	   DWPR = Number of days with non-zero precipitation included in 
          # 	          multiday precipitation total (MDPR)
          # 	   EVAP = Evaporation of water from evaporation pan (tenths of mm)
          # 	   FMTM = Time of fastest mile or fastest 1-minute wind 
          # 	          (hours and minutes, i.e., HHMM)
          # 	   FRGB = Base of frozen ground layer (cm)
          # 	   FRGT = Top of frozen ground layer (cm)
          # 	   FRTH = Thickness of frozen ground layer (cm)
          # 	   GAHT = Difference between river and gauge height (cm)
          # 	   MDEV = Multiday evaporation total (tenths of mm; use with DAEV)
          # 	   MDPR = Multiday precipitation total (tenths of mm; use with DAPR and 
          # 	          DWPR, if available)
          # 	   MDSF = Multiday snowfall total 
          # 	   MDTN = Multiday minimum temperature (tenths of degrees C; use with 
          # 	          DATN)
          # 	   MDTX = Multiday maximum temperature (tenths of degress C; use with 
          # 	          DATX)
          # 	   MDWM = Multiday wind movement (km)
          #            MNPN = Daily minimum temperature of water in an evaporation pan 
          # 	         (tenths of degrees C)
          #            MXPN = Daily maximum temperature of water in an evaporation pan 
          # 	         (tenths of degrees C)
          # 	   PGTM = Peak gust time (hours and minutes, i.e., HHMM)
          # 	   PSUN = Daily percent of possible sunshine (percent)
          # 	   SN*# = Minimum soil temperature (tenths of degrees C)
          # 	          where * corresponds to a code
          # 	          for ground cover and # corresponds to a code for soil 
          # 		  depth.  
          # 		  
          # 		  Ground cover codes include the following:
          # 		  0 = unknown
          # 		  1 = grass
          # 		  2 = fallow
          # 		  3 = bare ground
          # 		  4 = brome grass
          # 		  5 = sod
          # 		  6 = straw multch
          # 		  7 = grass muck
          # 		  8 = bare muck
          # 		  
          # 		  Depth codes include the following:
          # 		  1 = 5 cm
          # 		  2 = 10 cm
          # 		  3 = 20 cm
          # 		  4 = 50 cm
          # 		  5 = 100 cm
          # 		  6 = 150 cm
          # 		  7 = 180 cm
          # 		  
          # 	   SX*# = Maximum soil temperature (tenths of degrees C) 
          # 	          where * corresponds to a code for ground cover 
          # 		  and # corresponds to a code for soil depth. 
          # 		  See SN*# for ground cover and depth codes. 
          #            TAVG = Average temperature (tenths of degrees C)
          # 	          [Note that TAVG from source 'S' corresponds
          # 		   to an average for the period ending at
          # 		   2400 UTC rather than local midnight]
          #            THIC = Thickness of ice on water (tenths of mm)	
          #  	   TOBS = Temperature at the time of observation (tenths of degrees C)
          # 	   TSUN = Daily total sunshine (minutes)
          # 	   WDF1 = Direction of fastest 1-minute wind (degrees)
          # 	   WDF2 = Direction of fastest 2-minute wind (degrees)
          # 	   WDF5 = Direction of fastest 5-second wind (degrees)
          # 	   WDFG = Direction of peak wind gust (degrees)
          # 	   WDFI = Direction of highest instantaneous wind (degrees)
          # 	   WDFM = Fastest mile wind direction (degrees)
          #            WDMV = 24-hour wind movement (km)	   
          #            WESD = Water equivalent of snow on the ground (tenths of mm)
          # 	   WESF = Water equivalent of snowfall (tenths of mm)
          # 	   WSF1 = Fastest 1-minute wind speed (tenths of meters per second)
          # 	   WSF2 = Fastest 2-minute wind speed (tenths of meters per second)
          # 	   WSF5 = Fastest 5-second wind speed (tenths of meters per second)
          # 	   WSFG = Peak gust wind speed (tenths of meters per second)
          # 	   WSFI = Highest instantaneous wind speed (tenths of meters per second)
          # 	   WSFM = Fastest mile wind speed (tenths of meters per second)
          # 	   WT** = Weather Type where ** has one of the following values:
          # 	   
          #                   01 = Fog, ice fog, or freezing fog (may include heavy fog)
          #                   02 = Heavy fog or heaving freezing fog (not always 
          # 		       distinquished from fog)
          #                   03 = Thunder
          #                   04 = Ice pellets, sleet, snow pellets, or small hail 
          #                   05 = Hail (may include small hail)
          #                   06 = Glaze or rime 
          #                   07 = Dust, volcanic ash, blowing dust, blowing sand, or 
          # 		       blowing obstruction
          #                   08 = Smoke or haze 
          #                   09 = Blowing or drifting snow
          #                   10 = Tornado, waterspout, or funnel cloud 
          #                   11 = High or damaging winds
          #                   12 = Blowing spray
          #                   13 = Mist
          #                   14 = Drizzle
          #                   15 = Freezing drizzle 
          #                   16 = Rain (may include freezing rain, drizzle, and
          # 		       freezing drizzle) 
          #                   17 = Freezing rain 
          #                   18 = Snow, snow pellets, snow grains, or ice crystals
          #                   19 = Unknown source of precipitation 
          #                   21 = Ground fog 
          #                   22 = Ice fog or freezing fog
          # 		  
          #             WV** = Weather in the Vicinity where ** has one of the following 
          # 	           values:
          # 		   
          # 		   01 = Fog, ice fog, or freezing fog (may include heavy fog)
          # 		   03 = Thunder
          # 		   07 = Ash, dust, sand, or other blowing obstruction
          # 		   18 = Snow or ice crystals
          # 		   20 = Rain or snow shower
          
          # save this data
          if (e_y == 1) {ghcn_TM <- gz_ghcn} else {ghcn_TM <- rbind(ghcn_TM, gz_ghcn)}
        }
        # ghcn_TM <- ghcn_TM[,V4:=ifelse(V4==9999|V4==-9999, NA, round(32 + 9 * V4/50, 3))]; ## convert to degrees F; noaa record is tenths of degrees C
        ghcn_TM <- ghcn_TM %>%
          mutate(V4=ifelse(V3=="PRCP", V4, ifelse(V4==9999|V4==-9999, NA, round(32 + 9 * V4/50, 3))))
        ghcn_TM[, V2 := ymd(V2)]; # convert charcter date to ymd format
        
        PTT_PTR <- matrix(ncol = 8, nrow = 0); 
        for (e_i in 1:nrow(env_meta_info)) { 
          
          # for each environment, pull the metadata:
          env_code <- env_meta_info$env_code[e_i];
          field_lat <- env_meta_info$lat[e_i]; field_lon <- env_meta_info$lon[e_i]; 
          planting_date <- env_meta_info$PlantingDate[e_i];
          planting_year <- year(planting_date);
          
          # set the dates to inspect for this environment (from planting until your set end date); see above for details
          DAPs <- seq(as.Date(planting_date), length.out = searching_daps, by = "day");
          ending_date <- DAPs[searching_daps]; 
          ending_year <- year(ending_date);
          
          # read in day length file made earlier
          local_dl_file <- paste(sp_navy_dir, env_code, '_DL_', searching_daps, 'DAPs', sep = '')
          DL <- read.table(local_dl_file, header = T, sep = "\t", stringsAsFactors = F);
          
          # set name of temperature file and make it
          local_tm_file <- paste(sp_ghcn_dir, env_code, '_TM_', searching_daps, 'DAPs', sep = '' );
          
          # this subfunction calculates the daily max and min temperatures, averaged over the nearest weather stations, for each environment
          # if (!file.exists(local_tm_file)) {
          
          TM_from_local_NOAA(ghcn_TM %>% filter(V3 %in% c("TMAX", "TMIN")), 
                             field_lat, field_lon, 
                             local_tm_file, STS_ghcn, DAPs)
          # } ;
          
          TM_0 <- read.table(local_tm_file, header = T, sep = "\t", stringsAsFactors = F); # read in temperature file created by above function
          
          PRCP_0 <- PRCP_from_local_NOAA(ghcn_TM %>% filter(V3 == "PRCP"), field_lat, field_lon, 
                                         STS_ghcn, DAPs)
          PRCP_0$date <- as.character(PRCP_0$date)
          
          # merge day length with temperature data AND precip--
          DL_TM <- merge(DL, TM_0, all.x = T) ;
          DL_TM <- left_join(DL_TM, PRCP_0, by="date")
          
          # fill in missing temperatures
          DL_TM$TMAX <- Fill_Missning_TM(DL_TM$TMAX, env);
          DL_TM$TMIN <- Fill_Missning_TM(DL_TM$TMIN, env);
          DL_TM$precip <- Fill_Missning_TM(DL_TM$precip, env);
          
          # save max and min temperatures as vectors
          Tmax <- DL_TM$TMAX; Tmin <- DL_TM$TMIN; precip.vector <- DL_TM$precip
          
          # calculate GDD with sub-function:
          GDDs <- Adjusting_TM_4_GDD(Tmax, Tmin, Haun_threshold, t_base, t_max1, t_max2);
          
          # Calculate other environmental parameters:
          PTTs <- round(GDDs * DL_TM$DL, 4); # PTT=GDDxDL
          PTRs <- round(GDDs / DL_TM$DL, 4); # PTR=GDD/DL
          PTD1s <- round((DL_TM$TMAX - DL_TM$TMIN) * DL_TM$DL, 4); #PTD1=(TMAX-TMIN)xDL
          PTD2s <- round((DL_TM$TMAX - DL_TM$TMIN) / DL_TM$DL, 4); #PTD2=(TMAX-TMIN)/DL
          PTSs <- round(((DL_TM$TMAX^2) - (DL_TM$TMIN^2)) * (DL_TM$DL^2), 4); #PTS=(TMAX^2 - TMIN^2)*(DL^2)
          avg.temp <- (Tmax+Tmin)/2
          DTR <- Tmax-Tmin
          
          
          env_codes <- rep(env_code, searching_daps); # make vector of the environmental code of the length of number of days of interest
          
          # make data frame to hold this data, and output
          t_df <- data.frame(env_code = env_codes, date = DAPs, TMAX = DL_TM$TMAX, TMIN = DL_TM$TMIN, DL = DL_TM$DL, GDD = GDDs, PTT = PTTs, 
                             PTR = PTRs, PTD1 = PTD1s, PTD2 = PTD2s, PTS = PTSs, PRECIP=precip.vector, avg.temp=avg.temp)
          PTT_PTR <- rbind(PTT_PTR, t_df);
        }
        
        write.table(PTT_PTR, file = ptt_ptr_file, sep = "\t", quote = F, row.name = F)
        invisible(gc());
      }
      
      # Expand to include more environment parameters! Don't want to break the old one in the process.
      Compile_PTT_PTR_local_GHCN_extend <-  function(exp_dir, env_meta_info, exp_s_year, exp_e_year, searching_daps,ptt_ptr_file, t_base, t_max1, t_max2, petfile) {
        
        # make directories:
        sp_env_dir <- paste(exp_dir, 'envs/', sep = '');     if (!dir.exists(sp_env_dir))  { dir.create(sp_env_dir)};
        # GHCN is global historical climatology network
        sp_ghcn_dir <- paste(sp_env_dir, 'ghcn/', sep = ''); if (!dir.exists(sp_ghcn_dir)) { dir.create(sp_ghcn_dir)};
        sp_navy_dir <- paste(sp_env_dir, 'Geoshpere/', sep = ''); if (!dir.exists(sp_navy_dir)) { dir.create(sp_navy_dir)};
        
        # set latitude and longitude range of data
        lat_range <- range(env_meta_info$lat, na.rm = T); lon_range <- range(env_meta_info$lon, na.rm = T);
        
        # pull the weather stations with info relevant to your environments
        local_ghcn_st_file <- paste('all_ghcn_stations', sep = '');  
        local_ghcn_target_st_file <- paste(sp_ghcn_dir, '0target_ghcn_stations', sep = '');
        if (!file.exists(local_ghcn_target_st_file)) {
          
          # if you don't have a pre-made list of all ghcn stations, then get that info
          if (!file.exists(local_ghcn_st_file)) { 
            ghcn_all_sts <- ghcnd_stations() # this function returns data for all GHCN weather stations available
          } else { 
            ghcn_all_sts <- read.csv(local_ghcn_st_file);
          }
          
          # filter the ghcn info to only include those within your begin and end year, latitude, and longitude
          # only keep the max and min temperature
          STS_ghcn <- dplyr::filter(ghcn_all_sts, first_year <= exp_s_year & last_year >= exp_e_year 
                                    & between(latitude, lat_range[1] - 2, lat_range[2] + 2) 
                                    & between(longitude,lon_range[1] - 2, lon_range[2] + 2)
                                    & element %in% c("TMAX", "TMIN", "PRCP")
          );
          
          write.csv(STS_ghcn, local_ghcn_target_st_file  ) # write out file
        }
        STS_ghcn <- read.csv(local_ghcn_target_st_file); 
        
        st_ids <- c();
        for (e_i in 1:nrow(env_meta_info)) { 
          
          # for each environment, pull the metadata:
          env_code <- env_meta_info$env_code[e_i];
          field_lat <- env_meta_info$lat[e_i]; field_lon <- env_meta_info$lon[e_i]; 
          planting_date <- env_meta_info$PlantingDate[e_i];
          planting_year <- year(planting_date);
          
          # set the dates to inspect for this environment (from planting until your set end date)
          DAPs <- seq(as.Date(planting_date), length.out = searching_daps, by = "day"); # make a list of all the dates this field was growing, up until the specified searching_daps limit
          ending_date <- DAPs[searching_daps]; 
          ending_year <- year(ending_date);
          
          # Make file containing daylength if it does not already exist
          # Using DayLength_from_Equation function
          local_dl_file <- paste(sp_navy_dir, env_code, '_DL_', searching_daps, 'DAPs', sep = '')
          # if (!file.exists(local_dl_file)) {
          DayLength_from_Equation(planting_year, ending_year, 
                                  field_lat, field_lon, 
                                  local_dl_file, DAPs ) 
          # }
          
          # find closest 6 weather stations within 50 km of the field location
          field_sts <- meteo_distance(STS_ghcn, lat = field_lat, lon = field_lon, radius = 50, limit = 6);
          
          # save the IDs of those weather stations
          st_ids <- append(st_ids, unique(as.vector(field_sts$id)));
        }
        
        # keep only unique weather stations (in case environments near one another)
        st_ids <- unique(st_ids);
        
        # work through all years of the overall experiment to read in weather data:
        for (e_y in 1:(exp_e_year - exp_s_year + 1)) {
          gz_ghcn_file <- paste(ghcn_year_dir, exp_s_year + e_y - 1, '.csv.gz', sep = ''); # set weather file name
          
          # read in weather data, and filter to keep only the weather stations we want AND only keep the max and min temp data from them
          gz_ghcn <- fread(gz_ghcn_file, select = c(1:4),showProgress = FALSE);
          gz_ghcn <- gz_ghcn[gz_ghcn$V1 %in% st_ids & gz_ghcn$V3 %in% c('TMAX', 'TMIN', 'PRCP')][,1:4] # keep precip plus temp 
          
          #    # definitions of weather units:
          #               PRCP = Precipitation (tenths of mm)
          #    	   SNOW = Snowfall (mm)
          # 	   SNWD = Snow depth (mm)
          #            TMAX = Maximum temperature (tenths of degrees C)
          #            TMIN = Minimum temperature (tenths of degrees C)
          # 	   
          # 	   The other elements are:
          # 	   
          # 	   ACMC = Average cloudiness midnight to midnight from 30-second 
          # 	          ceilometer data (percent)
          # 	   ACMH = Average cloudiness midnight to midnight from 
          # 	          manual observations (percent)
          #            ACSC = Average cloudiness sunrise to sunset from 30-second 
          # 	          ceilometer data (percent)
          # 	   ACSH = Average cloudiness sunrise to sunset from manual 
          # 	          observations (percent)
          #            AWDR = Average daily wind direction (degrees)
          # 	   AWND = Average daily wind speed (tenths of meters per second)
          # 	   DAEV = Number of days included in the multiday evaporation
          # 	          total (MDEV)
          # 	   DAPR = Number of days included in the multiday precipiation 
          # 	          total (MDPR)
          #            DASF = Number of days included in the multiday snowfall 
          # 	          total (MDSF)		  
          # 	   DATN = Number of days included in the multiday minimum temperature 
          # 	         (MDTN)
          # 	   DATX = Number of days included in the multiday maximum temperature 
          # 	          (MDTX)
          #            DAWM = Number of days included in the multiday wind movement
          # 	          (MDWM)
          # 	   DWPR = Number of days with non-zero precipitation included in 
          # 	          multiday precipitation total (MDPR)
          # 	   EVAP = Evaporation of water from evaporation pan (tenths of mm)
          # 	   FMTM = Time of fastest mile or fastest 1-minute wind 
          # 	          (hours and minutes, i.e., HHMM)
          # 	   FRGB = Base of frozen ground layer (cm)
          # 	   FRGT = Top of frozen ground layer (cm)
          # 	   FRTH = Thickness of frozen ground layer (cm)
          # 	   GAHT = Difference between river and gauge height (cm)
          # 	   MDEV = Multiday evaporation total (tenths of mm; use with DAEV)
          # 	   MDPR = Multiday precipitation total (tenths of mm; use with DAPR and 
          # 	          DWPR, if available)
          # 	   MDSF = Multiday snowfall total 
          # 	   MDTN = Multiday minimum temperature (tenths of degrees C; use with 
          # 	          DATN)
          # 	   MDTX = Multiday maximum temperature (tenths of degress C; use with 
          # 	          DATX)
          # 	   MDWM = Multiday wind movement (km)
          #            MNPN = Daily minimum temperature of water in an evaporation pan 
          # 	         (tenths of degrees C)
          #            MXPN = Daily maximum temperature of water in an evaporation pan 
          # 	         (tenths of degrees C)
          # 	   PGTM = Peak gust time (hours and minutes, i.e., HHMM)
          # 	   PSUN = Daily percent of possible sunshine (percent)
          # 	   SN*# = Minimum soil temperature (tenths of degrees C)
          # 	          where * corresponds to a code
          # 	          for ground cover and # corresponds to a code for soil 
          # 		  depth.  
          # 		  
          # 		  Ground cover codes include the following:
          # 		  0 = unknown
          # 		  1 = grass
          # 		  2 = fallow
          # 		  3 = bare ground
          # 		  4 = brome grass
          # 		  5 = sod
          # 		  6 = straw multch
          # 		  7 = grass muck
          # 		  8 = bare muck
          # 		  
          # 		  Depth codes include the following:
          # 		  1 = 5 cm
          # 		  2 = 10 cm
          # 		  3 = 20 cm
          # 		  4 = 50 cm
          # 		  5 = 100 cm
          # 		  6 = 150 cm
          # 		  7 = 180 cm
          # 		  
          # 	   SX*# = Maximum soil temperature (tenths of degrees C) 
          # 	          where * corresponds to a code for ground cover 
          # 		  and # corresponds to a code for soil depth. 
          # 		  See SN*# for ground cover and depth codes. 
          #            TAVG = Average temperature (tenths of degrees C)
          # 	          [Note that TAVG from source 'S' corresponds
          # 		   to an average for the period ending at
          # 		   2400 UTC rather than local midnight]
          #            THIC = Thickness of ice on water (tenths of mm)	
          #  	   TOBS = Temperature at the time of observation (tenths of degrees C)
          # 	   TSUN = Daily total sunshine (minutes)
          # 	   WDF1 = Direction of fastest 1-minute wind (degrees)
          # 	   WDF2 = Direction of fastest 2-minute wind (degrees)
          # 	   WDF5 = Direction of fastest 5-second wind (degrees)
          # 	   WDFG = Direction of peak wind gust (degrees)
          # 	   WDFI = Direction of highest instantaneous wind (degrees)
          # 	   WDFM = Fastest mile wind direction (degrees)
          #            WDMV = 24-hour wind movement (km)	   
          #            WESD = Water equivalent of snow on the ground (tenths of mm)
          # 	   WESF = Water equivalent of snowfall (tenths of mm)
          # 	   WSF1 = Fastest 1-minute wind speed (tenths of meters per second)
          # 	   WSF2 = Fastest 2-minute wind speed (tenths of meters per second)
          # 	   WSF5 = Fastest 5-second wind speed (tenths of meters per second)
          # 	   WSFG = Peak gust wind speed (tenths of meters per second)
          # 	   WSFI = Highest instantaneous wind speed (tenths of meters per second)
          # 	   WSFM = Fastest mile wind speed (tenths of meters per second)
          # 	   WT** = Weather Type where ** has one of the following values:
          # 	   
          #                   01 = Fog, ice fog, or freezing fog (may include heavy fog)
          #                   02 = Heavy fog or heaving freezing fog (not always 
          # 		       distinquished from fog)
          #                   03 = Thunder
          #                   04 = Ice pellets, sleet, snow pellets, or small hail 
          #                   05 = Hail (may include small hail)
          #                   06 = Glaze or rime 
          #                   07 = Dust, volcanic ash, blowing dust, blowing sand, or 
          # 		       blowing obstruction
          #                   08 = Smoke or haze 
          #                   09 = Blowing or drifting snow
          #                   10 = Tornado, waterspout, or funnel cloud 
          #                   11 = High or damaging winds
          #                   12 = Blowing spray
          #                   13 = Mist
          #                   14 = Drizzle
          #                   15 = Freezing drizzle 
          #                   16 = Rain (may include freezing rain, drizzle, and
          # 		       freezing drizzle) 
          #                   17 = Freezing rain 
          #                   18 = Snow, snow pellets, snow grains, or ice crystals
          #                   19 = Unknown source of precipitation 
          #                   21 = Ground fog 
          #                   22 = Ice fog or freezing fog
          # 		  
          #             WV** = Weather in the Vicinity where ** has one of the following 
          # 	           values:
          # 		   
          # 		   01 = Fog, ice fog, or freezing fog (may include heavy fog)
          # 		   03 = Thunder
          # 		   07 = Ash, dust, sand, or other blowing obstruction
          # 		   18 = Snow or ice crystals
          # 		   20 = Rain or snow shower
          
          # save this data
          if (e_y == 1) {ghcn_TM <- gz_ghcn} else {ghcn_TM <- rbind(ghcn_TM, gz_ghcn)}
        }
        # ghcn_TM <- ghcn_TM[,V4:=ifelse(V4==9999|V4==-9999, NA, round(32 + 9 * V4/50, 3))]; ## convert to degrees F; noaa record is tenths of degrees C
        ghcn_TM <- ghcn_TM %>%
          mutate(V4=ifelse(V3=="PRCP", V4, ifelse(V4==9999|V4==-9999, NA, round(32 + 9 * V4/50, 3))))
        ghcn_TM[, V2 := ymd(V2)]; # convert charcter date to ymd format
        
        # read in PET (potential evapotranspiration) data:
        pet <- read_csv(petfile)
        
        PTT_PTR <- matrix(ncol = 8, nrow = 0); 
        for (e_i in 1:nrow(env_meta_info)) { 
          
          # for each environment, pull the metadata:
          env_code <- env_meta_info$env_code[e_i];
          field_lat <- env_meta_info$lat[e_i]; field_lon <- env_meta_info$lon[e_i]; 
          planting_date <- env_meta_info$PlantingDate[e_i];
          planting_year <- year(planting_date);
          
          # set the dates to inspect for this environment (from planting until your set end date); see above for details
          DAPs <- seq(as.Date(planting_date), length.out = searching_daps, by = "day");
          ending_date <- DAPs[searching_daps]; 
          ending_year <- year(ending_date);
          
          # read in day length file made earlier
          local_dl_file <- paste(sp_navy_dir, env_code, '_DL_', searching_daps, 'DAPs', sep = '')
          DL <- read.table(local_dl_file, header = T, sep = "\t", stringsAsFactors = F);
          
          # set name of temperature file and make it
          local_tm_file <- paste(sp_ghcn_dir, env_code, '_TM_', searching_daps, 'DAPs', sep = '' );
          #  local_tm_file <- paste(sp_isd_dir, env_code, '_TM_', searching_daps, 'DAPs', sep = '' );
          
          # this subfunction calculates the daily max and min temperatures, averaged over the nearest weather stations, for each environment
          # if (!file.exists(local_tm_file)) {
          
          TM_from_local_NOAA(ghcn_TM %>% filter(V3 %in% c("TMAX", "TMIN")), 
                             field_lat, field_lon, 
                             local_tm_file, STS_ghcn, DAPs)
          # } ;
          
          TM_0 <- read.table(local_tm_file, header = T, sep = "\t", stringsAsFactors = F); # read in temperature file created by above function
          
          PRCP_0 <- PRCP_from_local_NOAA(ghcn_TM %>% filter(V3 == "PRCP"), field_lat, field_lon, 
                                         STS_ghcn, DAPs)
          PRCP_0$date <- as.character(PRCP_0$date)
          
          # merge day length with temperature data AND precip--
          DL_TM <- merge(DL, TM_0, all.x = T) ;
          DL_TM <- left_join(DL_TM, PRCP_0, by="date")
          
          # fill in missing temperatures
          DL_TM$TMAX <- Fill_Missning_TM(DL_TM$TMAX, env);
          DL_TM$TMIN <- Fill_Missning_TM(DL_TM$TMIN, env);
          DL_TM$precip <- Fill_Missning_TM(DL_TM$precip, env);
          
          # pull PET and join:
          current.env <- env_code
          current.pet <- pet%>%
            filter(env_code==current.env) %>%
            dplyr::select(-env_code)
          current.pet$date <- as.character(current.pet$date)
          DL_TM <- left_join(DL_TM, current.pet, by="date")
          
          # save max and min temperatures as vectors
          Tmax <- DL_TM$TMAX; Tmin <- DL_TM$TMIN; precip.vector <- DL_TM$precip
          
          # calculate GDD with sub-function:
          GDDs <- Adjusting_TM_4_GDD(Tmax, Tmin, Haun_threshold, t_base, t_max1, t_max2);
          
          # Calculate other environmental parameters:
          PTTs <- round(GDDs * DL_TM$DL, 4); # PTT=GDDxDL
          PTRs <- round(GDDs / DL_TM$DL, 4); # PTR=GDD/DL
          PTD1s <- round((DL_TM$TMAX - DL_TM$TMIN) * DL_TM$DL, 4); #PTD1=(TMAX-TMIN)xDL
          PTD2s <- round((DL_TM$TMAX - DL_TM$TMIN) / DL_TM$DL, 4); #PTD2=(TMAX-TMIN)/DL
          PTSs <- round(((DL_TM$TMAX^2) - (DL_TM$TMIN^2)) * (DL_TM$DL^2), 4); #PTS=(TMAX^2 - TMIN^2)*(DL^2)
          avg.temp <- (Tmax+Tmin)/2
          DTR <- Tmax-Tmin # diurnal temperature range
          H20.balance <- (DL_TM$precip)/10 - (DL_TM$PET)/100 # precip in 10th of mm, PET in 100th of mm so correct here
          env_codes <- rep(env_code, searching_daps); # make vector of the environmental code of the length of number of days of interest
          
          # make data frame to hold this data, and output
          t_df <- data.frame(env_code = env_codes, date = DAPs, TMAX = DL_TM$TMAX, TMIN = DL_TM$TMIN, DL = DL_TM$DL, GDD = GDDs, PTT = PTTs, 
                             PTR = PTRs, PTD1 = PTD1s, PTD2 = PTD2s, PTS = PTSs, PRECIP=precip.vector, DTR=DTR, PET=DL_TM$PET, H20.balance=H20.balance)
          PTT_PTR <- rbind(PTT_PTR, t_df);
        }
        
        write.table(PTT_PTR, file = ptt_ptr_file, sep = "\t", quote = F, row.name = F)
        invisible(gc());
      }
      
      # Function used in Compile_PTT_PTR_local_GHCN:
      # Function to calculate day length for each day of the growing season
      DayLength_from_Equation <- function(Y1, Y2, lat, lon, local_file, days) {
        DLs <- wholeyear_daylength_from_function(Y1, lat, lon); # actually go calculate the day lengths
        
        # if ending year was later than planting year (that is, if planted in fall/winter and harvested next spring),
        # you need to get the day length for that next year too
        if (Y2 > Y1) {
          DL_2 <- wholeyear_daylength_from_function(Y2, lat, lon); 
          DLs <- rbind(DLs, DL_2);
        }
        
        # keep only the day lengths in the dates you're interested in (planting to DAP limit), and output
        DL_window <- DLs[DLs$date %in% days, ];
        write.table(DL_window, file = local_file, sep = "\t", row.name = F, quote = F)
      }
      
      # sub-function used in Compile_PTT_PTR_local_GHCN: 
      # calculate day lengths for a given location in a given year
      wholeyear_daylength_from_function <- function(Y, lat_dec, lon_dec) {
        # set beginning and ending dates
        d1 <- paste(Y, '-1-1', sep = ''); 
        dL <- paste(Y, '-12-31', sep = '');
        
        # make a vector of the names of all the dates of that year:
        Ds <- seq(as.Date(d1), as.Date(dL), by = "days");
        
        # calculate the daylength for a given latitude and DAY of year (not date)
        DL <- round(daylength(lat_dec, 1:length(Ds)), 3);   
        
        # make dataframe with day length by date, and output it
        DL_df <- data.frame(date = Ds, DL = DL)
        return (DL_df);
      }
      
      # calculate GDD, including adjusting for the limit max and min temp and for the Haun threshold
      # used in both Compile_PTT_PTR_local_GHCN and Compile_PTT_PTR
      Adjusting_TM_4_GDD <- function(Tmax, Tmin, threshold, t_base, t_max1, t_max2) {
        Tmax[Tmax < t_base] <- t_base; Tmin[Tmin < t_base] <- t_base; # if the temperatures are BELOW the base temp, set it as the base temp
        if (Tmax[1] > t_max2) {Tmax[1] <- t_max2}; # if starting temperatures are above the second (Haun) threshold,
        if (Tmin[1] > t_max2) {Tmin[1] <- t_max2}; # set them as that threshold
        if (threshold > 0) { # IF the Haun threshold exists for this species
          gdd_cum <- (Tmax[1] + Tmin[1]) / 2 - t_base; # GDD formula, for first day of season
          for (i in 2:length(Tmax)) { # for each day of the season:
            if (gdd_cum < threshold) { t_max0 <- t_max2} else {t_max0 <- t_max1}; # set the max temp as appropriate, 
            if (Tmax[i] > t_max0) { Tmax[i] <- t_max0 }; #depending on whether cumulative GDD have surpassed Haun threshold
            if (Tmin[i] > t_max0) { Tmin[i] <- t_max0 };
            gdd_cum <- gdd_cum + (Tmax[i] + Tmin[i]) / 2 - t_base; # add day's results to cumulative GDD
          }    
        } else { # or without the Haun threshold, limit the daily temps 
          Tmax[Tmax > t_max1] <- t_max1; Tmin[Tmin > t_max1] <- t_max1;
        }
        gdds <- round((Tmax + Tmin) / 2 - t_base, 4) # calculate GDD!
        return (gdds)
      } 
      
      # used in Compile_PTT_PTR_local_GHCN
      TM_from_local_NOAA <- function(TM_0,  lat, lon,  local_file, sts_ghcn, daps) {
        
        # pull nearest 6 weather stations within 50km of field location
        field_sts <- meteo_distance(sts_ghcn, lat = lat, lon = lon, radius = 50, limit = 6);
        
        # pull the weather station IDs:
        st_ids <- unique(as.vector(field_sts$id));
        T_mean <-  data.frame(date = daps); # make data frame currently holding all dates in growing season through DAPs limit
        
        TM <- TM_0[TM_0$V2 %in% daps][,1:4]; # pull temperature data for the dates of interest
        
        # extract daily max temperature data for each weather station
        TMAX <- TM[TM$V1 %in% st_ids & TM$V3 == 'TMAX'];
        # and find the average daily max temp data for each day, averaged over the different weather stations
        TMAX_mean <- TMAX[, .(TMAX = mean(V4)), by = V2];
        # format and add info to T_mean dataframe:
        colnames(TMAX_mean)[1] <- 'date'; 
        T_mean  <- merge(T_mean , TMAX_mean, all.x = T)
        
        # repeat for the minimum temperatures:
        TMIN <- TM[TM$V1 %in% st_ids & TM$V3 == 'TMIN'];
        TMIN_mean <- TMIN[, .(TMIN = mean(V4)), by = V2];
        colnames(TMIN_mean)[1] <- 'date';
        T_mean <- merge(T_mean, TMIN_mean, all.x = T);
        
        # write out result
        write.table(T_mean, file = local_file, sep = "\t", row.name = F, quote = F) ;
      }
      
      # used in Compile_PTT_PTR_local_GHCN
      # make this work for Precip also
      PRCP_from_local_NOAA <- function(PRCP_0, lat, lon, sts_ghcn, daps) {
        # pull nearest 6 weather stations within 50km of field location
        field_sts <- meteo_distance(sts_ghcn, lat = lat, lon = lon, radius = 50, limit = 6);
        
        # pull the weather station IDs:
        st_ids <- unique(as.vector(field_sts$id));
        T_mean <-  data.frame(date = daps); # make data frame currently holding all dates in growing season through DAPs limit
        
        PRCP <- PRCP_0[PRCP_0$V2 %in% daps][,1:4]; # pull temperature data for the dates of interest
        
        # extract daily precip data for each weather station
        precip <- PRCP[PRCP$V1 %in% st_ids & PRCP$V3 %in% c('PRCP')];
        # and find the average daily max temp data for each day, averaged over the different weather stations
        precip_mean <- precip[, .(precip = mean(V4)), by = V2];
        # format and add info to T_mean dataframe:
        colnames(precip_mean)[1] <- 'date'; 
        T_mean  <- merge(T_mean , precip_mean, all.x = T)
        return(T_mean)
      }
      
      # used in Compile_PTT_PTR_local_GHCN
      Fill_Missning_TM <- function(Tx, env) {
        NA_inds <- which(is.na(Tx)); # identify dates with missing max temp data
        A_inds <- which(!is.na(Tx)) # and all other dates
        for (NA_ind in NA_inds ) {
          if (NA_ind == 1) {  # if you are missing the first day's max temp data,
            Tx[NA_ind] <- Tx[A_inds[1]] # just assign the next day's data to it
          } else if (NA_ind >= max(A_inds)) { # or, if you are missing the last day's temp data, 
            Tx[NA_ind] <- Tx[max(A_inds)] # just assign the previous day's data to it
          } else { # otherwise, 
            pre_ind <- A_inds[max(which(A_inds < NA_ind))]; # find the nearest previous non-missing day
            suff_ind <- A_inds[min(which(A_inds > NA_ind))]; # and the nearest following non-missing day
            Tx[NA_ind] <- mean(Tx[c(pre_ind, suff_ind)] ); # and average the two
          }
        }
        return(Tx);
      }
      
      # Function to search for the best environmental parameter and timeframe to predict phenotype
      Exhaustive_search_full <- function(env_mean_trait, env_paras, searching_daps, exp_pop_dir,
                                         FTdaps, trait, dap_x, dap_y, LOO, min.window.size) {
        
        # make file names
        pop_cor_file <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait), 'Envs_PTTPTR_', LOO, 'LOO_cor_whole_pop', sep = '');
        pop_pval_file <- paste(exp_pop_dir, trait, '_', nrow(env_mean_trait), 'Envs_PTTPTR_', LOO, 'LOO_P_whole_pop', sep = '');
        
        
        # if (!file.exists(pop_cor_file)) {
        dap_win <- (searching_daps) * (searching_daps)  / 2; # how many search windows do you need to iterate over? It's half of a square, so days^2 /2
        
        # make matrix to hold correlations and their p values
        pop_cors_matrix <- matrix(ncol = ((ncol(env_paras)-2)*2)+3, nrow = dap_win * 1);
        pop_pval_matrix <- matrix(ncol = ((ncol(env_paras)-2))+3, nrow = dap_win * 1);
        colnames(pop_cors_matrix) <-c("Day_x", "Day_y", "window",
                                      paste0("R_", colnames(env_paras)[-c(1:2)]),
                                      paste0("nR_", colnames(env_paras)[-c(1:2)]))
        colnames(pop_pval_matrix)<-c("Day_x", "Day_y", "window",
                                     paste0("pval_", colnames(env_paras)[-c(1:2)]))
        
        # # make matrix to hold correlations and their p values, with no negative versions of the param!
        #   pop_cors_matrix <- matrix(ncol = ((ncol(env_paras)-2))+3, nrow = dap_win * 1);
        #   pop_pval_matrix <- matrix(ncol = ((ncol(env_paras)-2))+3, nrow = dap_win * 1);
        #   colnames(pop_cors_matrix) <-c("Day_x", "Day_y", "window",
        #       paste0("R_", colnames(env_paras)[-c(1:2)]))
        #       # ,
        #       # paste0("nR_", colnames(env_paras)[-c(1:2)]))
        #   colnames(pop_pval_matrix)<-c("Day_x", "Day_y", "window",
        #       paste0("pval_", colnames(env_paras)[-c(1:2)]))
        #     # c("pop_code", 'Day_x', 'Day_y', 'window', 'R_DL', 'R_GDD', 'R_PTT', 'R_PTR', 'R_PTD', 'R_PTD2', 'R_PTS', 'nR_DL', 'nR_GDD', 'nR_PTT', 'nR_PTR', 'nR_PTD', 'nR_PTD2', 'nR_PTS');
        
        # loop through the season in different window sizes:
        n <- 0; # initialize iterator variable
        for (d1 in 1:(dap_y - min.window.size)) { # the +/- X here prevents it from choosing the very last day as the start date,
          for (d2 in (d1 + min.window.size):dap_y) { # or from choosing a day too close to the start date as the window end date
            days <- c(d1:d2); # make list of days in current window
            env_facts_matrix <- matrix(nrow = nrow(env_mean_trait), ncol = ncol(env_paras)-2);
            for (e_i in 1:nrow(env_mean_trait)) { # loop through the environments
              e <- env_mean_trait$env_code[e_i]; # set current environment code
              env_para <- subset(env_paras, env_paras$env_code == e); # pull all environmental info for this environ
              # env_mean <- colMeans(env_para[days, (c(8, 9, 10, 11, 12, 13, 14) - 3)]); # find mean for these values in this environ: ### DL, GDD, PTT, PTR, PTD, PTD2, PTS
              env_mean <- colMeans(env_para[days, c(3:ncol(env_para))])
              env_facts_matrix[e_i,] <- env_mean; # put the mean values of the parameters in this environment into a matrix
            }
            n <- n + 1; # increment iterator
            ### leave one environment out and get the median correlation
            Ymean_envPara <- cbind(env_facts_matrix, env_mean_trait$meanY); # add mean observed trait value column to your environmental parameters matrix
            rs <- c(); # save r (correlation) values
            ps <- c(); # need to keep p values  now too
            if (LOO == 0) { # if you are NOT leaving-one-out when calculating correlation:
              for (k in c(1:(ncol(env_paras)-2))) {
                rs[k] <- round(cor(Ymean_envPara[,ncol(Ymean_envPara)], Ymean_envPara[,k]), digits = 4) # find correlation between each environmental parameter and the environment mean
                ps[k] <- round(-log(cor.test(Ymean_envPara[,ncol(Ymean_envPara)], Ymean_envPara[,k])$p.value, 10), digits = 4) # save the -log10 p values for the correlation
              }
              
            } else { # if leaving one out when calculating correlation:
              loo_rs_matrix <- matrix(nrow = nrow(Ymean_envPara)+ 0, ncol = ncol(env_paras)-2); # make matrix to hold correlations
              loo_ps_matrix <- matrix(nrow = nrow(Ymean_envPara)+ 0, ncol = ncol(env_paras)-2); # make matrix to hold p val
              for (k in c(1:(ncol(env_paras)-2))) { ## loop through environment parameters
                for (e_x in c(1:nrow(Ymean_envPara))) { # loop through the environments
                  t_matrix <- Ymean_envPara[-e_x,]; # drop one environment from your matrix
                  loo_rs_matrix[e_x, k] <- round(cor(t_matrix[,ncol(Ymean_envPara)], t_matrix[,k]), digits = 4) # find correlation between trait value and environmental param, leaving out one environ
                  loo_ps_matrix[e_x, k] <- round(-log(cor.test(Ymean_envPara[,ncol(Ymean_envPara)], t_matrix[,k])$p.value, 10), digits = 4) # find p value of correlation between trait value and environmental param, leaving out one environ
                }
              }
              rs <- apply(loo_rs_matrix, 2, median); # get median of the correlations for leaving one out to get final correlation between environmental param and trait, by environ param
              ps <- apply(loo_ps_matrix, 2, median); # get median of the pval for leaving one out to get final pval between environmental param and trait, by environ param
            }
            pop_cors_matrix[n, ] <- c(d1, d2, d2 - d1, rs, 0 - rs); # add window's info to the matrix AND add the NEGATIVE correlations too (using 0-rs)
            # pop_cors_matrix[n, ] <- c(d1, d2, d2 - d1, rs); # add window's info to the matrix WITHOUT the NEGATIVE correlations 
            pop_pval_matrix[n,] <- c(d1,d2,d2-d1, ps) # keep the p values
            
          }
        }
        pop_cors_matrix <- pop_cors_matrix[1:n,]
        pop_pval_matrix <- pop_pval_matrix[1:n,]
        write.table(pop_cors_matrix, file = pop_cor_file, sep = "\t", row.names = F, quote = F);
        write.table(pop_pval_matrix, file = pop_pval_file, sep = "\t", row.names = F, quote = F);
        
        # }
      }
      
      # Plot results of exhaustive search
      exhaustive_plot <- function(pop_cor_file, pop_pval_file, env_paras, dap_x, dap_y,
                                  FTdaps, type=c("p.only", "r.only", "both") # use type to choose which type of plot to print
      ) {
        # read data back in
        pop_cors <- read.table(pop_cor_file, header = T, sep = "\t");
        pop_pvals <- read.table(pop_pval_file, header=T, sep="\t")
        
        if (type=="r.only") {
          # make a pdf to show all of the environmental parameters and their optimal windows:
          pdf(paste0(exp_pop_dir, "/MaxR_", trait, ".pdf"),
              width= (ncol(env_paras)-2),height= 2,
              pointsize=6)
          layout(matrix(c(1:((ncol(env_paras)-2)*2)), 2, (ncol(env_paras)-2), byrow = T))
          
          for (k in c(1:((ncol(env_paras)-2)*2))) { # loop through the environmental parameters, plus all of them again but with color scale switched
            # pop_cor_0 <- subset(pop_cors, pop_cors$pop_code == p); # keep only the window correlations for those in the desired pop
            pop_cor_0 <- pop_cors
            pop_cor <- pop_cor_0[,c(1:3, k + 3)]; # pull correlations for the environ parameter of interest
            colnames(pop_cor)[4] <- 'R';
            pop_cor <- pop_cor[order(pop_cor$R),]; # sort by correlation
            
            xs <- pop_cor$Day_x;  ys <-  pop_cor$Day_y; # save window start and end dates, in order of correlation
            mid_R <- median(pop_cor$R); # find median correlation across windows
            
            # set colors: 
            cell_col <- floor(pop_cor$R * 12) + 13; ### the same color scale
            pop_cor$cell_col <- cell_col; 
            
            max_R <- pop_cor[which.max(abs(pop_cor$R))[1], ]; # pull the maximum absolute value of correlation found
            
            # set parameters--
            par(mar = c(0.5, 1.0, 1, 0.5) , # set margins on all sides
                mgp = c(0.05, 0.1, 0), # set margin lines for axis title, axis labels, axis line
                tck = -0.01, # length of tick marks as fraction of plotting region
                bty = "n"); # character string, suppressing box drawing
            plot(-50, -50, xlim = c(0, dap_x), ylim = c(0, dap_y), col = "white",  xlab = '', 
                 xaxt = "n", yaxt = "n", ylab = 'Days after planting', bty = "n", cex.lab = 1);
            arrows(-1, 10, -1, dap_y - 10, length = 0.05, angle = 15, lwd = .5,  col = "grey59");
            mtext(c(1, 50, 100, dap_y), side = 2, at = c(1,50, 100, dap_y), line = -1, cex = .6)
            
            rect(xs - 0.5, ys - 0.5, xs + 0.5, ys + 0.5, col = col_palette[pop_cor$cell_col], border = "NA")
            rect(max(pop_cor$Day_x) - 0.5, max(pop_cor$Day_y) - 0.5, max(pop_cor$Day_x) + 0.5, max(pop_cor$Day_y) + 0.5, border = "NA", col = "white", lwd = 0.001)
            
            arrows(10, dap_y + 4, dap_x - 10, dap_y + 4, angle = 15, length = 0.05, lwd = .5, col = "grey59")
            mtext("Days after planting", side = 3, at = dap_x / 2, line = -0.1, cex = .6)
            mtext(c(1, 50, 100, dap_y), side = 3, at = c(1, 50, 100, dap_y), line = -1.1, cex = .6)
            arrows(max_R$Day_x + 4,  max_R$Day_y - 4,  max_R$Day_x,  max_R$Day_y, length = 0.05, angle = 15, lwd = .5, col = "grey59")
            
            # Make legend:
            box_ys <- seq(1, 50, by = 2); box_xs <- rep(dap_x - 15, 25); 
            rect(box_xs - .5 * 2, box_ys - 0.5 * 2, box_xs + 0.5 * 2, box_ys + 0.5 * 2, border = "NA", col = col_palette)
            text(dap_x - 10 - 5, 52, 'r', cex = .5);
            r_lab_top <- 1; r_lab_mid <- 0; r_lab_bottom <- -1; max_r_lab <- paste( 'r = ', sprintf( "%.3f", max_R$R), sep = '');
            # if (k >(ncol(env_paras)-2)) { # because we want the first SEVEN charts to have 1 at the top, and next 7 at the BOTTOM
            #   r_lab_top <- -1; r_lab_bottom <- 1; max_r_lab <- paste( 'r = ', sprintf( "%.3f", 0 - max_R$R), sep = '');
            #   }
            legend(max_R$Day_x - 4 , max_R$Day_y - 4 , c(paste( max_R$Day_x, ' to ', max_R$Day_y, ' DAP ', colnames(pop_cor_0)[k+3], sep = ''), max_r_lab),  cex = .6, bty = "n");
            text(dap_x - 10 + 3, 50, r_lab_top, cex = .5)
            text(dap_x - 10 + 3, 27, r_lab_mid, cex = .5);
            text(dap_x - 10 + 3, 1,  r_lab_bottom, cex = .5)
            
            # Make box-and-whisker plot of the phenotypes
            boxplot(FTdaps,   at = 145,  add = TRUE, xaxt = "n", yaxt = "n", xlab = '', ylab = '', width = 10, pch = 19, cex = .3, boxwex = 4, lwd = .4, col = "gold", border = "grey");
            boxplot(FTdaps,   at = 1, horizontal = T, add = TRUE, xaxt = "n", yaxt = "n", xlab = '', ylab = '', width = 10, pch = 19, cex = .3, boxwex = 4, lwd = .4, col = "gold", border = "grey");
            # text(mean(FTdaps), 5, 'Days to anthesis', cex = .5)
            # text(mean(FTdaps, na.rm = T), 10, paste('Trait: ', trait, sep = ''), cex = 2)
            # title(paste('Trait: ', trait, sep = ''), adj=0)
          }
          dev.off()
        }
        else if (type=="p.only") {
          # make a pdf to show all of the environmental parameters and their optimal windows:
          pdf(paste0(exp_pop_dir, "/MaxP_", trait, ".pdf"),
              width= (ncol(env_paras)-2),height= 1,
              pointsize=6)
          layout(matrix(c(1:((ncol(env_paras)-2))), 1, (ncol(env_paras)-2), byrow = T))
          
          for (k in c(1:((ncol(env_paras)-2)))) { # loop through the environmental parameters
            # pop_cor_0 <- subset(pop_cors, pop_cors$pop_code == p); # keep only the window correlations for those in the desired pop
            pop_pval_0 <- pop_pvals
            pop_pval <- pop_pval_0[,c(1:3, k + 3)]; # pull correlations for the environ parameter of interest
            colnames(pop_pval)[4] <- 'P';
            pop_pval <- pop_pval[order(pop_pval$P),]; # sort by correlation
            
            xs <- pop_pval$Day_x;  ys <-  pop_pval$Day_y; # save window start and end dates, in order of correlation
            mid_pval <- median(pop_pval$P); # find median correlation across windows
            max_P <- pop_pval[which.max(abs(pop_pval$P))[1], ]; # pull the maximum absolute value of correlation found
            
            # set colors: 
            cell_col <-(floor(pop_pval$P/(max_P$P/12)))+13
            # unique((floor(pop_pval$P/(max_P$P/12)))+13)
            # cell_col <- floor(pop_pval$P * 12) +13; ### the same color scale
            pop_pval$cell_col <- cell_col; 
            
            
            # set parameters--
            par(mar = c(0.5, 1.0, 1, 0.5) , # set margins on all sides
                mgp = c(0.05, 0.1, 0), # set margin lines for axis title, axis labels, axis line
                tck = -0.01, # length of tick marks as fraction of plotting region
                bty = "n"); # character string, suppressing box drawing
            plot(-50, -50, xlim = c(0, dap_x), ylim = c(0, dap_y), col = "white",  xlab = '', 
                 xaxt = "n", yaxt = "n", ylab = 'Days after planting', bty = "n", cex.lab = 1);
            arrows(-1, 10, -1, dap_y - 10, length = 0.05, angle = 15, lwd = .5,  col = "grey59");
            mtext(c(1, 50, 100, dap_y), side = 2, at = c(1,50, 100, dap_y), line = -1, cex = .6)
            
            rect(xs - 0.5, ys - 0.5, xs + 0.5, ys + 0.5, col = col_palette[pop_pval$cell_col], border = "NA")
            rect(max(pop_pval$Day_x) - 0.5, max(pop_pval$Day_y) - 0.5,
                 max(pop_pval$Day_x) + 0.5, max(pop_pval$Day_y) + 0.5, border = "NA", col = "white", lwd = 0.001)
            
            arrows(10, dap_y + 4, dap_x - 10, dap_y + 4, angle = 15, length = 0.05, lwd = .5, col = "grey59")
            mtext("Days after planting", side = 3, at = dap_x / 2, line = -0.1, cex = .6)
            mtext(c(1, 50, 100, dap_y), side = 3, at = c(1, 50, 100, dap_y), line = -1.1, cex = .6)
            arrows(max_P$Day_x + 4,  max_P$Day_y - 4,  max_P$Day_x,  max_P$Day_y, length = 0.05, angle = 15, lwd = .5, col = "grey59")
            
            # Make legend:
            box_ys <- seq(1, 50, by = 2); box_xs <- rep(dap_x - 15, 25); 
            rect(box_xs - .5 * 2, box_ys - 0.5 * 2, box_xs + 0.5 * 2, box_ys + 0.5 * 2, border = "NA", col = col_palette)
            text(dap_x - 10 - 5, 52, '-logP', cex = .5);
            p_lab_top <- max_P$P;  p_lab_bottom <- min(pop_pval$P); p_lab_mid <- (p_lab_top + p_lab_bottom)/2; max_p_lab <- paste( '-logP = ', sprintf( "%.3f", max_P$P), sep = '');
            # if (k >(ncol(env_paras)-2)) { # because we want the first SEVEN charts to have 1 at the top, and next 7 at the BOTTOM
            #   r_lab_top <- -1; r_lab_bottom <- 1; max_r_lab <- paste( 'r = ', sprintf( "%.3f", 0 - max_R$R), sep = '');
            #   }
            legend(max_P$Day_x - 4 , max_P$Day_y - 4 , c(paste( max_P$Day_x, ' to ', max_P$Day_y, ' DAP ',  
                                                                colnames(pop_pval_0)[k+3],sep = ''), max_p_lab),  cex = .6, bty = "n");
            text(dap_x - 10 + 3, 50, p_lab_top, cex = .5)
            text(dap_x - 10 + 3, 27, p_lab_mid, cex = .5);
            text(dap_x - 10 + 3, 1,  p_lab_bottom, cex = .5)
            
            # Make box-and-whisker plot of the phenotypes
            boxplot(FTdaps,   at = 145,  add = TRUE, xaxt = "n", yaxt = "n", xlab = '', ylab = '', width = 10, pch = 19, cex = .3, boxwex = 4, lwd = .4, col = "gold", border = "grey");
            boxplot(FTdaps,   at = 1, horizontal = T, add = TRUE, xaxt = "n", yaxt = "n", xlab = '', ylab = '', width = 10, pch = 19, cex = .3, boxwex = 4, lwd = .4, col = "gold", border = "grey");
            # text(mean(FTdaps), 5, 'Days to anthesis', cex = .5)
            # text(mean(FTdaps, na.rm = T), 10, paste('Trait: ', trait, sep = ''), cex = 2)
            # title(paste('Trait: ', trait, sep = ''), adj=0)
          }
          dev.off()
        }
        else if (type == "both") {
          pdf(paste0(exp_pop_dir, "/MaxRandP_", trait, ".pdf"),
              width= (ncol(env_paras)-2),height= 2,
              pointsize=6)
          layout(matrix(c(1:((ncol(env_paras)-2)*2)), 2, (ncol(env_paras)-2), byrow = T))
          
          for (k in c(1:((ncol(env_paras)-2)*2))) { # loop through the environmental parameters, plus all of them again but with color scale switched
            
            # First, pull r plot info:
            
            # pop_cor_0 <- subset(pop_cors, pop_cors$pop_code == p); # keep only the window correlations for those in the desired pop
            pop_cor_0 <- pop_cors
            pop_cor <- pop_cor_0[,c(1:3, k + 3)]; # pull correlations for the environ parameter of interest
            colnames(pop_cor)[4] <- 'R';
            pop_cor <- pop_cor[order(pop_cor$R),]; # sort by correlation
            
            xs <- pop_cor$Day_x;  ys <-  pop_cor$Day_y; # save window start and end dates, in order of correlation
            mid_R <- median(pop_cor$R); # find median correlation across windows
            
            # set colors: 
            cell_col <- floor(pop_cor$R * 12) + 13; ### the same color scale
            pop_cor$cell_col <- cell_col; 
            
            max_R <- pop_cor[which.max(abs(pop_cor$R))[1], ]; # pull the maximum absolute value of correlation found
            
            # Now, pull P plot info:
            
            # make new iterator for pval info
            if (k %in% c(1:((ncol(env_paras)-2)))) {
              l <- k 
            } else {
              l <- k-(ncol(env_paras)-2)
            }
            
            pop_pval_0 <- pop_pvals
            pop_pval <- pop_pval_0[,c(1:3, l + 3)]; # pull correlations for the environ parameter of interest
            colnames(pop_pval)[4] <- 'P';
            pop_pval <- pop_pval[order(pop_pval$P),]; # sort by correlation
            
            pxs <- pop_pval$Day_x;  pys <-  pop_pval$Day_y; # save window start and end dates, in order of correlation
            mid_pval <- median(pop_pval$P); # find median correlation across windows
            max_P <- pop_pval[which.max(abs(pop_pval$P))[1], ]; # pull the maximum absolute value of correlation found
            
            # set colors: 
            pcell_col <-(floor(pop_pval$P/(max_P$P/12)))+13
            # unique((floor(pop_pval$P/(max_P$P/12)))+13)
            # cell_col <- floor(pop_pval$P * 12) +13; ### the same color scale
            pop_pval$pcell_col <- pcell_col; 
            
            # Make the figure:
            
            # set parameters--
            par(mar = c(0.5, 1.0, 1, 0.5) , # set margins on all sides
                mgp = c(0.05, 0.1, 0), # set margin lines for axis title, axis labels, axis line
                tck = -0.01, # length of tick marks as fraction of plotting region
                bty = "n"); # character string, suppressing box drawing
            plot(-50, -50, xlim = c(0, dap_x), ylim = c(0, dap_y), col = "white",  xlab = '', 
                 xaxt = "n", yaxt = "n", ylab = 'Days after planting', bty = "n", cex.lab = 1);
            arrows(-1, 10, -1, dap_y - 10, length = 0.05, angle = 15, lwd = .5,  col = "grey59");
            mtext(c(1, 50, 100, dap_y), side = 2, at = c(1,50, 100, dap_y), line = -1, cex = .6)
            
            rect(xs - 0.5, ys - 0.5, xs + 0.5, ys + 0.5, col = col_palette[pop_cor$cell_col], border = "NA") # r values
            rect(pys-0.5, pxs-0.5, pys+0.5, pxs+0.5, col=col_palette[pop_pval$pcell_col], border="NA") # p values
            
            rect(max(pop_cor$Day_x) - 0.5, max(pop_cor$Day_y) - 0.5, max(pop_cor$Day_x) + 0.5, max(pop_cor$Day_y) + 0.5, border = "NA", col = "white", lwd = 0.001)
            
            arrows(10, dap_y + 4, dap_x - 10, dap_y + 4, angle = 15, length = 0.05, lwd = .5, col = "grey59")
            mtext("Days after planting", side = 3, at = dap_x / 2, line = -0.1, cex = .6)
            mtext(c(1, 50, 100, dap_y), side = 3, at = c(1, 50, 100, dap_y), line = -1.1, cex = .6)
            
            # make R arrow, then P arrow
            arrows(max_R$Day_x + 4,  max_R$Day_y - 4,  max_R$Day_x,  max_R$Day_y, length = 0.05, angle = 15, lwd = .5, col = "grey59")
            arrows(max_P$Day_y + 4,  max_P$Day_x - 4,  max_P$Day_y,  max_P$Day_x, length = 0.05, angle = 15, lwd = .5, col = "grey59")
            
            # Make legend (r):
            box_ys <- seq(1, 50, by = 2); box_xs <- rep(dap_x - 15, 25); 
            rect(box_xs - .5 * 2, box_ys - 0.5 * 2, box_xs + 0.5 * 2, box_ys + 0.5 * 2, border = "NA", col = col_palette)
            text(dap_x - 20, 55, 'r', cex = .5);
            r_lab_top <- 1; r_lab_mid <- 0; r_lab_bottom <- -1; max_r_lab <- paste( 'r = ', sprintf( "%.3f", max_R$R), sep = '');
            # if (k >(ncol(env_paras)-2)) { # because we want the first SEVEN charts to have 1 at the top, and next 7 at the BOTTOM
            #   r_lab_top <- -1; r_lab_bottom <- 1; max_r_lab <- paste( 'r = ', sprintf( "%.3f", 0 - max_R$R), sep = '');
            #   }
            legend(max_R$Day_x - 4 , max_R$Day_y - 4 , c(paste( max_R$Day_x, ' to ', max_R$Day_y, ' DAP ', colnames(pop_cor_0)[k+3], sep = ''), max_r_lab),  cex = .6, bty = "n");
            text(dap_x - 20, 50, r_lab_top, cex = .5)
            text(dap_x - 20, 27, r_lab_mid, cex = .5);
            text(dap_x - 20, 1,  r_lab_bottom, cex = .5)
            
            # Make legend (p)
            # box_pys <- seq(1, 50, by = 2); box_pxs <- rep(dap_x, 25); 
            # rect(box_pxs - .5 * 2, box_pys - 0.5 * 2, box_pxs + 0.5 * 2, box_pys + 0.5 * 2, border = "NA", col = col_palette)
            text(dap_x - 10, 55, '-logP', cex = .5);
            p_lab_top <- max_P$P;  p_lab_bottom <- min(pop_pval$P); p_lab_mid <- (p_lab_top + p_lab_bottom)/2; max_p_lab <- paste( '-logP = ', sprintf( "%.3f", max_P$P), sep = '');
            legend(max_P$Day_y - 4 , max_P$Day_x - 4 , c(paste( max_P$Day_x, ' to ', max_P$Day_y, ' DAP ',  
                                                                colnames(pop_pval_0)[l+3],sep = ''), max_p_lab),  cex = .6, bty = "n");
            text(dap_x - 7 , 50, p_lab_top, cex = .5)
            text(dap_x - 7 , 27, p_lab_mid, cex = .5);
            text(dap_x - 7, 1,  p_lab_bottom, cex = .5)
            # Make box-and-whisker plot of the phenotypes
            # boxplot(FTdaps,   at = 145,  add = TRUE, xaxt = "n", yaxt = "n", xlab = '', ylab = '', width = 10, pch = 19, cex = .3, boxwex = 4, lwd = .4, col = "gold", border = "grey");
            # boxplot(FTdaps,   at = 1, horizontal = T, add = TRUE, xaxt = "n", yaxt = "n", xlab = '', ylab = '', width = 10, pch = 19, cex = .3, boxwex = 4, lwd = .4, col = "gold", border = "grey");
            # text(mean(FTdaps), 5, 'Days to anthesis', cex = .5)
            # text(mean(FTdaps, na.rm = T), 10, paste('Trait: ', trait, sep = ''), cex = 2)
            # title(paste('Trait: ', trait, sep = ''), adj=0)
          }
          dev.off()
        }
      }
      
      # Plot environmental parameter means vs trait mean 
      Plot_Trait_mean_envParas <- function( env_mean_trait, env_paras, d1, d2, trait, exp_trait_dir, env_cols){
        days <- c(d1:d2); # make window
        env_facts_matrix <- matrix(nrow = nrow(env_mean_trait), ncol = ncol(env_paras)-1); # make matrix to hold info for each environment
        for (e_i in 1:nrow(env_mean_trait)) { # loop through the environments
          e <- env_mean_trait$env_code[e_i]; # pull current environment code
          env_para <- subset(env_paras, env_paras$env_code == e); # pull the environmental parameters for that environment
          # env_mean <- colMeans(env_para[days, c(8, 9, 10, 11, 12, 13, 14) - 3]); # find mean parameter values for across environments ### DL, GDD, PTT, PTR, PTS
          env_mean <- colMeans(env_para[days, c(3:ncol(env_para))])
          
          env_facts_matrix[e_i,] <- c(env_mean_trait$meanY[e_i], round(env_mean, 4) ); # put the mean Y and the environmental parameter means into a matrix
        }
        colnames(env_facts_matrix) <- c( 'meanY', colnames(env_paras)[-c(1:2)])
        # 'DL', 'GDD', 'PTT', 'PTR', 'PTD', 'PTD2', 'PTS'); # add column names
        envMeanPara_file <- paste(exp_trait_dir, trait, '_envMean', kPara_Name, d1, '_', d2, sep = ''); # make file name
        envMeanPara <- merge(env_mean_trait, env_facts_matrix); # combine mean Y with environmental param means into one table
        write.table(envMeanPara, file = envMeanPara_file, sep = "\t", row.names = F, quote = F); # and output the table
        
        # Make a pdf figure:
        pdf_file <- paste(exp_trait_dir, trait, 'Mean_', nrow(env_mean_trait), 'Env', kPara_Name, '.pdf', sep = ''); 
        pdf(pdf_file,
            width= ncol(env_paras)-2,height= 1,
            pointsize=6)
        
        layout(matrix(c(1:(ncol(env_paras)-2)), ncol = (ncol(env_paras)-2)));
        
        for (i in 1:(ncol(env_paras)-2)) { # loop through environmental parameters:
          par(mar = c(2.5, 2.0, 1, 0.5) , mgp = c(0.7, 0.01, 0), tck = -0.01, family = "mono");
          
          #plot environmental parameter (x) vs phenotype (Y), using mean of each for the environment
          plot(env_facts_matrix[, i + 1], env_facts_matrix[,1],
               xlab = colnames(env_paras)[i + 2], ylab = paste(trait, ' mean', sep = ''),  pch = 19, col = env_cols);
          abline(lm(env_facts_matrix[,1] ~ env_facts_matrix[, i + 1]), lty = 2); # add regression line
          
          # add correlation label
          r1 <- round(cor(env_facts_matrix[,1] , env_facts_matrix[, i + 1]), 3);
          legend("bottom", paste('r = ', r1, sep = ''), bty = "n")
          legend_p <- "topleft";  if (r1 < 0) { legend_p <- "topright"};
          
          # add legend with environmental codes
          if (i == 1) { legend(legend_p, env_mean_trait$env_code, pch = 19, col = env_cols, bty = "n" )};
        }
        dev.off()
      }
      
      # regress each line/genotype to the environmental mean and parameter
      Slope_Intercept <- function(maxR_dap1, maxR_dap2, env_mean_trait, PTT_PTR, exp_trait, line_codes, exp_pop_dir, PTT_PTR_ind, filter.less.than) {
        maxR_win <- c(maxR_dap1:maxR_dap2); # pull the window with the max R value
        prdM <- env_mean_trait;
        kPara <- c();
        for (e_i in 1:nrow(env_mean_trait)) { # loop through the environments
          envParas <- subset(PTT_PTR, PTT_PTR$env_code == env_mean_trait$env_code[e_i]); # pull the PTT etc info for this environment
          envPara <- mean(envParas[maxR_win, PTT_PTR_ind]); # find mean PTT or other info for the environment over the course of the growing season
          kPara[e_i] <- envPara; # save mean environmental parameter for this environment
        }
        prdM$kPara <- kPara;
        lm_ab_matrix <- matrix(ncol = 6,
                               nrow = length(line_codes));
        for (l in 1: length(line_codes)) { # for each line/taxa:
          l_trait <- subset(exp_trait, exp_trait$ril_code == line_codes[l]); # pull observed values for this line in all environments
          if(nrow(l_trait) >= filter.less.than) { # if this line was grown in at least 5 environments (number based on Xianran's code):
            l_trait <- merge(l_trait, prdM); # add optimal environmental parameter and meanY info
            lm_Mean <- lm(Yobs ~ meanY, data = l_trait); # linear model with Y=observedY and X = meanY
            lm_Para <- lm(Yobs ~ kPara, data = l_trait); # linear model with Y=observedY and X = environmental mean of chosen parameter
            
            # Get the predicted Y (what we're calling "intercept") from the models with meanY (lm_Mean) and environmental param (lm_Para) as the X values:
            a_Mean <- as.vector(round(predict(lm_Mean, # use `predict` to predict the expected Y=observedY given this X=meanY
                                              data.frame(meanY = mean(prdM$meanY))), # make a dataframe holding the overall mean phenotype (Y) value
                                      4)); 
            a_Para <- as.vector(round(predict(lm_Para, data.frame(kPara = mean(prdM$kPara))), 4)); ## adjusted by the population mean
            
            # Pull slopes (b)
            b_Mean <- as.vector(round(lm_Mean$coefficient[2], 4)); # pull slope from the lm_Mean model
            # b_Para <- as.vector(round(lm_Para$coefficient[2],4)); # pull slope from the lm_Para model
            # EDITED Nov 1 2022 -- for trait KPR, slopes are < 0.0001 so getting all shrunk to zero!!
            # So, DON'T ROUND
            b_Para <- as.vector(lm_Para$coefficient[2]); # pull slope from the lm_Para model
            
            a_Para_ori <- as.vector(round(lm_Para$coefficient[1],4)); # pull intercept from lm_Para model
            
            # store data:
            lm_ab_matrix[l,] <- c(line_codes[l], a_Mean, b_Mean, a_Para, a_Para_ori, b_Para);
          }
        }
        lm_ab_matrix <- lm_ab_matrix[!is.na(lm_ab_matrix[,2]),];
        colnames(lm_ab_matrix) <- c('line_codes',
                                    'Intcp_mean', # = a_Mean = predicted Y ("intercept") when using pheno mean as predictor
                                    'Slope_mean', # = b_Mean = slope when using pheno mean as predictor
                                    'Intcp_para_adj', # = a_Para = predicted Y ("intercept") when using env param as predictor
                                    'Intcp_para', # = a_Para_ori = mathematical intercept when using env param as predictor
                                    'Slope_para'); # = b_Para = slope when using env param as predictor
        out_file <- paste(exp_pop_dir, 'Intcp_Slope', sep = '');
        write.table(lm_ab_matrix, file = out_file, sep = "\t", quote = F, row.names = F)
        
      } 
      
      # leave-one-out cross-validation
      LOOCV <- function(maxR1, maxR2, env_mean_trait, PTT_PTR, PTT_PTR_ind, exp_trait, obs_prd_file, filter.less.than, line_codes, output=TRUE) {
        maxR_win <- c(maxR1:maxR2); # create optimal window
        prdM <- env_mean_trait; # assign mean phenotype by environment to new object
        maxR_envPara <- matrix(ncol = 2, nrow = nrow(env_mean_trait));
        kPara <- c();
        for (e_i in 1:nrow(env_mean_trait)) { # loop through the environments
          envParas <- subset(PTT_PTR, PTT_PTR$env_code == env_mean_trait$env_code[e_i]); # pull PTT etc info for relevant environ
          envPara <- mean(envParas[maxR_win, PTT_PTR_ind]); # find mean environmental parameter for optimal window + optimal environ param
          kPara[e_i] <- envPara; # save this -- the mean value of the chosen parameter in this environment
        }
        prdM$kPara <- kPara; 
        obs_prd_m <- matrix(0, ncol = 6 ,nrow = nrow(exp_trait));
        
        n <- 0; 
        
        for (l in line_codes) { # loop through the taxa/lines:
          l_trait <- subset(exp_trait, exp_trait$ril_code == l); # pull phenotype in all environs for given line
          ril_data <- merge(prdM, l_trait,  all.x = T); # add this pheno info to the table with environmental param and mean Y per environ
          if (length(which(!is.na(ril_data$Yobs))) >= filter.less.than) { # make sure at least 5 observations of this line
            for (e in 1:nrow(ril_data)) { # loop through environments
              obs_trait <- ril_data$Yobs[e]; # pull observed phenotype for current environment, for current line
              if (!is.na(obs_trait)) {
                trn <- ril_data[-e,]; # drop current environment
                l_mean <- mean(trn$Yobs, na.rm = T); # find mean of remaining observations
                
                # Predict the current environment using model based on other environments
                prd_trait_mean  <- round(predict( lm(Yobs ~ meanY, data = trn), ril_data[e,]), digit = 3); # model with Y=observedY and X=meanY
                prd_trait_kpara <- round(predict( lm(Yobs ~ kPara, data = trn), ril_data[e,]), digit = 3); # model with Y=observedY and X=environmental parameter
                n <- n + 1; # increment iterator
                obs_prd_m[n,] <- c(ril_data$env_code[e], 
                                   # ril_data$pop_code[e], 
                                   l, prd_trait_mean, prd_trait_kpara, obs_trait, l_mean);
                
              }
            }
          }
        }
        
        
        
        obs_prd_m <- obs_prd_m[1:n,]
        colnames(obs_prd_m) <- c('env_code', 
                                 # 'pop_code', 
                                 'ril_code', # aka line name
                                 'Prd_trait_mean', # predicted value for trait in this environment (using model based on other environments' data) using environmental mean as predictor
                                 'Prd_trait_kPara', # same as above, but with environmental parameter as predictor
                                 'Obs_trait', # true observed value
                                 'Line_mean'); # mean of all the other environments
        
        if(output) {write.table(obs_prd_m, file = obs_prd_file, sep = "\t", quote = F);
          return(prdM)
        } else {return(list(prdM=prdM, obs_prd=obs_prd_m))}
      }
      
      # Plot the observed vs predicted results using several prediction methods
      Plot_prediction_result <- function(obs_prd_file, all_env_code, prdM, kPara_Name, forecast_png_file, trait=trait,
                                         print.legend=T, plot.A=T, plot.B=T, plot.C=T, plot.D=T, path=T, save.output=T) {
        if (path==T) { # if you were provided a path, read it in
          Obs_Prd_m <- read.table(obs_prd_file, sep = "\t", header = T); # pull in LOO prediction results
        } else {Obs_Prd_m <- obs_prd_file} #otherwise, just use the file as the obs_prd_m
        
        Obs_Prd_m <- Obs_Prd_m[!is.na(Obs_Prd_m$Obs_trait),]; # drop missing observations
        prd_env <- as.vector(unique(Obs_Prd_m$env_code)); # pull environment names
        env_rs <- matrix(ncol = 3, nrow = length(prd_env));
        
        # # redudant--
        # for (e_i in 1:length(prd_env)) { # loop through environments
        #   env_obs_prd <- subset(Obs_Prd_m, Obs_Prd_m$env_code == prd_env[e_i]); # pull data for current environment
        #   if (nrow(env_obs_prd) > 0) { # make sure data is available
        #    env_rs[e_i,] <- c( sprintf( "%.2f", cor(env_obs_prd$Prd_trait_mean, env_obs_prd$Obs_trait, 
        #                                            use = "complete.obs")), # corr btw value predicted with pheno mean, and observed
        #                       sprintf( "%.2f", cor(env_obs_prd$Prd_trait_kPara, env_obs_prd$Obs_trait,
        #                                            use = "complete.obs")), # corr btw value predicted with PTT etc, and observed
        #                       sprintf( "%.2f", cor(env_obs_prd$Line_mean, env_obs_prd$Obs_trait, use = "complete.obs"))); # corr btw mean value for line in all other environs, and observed
        #   }
        #    
        # }
        
        # xy_lim <- range(Obs_Prd_m[,3:5],na.rm = T) # set limits for figure
        
        # Make figure:
        if(save.output==T) {
          png(forecast_png_file, width = 4/1.5, height= 4/1.5,pointsize=6, units = "in", res = 600)
        }
        if(sum(plot.A, plot.B, plot.C, plot.D)>1){
          layout(matrix(c(1:4), 2, 2, byrow = T)); # only need layout if plotting multiple panels
        }
        obs_prd_m <- Obs_Prd_m
        
        if(plot.A==T){
          # Plot A: observed vs predicted by environmental mean
          # obs_prd_m <- subset(Obs_Prd_m, Obs_Prd_m$pop_code == p); # keep only desired pop
          xy_limA <- range(obs_prd_m[,c(3,5)], na.rm=T)
          par(mar = c(2.5, 2.5, 1.0, 0.5) , mgp = c(1, 0.25, 0), tck = -0.005, family = "mono");
          # plot(obs_prd_m[,5], obs_prd_m[,3], # plot value predicted with pheno mean, vs observed
          plot(obs_prd_m[,3], obs_prd_m[,5], # plot value predicted with pheno mean, vs observed, with obs on y axis
               col = env_cols[match(obs_prd_m[,1], all_env_codes)], pch = 19, cex = .4, 
               xlab = paste('Predicted ', trait, ' by envMean', sep = ''), 
               ylab = paste('Observed ', trait, '', sep = ''), 
               xlim = xy_limA, ylim = xy_limA);
          abline(a = 0, b = 1, lty = 2, col = "gray59"); # add y=x line
          r1 <- sprintf("%.2f", cor(obs_prd_m[,3], obs_prd_m[,5], use = "complete.obs")); # add correlation to plot
          legend("top", legend= substitute(paste(italic('r'), " = ", R1), list(R1 = r1)), bty = "n")
          mtext('A', side = 3, at = xy_limA[1], line = 0.1, cex = .8); # label panel A
        }
        
        if(plot.B==T) {
          # Plot B: value predicted by environmental parameter (PTT etc) vs observed
          xy_limB <- range(obs_prd_m[,c(4,5)], na.rm=T)
          
          all_env_r <- obs_prd_m %>% # pull the correlations for each environment individually
            group_by(env_code) %>%
            mutate(env.cor=sprintf("%.2f", cor(Prd_trait_kPara, Obs_trait, use = "complete.obs"))) %>%
            dplyr::select(env_code, env.cor) %>%
            distinct%>%
            mutate(legend=paste0(env_code, " r=", env.cor))
          
          par(mar = c(2.5, 2.5, 1.0, 0.5) , mgp = c(1, 0.25, 0), tck = -0.005, family = "mono");
          plot(obs_prd_m$Prd_trait_kPara, obs_prd_m$Obs_trait, # plot value predicted with environmental parameter (PTT etc), vs observed
               col = env_cols[match(obs_prd_m$env_code, all_env_codes)], pch = 19, cex = .4, 
               xlab = paste('Predicted ', trait, ' by ', kPara_Name, sep = ''), 
               ylab = paste('Observed ', trait, '', sep = ''), 
               xlim = xy_limB, ylim = xy_limB);
          abline(a = 0, b = 1, lty = 2, col = "gray59");
          mtext('B', side = 3, at = xy_limB[1], line = 0.1, cex = .8);
          r2 <- sprintf("%.2f", cor(obs_prd_m[,4], obs_prd_m[,5], use = "complete.obs")); # add correlation
          legend("top", legend= substitute(paste("overall ", italic('r'), " = ", R1), list(R1 = r2)), bty = "n")
          # legend("top", legend= substitute(paste(italic('r'), " = ", R1), list(R1 = r2)), bty = "n")
          # add legend with environmental codes
          if (print.legend) {
            legend(x=min(xy_limB), y=max(xy_limB), all_env_r$legend, pch = 19, col = env_cols, bty = "n" )
          }
        }
        if(plot.C==T) {
          # Plot C: environmental parameter vs mean observation in each environment
          xy_limC <- range(c(prdM$meanY), na.rm=T)
          par(mar = c(2.5, 2.5, 1.0, 0.5) , mgp = c(1, 0.25, 0), tck = -0.005, family = "mono");
          plot(
            # prdM$PTR,
            prdM$kPara,
            prdM$meanY, # environmental parameter vs mean observation in each environment
            col = env_cols[match(prdM$env_code, all_env_codes)],  
            ylim = xy_limC, pch = 19, cex = .65, xlab = kPara_Name, ylab = 'Observed population mean');
          mtext(prdM$env_code, side = 1,
                at = prdM$kPara,
                # at=prdM$PTR,
                las = 2, line = -2, cex = .6 )
          abline(lm(prdM$meanY ~ prdM$kPara)) # add regression line
          # abline(lm(prdM$meanY ~ prdM$PTR)) # add regression line
          r2 <- sprintf("%.2f", cor(prdM$meanY, 
                                    prdM$kPara)); # calculate correlation
          # prdM$PTR)); # calculate correlation
          legend("top", legend= substitute(paste(italic('r'), " = ", R1), list(R1 = r2)), bty = "n")
          mtext('C', side = 3, at = min(prdM$kPara), line = 0.1, cex = .8);
        }
        
        if(plot.D==T) {
          # Plot D: mean value for line in all other environs, vs observed
          xy_limD <- range(obs_prd_m[,c(5,6)], na.rm=T)
          par(mar = c(2.5, 2.5, 1.0, 0.5) , mgp = c(1, 0.25, 0), tck = -0.005, family = "mono");
          plot(obs_prd_m[,6], obs_prd_m[,5],
               col = env_cols[match(obs_prd_m[,1], all_env_codes)], pch = 19, cex = .4, 
               ylab = paste('Observed ', trait, sep = ''), xlab = paste('Predicted ', trait, ' by BLUE', sep = ''), 
               xlim = xy_limD, ylim = xy_limD);
          abline(a = 0, b = 1, lty = 2, col = "gray59");
          r1 <- sprintf("%.2f", cor(obs_prd_m[,6], obs_prd_m[,5], use = "complete.obs"));
          legend("top", legend= substitute(paste(italic('r'), " = ", R1), list(R1 = r1)), bty = "n")
          mtext('D', side = 3, at = xy_limD[1], line = 0.1, cex = .8);
        }
        if(save.output==T) {dev.off()}
        
      }
      
      
      
      # function for UETG (Untested Environment, Tested Genotype) 1 to 2 prediction:
      UETG.1to2.function <- function(maxR_dap1, maxR_dap2, PTT_PTR_ind, env_mean_trait_trn, 
                                     env_mean_trait_test, PTT_PTR, exp_trait, test_env, filter.less.than) {
        maxR_win <- c(maxR_dap1:maxR_dap2); # set window
        prdM <- rbind(env_mean_trait_trn, env_mean_trait_test); 
        kPara <- c();
        
        # pull environmental parameter values for each environment
        for (e_i in 1:nrow(prdM)) {
          envParas <- subset(PTT_PTR, PTT_PTR$env_code == prdM$env_code[e_i]);
          envPara <- mean(envParas[maxR_win, PTT_PTR_ind]);
          kPara[e_i] <- envPara;
        }
        prdM$kPara <- kPara;
        
        obs_prd_m <- matrix(0, ncol = 6,nrow = nrow(exp_trait));
        n1 <- 1; n2 <- 0;
        for (l in line_codes) { # loop through lines
          l_trait <- subset(exp_trait, exp_trait$ril_code == l);# pull phenotype in all environs for given line
          ril_data <- merge(prdM, l_trait,  all.x = T); # add this pheno info to the table with environmental param and mean Y per environ
          trn <- ril_data[!(ril_data$env_code %in% test_env), ]; # pull environments for training (not your testing env)
          prd <- ril_data[(ril_data$env_code %in% test_env),]; # pull the  testing environment
          if (sum(!is.na(trn$Yobs)) >= filter.less.than-1 & sum(!is.na(prd$Yobs)) >= 1 ) { # make sure at least filter.less.than-1 training observations (filter.less.than-1), and one to predict
            obs_trait <- prd$Yobs; # pull observed value in testing set
            
            prd_trait_mean  <- round(predict( lm(Yobs ~ meanY, data = trn), prd), digit = 3); # predict the value in the testing set, using the testing set data. AND--use mean Y value as predictor
            prd_trait_kpara <- round(predict( lm(Yobs ~ kPara, data = trn), prd), digit = 3); # same as above, but use the environmental parameter as the predictor
            
            # save output
            obs_prd_t <- cbind(prd$env_code, rep(l, nrow(prd)), 
                               prd_trait_mean, prd_trait_kpara, prd$Yobs,
                               rep(mean(trn$Yobs, na.rm = T), nrow(prd)));
            n2 <- n1 + nrow(prd) - 1;
            obs_prd_m[n1:n2,] <- obs_prd_t;
            n1 <- n2 + 1;
          }
        }
        # output data: see LOOCV for more details
        obs_prd_m <- obs_prd_m[1:n2,]
        colnames(obs_prd_m) <- c('env_code', 'ril_code', 'Prd_trait_mean', 'Prd_trait_kPara', 'Obs_trait', 'Line_mean');
        # write.table(obs_prd_m, file = obs_prd_file, sep = "\t", quote = F, row.name = F);
        return(list(obs_prd_m, prdM));
        
      }
      
      # Plot result of 1 to 3 (TEUG = Tested Environment, Untested Genotype) prediction; 
      # can also apply to other prediction scenarios
      Plot_TEUG_result <- function(obs_prd_file, all_env_codes, kPara_Name, forecast_png_file, trait=trait, print.legend=T, path=T, save.output=T) {
        if (path==T) { # if you were provided a path, read it in
          Obs_Prd_m <- read.table(obs_prd_file, sep = "\t", header = T); # pull in LOO prediction results
        } else {Obs_Prd_m <- obs_prd_file} #otherwise, just use the file as the obs_prd_m
        
        # Obs_Prd_m <- Obs_Prd_m[!is.na(Obs_Prd_m$Obs_trait),]; # drop missing observations
        Obs_Prd_m <- Obs_Prd_m[!is.na(Obs_Prd_m[,3]),]; # drop missing observations
        prd_env <- as.vector(sort(unique(Obs_Prd_m$env_code))); # pull environment names
        env_rs <- matrix(ncol = 1, nrow = length(prd_env));
        
        for (e_i in 1:length(prd_env)) { # loop through environments
          # for (e_i in 1:2) { # loop through environments
          env_obs_prd <- subset(Obs_Prd_m, Obs_Prd_m$env_code == prd_env[e_i]); # pull data for current environment
          if (nrow(env_obs_prd) > 0) { # make sure data is available
            env_rs[e_i,] <- c( 
              # sprintf( "%.2f", cor(env_obs_prd[,3], env_obs_prd[,5], 
              #                                       use = "complete.obs")), # corr btw value predicted with pheno mean, and observed
              sprintf( "%.2f", cor(env_obs_prd[,3], env_obs_prd[,7],
                                   use = "complete.obs")))
            # sprintf( "%.2f", cor(env_obs_prd[,5], env_obs_prd[,4],
            #                  use = "complete.obs")))
            # , # corr btw value predicted with PTT etc, and observed
            # sprintf( "%.2f", cor(env_obs_prd[,6], env_obs_prd[,5], use = "complete.obs"))); # corr btw mean value for line in all other environs, and observed
          }
          
        }
        
        # xy_lim <- range(Obs_Prd_m[,3:5],na.rm = T) # set limits for figure
        obs_prd_m <- Obs_Prd_m
        
        # Make figure:
        if (save.output==T) {
          png(forecast_png_file, width = 4/1.5, height= 4/1.5,pointsize=6, units = "in", res = 600)
          # layout(matrix(c(1:4), 2, 2, byrow = T));
        }
        
        
        # Plot B: value predicted by environmental parameter (PTT etc) vs observed
        xy_limB <- range(obs_prd_m[,c(3,7)], na.rm=T)
        
        all_env_r <- obs_prd_m %>% # pull the correlations for each environment individually
          group_by(env_code) %>%
          mutate(env.cor=sprintf("%.2f", cor(y.hat, get(colnames(obs_prd_m)[3]), use = "complete.obs"))) %>%
          select(env_code, env.cor) %>%
          distinct%>%
          mutate(legend=paste0(env_code, " r=", env.cor)) %>%
          arrange(env_code)
        
        par(mar = c(2.5, 2.5, 1.0, 0.5) , mgp = c(1, 0.25, 0), tck = -0.005, family = "mono");
        plot(obs_prd_m[[7]], obs_prd_m[[3]], # plot value predicted with environmental parameter (PTT etc), vs observed
             # plot(obs_prd_m[,7], obs_prd_m[,3], # NOTE: this is original version for CERIS-JGRA, I changed to above for FR-gBLUP
             # col = env_cols[match(obs_prd_m[,1], all_env_codes)], pch = 19, cex = .4,  # also for CERIS-JGRA
             col = env_cols[match(obs_prd_m[[1]], all_env_codes)], pch = 19, cex = .4, # for FR-gBLUP
             xlab = paste('Predicted ', trait, ' by ', kPara_Name, sep = ''), 
             ylab = paste('Observed ', trait, '', sep = ''), 
             xlim = xy_limB, ylim = xy_limB);
        abline(a = 0, b = 1, lty = 2, col = "gray59");
        # mtext('B', side = 3, at = xy_limB[1], line = 0.1, cex = .8);
        r2 <- sprintf("%.2f", cor(obs_prd_m[,3], obs_prd_m[,7], use = "complete.obs")); # add correlation
        # r2 <- sprintf("%.2f", cor(obs_prd_m[[3]], obs_prd_m[[7]], use = "complete.obs")); # add correlation
        legend("top", legend= substitute(paste("overall ", italic('r'), " = ", R1), list(R1 = r2)), bty = "n")
        # legend("top", legend= substitute(paste(italic('r'), " = ", R1), list(R1 = r2)), bty = "n")
        # add legend with environmental codes
        if (print.legend) {
          legend(x=min(xy_limB), y=max(xy_limB), all_env_r$legend, pch = 19, col = env_cols, bty = "n" )
        }
        
        if (save.output==T) {
          dev.off()
        }
      }