############################## ArgPArse (Python) ############################## import sys, argparse, os class anceSTR(object): def __init__(self): parser = argparse.ArgumentParser( description='Analysis to determine inheritance patterens for repeat genotypes generated by ExpansionHunter and proccessed by STRparse', usage='''anceSTR [] The avaialble commands are: trio_analysis Analyse the inhertance patterns of STR genotypes within a set of trios. Input data format: Repeat_ID Trio_ID proband1 proband2 mother1 mother2 father1 father2 build_graphs Build a set of graphs to illustrate trio_analysis data. ''') parser.add_argument('command', help='Subcommand to run') args = parser.parse_args(sys.argv[1:2]) if not hasattr(self, args.command): print('Unrecognized command') parser.print_help() exit(1) getattr(self, args.command)() def trio_analysis(self): sys.path.insert(1, os.path.join(os.path.dirname(__file__), "anceSTR", "trio_analysis")) parser = argparse.ArgumentParser(description='Select VCF directory and STR genotyper.') parser.add_argument('-i', '--input', type=str, metavar="", required=True, help='Input table file. Tab seperation. Format: Repeat_ID Trio_ID proband1 proband2 mother1 mother2 father1 father2') parser.add_argument('-o', '--output', type=str, metavar="", required=True, help='Output Path') parser.add_argument('-b', '--build', type=int, metavar="", nargs="?", const=19, required=False, help='Specify reference genome') args = parser.parse_args(sys.argv[2:]) from trios_main import main main(args.input, args.output, args.build) def build_graphs(self): sys.path.insert(1, os.path.join(os.path.dirname(__file__), "anceSTR", "anceSTR")) parser = argparse.ArgumentParser(description='Merge CSV files containnig ExpansionHunter and GangSTR data') parser.add_argument('-i', '--input', type=str, metavar="", required=True, help='Path to the results of the trio analysis') parser.add_argument('-o', '--output', type=str, metavar="", required=True, help='Output directory path for graphs') args = parser.parse_args(sys.argv[2:]) from csvmerge_main import main main(args.input, args.output) if __name__ == '__main__': anceSTR() ############################## Functions (R) ############################## # Function to get packages Package_up = function(){ list.of.packages <- c("stringr", "reshape2", "dplyr", "matrixStats", "zeallot", "ggplot2", "plyr", "ggsignif", "tidyr", "gridExtra", "data.table") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if (length(new.packages)) install.packages(new.package) lapply(list.of.packages, require, character.only = TRUE) } # Function to get mode getmode = function(v) { uniqv <- unique(v) uniqv[which.max(tabulate(match(v, uniqv)))] } # Function to determine largest/smallest expansion events (contraction or Expansion) Get_Diff = function(x, minmax){ if (minmax == "min"){ largest <- 10000 for (n in 1:length(x)){ if (x[n] == 0) {next} if (abs(x[n]) <= abs(largest)){largest <- x[n]} } } if (minmax == "max"){ largest <- 0 for (n in 1:length(x)){ if (abs(x[n]) >= abs(largest)){largest <- x[n]} } } return(largest) } # Function to calculate the most likely combination of repeat inheritence when two events occour at the same locus calculate = function(sr){ # Format input data sr <- as.data.frame(t(sr)) for (x in (1:length(sr[1,]))){ if (x > 2){sr[,x] <- as.numeric(as.character(sr[,x]))}} # Test the different combinations of parents and proband alleles trio <- sr[3:8] names(trio) <- c("p1", "p2", "m1", "m2", "f1", "f2") y <- data.frame(mother=c(trio$m1, trio$m1, trio$m2, trio$m2, trio$m1, trio$m1, trio$m2, trio$m2), father=c(trio$f1, trio$f2, trio$f1, trio$f2, trio$f1, trio$f2, trio$f1, trio$f2), proband1=c(trio$p1, trio$p1, trio$p1, trio$p1, trio$p2, trio$p2, trio$p2, trio$p2), proband2=c(trio$p2, trio$p2, trio$p2, trio$p2, trio$p1, trio$p1, trio$p1, trio$p1)) y$diff_m <- y$proband1 - y$mother y$diff_f <- y$proband2 - y$father # Select for the most conservative change in repeat length y$sum <- apply(abs(y[c("diff_m", "diff_f")]), 1, sum) result <- y[y$sum==min(y$sum),] # In the case that there is more than one outcome with the same expansion/contraction sizes, prioratize expansions. result$sum <- apply(result[c("diff_m", "diff_f")], 1, sum) result <- result[result$sum==max(result$sum),] # Remove duplicated outputs result <- unique(result) # If still more than 1 outcome parents share an allele of same size and which parent cannot be deterined. Else if only 1 out parent of orgin can be called. if (nrow(result) > 1){parent_m <- parent_f <- "Undetermined"} else { parent_m <- "Mother" parent_f <- "Father" } # Detemine if result is a expansion or contration event if (result$diff_m[1] > 0){event_type_m <- "Expansion"} else {event_type_m <- "Contraction"} if (result$diff_f[1] > 0){event_type_f <- "Expansion"} else {event_type_f <- "Contraction"} # Return results return(list(result$proband1[1], result$diff_m[1], event_type_m, parent_m, result$proband2[1], result$diff_f[1], event_type_f, parent_f)) } # Function to import data Import_Data = function(OS, build){ if (build %in% c(19, 37)){if (OS == "Windows"){ ngc <- read.csv(file="D:/Work_Storage/Cambridge_Data/NGC_37/CGG_Repeats_NGC_37_filtered.csv", header=TRUE, sep="\t") loci <- read.csv("D:/Work_Storage/Cambridge_Data/NGC_37/CGG_Repeats_NGC_37_filtered_Locus_summary.csv", header = TRUE, sep="\t") } else if (OS == "Linux") { ngc <- read.csv("/home/dannear/clusterhome/Cambridge_Data/raw_data/NGC_37/CGG_Repeats_NGC_37_filtered.csv", header=TRUE, sep="\t") loci <- read.csv("/home/dannear/clusterhome/Cambridge_Data/raw_data/NGC_37/NGC_37_filtered_Locus_summary.csv", header = TRUE, sep="\t") }} else if (build %in% c(38)){if (OS == "Windows"){ ngc <- read.csv(file="D:/Work_Storage/Cambridge_Data/NGC_38/CGG_Repeats_NGC_38_filtered.csv", header=TRUE, sep="\t") loci <- read.csv("D:/Work_Storage/Cambridge_Data/NGC_38/NGC_38_Locus_summary.csv", header = TRUE, sep="\t") } else if (OS == "Linux") { ngc <- read.csv("/home/dannear/clusterhome/Cambridge_Data/Summaries/NGC_38/CGG_Repeats_NGC_38_filtered.csv", header=TRUE, sep="\t") loci <- read.csv("/home/dannear/clusterhome/Cambridge_Data/Summaries/NGC_38/NGC_38_Locus_summary.csv", header = TRUE, sep="\t") }} return(list(ngc, loci)) } # Function to organise data into trios Get_Trios = function(raw_df){ # IDENTIFY PATIENT TYPE (01 = Proband, 02 = Mother, 03 = Father, 04+ = Sibling) if ("Member" %in% names(raw_df)) raw_df <- raw_df[c("Call_ID", "Member", "Sample_ID", "Allele1_Units", "Allele2_Units")] else { raw_df$Member <- as.numeric(str_split_fixed(raw_df$Sample_ID, "_0", 2)[,2]) raw_df$Sample_ID <- str_split_fixed(raw_df$Sample_ID, "_0", 2)[,1] raw_df <- raw_df[c("Call_ID", "Member", "Sample_ID", "Allele1_Units", "Allele2_Units")] } # REMOVE INCOMPLETE CASES auto_df <- raw_df[str_split_fixed(raw_df$Call_ID, "\\.", 2)[,1] != "X",] auto_df[auto_df == 0] <- NA sex_df <- raw_df[str_split_fixed(raw_df$Call_ID, "\\.", 2)[,1] == "X",] sex_df$Allele1_Units <- apply(sex_df, 1, function(x){ if (as.numeric(x[4]) == 0) NA else x[4]}) raw_df <- rbind(auto_df, sex_df) raw_df <- raw_df[complete.cases(raw_df), ] # GROUP DATA BY FAMILIES fam <- melt(data = raw_df, id.vars = c("Call_ID", "Sample_ID", "Member")) fam$variable <- str_c(fam$variable, fam$Member, sep = "_") if (4 %in% unique(fam$member)){type <- 4} else {type <- 3} fam <- select(fam, -Member) fam <- dcast(fam, Call_ID + Sample_ID ~ variable, value.var = "value", drop=TRUE) if (type == 4) fam <- fam[,c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3", "Allele1_Units_4", "Allele2_Units_4")] if (type == 3) fam <- fam[,c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] fam <- fam[complete.cases(fam), ] # FILTER OUT SINGLES fam_filtered <- fam[!(rowSums(is.na(fam[,3:(type*2)+2])) >= type*2),] # FILTER OUT READS MISSING PROBANDS fam_filtered <- fam_filtered[!(rowSums(is.na(fam_filtered[,3:4])) == 2),] # SELECT ALL TRIOS (Remove Siblings) and DUOs (Proband and parent) duos_trios <- fam_filtered[,c(1:8)] trios <- duos_trios[rowSums(is.na(duos_trios[,c(3:8)])) == 0,] return(list(trios, fam)) } # Function to collect data on expansion events and split data into stable, poly, and event loci # Set sizes (Trios, Quartets, Quintets) Collect_Expansion_Events = function(df, set_size){ if (set_size == "trios"){ proband <- df[c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] ps1s2 <- proband} else if (set_size == "quartets"){ proband <- df[c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1 <- df[c("Call_ID", "Sample_ID", "Allele1_Units_4", "Allele2_Units_4", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1$Sample_ID <- paste0(sibling1$Sample_ID, "_2") names(sibling1) <- names(proband) ps1s2 <- rbind(proband, sibling1) } else if (set_size == "quintets"){ proband <- df[c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1 <- df[c("Call_ID", "Sample_ID", "Allele1_Units_4", "Allele2_Units_4", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling2 <- df[c("Call_ID", "Sample_ID", "Allele1_Units_5", "Allele2_Units_5", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1$Sample_ID <- paste0(sibling1$Sample_ID, "_2") sibling2$Sample_ID <- paste0(sibling1$Sample_ID, "_3") names(sibling1) <- names(proband) names(sibling2) <- names(proband) ps1s2 <- rbind(proband, sibling1, sibling2) } ps1s2 <- ps1s2[complete.cases(ps1s2),] ps1s2$check_1_mother <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele1_Units_2, 1, 0) ps1s2$check_2_mother <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele2_Units_2, 1, 0) ps1s2$check_3_mother <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele1_Units_2, 1, 0) ps1s2$check_4_mother <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele2_Units_2, 1, 0) ps1s2$check_1_father <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele1_Units_3, 1, 0) ps1s2$check_2_father <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele2_Units_3, 1, 0) ps1s2$check_3_father <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele1_Units_3, 1, 0) ps1s2$check_4_father <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele2_Units_3, 1, 0) ps1s2$check_sum <- rowSums(ps1s2[,9:16]) ps1s2$check_sum_1 <- rowSums(ps1s2[,c(9, 10, 13, 14)]) ps1s2$check_sum_2 <- rowSums(ps1s2[,c(11, 12, 15, 16)]) ps1s2$check_sum_mother <- rowSums(ps1s2[,c(9, 10, 11, 12)]) ps1s2$check_sum_father <- rowSums(ps1s2[,c(13, 14, 15, 16)]) stable_reps <- ps1s2[ps1s2$check_sum == 8, c(1:8, 17)] poly_reps <- ps1s2[ps1s2$check_sum %in% c(0, 1, 2, 3, 4, 6), c(1:8, 17)] event_reps <- ps1s2[(ps1s2$check_sum %in% c(0, 1)) | (ps1s2$check_sum == 3 & ps1s2$check_sum_1 %in% c(0,3) & ps1s2$check_sum_2 %in% c(0,3)) | (ps1s2$check_sum == 4 & ps1s2$check_sum_1 %in% c(0,4) & ps1s2$check_sum_2 %in% c(0,4)) | (ps1s2$check_sum == 4 & ps1s2$check_sum_mother %in% c(0,4) & ps1s2$check_sum_father %in% c(0,4)) | (ps1s2$check_sum == 2 & ps1s2$check_sum_1 == 1 & ps1s2$check_sum_2 == 1 & ps1s2$Allele1_Units_1 == ps1s2$Allele2_Units_1) | (ps1s2$check_sum == 2 & ps1s2$check_sum_1 %in% c(0,2) & ps1s2$check_sum_2 %in% c(0,2)), c(1:8, 17)] poly_reps <- anti_join(poly_reps, event_reps) #meta_data <- data.frame("No_Trios"=length(unique(ps1s2$Sample_ID)), "Total_Reads"=length(ps1s2$Call_ID), "Stable_Reads"=length(stable_reps$Call_ID), "Poly_Reads"=length(poly_reps$Call_ID), "Event_Reads"=length(event_reps$Call_ID)) meta_data <- data.frame("Subset"=c("No_Trios", "Total_Reads", "Stable_Reads", "Poly_Reads", "Event_Reads"), "Count"=c(length(unique(ps1s2$Sample_ID)), length(ps1s2$Call_ID), length(stable_reps$Call_ID), length(poly_reps$Call_ID), length(event_reps$Call_ID))) return(list(event_reps, poly_reps, stable_reps, meta_data)) } testCollect_Expansion_Events = function(df, set_size){ if (set_size == "trios"){ proband <- df[c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] ps1s2 <- proband} else if (set_size == "quartets"){ proband <- df[c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1 <- df[c("Call_ID", "Sample_ID", "Allele1_Units_4", "Allele2_Units_4", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1$Sample_ID <- paste0(sibling1$Sample_ID, "_2") names(sibling1) <- names(proband) ps1s2 <- rbind(proband, sibling1) } else if (set_size == "quintets"){ proband <- df[c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1 <- df[c("Call_ID", "Sample_ID", "Allele1_Units_4", "Allele2_Units_4", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling2 <- df[c("Call_ID", "Sample_ID", "Allele1_Units_5", "Allele2_Units_5", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")] sibling1$Sample_ID <- paste0(sibling1$Sample_ID, "_2") sibling2$Sample_ID <- paste0(sibling1$Sample_ID, "_3") names(sibling1) <- names(proband) names(sibling2) <- names(proband) ps1s2 <- rbind(proband, sibling1, sibling2) } ps1s2 <- ps1s2[complete.cases(ps1s2),] ps1s2$check_1_mother <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele1_Units_2, 1, 0) ps1s2$check_2_mother <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele2_Units_2, 1, 0) ps1s2$check_3_mother <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele1_Units_2, 1, 0) ps1s2$check_4_mother <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele2_Units_2, 1, 0) ps1s2$check_1_father <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele1_Units_3, 1, 0) ps1s2$check_2_father <- ifelse(ps1s2$Allele1_Units_1 == ps1s2$Allele2_Units_3, 1, 0) ps1s2$check_3_father <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele1_Units_3, 1, 0) ps1s2$check_4_father <- ifelse(ps1s2$Allele2_Units_1 == ps1s2$Allele2_Units_3, 1, 0) ps1s2$check_X <- ifelse(as.character(str_split_fixed(ps1s2$Call_ID, "\\.", 2)[,1]) == "X", "X", "A") ps1s2$check_h <- ifelse(ps1s2$Allele1_Units_1 != ps1s2$Allele2_Units_1, 1, 0) ps1s2$check_sum <- rowSums(ps1s2[,9:16]) ps1s2$check_sum_1 <- rowSums(ps1s2[,c(9, 10, 13, 14)]) ps1s2$check_sum_2 <- rowSums(ps1s2[,c(11, 12, 15, 16)]) ps1s2$check_sum_mother <- rowSums(ps1s2[,c(9, 10, 11, 12)]) ps1s2$check_sum_father <- rowSums(ps1s2[,c(13, 14, 15, 16)]) ps1s2$check_all <- paste0(ps1s2$check_X, ps1s2$check_h, ps1s2$check_sum_1, ps1s2$check_sum_2, ps1s2$check_sum_mother, ps1s2$check_sum_father) stable_reps <- ps1s2[ps1s2$check_all %in% c("A04444", "X13122", "X03342"),] poly_reps <- ps1s2[ps1s2$check_all %in% c("A03342", "A03324", "A12112", "A11212", "A12121", "A11221", "A11111", "A13122", "A11322", "A12222", "A02222", "X12121", "X12112", "X11111", "X02222", "X11221"),] event_reps <- ps1s2[ps1s2$check_all %in% c("A10422", "A14022", "A13021", "A13012", "A10321", "A10312", "A12011", "A10101", "A12002", "A11010", "A11001", "A12020", "A10202", "A10220", "A10211", "A11120", "A10110", "A11102", "A10000", "A01120", "A01102", "A02240", "A02204", "A00000", "X13021", "X10101", "X10321", "X12011", "X01120", "X11010", "X10000", "X11001", "X10110", "X10211", "X00000", "X12020", "X02240", "X11102", "X01102", "X10220"),] meta_data <- data.frame("Subset"=c("No_Trios", "Total_Reads", "Stable_Reads", "Poly_Reads", "Event_Reads"), "Count"=c(length(unique(ps1s2$Sample_ID)), length(ps1s2$Call_ID), length(stable_reps$Call_ID), length(poly_reps$Call_ID), length(event_reps$Call_ID))) return(list(event_reps, poly_reps, stable_reps, meta_data)) } # Function to analyse single expansion/contraction event loci single_events = function(singles){ # Deterime which is the event allele singles$Event_Allele <- apply(singles, 1, function(x){ getmode(as.numeric(na.omit(x[25:28])))}) singles <- singles[c(1:8, 29)] # Compare event allele to parental alleles singles$Mother1 <- as.numeric(singles$Event_Allele) - as.numeric(singles$Allele1_Units_2) singles$Mother2 <- as.numeric(singles$Event_Allele) - as.numeric(singles$Allele2_Units_2) singles$Father1 <- as.numeric(singles$Event_Allele) - as.numeric(singles$Allele1_Units_3) singles$Father2 <- as.numeric(singles$Event_Allele) - as.numeric(singles$Allele2_Units_3) # Determine the largest / smallest Expansion / Contraction Event singles$Event_Size_Max <- apply(singles[c("Mother1", "Mother2", "Father1", "Father2")], 1, function(x) Get_Diff(x, "max")) singles$Event_Size <- apply(singles[c("Mother1", "Mother2", "Father1", "Father2")], 1, function(x) Get_Diff(x, "min")) # Classify Expansion or Contration singles$Event_Type <- sapply(singles$Event_Size, function(x) if (x < 0) {return("Contraction")} else if (x > 0) {return("Expansion")}) singles$Parent <- ifelse((singles$Event_Size == singles$Mother1 | singles$Event_Size == singles$Mother2) & (singles$Event_Size == singles$Father1 | singles$Event_Size == singles$Father2), "Undetermined", ifelse((singles$Event_Size == singles$Mother1 | singles$Event_Size == singles$Mother2), "Mother", "Father")) singles <- singles[c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3", "Event_Allele", "Event_Size", "Event_Type", "Parent")] # Check for X chromosome for when proband is male x_singles <- filter(singles, Allele2_Units_1 == 0) singles <- filter(singles, Allele2_Units_1 != 0) x_singles$Parent <- "Mother" x_singles$Event_Size <- apply(x_singles[c("Allele1_Units_1", "Allele1_Units_2", "Allele2_Units_2")], 1, function(x) Get_Diff(c(as.numeric(x[1])-as.numeric(x[2]), as.numeric(x[1])-as.numeric(x[3])), "min")) singles <- rbind(singles, x_singles) return(singles) } # Function to analyse double expansion/contraction event loci double_events = function(doubles){ doubles$result <- apply(doubles, 1, calculate) doubles <- suppressMessages(doubles %>% unnest_wider(result, names_sep=as.character(c(1:8)), names_repair = "minimal")) doubles_m <- doubles[c(1:8, 14:17)] doubles_f <- doubles[c(1:8, 18:21)] names(doubles_m) <- names(doubles_f) <- c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3", "Event_Allele", "Event_Size", "Event_Type", "Parent") doubles_final <- rbind(doubles_m, doubles_f) doubles_final <- doubles_final[doubles_final$Event_Size != 0,] return(doubles_final) } # Function to deterime bias for smaller repeat sizes heterozygous_parents = function(hetero, chr){ # Define function to dfetermine if larger repeat allele is refelected in proband check_fun = function(x){ if (x[3] != x[4] & x[5] == x[6]){ if (max(x[3:4]) %in% x[1:2]) {"1_Mo"} else {"0_Mo"} } else if (x[3] == x[4] & x[5] != x[6]){ if (max(x[5:6]) %in% x[1:2]) {"1_Fa"} else {"0_Fa"} } else if (x[3] != x[4] & x[5] != x[6]){ if (max(x[3:4]) %in% x[1:2] & min(x[5:6]) %in% x[1:2]) {"1_Mo-0_Fa"} else if (min(x[3:4]) %in% x[1:2] & max(x[5:6]) %in% x[1:2]) {"0_Mo-1_Fa"} else if (max(x[3:4]) %in% x[1:2] & max(x[5:6]) %in% x[1:2]) {"2_dbl"} else if (min(x[3:4]) %in% x[1:2] & min(x[5:6]) %in% x[1:2]) {"0_dbl"} else {NA} } } # Format data if (chr == "X") {hetero <- hetero[!( gsub("\\..*", "", hetero$Call_ID) == "X"),]} hetero <- hetero[hetero$Allele1_Units_2 != hetero$Allele2_Units_2 | hetero$Allele1_Units_3 != hetero$Allele2_Units_3, 1:8] col_names <- c("Call_ID", "Sample_ID", "proband1", "proband2", "parent1", "parent2") names(hetero) <- c("Call_ID", "Sample_ID", "proband1", "proband2", "mother1", "mother2", "father1", "father2") # Apply check function to data hetero$output<- apply(hetero[3:8], 1, function(x) check_fun(x)) # Split data into destinct catagories, single events, double events where mother is recorded, and double events where father is recorded single <- hetero[complete.cases(hetero) & !(hetero$output %in% c("0_dbl", "2_dbl", "1_Mo-0_Fa", "0_Mo-1_Fa")),] double_Mo <- hetero[complete.cases(hetero) & hetero$output %in% c("0_dbl", "2_dbl", "1_Mo-0_Fa", "0_Mo-1_Fa"),] double_Fa <- hetero[complete.cases(hetero) & hetero$output %in% c("0_dbl", "2_dbl", "1_Mo-0_Fa", "0_Mo-1_Fa"),] double_Mo$output <- apply( double_Mo[9], 1, function(x) if (x %in% c("0_dbl", "0_Mo-1_Fa")) {"0_Mo"} else if (x %in% c("2_dbl", "1_Mo-0_Fa")){"1_Mo"} ) double_Fa$output <- apply( double_Fa[9], 1, function(x) if (x %in% c("0_dbl", "1_Mo-0_Fa")) {"0_Fa"} else if (x %in% c("2_dbl", "0_Mo-1_Fa")){"1_Fa"} ) # Bind organised events back together and spilt check and p_ID intop seperate columns hetero <- rbind(single, double_Mo, double_Fa) hetero <- hetero %>% separate(output, c("check", "p_ID"), "_") # Deterime size of maximun repeat size present in each call hetero$max_mo <- apply(hetero[5:10], 1, function(x) if (x[6]=="Mo"){max(as.numeric(x[1:2]))} else {NA} ) hetero$max_fa <- apply(hetero[5:10], 1, function(x) if (x[6]=="Fa"){max(as.numeric(x[3:4]))} else {NA} ) # Melt data mhetero <- melt(hetero[c("p_ID", "check", "max_mo", "max_fa")], id=c("p_ID", "check")) mhetero <- mhetero[complete.cases(mhetero),] # Get the total number of times the larger allele was or was not transfered to child for both mother and father summ1 <- ddply(mhetero, .(p_ID, check), summarise, MAX = max(value), COUNT = length(value)) # Get the total number of times the larger allele was or was not transfered to child for both mother and father by repeat size summ2 <- ddply(mhetero, .(p_ID, check, value), summarise, COUNT = length(value)) # Get the total number of times the larger allele was or was not transfered to child ny repeat size summ3 <- ddply(mhetero, .(value, check), summarise, COUNT = length(value)) return(list(summ1, summ2, summ3, mhetero)) } # Function to save raw data generated by trio analysis trios_save_data = function(dir_path, save_name, All_Events_df, Poly_df, Stable_df, mdata){ write.table(All_Events_df, paste0(dir_path, "/ancSTR_Trio_analysis_MutationReads_", save_name, ".tab"), sep="\t", row.names = F, col.names = T, quote=F) write.table(Poly_df, paste0(dir_path, "/ancSTR_Trio_analysis_InformativeReads_", save_name, ".tab"), sep="\t", row.names = F, col.names = T, quote=F) write.table(Stable_df, paste0(dir_path, "/ancSTR_Trio_analysis_MonogenicReads_", save_name, ".tab"), sep="\t", row.names = F, col.names = T, quote=F) write.table(mdata, paste0(dir_path, "/ancSTR_Trio_analysis_MetaData_", save_name, ".tab"), sep="\t", row.names = F, col.names = T, quote=F) } # Function to format MSSNG input data (Proband = 1, Mother = 2, Father = 3, Sibling = 4) assign_structure = function(reads_path, family_path, type){ reads_df <- rename(read.csv(reads_path, sep="\t"), c("Sample_ID"="Index.ID")) family_manifest <- rename(read.csv(family_path, sep="\t"), c("Family.ID"="Sample_ID")) family_manifest <- family_manifest[family_manifest$Size==type,] samples <- unique(reads_df[c("Index.ID")]) merge_df <- merge(x=samples, y=family_manifest, by="Index.ID") merge_df$Member <- apply(merge_df, 1, function(x){ check <- toupper(as.character(x[3])) if (check %in% c("PROBAND", "AFFECTED_PROBAND", "PROBAND_AFFECTED")) 1 else if (check %in% c("MOTHER", "M", "MO")) 2 else if (check %in% c("FATHER", "F", "FA")) 3 else if (check %in% c("SIBLING", "UNAFFECTED_SIBLING", "AFFECTED_SIBLING", "SIBLING_UNAFFECTED", "SIBLING_AFFECTED")) 4}) if (type == 4){ check <- aggregate(list("sum"=merge_df$Member), by=list("Sample_ID"=merge_df$Sample_ID), sum) check <- merge(check[check$sum == 10,], merge_df, by="Sample_ID") } else if (type == 3){ check <- aggregate(list("sum"=merge_df$Member), by=list("Sample_ID"=merge_df$Sample_ID), sum) check <- merge(check[check$sum == 6,], merge_df, by="Sample_ID") } reads_df <- reads_df[reads_df$Index.ID %in% check$Index.ID,] reads_df <- merge(reads_df, merge_df[c("Index.ID", "Sample_ID", "Member")], by="Index.ID") return(unique(reads_df[c("Sample_ID", "Member", "Call_ID","Chr","Start","End","GT","Ref_Units","Allele1_Units","Allele2_Units")])) } # Function to analyse informative reads informative_analysis <- function(data){ #filter out same mother == father genotypes data$s_check <- apply(data, 1, function(x) if (all(unique(x[c(5, 6)]) == unique(x[c(7, 8)]))){1} else {0} ) data <- data[data$s_check != 1, c(1:8)] data_con <- data[data$s_check == 1, c(1:8)] #check which parents are heterozygous data$Chr <- str_split_fixed(data$Call_ID, "\\.", 2)[,1] # <- merge(data, loci[c("Call_ID", "Chr")], by="Call_ID") data$m_check <- apply(data[5:6], 1, function(x){ if (x[1] == x[2]){1} else 0}) data$f_check <- apply(data[7:8], 1, function(x){ if (x[1] == x[2]){1} else 0}) #sort event types data_b <- data[data$m_check == 0 & data$f_check == 0,] data_m <- data[data$m_check == 0 & data$f_check == 1,] data_m <- rbind(data_m, data_b) data_m$lists <- apply(data_m[c(3:8)], 1 , function(x){ if (setequal(x[1:2], x[5:6])) {rem <- x[1:2]} else if (x[1] != x[2]){rem <- setdiff(x[1:2], x[5:6])} else {rem <- x[1:2]} if (min(x[3:4]) %in% rem){list(parent = "Mother", Allele_Size = min(x[3:4]), Transfer_Type = "S", Allele_Difference = abs(as.numeric(x[3])-as.numeric(x[4])))} else {list(parent = "Mother", Allele_Size = max(x[3:4]), Transfer_Type ="L", Allele_Difference = abs(as.numeric(x[3])-as.numeric(x[4])))} }) data_m <- cbind(data_m[-c(9:12)], do.call(rbind.data.frame, data_m$lists)) data_f <- data[data$m_check == 1 & data$f_check == 0,] data_f <- rbind(data_f, data_b) data_f <- data_f[data_f$Chr != "X",] data_f$lists <- apply(data_f[c(3:8)], 1 , function(x){ if (setequal(x[1:2], x[3:4])) {rem <- x[1:2]} else if (x[1] != x[2]){rem <- setdiff(x[1:2], x[3:4])} else {rem <- x[1:2]} if (min(x[5:6]) %in% rem){list(parent = "Father", Allele_Size = min(x[5:6]), Transfer_Type = "S", Allele_Difference = abs(as.numeric(x[5])-as.numeric(x[6])))} else {list(parent = "Father", Allele_Size = max(x[5:6]), Transfer_Type ="L", Allele_Difference = abs(as.numeric(x[5])-as.numeric(x[6])))} }) data_f <- cbind(data_f[-c(9:12)], do.call(rbind.data.frame, data_f$lists)) data <- rbind(data_m, data_f) return(data) } # Analysis functions hetero_an <- function(Poly_df){ c(data1, data2, data3, print_df) %<-% heterozygous_parents(Poly_df, "X") data3$Repeat_Bias <- apply(data3, 1, function(x) as.numeric(x[3])/sum(data3$COUNT[data3$value == as.numeric(x[1])])*100) data3$p_ID <- "All" data2$Repeat_Bias <- apply(data2, 1, function(x) as.numeric(x[4])/sum(data2$COUNT[data2$value == as.numeric(x[3]) & data2$p_ID == x[1]])*100) plot1 <- rbind(data2, data3) plot1$tot_Count <- apply(plot1, 1, function(x) sum(plot1$COUNT[plot1$value == as.numeric(x[3]) & plot1$p_ID == x[1]])) plot1 <- plot1[plot1$check==1,] b1 <- ggplot(data1, aes(x=p_ID, y=COUNT, fill=check)) + geom_bar(stat="identity", position=position_dodge()) b1 <- b1 + theme(legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_text(size = 20, face = "bold", colour = "midnightblue"), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.text = element_text(colour="midnightblue", size = 14, face = "bold")) b1 <- b1 + scale_x_discrete(name ="\n Parent") b1 <- b1 + scale_y_continuous(name="") + scale_fill_manual(values = c("deepskyblue", "red2"), labels=c("Smaller Allele", "Larger Allele")) #b2_plot <- aggregate(list("Total"=data2$COUNT), by=list("Larger.Repeat"=data2$value), FUN=sum) #b2 <- ggplot(b2_plot[b2_plot$Larger.Repeat > 19,], aes(x=Larger.Repeat, y=Total)) + geom_point() + geom_line() + theme_classic() #b2 <- b2 + scale_y_continuous(name="Total Count\n", breaks=seq(0,7000,1000), limits=c(0,7000)) #print(b2) b3 <- ggplot(plot1[plot1$value>3,], aes(x=value, y=Repeat_Bias, group=p_ID)) + geom_line(aes(color=p_ID), size=1) + geom_point(aes(color=p_ID)) + theme_classic() b3 <- b3 + scale_x_continuous(name ="\n Larger Allele Repeat Size", breaks=seq(0,130, 10)) b3 <- b3 + scale_y_continuous(name="% Selection Bias\n", breaks=seq(0,100,10), limits=c(0,100)) b3 <- b3 + scale_color_manual(values=c('black', 'blue', 'deeppink')) b3 <- b3 + geom_hline(yintercept=50, linetype="dashed", color = "red") b3 <- b3 + geom_vline(xintercept=50, linetype="dashed", color = "blue") b3 <- b3 + theme(legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_text(size = 20, face = "bold", colour = "midnightblue"), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.text = element_text(colour="midnightblue", size = 14, face = "bold")) b3 <- b3 + scale_colour_manual(labels=c("Total", "Father", "Mother"), values=c("midnightblue", "deepskyblue", "red")) print(b3) #apply(data2, 1, function(x) as.numeric(x[4])/sum(data2$COUNT[data2$value == as.numeric(x[3])])*100) } gene_an = function(All_Events_df, loci){ genes <- All_Events_df#merge(All_Events_df, loci[c("Call_ID", "Gene", "Region")], by="Call_ID") mdata <- melt(genes[c("Call_ID", "Event_Type", "Gene", "Region", "Event_Size")], id=c("Call_ID", "Event_Type", "Gene", "Region")) agg <- ddply(mdata, .(Call_ID, Gene, Region, Event_Type), summarise, AVE = mean(value), SD = sd(value), VAR = var(value), MIN = min(value), MAX = max(value), LEN = length(value)) agg$RANGE <- apply(agg[c("MIN","MAX")], 1, function(x) diff(x)) g1 <- ggplot(agg[agg$Call_ID != "12.7781294",], aes(x=LEN, y=AVE)) + geom_point() print(g1) g2 <- ggplot(agg[agg$Event_Type == "Expansion" & agg$Call_ID != "12.7781294",], aes(x=LEN, y=RANGE)) + geom_point() print(g2) g3 <- ggplot(agg[agg$Event_Type == "Contraction" & agg$Call_ID != "12.7781294",], aes(x=LEN, y=RANGE)) + geom_point() print(g3) g4 <- ggplot(agg[agg$Call_ID != "12.7781294",], aes(x=LEN, y=RANGE)) + geom_point() print(g4) return(list(genes, mdata)) } region_an = function(mdata, loci, genes){ mdata <- mdata[abs(mdata$value) != 1,] genes$Event_Size <- apply(genes[c("Event_Allele", "Event_Size", "Event_Type")], 1, function(x) if (x[3] == "Contraction"){as.numeric(x[2])*-1} else if (x[3] == "Expansion"){as.numeric(x[2])} ) region <- merge( ddply(mdata, .(Region, Event_Type), summarise, amount= length(value) ), aggregate(list("Total_length"=loci$Region), by=list("Region"=loci$Region), FUN=length), by="Region") region$Normalised <- region$amount/region$Total_length r1 <- ggplot(data=region, aes(x=Event_Type, y=Normalised, fill=Region)) + geom_bar(stat="identity", position="dodge") r1 <- r1 + scale_y_continuous(name="Ave. Devations\nper Repeat Locus\n", breaks=seq(0,120,20), limits=c(0,120)) + scale_x_discrete(name="") r1 <- r1 + theme(legend.position="top", axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_text(size = 20, face = "bold", colour = "midnightblue"), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.title = element_text(colour="midnightblue", size=16, face="bold"), legend.text = element_text(colour="black", size = 13, face = "bold")) r1 <- r1 + scale_fill_discrete(labels=c("Downstream", "Exonic", "Intergenic", "Intronic", "ncRNA", "Upstream", "3'-UTR", "5'-UTR")) r3 <- ggplot(data=genes[abs(genes$Event_Size) != 1,], aes(x=Region, y=Event_Size, fill=Region)) + geom_boxplot(outlier.shape = NA) r3 <- r3 + scale_y_continuous(name ="Mutation Size\n\n", breaks=seq(-30,40, 10), limits=c(-30,40)) r3 <- r3 + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + geom_hline(yintercept=0, linetype="dashed", color = "Black") r3 <- r3 + theme(legend.position='none', axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_blank(), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.title = element_text(colour="midnightblue", size=16, face="bold"), legend.text = element_text(colour="black", size = 13, face = "bold")) r3 <- r3 + scale_fill_discrete(labels=c("Downstream", "Exonic", "Intergenic", "Intronic", "ncRNA", "Upstream", "3'-UTR", "5'-UTR")) print(grid.arrange(r1, r3)) } event_an = function(All_Events_df, genes){ All_Events_df$Event_Size <- apply(All_Events_df, 1, function(x) if (x[11] == "Expansion") as.numeric(x[10]) else if (x[11] == "Contraction") as.numeric(x[10])*-1 ) agg1 <- aggregate(All_Events_df$Event_Type, by = list(All_Events_df$Event_Type), length) agg2 <- aggregate(All_Events_df$Event_Size, by = list(All_Events_df$Event_Size), length) e1 <- ggplot(data=agg1, aes(x=Group.1, y=x)) + geom_bar(stat="identity") + geom_text(aes(label = x)) e2 <- ggplot(data=agg2, aes(x=Group.1, y=x)) + geom_bar(stat="identity", colour="Dark red", fill="Red") e2 <- e2 + theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15), legend.position="top", axis.line = element_line(size=1, colour="Black"), axis.ticks = element_line(size = 1), legend.title = element_text(face = "bold"), panel.background = element_blank(), panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + scale_y_continuous(name="Deviation Count\n", limits=c(0,150000), labels = function(x) format(x, scientific = TRUE)) + scale_x_continuous(name="\nChange in repeat length") # #write.table(agg2, "C:/Users/DLE/Dropbox/University/PhD/Bioinfo/agg2.tab", quote=F, col.names = T, row.names = F, sep = "\t") genes$X <- apply(genes[c("Event_Allele", "Event_Size", "Event_Type")], 1, function(x) if (x[3] == "Contraction"){as.numeric(x[1])+as.numeric(x[2])} else if (x[3] == "Expansion"){as.numeric(x[1])-as.numeric(x[2])} ) genes$Event_Size <- apply(genes[c("Event_Allele", "Event_Size", "Event_Type")], 1, function(x) if (x[3] == "Contraction"){as.numeric(x[2])*-1} else if (x[3] == "Expansion"){as.numeric(x[2])} ) e3 <- ggplot(data=genes[genes$Call_ID != "12.7781294" & genes$X > 0,], aes(x=X, y=Event_Size, group=as.numeric(X))) + geom_boxplot(outlier.size = 0.3) e3 <- e3 + scale_x_continuous(name ="Parental Repeat Length\n " , breaks=seq(0,170, 10), limits=c(0, 170)) e3 <- e3 + scale_y_continuous(name ="\nDeviation Size (CGG Units)", breaks=seq(-100,250, 50), limits=c(-100,200)) e3 <- e3 + theme(axis.text.x = element_text(size = 13, face = "bold", colour = "Black"), axis.text.y = element_text(size = 14, face = "plain", colour = "Black"), axis.title.x = element_text(size = 20, face = "bold", colour = "Black"), axis.title.y = element_text(size = 20, face = "bold", colour = "Black"), panel.background = element_blank()) + coord_flip() print(e1) print(e2) print(e3) } parent_an =function(All_Events_df){ p_agg <- aggregate(All_Events_df$Parent, by = list(All_Events_df$Parent), length) p1 <- ggplot(p_agg, aes(x=Group.1, y=x, fill=Group.1))+geom_bar(width = 1, stat = "identity") + theme(legend.position="top") print(p1) } # Function to get name of myfunc <- function(v1) { deparse(substitute(v1)) } Process_Trios = function(ngc){ c(trios, families) %<-% Get_Trios(ngc) # ORGANISE AND CLASSIFY TRIO READS c(Events_df, Poly_df, Stable_df, mdata) %<-% testCollect_Expansion_Events(trios, "trios") # ANALYSE REPEAT LENGTH CHANGES col_names <- c("Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3") Events_df$Check_and_1 <- apply(Events_df[col_names], 1, function(x) if (x[1] %in% x[c(3:4)] & x[1] %in% x[c(5:6)]) {NA} else {x[1]}) Events_df$Check_and_2 <- apply(Events_df[col_names], 1, function(x) if (x[2] %in% x[c(3:4)] & x[2] %in% x[c(5:6)]) {NA} else {x[2]}) Events_df$Check_or_1 <- apply(Events_df[col_names], 1, function(x) if (x[1] %in% x[c(3:4)] | x[1] %in% x[c(5:6)]) {NA} else {x[1]}) Events_df$Check_or_2 <- apply(Events_df[col_names], 1, function(x) if (x[2] %in% x[c(3:4)] | x[2] %in% x[c(5:6)]) {NA} else {x[2]}) # Analsyse the single events Single_Evenets_df <- single_events(Events_df[Events_df$check_all %in% c("A10422", "A14022", "A13021", "A13012", "A10321", "A10312", "A12011", "A10101", "A12002", "A11010", "A11001", "A12020", "A10202","A10220","A10211", "A11120","A10110", "A11102", "A01120", "A01102", "A02240", "A02204", "X13021", "X10101", "X10321", "X12011", "X01120", "X11010", "X10000", "X11001", "X10110", "X10211", "X02240", "X11102", "X01102", "X10220" ),]) # Analsyse the dounle events Double_Events_df <- double_events(Events_df[Events_df$check_all %in% c("X00000", "X12020", "A00000", "A10000"),c("Call_ID","Sample_ID","Allele1_Units_1","Allele2_Units_1","Allele1_Units_2","Allele2_Units_2","Allele1_Units_3","Allele2_Units_3","check_sum","Check_and_1","Check_and_2","Check_or_1","Check_or_2")]) # Combine all Expansion/Contraction Events All_Events_df <- rbind(Single_Evenets_df, Double_Events_df) All_Events_df <- merge(All_Events_df, loci[c("Call_ID", "Chr", "Gene", "Region")], by="Call_ID") All_Events_df$Event_Size <- abs(All_Events_df$Event_Size) All_Reads_df <- rbind(Stable_df[c("Call_ID","Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")], Poly_df[c("Call_ID","Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")], Events_df[c("Call_ID","Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")]) return(list(Events_df, Poly_df, Stable_df, mdata, All_Events_df, All_Reads_df)) } ############################## Circos (R) ############################## # Import data library(circlize) library(data.table) library(plyr) library(stringr) library(reshape2) loci <- data.frame(fread("/home/dannear/Dropbox/University/PhD/Bioinfo/MSSNG/Sample Data/Quads_Full_20220215_by_locus.csv")) all_trios <- data.frame(fread("/mnt/Storage/Manuscript2_Data/All_Data_TRIOFORMAT.tab")) all_trios <- all_trios[,2:9] names(all_trios) <- c("Call_ID", "Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3") all_mutations <- data.frame(fread("/mnt/Storage/Manuscript2_Data/ancSTR_Trio_analysis_MutationReads_All_Data.tab")) # Set data plot_data <- melt(all_trios[c("Call_ID", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")], id="Call_ID") plot_data <- plot_data[plot_data$value != 0,] event_data <- melt(all_mutations[c("Call_ID", "Event_Size", "Event_Type")], id=c("Call_ID", "Event_Type")) event_data <- event_data[event_data$value != 0,] rm(all_trios) rm(all_mutations) gc() ave_1 <- aggreagte(plot_data) ave_size <- ddply(plot_data, .(Call_ID), summarise, AVE = mean(value), MED = median(value), Len = length(value)/2) ave_size_2 <- ddply(event_data, .(Call_ID, Event_Type), summarise, AVE = mean(value), MED = median(value), Len = length(value)) ave_size_2$AVE <- apply(ave_size_2[c("Event_Type", "AVE")], 1, function(x) if (x[1] == "Contraction") -1*as.numeric(x[2]) else if (x[1] == "Expansion") as.numeric(x[2]) ) ave_size_2$start <- as.numeric(str_split_fixed(ave_size_2$Call_ID, "\\.", 2)[,2]) ave_size_2$col <- apply(ave_size_2[c("Event_Type", "AVE")], 1, function(x) if (x[1] == "Contraction") "royalblue2" else if (x[1] == "Expansion") "darkorange" ) ave_size_2 <- ave_size_2[ave_size_2$AVE > -20 & ave_size_2$AVE < 50,] size_len <- ddply(event_data, .(Call_ID), summarise, Ave_Mut_Size = mean(abs(value)), total = length(value)) type_len <- ddply(event_data, .(Call_ID, Event_Type), summarise, event_count = length(value)) all_data <- Reduce(function(x, y) merge(x, y, all=TRUE, by="Call_ID"), list(ave_size, size_len, type_len)) all_data$Rate <- all_data$total/all_data$Len*100 all_data$Rate2 <- all_data$total/all_data$Len/2 all_data$chr <- paste0("chr", str_split_fixed(all_data$Call_ID, "\\.", 2)[,1]) all_data$start <- as.numeric(str_split_fixed(all_data$Call_ID, "\\.", 2)[,2]) mut_count <- melt(data = all_data[c("Call_ID", "Event_Type", "event_count")], id.vars = c("Call_ID", "Event_Type")) mut_count <- select(mut_count, -variable) mut_count <- dcast(mut_count, Call_ID ~ Event_Type, value.var = "value", drop=TRUE)[c("Call_ID", "Expansion", "Contraction")] mut_count[is.na(mut_count)] <- 0 final <- merge(unique(all_data[c("Call_ID", "chr", "start", "AVE", "MED", "Len", "Ave_Mut_Size", "total", "Rate")]), mut_count, by="Call_ID") final[is.na(final)] <- 0 final$colour_rate <- apply(final[c("Rate")], 1, function(x) if (x[1] > 30) "darkorange" else if (x[1] > 1) "royalblue2" else if (x[1] > 0) "green" else if (x[1] == 0) "green") final$Contraction <- final$Contraction*-1 ave_size_2$chr <- paste0("chr", str_split_fixed(ave_size_2$Call_ID, "\\.", 2)[,1]) bed_labels <- merge(final[c("Call_ID", "chr", "start", "Rate")], loci[c("Call_ID", "Gene")]) bed_labels$end <- bed_labels$start+1 plot_labels <- bed_labels[bed_labels$Rate > 20,] plot_labels <- plot_labels[c("chr", "start", "end", "Gene")] plot_labels <- plot_labels[!duplicated(plot_labels$Gene), ] clean_genes = function(pl){ pl$Gene_1 <- str_split_fixed(pl$Gene, "->", 2)[,1] pl$Gene_2 <- str_split_fixed(pl$Gene, "->", 2)[,2] pl1 <- pl[c("chr","start","end","Gene")] pl1$Gene <- pl$Gene_1 pl2 <- pl[c("chr","start","end","Gene")] pl2$Gene <- pl$Gene_2 pl2$start <- pl2$start + 100 pl2$end <- pl2$start + 1 pl <- rbind(pl1, pl2) pl <- pl[pl$Gene != "" & pl$Gene != "NONE",] pl <- pl[!duplicated(pl$Gene), ] pl$Gene <- apply(pl[c("Gene")], 1, function(x) if (nchar(x[1])>10) substr(x[1],1,nchar(x[1])-3) else x[1]) pl$cols <- apply(pl[c("chr")], 1, function(x){ chr <- gsub("chr", "", x[1]) if (chr == "X") "royalblue2" else if ((as.numeric(chr) %% 3) == 0) "royalblue2" else if ((as.numeric(chr) %% 2) == 0) "darkorange" else "azure4"} ) return(pl) } plot_labels <- clean_genes(plot_labels) # PLOT LABELS tiff("/home/dannear/Dropbox/University/PhD/Writing/Manuscript 2/Data analysis/graphs/circos/test3.tiff", units="in", width=8, height=8, res=300) circos.clear() circos.par("start.degree" = 90) circos.initializeWithIdeogram(plotType = NULL, chromosome.index = paste0("chr", c(1:22, "X")), species = "hg38") #species = "hg38", chromosome.index = paste0("chr", c(1:22, "X")) # Add gene labels circos.genomicLabels(plot_labels, labels.column = 4, side = "outside", cex = 0.60, col = plot_labels$cols, line_col = plot_labels$cols, font = par("serif")) # Add ideogram circos.genomicIdeogram() # Add Chr labels circos.trackPlotRegion(track.index = 1, panel.fun = function(x, y) { xlim = get.cell.meta.data("xlim") ylim = get.cell.meta.data("ylim") sector.name = gsub("chr", "", get.cell.meta.data("sector.index")) if (sector.name == "X") cols <- "royalblue2" else if ((as.numeric(sector.name) %% 3) == 0) cols <- "royalblue2" else if ((as.numeric(sector.name) %% 2) == 0) cols <- "darkorange" else cols <- "azure4" circos.text(CELL_META$xcenter, ylim[1] + cm_h(11), sector.name, facing = "clockwise", niceFacing = TRUE, adj = c(0, 2), cex = 1, font = 2, col = cols) }, bg.border = NA) # Mutation Rate circos.track(ylim = c(0, 90), panel.fun = function(x, y) { chr = CELL_META$sector.index xlim = CELL_META$xlim ylim = CELL_META$ylim y = seq(0, 90, by = 30) circos.segments(0, y, xlim[c("max.data")], y) }, track.height=0.20, bg.border = NA) circos.trackLines(track.index=4,final$chr, final$start, final$Rate, type="h", col = final$colour_rate) # Ave Mutation Size circos.track(ylim = c(-20, 40), panel.fun = function(x, y) { chr = CELL_META$sector.index xlim = CELL_META$xlim ylim = CELL_META$ylim y = seq(-20, 40, by = 10) circos.segments(0, y, xlim[c("max.data")], y, col="gray") }, track.height=0.35, bg.border = NA) circos.trackPoints(track.index=5, ave_size_2$chr, ave_size_2$start, ave_size_2$AVE, cex = 0.2, col = ave_size_2$col)#ave_size_2$Event_Type[ave_size_2$AVE > -20] ) dev.off() ############################## Gene Analysis (R) ############################## # Import gene lists of dissease-associated genes nddgenes <- read.csv("C:/Users/DLE/Dropbox/University/PhD/Bioinfo/anceSTR/anceSTR/Data/Repeats_of_interest_NDD_ID.txt", header = T, sep="\t") nddgeneslist <- unique(read.csv("C:/Users/DLE/Dropbox/University/PhD/Bioinfo/anceSTR/anceSTR/Data/Repeats_of_interest_NDD_ID_2.txt", header = T, sep="\t")[1]) fs_37_sites <- data.frame(FragileSite=c("FRA2A", "FRA7A", "FRA10A", "FRA11A", "FRA11B", "FRA12A", "FRA16A", "FRA19B", "FRAXA", "FRAXE", "FRAXF"), ID=c("2.100721250", "7.55955533", "10.95462280", "11.66512291", "11.119077000", "12.50898785", "16.17564765", "19.2308194", "X.146993568", "X.147582126", "X.148713408") ) fs_38_sites <- data.frame(FragileSite=c("FRA2A", "FRA7A", "FRA10A", "FRA11A", "FRA11B", "FRA12A", "FRA16A", "FRA19B", "FRAXA", "FRAXE", "FRAXF"), ID=c("2.100104473", "7.55887840", "10.93702523", "11.66744820", "11.119206290", "12.50505002", "16.17470908", "19.2308195", "X.147912050", "X.148500606", "X.149631714") ) # Melt data present in trio format trio_melt <- melt(all_trios[c("Call_ID", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")], id="Call_ID") # Remove all 0 values, i.e. missing X chromosome alleles in males trio_melt <- trio_melt[trio_melt$value != 0,] # Melt data of all event reads event_melt <- melt(all_mutations[c("Call_ID", "Event_Size", "Event_Type", "Parent")], id=c("Call_ID", "Event_Type", "Parent")) event_melt <- event_melt[event_melt$value != 0,] # Calculate event metrics per locus ave_size_event <- ddply(trio_melt, .(Call_ID), summarise, AVE = mean(value), MED = median(value), Len = length(value)/2) size_len_event <- ddply(event_melt, .(Call_ID), summarise, Size = mean(abs(value)), total = length(value)) type_len_event <- ddply(event_melt, .(Call_ID, Event_Type), summarise, total = length(value)) parent_len_event <- ddply(event_melt, .(Call_ID, Parent), summarise, total = length(value)) # Merge data for plotting tplot <- Reduce(function(x,y) merge(x = x, y = y, by = "Call_ID"), list(ave_size_event, size_len_event, loci[c("Call_ID", "Gene", "Region")])) tplot$rate <- tplot$total/tplot$Len*100 tplot$rate2 <- tplot$total/tplot$Len tplot$Gene_1 <- str_split_fixed(tplot$Gene, "->", 2)[,1] tplot$Gene_2 <- str_split_fixed(tplot$Gene, "->", 2)[,2] labplot <- tplot[tplot$Call_ID %in% fs_38_sites$ID,] # Build plot of mutation rate verses mean parental repeat length aveplot <- ggplot(tplot, aes(x=AVE, y=rate2, fill=Region, color=Region, label=Gene)) + geom_point() + theme(legend.position="top", axis.line = element_line(size=1.5, colour="Black"), axis.ticks = element_line(size = 1), legend.title = element_text(face = "bold"), panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) + scale_y_continuous(labels = function(x) format(x, scientific = TRUE), name="Deviation Rate\n(Change per gamete per generation)\n", breaks=seq(0,1,0.1), limits=c(0,1)) + scale_x_continuous(name="\nMean Parental Repeat Length", breaks=seq(0,50,5), limits=c(0,50)) + theme(legend.title = element_text(size = 13, face = "bold", colour = "Black"), axis.text.x = element_text(size = 13, face = "bold", colour = "Black"), axis.text.y = element_text(size = 14, face = "plain", colour = "Black"), axis.title.x = element_text(size = 20, face = "bold", colour = "Black"), axis.title.y = element_text(size = 20, face = "bold", colour = "Black"), panel.background = element_blank(), legend.text = element_text(colour="Black", size = 13), axis.line = element_line(size = 1, colour = "Black", linetype=1)) aveplot # Select repeats that overlap with disease-associated genes geneplot <- tplot[(tplot$Gene_1 %in% nddgenes$Gene | tplot$Gene_2 %in% nddgenes$Gene)& tplot$AVE<60,] aveplot2 <- aveplot + geom_point(data=geneplot, shape=25, color="red", size=3)+ geom_smooth(method=lm, se=F, aes(group=1)) + geom_text(data=subset(tplot, Call_ID %in% fs_38_sites$ID & Gene != "LINGO3" & Gene != "CBL"), position=position_jitter(width=0.1,height=0.1), aes(label=Gene), hjust=-0.2, vjust=0, colour="Black", fontface=2) + geom_point(data=subset(tplot, Call_ID %in% fs_38_sites$ID), shape=21, color="Black", size=2) aveplot2 # Build plot of Ave mutation size verses mean parental repeat length tplot$col1 <- "black" sizeplot <- ggplot(tplot, aes(x=AVE, y=Size, fill=Region, color=Region, label=Gene)) + geom_point() + stat_function(fun = function(x) {-0.3*x + 12}, size = 1, colour = "blue") + geom_point(data=subset(tplot, Call_ID %in% fs_38_sites$ID), shape=21, color="Black", size=2) + geom_text(data=subset(tplot, Call_ID %in% fs_38_sites$ID & Gene != "LINGO3" & Gene != "CBL" ), position=position_jitter(width=2,height=2), aes(label=Gene), hjust=-0.2, vjust=0, colour="Black", fontface=2) + theme(legend.position="top", axis.line = element_line(size=1.5, colour="Black"), axis.ticks = element_line(size = 1), legend.title = element_text(face = "bold"), panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) + theme(legend.title = element_text(size = 13, face = "bold", colour = "Black"), axis.text.x = element_text(size = 13, face = "bold", colour = "Black"), axis.text.y = element_text(size = 14, face = "plain", colour = "Black"), axis.title.x = element_text(size = 20, face = "bold", colour = "Black"), axis.title.y = element_text(size = 20, face = "bold", colour = "Black"), panel.background = element_blank(), legend.text = element_text(colour="Black", size = 13), axis.line = element_line(size = 1, colour = "Black", linetype=1)) + scale_y_continuous(name="Mean Deviation Size\n (CGG Units)\n", breaks=seq(0,40,5), limits=c(0,40)) + scale_x_continuous(name="\nMean Parental Repeat Length", breaks=seq(0,50,5), limits=c(0,50)) + scale_fill_discrete(labels=c("downstream"="Downstream", "exonic"="Exonic", "intergenic"="Intergenic", "intronic"="Intronic", "ncRNA"="ncRNA", "upstream"="Upstream", "UTR3"="3'-UTR", "UTR5"="5'-UTR")) sizeplot # Graphs for larger mutations outside the normal range large <- tplot[tplot$rate2 > 0.01,] rplot_rate <- ggplot(large, aes(x=Region, y=rate2, fill=Region)) + geom_boxplot() + scale_y_continuous(labels = function(x) format(x, scientific = TRUE),name ="Deviation Rate\n(Events per gamete per generation)\n", breaks=seq(0, 1, 0.1), limits=c(0,1)) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + theme(legend.position='top', axis.text.x = element_text(size = 13, face = "bold", colour = "Black"), axis.text.y = element_text(size = 14, face = "plain", colour = "Black"), axis.title.x = element_text(size = 20, face = "bold", colour = "Black"), axis.title.y = element_text(size = 20, face = "bold", colour = "Black"), panel.background = element_blank(), legend.title = element_text(colour="Black", size=16, face="bold"), legend.text = element_text(colour="Black", size = 13, face = "bold")) + scale_fill_discrete(labels=c("downstream"="Downstream", "exonic"="Exonic", "intergenic"="Intergenic", "intronic"="Intronic", "ncRNA"="ncRNA", "upstream"="Upstream", "UTR3"="3'-UTR", "UTR5"="5'-UTR")) + scale_x_discrete(name = "\nGenetic Region", labels=c("downstream"="Downstream", "exonic"="Exonic", "intergenic"="Intergenic", "intronic"="Intronic", "ncRNA"="ncRNA", "upstream"="Upstream", "UTR3"="3'-UTR", "UTR5"="5'-UTR")) rplot_rate rplot_size <- ggplot(large, aes(x=Region, y=Size, fill=Region)) + geom_boxplot() + scale_y_continuous(name ="Average Deviation Size\n(CGG Units)\n", breaks=seq(0, 35, 5), limits=c(0,35)) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + theme(legend.position='top', axis.text.x = element_text(size = 13, face = "bold", colour = "Black"), axis.text.y = element_text(size = 14, face = "plain", colour = "Black"), axis.title.x = element_text(size = 20, face = "bold", colour = "Black"), axis.title.y = element_text(size = 20, face = "bold", colour = "Black"), panel.background = element_blank(), legend.title = element_text(colour="Black", size=16, face="bold"), legend.text = element_text(colour="Black", size = 13, face = "bold")) + scale_fill_discrete(labels=c("downstream"="Downstream", "exonic"="Exonic", "intergenic"="Intergenic", "intronic"="Intronic", "ncRNA"="ncRNA", "upstream"="Upstream", "UTR3"="3'-UTR", "UTR5"="5'-UTR")) + scale_x_discrete(name = "\nGenetic Region", labels=c("downstream"="Downstream", "exonic"="Exonic", "intergenic"="Intergenic", "intronic"="Intronic", "ncRNA"="ncRNA", "upstream"="Upstream", "UTR3"="3'-UTR", "UTR5"="5'-UTR")) rplot_rate sig_genes <- read.csv("C:/Users/DLE/Dropbox/University/PhD/Writing/Manuscript 2/Data analysis/scripts/repeat_stats.txt", sep="t") siglist <- data.frame("genes"=unique(c(str_split_fixed(sig_genes$Gene, "->", 2)[,1], str_split_fixed(sig_genes$Gene, "->", 2)[,2]))) siglist <- siglist[siglist$genes %in% nddgeneslist$Gene,] names(vari) <- c("x", "y") lm_eqn <- function(df){ m <- lm(y ~ x, df); eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, list(a = format(unname(coef(m)[1]), digits = 2), b = format(unname(coef(m)[2]), digits = 2), r2 = format(summary(m)$r.squared, digits = 3))) as.character(as.expression(eq)); } p1 <- ggplot(vari, aes(x=x, y=y)) + geom_point() + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) p1 <- p1 + geom_text(label = lm_eqn(vari), parse = TRUE) p1 vari2 <- tplot[c("rate2", "AVE")] names(vari2) <- c("y", "x") p2 <- ggplot(vari2, aes(x=x, y=y)) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) p2 <- p2 + geom_text(label = lm_eqn(vari), parse = TRUE) p2 ############################## Trio Analysis (R) ############################## # INTERGENERATIONAL REPEATS ANALYSIS # IMPORT FUNCTIONS source("C:/Users/DLE/Dropbox/University/PhD/Bioinfo/anceSTR/anceSTR/functions/functions.R") # LIBRARIES Package_up() # IMPORT BASE DATA 1 c(ngc, loci) %<-% Import_Data(as.character(Sys.info()[['sysname']]), 38) # IMPORT BASE DATA 2 ngc <- read.csv(tcltk::tk_choose.files(caption = "Select Reads File"), header = T, sep = "\t") loci <- read.csv(tcltk::tk_choose.files(caption = "Select Corresonding Loci File"), header = T, sep = "\t") # IMPORT BASE DATA 3 ngc <- assign_structure("C:/Users/DLE/Dropbox/University/PhD/Bioinfo/MSSNG/Analysis/Quads/CGG_Repeats_Quads_Full_20220215_filtered.csv","C:/Users/DLE/Dropbox/University/PhD/Bioinfo/MSSNG/Sample Data/MSSNG_family_manifest.tab", 4) # ORGANISE RAW DATA INTO TRIOS c(trios_df, families_df) %<-% Get_Trios(ngc) # ORGANISE AND CLASSIFY TRIO READS c(Events_df, Poly_df, Stable_df, mdata) %<-% Collect_Expansion_Events(all_trios, "trios") c(Events_df, Poly_df, Stable_df, mdata) %<-% testCollect_Expansion_Events(trio_t, "trios") # ANALYSE REPEAT LENGTH CHANGES col_names <- c("Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3") Events_df$Check_and_1 <- apply(Events_df[col_names], 1, function(x) if (x[1] %in% x[c(3:4)] & x[1] %in% x[c(5:6)]) {NA} else {x[1]}) Events_df$Check_and_2 <- apply(Events_df[col_names], 1, function(x) if (x[2] %in% x[c(3:4)] & x[2] %in% x[c(5:6)]) {NA} else {x[2]}) Events_df$Check_or_1 <- apply(Events_df[col_names], 1, function(x) if (x[1] %in% x[c(3:4)] | x[1] %in% x[c(5:6)]) {NA} else {x[1]}) Events_df$Check_or_2 <- apply(Events_df[col_names], 1, function(x) if (x[2] %in% x[c(3:4)] | x[2] %in% x[c(5:6)]) {NA} else {x[2]}) # Analsyse the single events Single_Evenets_df <- single_events(Events_df[Events_df$check_all %in% c("A10422", "A14022", "A13021", "A13012", "A10321", "A10312", "A12011", "A10101", "A12002", "A11010", "A11001", "A12020", "A10202","A10220","A10211", "A11120","A10110", "A11102", "A01120", "A01102", "A02240", "A02204", "X13021", "X10101", "X10321", "X12011", "X01120", "X11010", "X10000", "X11001", "X10110", "X10211", "X02240", "X11102", "X01102", "X10220" ),]) # Analsyse the dounle events Double_Events_df <- double_events(Events_df[Events_df$check_all %in% c("X00000", "X12020", "A00000", "A10000"),c("Call_ID","Sample_ID","Allele1_Units_1","Allele2_Units_1","Allele1_Units_2","Allele2_Units_2","Allele1_Units_3","Allele2_Units_3","check_sum","Check_and_1","Check_and_2","Check_or_1","Check_or_2")]) # Combine all Expansion/Contraction Events All_Events_df <- rbind(Single_Evenets_df, Double_Events_df) All_Events_df <- merge(All_Events_df, loci[c("Call_ID", "Chr", "Gene", "Region")], by="Call_ID") All_Events_df$Event_Size <- abs(All_Events_df$Event_Size) All_Reads_df <- rbind(Stable_df[c("Call_ID","Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")], Poly_df[c("Call_ID","Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")], Events_df[c("Call_ID","Sample_ID", "Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3")]) # Save files trios_save_data(tcltk::tk_choose.dir(caption = "Select Save directory"), "MSSNG_Trios", All_Events_df, Poly_df, Stable_df, mdata) "XYLT1", "FMR1","AFF2","HOXD1","LOC642361","LRP12","GIPC1", "NOTCH2NLC", "DIP2B", "AFF3", "FRA10AC1" ############################## Quad Analysis (R) ############################## ngc <- assign_structure("E:/MSSNG/Quads_MSSNG/CGG_Repeats_Quads_Full_20220215_filtered.csv", "C:/Users/DLE/Dropbox/University/PhD/Bioinfo/MSSNG/Sample Data/MSSNG_family_manifest.tab", 4) ngc <- assign_structure("/mnt/Storage/MSSNG/CGG_Repeats_Quads_Full_20220215_filtered.csv", "/home/dannear/Dropbox/University/PhD/Bioinfo/MSSNG/Sample Data/MSSNG_family_manifest.tab", 4) probands <- ngc[ngc$Member != 4,] sibling <- ngc[ngc$Member != 1,] sibling["Member"][sibling["Member"] == 4] <- 1 c(pro_Events, pro_Info, pro_mono, pro_mdata, pro_All_Event, pro_All_Reads) %<-% Process_Trios(probands) c(sib_Events, sib_Info, sib_mono, sib_mdata, sib_All_Event, sib_All_Reads) %<-% Process_Trios(sibling) event_counts <- aggregate(list("count"=quad_All_Events$Event_Size), by=list("Gene"=quad_All_Events$Gene, "Type"=quad_All_Events$Type), length) test <- aov(count ~ Type * Gene, data = event_counts) ttt <- TukeyHSD(test) summary(quad_All_Events) one.way <- aov(Event_Size ~ Type, data = quad_All_Events) summary(one.way) two.way <- aov(Event_Size ~ Gene + Type, data = quad_All_Events) summary(two.way) two.way2 <- aov(Event_Size ~ Gene * Type, data = quad_All_Events) summary(two.way2) tt <- TukeyHSD(two.way2) length(pro_All_Event$Gene[pro_All_Event$Gene %in% genelist]) length(sib_All_Event$Gene[sib_All_Event$Gene %in% genelist]) test <- merge(aggregate(list("count"=pro_All_Event$Gene), by=list("gene"=pro_All_Event$Gene), length), aggregate(list("count"=sib_All_Event$Gene), by=list("gene"=sib_All_Event$Gene), length), by="gene") test$diff <- test$count.x-test$count.y test$ration <- test$count.x/test$count.y test$ration2 <- apply(test, 1, function(x) if (as.numeric(x[2]) > as.numeric(x[3])) as.numeric(x[2])/as.numeric(x[3]) else if (as.numeric(x[2]) < as.numeric(x[3])) -1*as.numeric(x[3])/as.numeric(x[2]) else 0 ) test$length <- test$count.x+test$count.y test$percentage.diff <- abs(test$diff)/test$length*100 test2 <- test[test$diff != 0,] p<-ggplot(data=test, aes(x=gene, y=ration2)) + geom_bar(stat="identity") p p <- ggplot(quad_All_Events[quad_All_Events$Event_Size != 1,], aes(x=Type, y=Event_Size, fill=Gene)) + geom_boxplot() p <- p + theme(legend.position="none") p lencheck <- merge(aggregate(list("total"=as.numeric(pro_All_Event$Event_Size)), by=list("gene"=pro_All_Event$Gene), sum), aggregate(list("total"=as.numeric(sib_All_Event$Event_Size)), by=list("gene"=sib_All_Event$Gene), sum), by="gene") # IMPORT DATA library(stringi) All_Events <- all_mutations_quads All_Events$Type <- str_split_fixed(stri_reverse(All_Events$Sample_ID), "_", 2)[,1] pro_All_Event <- All_Events[All_Events$Type != "2",] sib_All_Event <- All_Events[All_Events$Type == "2",] pro_All_Event$Type <- "Proband" sib_All_Event$Type <- "Sibling" quad_All_Events <- rbind(pro_All_Event, sib_All_Event) # AGGREGATE DATA BY GENE AND DEFINE POROBANDS AND SIBLINGS com_melt1 <- melt(pro_All_Event[pro_All_Event$Event_Type == "Expansion", c("Gene", "Type", "Event_Size")], id=c("Gene", "Type")) test1 <- ddply(com_melt1, .(Gene, Type), summarise, AVE = mean(value, na.rm=TRUE), LEN = length(value)) com_melt2 <- melt(sib_All_Event[sib_All_Event$Event_Type == "Expansion", c("Gene", "Type", "Event_Size")], id=c("Gene", "Type")) test2 <- ddply(com_melt2, .(Gene, Type), summarise, AVE = mean(value, na.rm=TRUE), LEN = length(value)) test_merge <- merge(test1, test2, by = "Gene", all=T) test_merge$Type.x <- 1 #proband test_merge$Type.y <- 0 #sibling test_bind <- rbind(setNames(test_merge[c("Gene", "Type.x", "AVE.x", "LEN.x")], c("Gene", "Type", "AVE", "LEN")), setNames(test_merge[c("Gene", "Type.y", "AVE.y", "LEN.y")], c("Gene", "Type", "AVE", "LEN"))) test_bind[is.na(test_bind)] <- 0 # RUN MODEL model <- glm(Type ~ LEN + Gene, data = test_bind, family = binomial) summary(model) event_stats <- data.frame(summary(model)$coefficients) names(event_stats) <- c("Estimate", "Std.Error", "z.value", "p.value") event_stats$Gene <- str_replace(as.character(row.names(event_stats)), "Gene", "") event_stats$sig.Size <- apply(event_stats, 1, function(x) if (as.numeric(x[4]) < 0.001) "***" else if (as.numeric(x[4]) < 0.01) "**" else if (as.numeric(x[4]) < 0.05) "*" else if (as.numeric(x[4]) <= 0.1) "." else "") model_size <- glm(Type ~ LEN + AVE + Gene, data = test_bind, family = binomial) size_stats <- data.frame(summary(model_size)$coefficients) names(size_stats) <- c("Estimate", "Std.Error", "z.value", "p.value") size_stats$Gene <- str_replace(as.character(row.names(size_stats)), "Gene", "") size_stats$sig.Size <- apply(size_stats, 1, function(x) if (as.numeric(x[4]) < 0.001) "***" else if (as.numeric(x[4]) < 0.01) "**" else if (as.numeric(x[4]) < 0.05) "*" else if (as.numeric(x[4]) <= 0.1) "." else "") # CORRECT FOR MULTIPLR TESTING library(FSA) input <- data.frame("Gene"=event_stats$Gene, "Raw.p"=event_stats$p.value) input <- input[order(input$Raw.p),] input <- input[!(input$Gene %in% c("LEN", "(Intercept)")),] headtail(input) input$Bonferroni = p.adjust(input$Raw.p, method = "bonferroni") input$BH = p.adjust(input$Raw.p, method = "BH") View(input) input ############################## Informative Analysis (R) ############################## all_data <- informative_analysis(all_inform) ngc_data <- informative_analysis(ngc_i) trios_data <- informative_analysis(trio_i) quads_data <- informative_analysis(quad_i) mvf <- aggregate(list("count"=all_data$Transfer_Type), by=list("parent"=all_data$parent, "type"=all_data$Transfer_Type), length) d1 <- ggplot(mvf, aes(x=parent, y=count)) + geom_bar(stat='identity', aes(fill=type), position = position_dodge(), width=.5) d1 <- d1 + theme(legend.position="top", axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_text(size = 20, face = "bold", colour = "midnightblue"), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.text = element_text(colour="midnightblue", size = 14, face = "bold")) d1 <- d1 + xlab("") + ylab("No. of Transfers\n") + scale_fill_discrete(name = "Allele Type", labels = c("Larger", "Smaller")) d1 new_data <- trios_data bias_diff <- aggregate(list("count"=new_data$Allele_Difference), by=list("type"=new_data$Transfer_Type, "diff"=new_data$Allele_Difference), length) bias_diff <- merge(bias_diff[bias_diff$type == "S",], bias_diff[bias_diff$type == "L",], by="diff", all=T) bias_diff$type.x <- "S" bias_diff$type.y <- "L" bias_diff[is.na(bias_diff)] <- 0 bias_diff$bias <- bias_diff$count.x/(bias_diff$count.x+bias_diff$count.y)*100 bias_diff$bias_t <- bias_diff$bias-50 bias_diff$ratio <- bias_diff$count.x/bias_diff$count.y d2 <- ggplot(bias_diff, aes(x=diff, y=bias, label=bias)) + geom_point(stat = "identity", shape =2) + geom_line(stat = "identity") d2 <- d2 + scale_x_continuous(name ="\n Allele Size Difference")#, breaks=seq(0,60, 10), limits = c(0, 60)) d2 <- d2 + scale_y_continuous(name ="Proportion Smaller Allele Selected (%)\n", breaks = seq(50, 100, 10), limits = c(50, 100)) d2 <- d2 + theme(legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_text(size = 20, face = "bold", colour = "midnightblue"), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.text = element_text(colour="midnightblue", size = 14, face = "bold")) d2 d3 <- ggplot(bias_diff, aes(x=diff, y=ratio, label=ratio)) + geom_bar(stat = "identity") d3 <- d3 + scale_x_continuous(name ="\n Allele Size Difference", breaks=seq(0,100, 10), limits = c(0, 100)) d3 <- d3 + scale_y_continuous(name ="Selection Ratio (Smaller Allele : Larger Allele) \n", breaks = seq(0, 20, 1), limits = c(0, 20)) d3 <- d3 + theme(legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_text(size = 20, face = "bold", colour = "midnightblue"), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.text = element_text(colour="midnightblue", size = 14, face = "bold")) d3 build_d1 = function(all, ngc, trios, quads){ for (x in c(myfunc(all), myfunc(ngc), myfunc(trios), myfunc(quads))){ data <- get(x) assign(paste0("mvf_", x), aggregate(list("count"=data$Transfer_Type), by=list("parent"=data$parent, "type"=data$Transfer_Type), length)) assign(paste0("d1_", x), ggplot(get(paste0("mvf_", x)), aes(x=parent, y=count)) + geom_bar(stat='identity', aes(fill=type), position = position_dodge(), width=.5) + theme(legend.position="top", axis.text.x = element_text(size = 13, face = "bold", colour = "black"), axis.text.y = element_text(size = 14, face = "plain", colour = "black"), axis.title.x = element_blank(), axis.title.y = element_blank(), panel.background = element_blank(), legend.text = element_text(colour="black", size = 14, face = "bold")) + xlab("") + ylab("No. of Transfers\n") + scale_fill_manual(name = "Allele Type", labels = c("Larger", "Smaller"), values = c("L" = "steelblue", "S" = "darkorange")) ) } grid.arrange(d1_all, d1_ngc, d1_trios, d1_quads) } build_d1(all_data, ngc_data, trios_data, quads_data) build_d2 = function(all, ngc, trios, quads){ for (x in c(myfunc(all), myfunc(ngc), myfunc(trios), myfunc(quads))){ data <- get(x) bias_diff <- aggregate(list("count"=data$Allele_Difference), by=list("type"=data$Transfer_Type, "diff"=data$Allele_Difference), length) bias_diff <- merge(bias_diff[bias_diff$type == "S",], bias_diff[bias_diff$type == "L",], by="diff", all=T) bias_diff$type.x <- "S" bias_diff$type.y <- "L" bias_diff[is.na(bias_diff)] <- 0 bias_diff$bias <- bias_diff$count.x/(bias_diff$count.x+bias_diff$count.y)*100 bias_diff$bias_t <- bias_diff$bias-50 bias_diff$ratio <- bias_diff$count.x/bias_diff$count.y assign(paste0("d2_", x), ggplot(bias_diff, aes(x=diff, y=bias, label=bias)) + geom_point(stat = "identity", shape=23, fill="black", color="red") + geom_text(label="Mendelian\nExpectation", x=15, y=45 , color = "Red") + #geom_line(stat = "identity") + scale_x_continuous(name ="\n Allele Size Difference", breaks=seq(0,100, 10), limits = c(0, 100)) + scale_y_continuous(name ="Proportion Smaller Allele Selected (%)\n", breaks = seq(40, 100, 10), limits = c(40, 100)) + theme(axis.line = element_line(size = 0.8, colour = "black", linetype=1), legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "black"), axis.text.y = element_text(size = 14, face = "bold", colour = "black"), axis.title.x = element_blank(), axis.title.y = element_blank(), panel.background = element_blank(), legend.text = element_text(colour="black", size = 14, face = "bold")) + geom_hline(yintercept=50, linetype="dashed", color = "red") ) } grid.arrange(d2_all, d2_ngc, d2_trios, d2_quads) } build_d2(all_data, ngc_data, trios_data, quads_data) build_d3 = function(all, ngc, trios, quads){ for (x in c(myfunc(all), myfunc(ngc), myfunc(trios), myfunc(quads))){ data <- get(x) bias_diff <- aggregate(list("count"=data$Allele_Difference), by=list("type"=data$Transfer_Type, "diff"=data$Allele_Difference), length) bias_diff <- merge(bias_diff[bias_diff$type == "S",], bias_diff[bias_diff$type == "L",], by="diff", all=T) bias_diff$type.x <- "S" bias_diff$type.y <- "L" bias_diff[is.na(bias_diff)] <- 0 bias_diff$bias <- bias_diff$count.x/(bias_diff$count.x+bias_diff$count.y)*100 bias_diff$bias_t <- bias_diff$bias-50 bias_diff$ratio <- bias_diff$count.x/bias_diff$count.y bias_diff <- bias_diff[bias_diff$ratio != Inf,] assign(paste0("d3_", x),ggplot(bias_diff, aes(x=diff, y=ratio, label=ratio)) + geom_bar(stat = "identity", fill="Dark blue", width=1) + geom_text(label="Mendelian\nExpectation", x=95, y=2.5 ,label.size = 0.5, color = "Red") + scale_x_continuous(name ="\n Allele Size Difference", breaks=seq(0,100, 10), limits = c(0, 100)) + scale_y_continuous(name ="Selection Ratio (Smaller Allele : Larger Allele) \n", breaks = seq(0, 18, 2), limits = c(0, 18)) + theme(axis.line = element_line(size = 0.8, colour = "black", linetype=1), legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "black"), axis.text.y = element_text(size = 14, face = "plain", colour = "black"), axis.title.x = element_blank(), axis.title.y = element_blank(), panel.background = element_blank()) + geom_hline(yintercept = 1, colour="red", linetype = "longdash") ) } grid.arrange(d3_all, d3_ngc, d3_trios, d3_quads) } build_d3(all_data, ngc_data, trios_data, quads_data) bias_diff <- aggregate(list("count"=new_data$Allele_Difference), by=list("type"=new_data$Transfer_Type, "diff"=new_data$Allele_Difference, "Parent"=new_data$parent), length) bias_diff <- merge(bias_diff[bias_diff$type == "S",], bias_diff[bias_diff$type == "L",], by=c("diff", "Parent"), all=T) bias_diff <- bias_diff[c(1,2,4,6)] colnames(bias_diff)[3] <- "smaller" colnames(bias_diff)[4] <- "larger" bias_diff$selection <- bias_diff$smaller/(bias_diff$smaller+bias_diff$larger)*100 d2 <- ggplot(bias_diff, aes(x=diff, y=selection, fill=Parent)) + geom_point(stat = "identity", shape =2) + geom_line(stat = "identity") d2 <- d2 + scale_x_continuous(name ="\n Allele Size Difference")#, breaks=seq(0,60, 10), limits = c(0, 60)) d2 <- d2 + scale_y_continuous(name ="Proportion Smaller Allele Selected (%)\n", breaks = seq(50, 100, 10), limits = c(50, 100)) d2 <- d2 + theme(legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "midnightblue"), axis.text.y = element_text(size = 14, face = "plain", colour = "midnightblue"), axis.title.x = element_text(size = 20, face = "bold", colour = "midnightblue"), axis.title.y = element_text(size = 20, face = "bold", colour = "midnightblue"), panel.background = element_blank(), legend.text = element_text(colour="midnightblue", size = 14, face = "bold")) d2 ############################## FMR1 Analysis (R) ############################## FMR1 <- all_trios[all_trios$Call_ID == unique(loci$Call_ID[loci$Gene=="FMR1"]),] fmr1_comparison = function(fmr1){ fmr1$proband_sex <- ifelse(fmr1$Allele2_Units_1 == 0, "MALE", "FEMALE") fmr1$F_check <- apply(fmr1[c("Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3", "proband_sex")], 1, function(x){ if (x["proband_sex"] == "MALE"){NA} else if (x["proband_sex"] == "FEMALE"){ if (as.numeric(x["Allele1_Units_3"]) %in% as.numeric(x[c("Allele1_Units_1", "Allele2_Units_1")])) "Present" else "Deviation" } }) fmr1$M_check <- apply(fmr1[c("Allele1_Units_1", "Allele2_Units_1", "Allele1_Units_2", "Allele2_Units_2", "Allele1_Units_3", "Allele2_Units_3", "proband_sex")], 1, function(x){ if (x["proband_sex"] == "MALE"){ if (unique(max(as.numeric(x[c("Allele1_Units_2", "Allele2_Units_2")]))) == as.numeric(x["Allele1_Units_1"])) "L Transfer" else if (unique(min(as.numeric(x[c("Allele1_Units_2", "Allele2_Units_2")]))) == as.numeric(x["Allele1_Units_1"])) "S Transfer" else "Deviation" } else if (x["proband_sex"] == "FEMALE"){ vals <- as.numeric(x[c("Allele1_Units_2", "Allele2_Units_2" )]) if (length(vals[vals != as.numeric(x["Allele1_Units_3"])]) == 0) vals <- unique(vals) else vals <- vals[vals != as.numeric(x["Allele1_Units_3"])] if (unique(max(vals)) %in% as.numeric(x[c("Allele1_Units_1", "Allele2_Units_1")])) "L Transfer" else if (unique(min(vals)) %in% as.numeric(x[c("Allele1_Units_1", "Allele2_Units_1")])) "S Transfer" else "Deviation" } }) fmr1$M_diff <- abs(fmr1$Allele1_Units_2 - fmr1$Allele2_Units_2) fmr1$M_check <- apply(fmr1, 1, function(x) if (as.numeric(x["M_diff"]) == 0 ) "Homozygous" else x["M_check"]) return(fmr1) } FMR1_comp <- fmr1_comparison(FMR1) FMR1_agg <- aggregate(list("count"=FMR1_comp$M_check), by=list("transfer_type"=FMR1_comp$M_check), length) FMR1_diff_agg <- aggregate(list("count"=FMR1_comp$M_diff), by=list("Size_Diff"=FMR1_comp$M_diff, "transfer_type"=FMR1_comp$M_check), length) FMR1_diff_agg <- FMR1_diff_agg[!(FMR1_diff_agg$transfer_type %in% c("Homozygous", "Deviation")),] FMR1_L <- FMR1_diff_agg[FMR1_diff_agg$transfer_type == "L Transfer",] names(FMR1_L) <- c("Size_Diff", "transfer_type", "Large_Count") FMR1_S <- FMR1_diff_agg[FMR1_diff_agg$transfer_type == "S Transfer",] names(FMR1_S) <- c("Size_Diff", "transfer_type", "Small_Count") FMR1_T <- merge(FMR1_L, FMR1_S, by="Size_Diff", all=T) FMR1_T[is.na(FMR1_T)] <- 0 FMR1_T$prop <- FMR1_T$Small_Count/(FMR1_T$Small_Count+FMR1_T$Large_Count)*100 FMR1_comp_fe <- FMR1_comp[FMR1_comp$proband_sex == "FEMALE",] FMR1_comp_ma <- FMR1_comp[FMR1_comp$proband_sex == "MALE",] FMR1_comp_fe <- FMR1_comp[FMR1_comp$proband_sex == "FEMALE" & FMR1_comp$M_check %in% c("L Transfer", "S Transfer"),] FMR1_comp_ma <- FMR1_comp[FMR1_comp$proband_sex == "MALE" & FMR1_comp$M_check %in% c("L Transfer", "S Transfer"),] length(FMR1_comp_fe$Allele1_Units_1[FMR1_comp_fe$Allele1_Units_1 > 50]) + length(FMR1_comp_fe$Allele2_Units_1[FMR1_comp_fe$Allele2_Units_1 > 50]) length(FMR1_comp_fe$Allele1_Units_2[FMR1_comp_fe$Allele1_Units_2 > 50]) + length(FMR1_comp_fe$Allele2_Units_2[FMR1_comp_fe$Allele2_Units_2 > 50]) + length(FMR1_comp_fe$Allele1_Units_3[FMR1_comp_fe$Allele1_Units_3 > 50]) length(FMR1_comp_ma$Allele1_Units_1[FMR1_comp_ma$Allele1_Units_1 > 50]) length(FMR1_comp_ma$Allele1_Units_2[FMR1_comp_ma$Allele1_Units_2 > 50]) + length(FMR1_comp_ma$Allele2_Units_2[FMR1_comp_fe$Allele2_Units_2 > 50]) gg <- ggplot(data=FMR1_agg, aes(x=transfer_type, y=count)) + geom_bar(stat="identity", width=0.5) gg gz <- ggplot(FMR1_T, aes(x=Size_Diff, y=prop)) + geom_bar(stat = "identity", fill="Dark blue", width=1) + geom_text(label="Mendelian\nExpectation", x=95, y=2.5 , color = "Red") + scale_x_continuous(name ="\n Allele Size Difference") + scale_y_continuous(name ="Smaller Allele Selection (%) \n") + theme(axis.line = element_line(size = 0.8, colour = "black", linetype=1), legend.position="top", legend.title=element_blank(), axis.text.x = element_text(size = 13, face = "bold", colour = "black"), axis.text.y = element_text(size = 14, face = "plain", colour = "black"), panel.background = element_blank()) + geom_hline(yintercept = 50, colour="red", linetype = "longdash") gz ############################## Post-Hoc Testing (R) ############################## # Check Distribution of Gene Expression Data library(stringr) library(ggplot2) library(gridExtra) library(reshape2) library(dplyr) library(plyr) library(ggpubr) library(car) library(fitdistrplus) library(data.table) library(FSA) large ggdensity(large$rate2, xlab = "Description") ggqqplot(large$rate2) qqPlot(large$rate2) shapiro.test(large$rate2) descdist(large$rate2, discrete = FALSE) fit.weibull <- fitdist(large$rate2[is.na(large$rate2)==F], "weibull") fit.norm <- fitdist(large$rate2, "norm") plot(fit.norm) ######## RATE kruskal.test(rate2 ~ Region, data = large) # Kruskal-Wallis chi-squared = 321.36, df = 53, p-value < 2.2e-16 # There is a significant difference between the tissue types and their exprssion levels # Post Hoc Testing # Bonferroni Correction model <- aov(rate2 ~ Region, data = large) summary(model) View(data.frame(pairwise.t.test(large$rate2, large$Region, p.adjust.method="bonferroni")$p.value)) # Pairwise Wilcoxon tests with multiple testing correction, p.adjust.method is Benjamini-Hochberg wil <- pairwise.wilcox.test(large$rate2, large$Region, p.adjust.method = 'BH') wil_pvalue <- data.frame(wil$p.value) wil_pvalue summary(wil) wil #Dunn Test View(data.frame(dunnTest(rate2 ~ Region,data=large,method="bh")$res)) ######## SIZE shapiro.test(large$AVE) kruskal.test(AVE ~ Region, data = large) qqPlot(large$AVE) fit.weibull <- fitdist(large$AVE[is.na(large$AVE)==F], "weibull") fit.norm <- fitdist(large$AVE, "norm") plot(fit.norm) data.frame(dunnTest(AVE ~ Region,data=large,method="bh")$res)