setwd('/Volumes/MyBook_3/BD_aging_project/Public_datasets/GTex/')
options(stringsAsFactors=F)

# 2016-11-30
# get GTex data to compare mouse and human aging patterns

# 2016-12-01
# need to include genotying PCA as covariates

# 2016-12-15
# get only max transcript count per gene (to mirror mouse data)


# 2016-12-16
# filter only genes with protein coding annnotation + known mouse ortholog

my.gene.counts <- read.csv('GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct', header = F, sep="\t",skip = 2)
my.meta.data.patient <- read.csv('GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt',header=T, sep="\t")


##################################################################################################################
my.meta.data.sample <- read.csv('GTEx_Data_V6_Annotations_SampleAttributesDS.txt',header=T, sep="\t")

my.meta.data.sample.v2 <- my.meta.data.sample[,c('SAMPID','SMTSD','SMATSSCR','SMNABTCH','SMGEBTCH','SMRIN','SMTSISCH','SMTSPAX','SMAFRZE','SMMNCPB','SMMNCV','SMNTRNRT','SMEXPEFF','SMMPPDUN','SMRRNART','SMDPMPRT')]

colnames(my.meta.data.sample.v2) <- c( "Sample_ID",
                                       "Tissue_Type",
                                       "AutolysisScore",
                                       "RNA_batch",
                                       "Exp_Batch",
                                       "RIN",
                                       "Ischemic_time",
                                       "Fixation_time",
                                       "GTex_freeze",
                                       "Cov_per_base",
                                       "MeanCV",
                                       "Intronic_rate",
                                       "Efficiency",
                                       "Unique",
                                       "rRNA_rate",
                                       "duplication_rate")

# SAMPID  Sample ID, GTEx Public Sample ID
# SMATSSCR  Autolysis Score
# SMNABTCH	Nucleic Acid Isolation Batch ID
# SMGEBTCH	Genotype or Expression Batch ID
# SMRIN	RIN Number
# SMTSD	Tissue Type, more specific detail of tissue type
# SMTSISCH	Total Ischemic time for a sample in 4 hour intervals
# SMTSPAX	Time a sample spent in the PAXgene fixative
# SMAFRZE	Samples included in the GTEx Analysis Freeze
# SMMNCPB	Mean Coverage Per Base: Coverage is averaged per base across each transcript, and averaged again across all transcripts.
# SMMNCV	mean coefficient of variation: standard deviation in base coverage divided by mean coverage
# SMNTRNRT	Intronic Rate: The fraction of reads that map within introns
# SMEXPEFF	Expression Profiling Efficiency: Ratio of exon reads to total reads
# SMMPPDUN	Mapped Unique: Number of reads that were aligned and did not have duplicate flags
# SMRRNART	rRNA Rate: Ratio of all reads aligned to rRNA regions to total reads
# SMDPMPRT	Duplication Rate of Mapped: Duplicate reads divided by total mapped reads

unique(my.meta.data.sample.v2$Tissue_Type)
# [1] "Whole Blood"                               "Adipose - Subcutaneous"                    "Muscle - Skeletal"                         "Artery - Tibial"                          
# [5] "Artery - Coronary"                         "Heart - Atrial Appendage"                  "Adipose - Visceral (Omentum)"              "Ovary"                                    
# [9] "Uterus"                                    "Vagina"                                    "Breast - Mammary Tissue"                   "Skin - Not Sun Exposed (Suprapubic)"      
# [13] "Minor Salivary Gland"                      "Brain - Cortex"                            "Adrenal Gland"                             "Thyroid"                                  
# [17] "Lung"                                      "Spleen"                                    "Pancreas"                                  "Esophagus - Muscularis"                   
# [21] "Esophagus - Mucosa"                        "Esophagus - Gastroesophageal Junction"     "Stomach"                                   "Colon - Sigmoid"                          
# [25] "Small Intestine - Terminal Ileum"          "Colon - Transverse"                        "Prostate"                                  "Testis"                                   
# [29] "Skin - Sun Exposed (Lower leg)"            "Nerve - Tibial"                            "Heart - Left Ventricle"                    "Pituitary"                                
# [33] "Brain - Cerebellum"                        "Cells - Transformed fibroblasts"           "Artery - Aorta"                            "Cells - EBV-transformed lymphocytes"      
# [37] "Liver"                                     "Kidney - Cortex"                           "Brain - Hippocampus"                       "Brain - Substantia nigra"                 
# [41] "Brain - Anterior cingulate cortex (BA24)"  "Brain - Frontal Cortex (BA9)"              "Brain - Cerebellar Hemisphere"             "Brain - Caudate (basal ganglia)"          
# [45] "Brain - Nucleus accumbens (basal ganglia)" "Brain - Putamen (basal ganglia)"           "Brain - Hypothalamus"                      "Brain - Spinal cord (cervical c-1)"       
# [49] "Brain - Amygdala"                          "Fallopian Tube"                            "Bladder"                                   "Cervix - Ectocervix"                      
# [53] "Cervix - Endocervix"                       "Cells - Leukemia cell line (CML)"          ""

my.samples.of.interest <- my.meta.data.sample.v2$Tissue_Type %in% c("Heart - Left Ventricle",
                                                                    "Brain - Cerebellum",
                                                                    "Liver")

my.meta.data.sample.v3 <- my.meta.data.sample.v2[my.samples.of.interest,]
##################################################################################################################

##################################################################################################################
my.meta.data.patient <- read.csv('GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt',header=T, sep="\t")

# update metadata using patient info
my.meta.data.sample.v4 <- my.meta.data.sample.v3
my.meta.data.sample.v4$Age <- NA
my.meta.data.sample.v4$Gender <- NA
my.meta.data.sample.v4$Hardy <- NA
my.meta.data.sample.v4$Individual <- NA
# SUBJID  Subject ID, GTEx Public Donor ID
# AGE	Age in 10 year categories
# GENDER	Gender (1=Male  2=Female)
# DTHHRDY	Hardy Scale (0=Ventilator Case  1=Violent and fast death	2=Fast death of natural causes	3=Intermediate death	4=Slow death)

for (i in 1:length(my.meta.data.patient$SUBJID)) {
  
  my.samples <- grep(my.meta.data.patient$SUBJID[i],my.meta.data.sample.v3$Sample_ID)
  
  if(length(my.samples) > 0) {
    my.meta.data.sample.v4$Age[my.samples] <- my.meta.data.patient$AGE[i]
    my.meta.data.sample.v4$Gender[my.samples] <- my.meta.data.patient$GENDER[i]
    my.meta.data.sample.v4$Hardy[my.samples] <- my.meta.data.patient$DTHHRDY[i]
    my.meta.data.sample.v4$Individual[my.samples] <- my.meta.data.patient$SUBJID[i]
    
  }
  
}

dim(my.meta.data.sample.v4)
# [1] 642  20

my.sorted <- sort(my.meta.data.sample.v4$Tissue_Type,index.return=T)

my.meta.data.sample.v5 <- my.meta.data.sample.v4[my.sorted$ix,]

my.meta.data.sample.v5$Gender[my.meta.data.sample.v5$Gender == 1] <- "Male"
my.meta.data.sample.v5$Gender[my.meta.data.sample.v5$Gender == 2] <- "Female"

my.meta.data.sample.v5$Gender <- factor(my.meta.data.sample.v5$Gender)
##################################################################################################################


##################################################################################################################
# get computed covariates - 2016-12-01

Cerebellum.v6p.covariates <- read.csv('GTEx_Analysis_v6p_eQTL_covariates/Brain_Cerebellum_Analysis.v6p.covariates.txt',header=F, sep="\t")
Heart.v6p.covariates <- read.csv('GTEx_Analysis_v6p_eQTL_covariates/Heart_Left_Ventricle_Analysis.v6p.covariates.txt',header=F, sep="\t")
Liver.v6p.covariates <- read.csv('GTEx_Analysis_v6p_eQTL_covariates/Liver_Analysis.v6p.covariates.txt',header=F, sep="\t")

# keep only genotyping PCs (first 3 lines)
Cerebellum.v6p.covariates <- Cerebellum.v6p.covariates[1:4,-1]
Heart.v6p.covariates <- Heart.v6p.covariates[1:4,-1]
Liver.v6p.covariates <- Liver.v6p.covariates[1:4,-1]

my.pc.covariates <- t(cbind(Cerebellum.v6p.covariates,Heart.v6p.covariates,Liver.v6p.covariates))
my.pc.covariates.unique <- data.frame(unique(my.pc.covariates)) # remove duplicates


# update metadata using patient info
my.meta.data.sample.v6 <- my.meta.data.sample.v5
my.meta.data.sample.v6$PC1 <- NA
my.meta.data.sample.v6$PC2 <- NA
my.meta.data.sample.v6$PC3 <- NA

for (i in 1:length(my.pc.covariates.unique$X1)) {
  
  my.samples <- my.meta.data.sample.v6$Individual %in% my.pc.covariates.unique$X1[i]
  
  if(length(my.samples) > 0) {
    my.meta.data.sample.v6$PC1[my.samples] <- my.pc.covariates.unique$X2[i]
    my.meta.data.sample.v6$PC2[my.samples] <- my.pc.covariates.unique$X3[i]
    my.meta.data.sample.v6$PC3[my.samples] <- my.pc.covariates.unique$X4[i]  
  }
  
}
##################################################################################################################


##################################################################################################################
my.samples.forstudy <- my.gene.counts[1,] %in% my.meta.data.sample.v6$Sample_ID
my.gene.counts.select <- cbind(my.gene.counts[,1:2],my.gene.counts[,my.samples.forstudy])

write.table(my.gene.counts.select, file="2016-12-01_GTex_Heart_Cerebellum_Liver.txt",quote = F, sep = "\t",row.names = F,col.names = F)

my.data.forstudy <- my.meta.data.sample.v6$Sample_ID %in% my.gene.counts[1,]

write.table(my.meta.data.sample.v6[my.data.forstudy,], file="2016-12-1_GTex_Heart_Cerebellum_Liver_METADATA.txt",quote = F, sep = "\t",row.names = F,col.names = T)
##################################################################################################################


##################################################################################################################
# 2016-12-15

# # read in matrix with Perl parse gene name
# my.gene.counts <- read.csv('2016-11-30_GTex_Heart_Cerebellum_Liver.GNAME.txt', header = F, sep="\t")
# 
# my.gene.counts.2 <- my.gene.counts[-1,-1]
# 
# my.gene.counts.3 <- aggregate(. ~ V2, data = my.gene.counts.2, max)


##################################################################################################################
# 2016-12-16

# read in matrix and Biomart data
my.gene.counts <- read.csv('2016-12-01_GTex_Heart_Cerebellum_Liver.txt', header = F, sep="\t")

my.mouse.gnames <-  read.csv('2016-12-16_en75_mouse_geneNames.txt', header = T, sep="\t")
my.gencode.Names <-  read.csv('2016-12-16_GTEx_EnsGeneID_list.txt', header = T, sep="\t")
my.mouse.orth <-  read.csv('2016-12-16_en75_human_mouse_orthologs.txt', header = T, sep="\t")

my.merge1 <- merge(my.gencode.Names,my.mouse.orth[,-c(3,5)], by.x = 'X',by.y='Ensembl.Gene.ID')

my.merge2 <- merge(my.merge1,my.mouse.gnames, by.x = 'Mouse.Ensembl.Gene.ID',by.y='Ensembl.Gene.ID')

colnames(my.merge2) <- c("Mouse.Ensembl.Gene.ID","Human.Ensembl.Gene.ID","Gencode19","Human_Symbol","Mouse_Symbol")

head(my.merge2)
# Mouse.Ensembl.Gene.ID Human.Ensembl.Gene.ID          Gencode19 Human_Symbol Mouse_Symbol
# 1    ENSMUSG00000000001       ENSG00000065135  ENSG00000065135.7        GNAI3        Gnai3
# 2    ENSMUSG00000000028       ENSG00000093009  ENSG00000093009.5        CDC45        Cdc45
# 3    ENSMUSG00000000037       ENSG00000102098 ENSG00000102098.13        SCML2        Scml2
# 4    ENSMUSG00000000049       ENSG00000091583  ENSG00000091583.6         APOH         Apoh
# 5    ENSMUSG00000000056       ENSG00000141562 ENSG00000141562.13         NARF         Narf
# 6    ENSMUSG00000000058       ENSG00000105971 ENSG00000105971.10         CAV2         Cav2

my.merge.3 <- merge(my.merge2, my.gene.counts[-1,], by.x = 'Gencode19', by.y = 'V1')

my.final <- rbind(c(colnames(my.merge.3)[1:5],as.character(my.gene.counts[1,3:464])), my.merge.3[,-6])

# get all protein coding(even with duplicates)
write.table(my.merge2, file="2016-12-16_Correspondence_GeneName_Human_Mouse_Orthologs.txt",quote = F, sep = "\t",row.names = F,col.names = T)


# filter so each protein coding gene only reported once
rownames(my.gene.counts) <- my.gene.counts$V1

my.gene.counts.2 <- rbind(my.gene.counts[1,],
                          my.gene.counts[unique(my.merge2$Gencode19),]
)

write.table(my.gene.counts.2, file="2016-12-16_GTex_Heart_Cerebellum_Liver_FILTERED.txt",quote = F, sep = "\t",row.names = F,col.names = F)




