####################
### 1. Gene analysis
####################
org <- 'Hs'

### Meta data from dbGap
meta <- as.data.frame(fread('../metadata/brainAtlas_metadata.csv'))
rownames(meta) <- meta[,'Run']

#load gene raw counts
load("../data/rawData_genes_b01.RData")
ct1 <- countTable
load("../data/rawData_genes_b02.RData")
ct2 <- countTable
stopifnot(all(rownames(ct1) == rownames(ct2)))
countTable <- cbind(ct1, ct2)
Batch <- NA
Group <- as.factor(meta[colnames(countTable), 'body_site'])

#remove chromosome MT
tmp <- annotateEnsPos(rownames(countTable), org=org)
sel <- !(tmp$chr %in% 'MT')
countTable <- countTable[sel, ]

#remove low counts
sel.r <- rowSums(countTable) > ncol(countTable)
countTable <- countTable[sel.r, ]

# remove samples with < 10e7 reads
sel.c <- colSums(countTable) > 1e7
countTable <- countTable[, sel.c]
meta <- meta[colnames(countTable), ]
Group <- Group[sel.c]

#get norm.counts
dge <- DGEList(counts=countTable)
dge <- calcNormFactors(dge)
design <- model.matrix(~ -1 + Group)

v <- voom(dge, design, plot=F)
norm.counts <- v$E
anno <- annotateEns(rownames(norm.counts), org, level='gene')

#save genes lib size
genes.libSize <- v$targets$lib.size
names(genes.libSize) <- rownames(v$targets)
genes.avg.norm <- mean(genes.libSize)/1e06
save(genes.libSize, genes.avg.norm, file='../data/genes.libSize.RData')

#export norm.counts
xout <- data.frame(anno, 2^(norm.counts) * genes.avg.norm)
write.table(xout, file='../results/tables/norm.counts.xls', row.names=F,
            col.names=T, quote=F, sep='\t')

save(countTable, meta, norm.counts, v, Group, Batch, file='../data/countTable.RData', compress='gzip')

###################################################
### 2. Construct mean tables
################################################

## Per struct
##########################
load('../data/countTable.RData')
load('../data/genes.libSize.RData')
Group.site <- as.factor(meta[colnames(countTable), 'body_site'])
Group.stage <- as.factor(meta[colnames(countTable), 'stage'])
anno <- annotateEns(rownames(norm.counts), org, level='gene')
mydircreate('../results/tables/per_struct')
for (l in levels(Group.site)) {
  sel <- Group.site == l
  design <- model.matrix(~ -1 + Group.stage[sel])
  fit1 <- lmFit(v[, sel], design)
  norm.counts <- v$E
  m <- (coef(fit1))
  colnames(m) <- gsub('\\[sel\\]', '', gsub('Group.stage', 'mean.', colnames(m)))
  m <- m[, c("mean.2A", "mean.2B", "mean.3A", "mean.3B", "mean.4", "mean.5", "mean.6", "mean.7", "mean.8", "mean.9", "mean.10", "mean.11")]
  cpm <- 2^(coef(fit1))
  cpm[cpm < 0.4] <- 0
  colnames(cpm) <- gsub('\\[sel\\]', '', gsub('Group.stage', 'mean.', colnames(cpm)))
  cpm <- cpm[, c("mean.2A", "mean.2B", "mean.3A", "mean.3B", "mean.4", "mean.5", "mean.6", "mean.7", "mean.8", "mean.9", "mean.10", "mean.11")]
  means <- rbind(m)
  xout <- data.frame(anno, means)
  write.table(xout, file=gsub(' ', '_', paste0('../results/tables/per_struct/xp_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
  write.table(data.frame(anno, cpm), file=gsub(' ', '_', paste0('../results/tables/per_struct/cpm_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
}

## Per stage
##########################
load('../data/countTable.RData')
mydircreate('../results/tables/per_stage')
Group.site <- as.factor(meta[colnames(countTable), 'body_site'])
Group.stage <- as.factor(meta[colnames(countTable), 'stage'])
anno <- annotateEns(rownames(norm.counts), org, level='gene')
for (l in levels(Group.stage)) {
  sel <- Group.stage == l
  design <- model.matrix(~ -1 + Group.site[sel])
  fit1 <- lmFit(v[, sel], design)
  norm.counts <- v$E
  m <- (coef(fit1))
  colnames(m) <- gsub('\\[sel\\]', '', gsub('Group.site', 'mean.', colnames(m)))
  means <- rbind(m)
  cpm <- 2^(coef(fit1))
  cpm[cpm < 0.4] <- 0
  colnames(cpm) <- gsub('\\[sel\\]', '', gsub('Group.site', 'mean.', colnames(cpm)))
  xout <- data.frame(anno, means)
  write.table(xout, file=gsub(' ', '_', paste0('../results/tables/per_stage/xp_stage_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
  write.table(data.frame(anno, cpm), file=gsub(' ', '_', paste0('../results/tables/per_stage/cpm_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
}



####################
### 3. TE analysis
####################
load("../data/rawData_TEs_b01.RData")
x1 <- x
load("../data/rawData_TEs_b02.RData")
x2 <- x
stopifnot(all(rownames(x1) == rownames(x2)))
x <- cbind(x1, x2)
rm(x1, x2)
gc()

# get list of samples to keep
load("../data/countTable.RData")
keep <- colnames(countTable)
maptable <- as.data.frame(fread('gzip -dc hg19_TE_repmask_LTRm_s_20140131.maptable.gz')) # 
stopifnot(all(rownames(x) ==  maptable$V1))
ereinfo <- maptable[, -1]
colnames(ereinfo) <- c("chr", "start", "end", "repName", "strand", "class.fam")
ereinfo$unMerged <- unMergeERE(ereinfo$repName)
countTable <- as.matrix(x)
rm(x)
gc()

Batch <- NA
Group <- as.factor(meta[colnames(countTable), 'body_site'])

#remove low counts
sel <- rowSums(countTable) > ncol(countTable)
countTable <- countTable[sel, ]
ereinfo <- ereinfo[sel, ]

# remove samples with low cov
sel.c <- colnames(countTable) %in% keep
countTable <- countTable[, keep]
meta <- meta[keep, ]
Group <- Group[sel.c]

#remove the ones in exons
tmp <- addDist2exon(ereinfo, org=org, exonbed="1811_hg19_ens_coding_genes_exons_symbol.bed")
sel.r <- tmp$dist2exon!=0
countTable <- countTable[sel.r, ]
ereinfo <- ereinfo[sel.r, ]

#get norm.counts
dge <- DGEList(counts=countTable)
dge <- calcNormFactors(dge)
design <- model.matrix(~ -1 + Group)

load('../data/genes.libSize.RData')
stopifnot(names(genes.libSize)==colnames(countTable))
pdf('../results_ere_stringent/figs/mean_variance_trend_ere.pdf')
v <- voom(dge, design, plot=TRUE, lib.size=genes.libSize)
dev.off()
norm.counts <- v$E
gc()

#sel TEs with >50 reads in at least one sample
sel.ere <- grepl("LTR|SINE|SVA|LINE", ereinfo$class.fam)
sel.dnat <- grepl("DNA|^RC", ereinfo$class.fam)
norm.counts.l <- 2^(norm.counts) * genes.avg.norm
sel.50 <- apply(norm.counts.l, 1, function(x) any(x>50))
sel <- (sel.ere | sel.dnat) & sel.50

ereinfo <- ereinfo[sel, ]
countTable <- countTable[sel, ]
norm.counts <- norm.counts[sel, ]
v <- v[sel, ]

#save data
save(ereinfo, countTable, norm.counts, v, Group, Batch,
     file='../data/countTable_ere_str.RData', compress='gzip')


###################################################
### 4. Construct mean tables for TE
###################################################
## Per struct
##########################
load('../data/countTable_ere_str.RData')
load('../data/genes.libSize.RData')
meta <- as.data.frame(fread('../metadata/brainAtlas_metadata.csv'))
rownames(meta) <- meta[,'Run']

Group.site <- as.factor(meta[colnames(countTable), 'body_site'])
Group.stage <- as.factor(meta[colnames(countTable), 'stage'])
for (l in levels(Group.site)) {
  sel <- Group.site == l
  design <- model.matrix(~ -1 + Group.stage[sel])
  fit1 <- lmFit(v[, sel], design)
  norm.counts <- v$E
  m <- (coef(fit1))
  colnames(m) <- gsub('\\[sel\\]', '', gsub('Group.stage', 'mean.', colnames(m)))
  m <- m[, c("mean.2A", "mean.2B", "mean.3A", "mean.3B", "mean.4", "mean.5", "mean.6", "mean.7", "mean.8", "mean.9", "mean.10", "mean.11")]
  means <- rbind(m)
  cpm <- 2^(coef(fit1))
  cpm[cpm < 0.4] <- 0
  colnames(cpm) <- gsub('\\[sel\\]', '', gsub('Group.stage', 'mean.', colnames(cpm)))
  cpm <- cpm[, c("mean.2A", "mean.2B", "mean.3A", "mean.3B", "mean.4", "mean.5", "mean.6", "mean.7", "mean.8", "mean.9", "mean.10", "mean.11")]
  xout <- data.frame(ereinfo, means)
  write.table(xout, file=gsub(' ', '_', paste0('../results_ere_stringent/tables/per_struct/xp_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
  write.table(data.frame(ereinfo, cpm), file=gsub(' ', '_', paste0('../results_ere_stringent/tables/per_struct/cpm_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
}

## Per stage
##########################
load('../data/countTable_ere_str.RData')
meta <- as.data.frame(fread('../metadata/brainAtlas_metadata.csv'))
rownames(meta) <- meta[,'Run']
Group.site <- as.factor(meta[colnames(countTable), 'body_site'])
Group.stage <- as.factor(meta[colnames(countTable), 'stage'])
for (l in levels(Group.stage)) {
  sel <- Group.stage == l
  design <- model.matrix(~ -1 + Group.site[sel])
  fit1 <- lmFit(v[, sel], design)
  norm.counts <- v$E
  m <- (coef(fit1))
  colnames(m) <- gsub('\\[sel\\]', '', gsub('Group.site', 'mean.', colnames(m)))
  means <- rbind(m)
  cpm <- 2^(coef(fit1))
  cpm[cpm < 0.4] <- 0
  colnames(cpm) <- gsub('\\[sel\\]', '', gsub('Group.site', 'mean.', colnames(cpm)))
  xout <- data.frame(ereinfo, means)
  write.table(xout, file=gsub(' ', '_', paste0('../results_ere_stringent/tables/per_stage/xp_stage_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
  write.table(data.frame(ereinfo, cpm), file=gsub(' ', '_', paste0('../results_ere_stringent/tables/per_stage/cpm_stage_', l, '_all.xls')), 
              row.names=F, col.names=T, quote=F, sep='\t')
}


########################
### 4. Multimap TE analysis
########################

#Load TE maptable
maptable <- as.data.frame(fread('gzip -dc hg19_TE_repmask_LTRm_s_20140131.maptable.gz'))
#load gene counts
load("../data/rawData_multi_TEs_b01.RData")
stopifnot(all(rownames(x) ==  maptable$V1))
ereinfo <- maptable[, -1]
colnames(ereinfo) <- c("chr", "start", "end", "repName", "strand", "class.fam")
ereinfo$unMerged <- unMergeERE(ereinfo$repName)
countTable <- as.matrix(x)
rm(x)
gc()
# Add up fam
countTable <- data.frame(unMerged=ereinfo$unMerged, countTable)
snames <- colnames(countTable)[-1]
cmd <- paste0('select unMerged, sum(', paste(snames, collapse='), sum('), ') from countTable group by unMerged')
tmp2 <- sqldf(cmd)
countTable <- tmp2[, -1]
rownames(countTable) <- tmp2[, 1]
colnames(countTable) <- snames
save(countTable, file=paste0("../data/rawData_multi_TEs_CT_1.RData"), compress='gzip')

# batch 2
load("../data/rawData_multi_TEs_b02.RData")
stopifnot(all(rownames(x) ==  maptable$V1))
ereinfo <- maptable[, -1]
colnames(ereinfo) <- c("chr", "start", "end", "repName", "strand", "class.fam")
ereinfo$unMerged <- unMergeERE(ereinfo$repName)
countTable <- as.matrix(x)
rm(x)
gc()
# Add up fam
countTable <- data.frame(unMerged=ereinfo$unMerged, countTable)
snames <- colnames(countTable)[-1]
cmd <- paste0('select unMerged, sum(', paste(snames, collapse='), sum('), ') from countTable group by unMerged')
tmp2 <- sqldf(cmd)
countTable <- tmp2[, -1]
rownames(countTable) <- tmp2[, 1]
colnames(countTable) <- snames
save(countTable, file=paste0("../data/rawData_multi_TEs_CT_2.RData"), compress='gzip')


###################################################
### 01b load the data
###################################################
# get list of samples to keep
load("../data/countTable.RData")
keep <- colnames(countTable)
load('../data/rawData_multi_TEs_CT_1.RData')
ct1 <- countTable
load('../data/rawData_multi_TEs_CT_2.RData')
ct2 <- countTable
countTable <- cbind(ct1, ct2)
countTable <- countTable[, keep]
sel.c <- colnames(countTable) %in% keep

dge <- DGEList(counts=countTable)
dge <- calcNormFactors(dge)
design <- as.matrix(rep(1, ncol(dge)))
load('../data/genes.libSize.RData')
stopifnot(names(genes.libSize)==colnames(countTable))
v <- voom(dge, design, plot=F, lib.size=genes.libSize)
norm.counts <- v$E
cpm.counts <- 2^(norm.counts)
cpm.counts[countTable == 0] <- 0

## rename col
meta <- as.data.frame(fread('../metadata/brainAtlas_metadata.csv'))
rownames(meta) <- meta[,'Run']
Batch <- NA
Group <- as.factor(meta[colnames(countTable), 'body_site'])

meta <- meta[keep, ]
Group <- Group[sel.c]
save(Batch, Group, v, meta, countTable, norm.counts, cpm.counts, file='../data/countTables_TEfam.RData')

### Save means per stage and structure 
#\begin{rbatch}
load('../data/countTables_TEfam.RData')
load('../data/genes.libSize.RData')
Group.site <- as.factor(meta[colnames(countTable), 'body_site'])
Group.stage <- as.factor(meta[colnames(countTable), 'stage'])

for (l in levels(Group.site)) {
  sel <- Group.site == l
  design <- model.matrix(~ -1 + Group.stage[sel])
  fit1 <- lmFit(v[, sel], design)
  norm.counts <- v$E
  m <- (coef(fit1))*genes.avg.norm
  colnames(m) <- gsub('\\[sel\\]', '', gsub('Group.stage', 'mean.', colnames(m)))
  m <- m[, c("mean.2A", "mean.2B", "mean.3A", "mean.3B", "mean.4", "mean.5", "mean.6", "mean.7", "mean.8", "mean.9", "mean.10", "mean.11")]
  cpm <- 2^(coef(fit1))
  cpm[cpm < 0.4] <- 0
  colnames(cpm) <- gsub('\\[sel\\]', '', gsub('Group.stage', 'mean.', colnames(cpm)))
  cpm <- cpm[, c("mean.2A", "mean.2B", "mean.3A", "mean.3B", "mean.4", "mean.5", "mean.6", "mean.7", "mean.8", "mean.9", "mean.10", "mean.11")]
  means <- rbind(m)
  write.table(means, file=gsub(' ', '_', paste0('../results_fam/tables/per_struct/xp_', l, '.xls')), 
              row.names=T, col.names=T, quote=F, sep='\t')
  write.table(cpm, file=gsub(' ', '_', paste0('../results_fam/tables/per_struct/cpm_', l, '.xls')), 
              row.names=T, col.names=T, quote=F, sep='\t')
}

## Per stage
##########################
load('../data/countTables_TEfam.RData')
Group.site <- as.factor(meta[colnames(countTable), 'body_site'])
Group.stage <- as.factor(meta[colnames(countTable), 'stage'])
for (l in levels(Group.stage)) {
  sel <- Group.stage == l
  design <- model.matrix(~ -1 + Group.site[sel])
  fit1 <- lmFit(v[, sel], design)
  norm.counts <- v$E
  m <- (coef(fit1))*genes.avg.norm
  colnames(m) <- gsub('\\[sel\\]', '', gsub('Group.site', 'mean.', colnames(m)))
  means <- rbind(m)
  cpm <- 2^(coef(fit1))
  cpm[cpm < 0.4] <- 0
  colnames(cpm) <- gsub('\\[sel\\]', '', gsub('Group.site', 'mean.', colnames(cpm)))
  xout <- means
  write.table(xout, file=gsub(' ', '_', paste0('../results_fam/tables/per_stage/xp_stage_', l, '.xls')), 
              row.names=T, col.names=T, quote=F, sep='\t')
  write.table(cpm, file=gsub(' ', '_', paste0('../results_fam/tables/per_stage/cpm_', l, '.xls')), 
              row.names=T, col.names=T, quote=F, sep='\t')
}
