pairs <- pairs[cType == 'base' & frac >= 0.1]  # Consider positions that contact base in >= 10% of weighted structures
pairs
pairs <- pairs[cType == 'base' & frac >= 0.05]  # Consider positions that contact base in >= 10% of weighted structures
pairs
allBpos
# Read in the aligned contact files and subset to base-contacts only (no backbone)
# Consider all pairs that contacting base in >= 10% of weighted structures
pairs <- fread('zf-C2H2_weightedSum_distCut3.6.txt')
pairs <- pairs[cType == 'base' & frac >= 0.05 & mState %in% allBpos$mState]
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
allBpos
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
allBpos
# Read in the aligned contact files and subset to base-contacts only (no backbone)
# Consider positions that contact any base in >= 10% of weighted structures
allBpos <- fread('zf-C2H2_weightedSum_distCut3.6_unionBases.txt')
allBpos[bType == 'base']
# Read in the aligned contact files and subset to base-contacts only (no backbone)
# Consider positions that contact any base in >= 10% of weighted structures
allBpos <- fread('zf-C2H2_weightedSum_distCut3.6_unionBases.txt')
allBpos
allBpos[bType == 'base']
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
doms
# Gather clustered ZFs based on the output of findDomains.py and subset
# tables appropriately
library(data.table)
rm(list = ls())
inDoms <- fread('prot_seq_fewZFs_hmmerOut.txt')
inMotifs <- fread('motifTable_mostRecent_fewZFs.txt')
# Remove clusters of length less than 2
clustInfo <- inDoms[,.(clustLen = .N),by = c('prot','clustNum')]
doms <- merge(inDoms, clustInfo[clustLen >= 2 & clustLen <= 4], by = c('prot','clustNum'))
# Remove proteins with more than one cluster
nClusts <- doms[,.(nClusts = length(unique(clustNum))),by = c('prot')]
doms <- merge(doms, nClusts[nClusts == 1], by = c('prot'))
# Remove proteins for which we have no motif
doms <- doms[prot %in% inMotifs$TF_ID]
motifs <- inMotifs[TF_ID %in% doms$prot]
# Append the appropriate motif ID and TF name to the domain table
doms <- merge(doms, motifs[,c('TF_ID','Motif_ID','TF_Name','TF_Species'),with=FALSE],
by.x = 'prot', by.y = 'TF_ID', all.x = TRUE)
doms <- doms[order(prot, targStart)]
setwd('../cis_bp/C2H2-ZF/')
library(data.table)
rm(list = ls())
inDoms <- fread('prot_seq_fewZFs_hmmerOut.txt')
inMotifs <- fread('motifTable_mostRecent_fewZFs.txt')
# Remove clusters of length less than 2
clustInfo <- inDoms[,.(clustLen = .N),by = c('prot','clustNum')]
doms <- merge(inDoms, clustInfo[clustLen >= 2 & clustLen <= 4], by = c('prot','clustNum'))
# Remove proteins with more than one cluster
nClusts <- doms[,.(nClusts = length(unique(clustNum))),by = c('prot')]
doms <- merge(doms, nClusts[nClusts == 1], by = c('prot'))
# Remove proteins for which we have no motif
doms <- doms[prot %in% inMotifs$TF_ID]
motifs <- inMotifs[TF_ID %in% doms$prot]
# Append the appropriate motif ID and TF name to the domain table
doms <- merge(doms, motifs[,c('TF_ID','Motif_ID','TF_Name','TF_Species'),with=FALSE],
by.x = 'prot', by.y = 'TF_ID', all.x = TRUE)
doms <- doms[order(prot, targStart)]
doms
# Ensure the motif is at least nZFs*3 + 1 in length
doms <- merge(doms, fread('PWM_lengths.txt'), by = 'Motif_ID')
doms
doms <- doms[order(prot, targStart)]
doms
doms <- doms[motifLen >= 3*clustLen+1]
doms
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
table(doms$motifLen, doms$clustLen)
doms[,.SD[1], by = 'TF_ID']
doms[,.SD[1], by = 'prot']
library(ggplot2)
# Plot the distribution of (motifLen-1)/clustLen
g <- ggplot(doms[,.SD[1], by = 'prot'], aes(x = clustLen, y = motifLen)) +
geom_boxplot(outlier.colour = 'white') +
geom_jitter(height = 0, width = 0.1, size = 1, alpha = 0.4)
g
# Gather clustered ZFs based on the output of findDomains.py and subset
# tables appropriately
library(data.table)
library(ggplot2)
rm(list = ls())
inDoms <- fread('prot_seq_fewZFs_hmmerOut.txt')
inMotifs <- fread('motifTable_mostRecent_fewZFs.txt')
# Remove clusters of length less than 2
clustInfo <- inDoms[,.(clustLen = .N),by = c('prot','clustNum')]
doms <- merge(inDoms, clustInfo[clustLen >= 2 & clustLen <= 4], by = c('prot','clustNum'))
# Remove proteins with more than one cluster
nClusts <- doms[,.(nClusts = length(unique(clustNum))),by = c('prot')]
doms <- merge(doms, nClusts[nClusts == 1], by = c('prot'))
# Remove proteins for which we have no motif
doms <- doms[prot %in% inMotifs$TF_ID]
motifs <- inMotifs[TF_ID %in% doms$prot]
# Append the appropriate motif ID and TF name to the domain table
doms <- merge(doms, motifs[,c('TF_ID','Motif_ID','TF_Name','TF_Species'),with=FALSE],
by.x = 'prot', by.y = 'TF_ID', all.x = TRUE)
doms <- doms[order(prot, targStart)]
# How long are the motifs relative to the cluster length?
doms <- merge(doms, fread('PWM_lengths.txt'), by = 'Motif_ID')
doms <- doms[order(prot, targStart)]
# Plot the distribution of (motifLen-1)/clustLen
doms$clustLen <- factor(doms$clustLen)
g <- ggplot(doms[,.SD[1], by = 'prot'], aes(x = clustLen, y = motifLen)) +
geom_boxplot(outlier.colour = 'white') +
geom_jitter(height = 0, width = 0.1, size = 1, alpha = 0.4)
# Gather clustered ZFs based on the output of findDomains.py and subset
# tables appropriately
library(data.table)
library(ggplot2)
rm(list = ls())
inDoms <- fread('prot_seq_fewZFs_hmmerOut.txt')
inMotifs <- fread('motifTable_mostRecent_fewZFs.txt')
# Remove clusters of length less than 2
clustInfo <- inDoms[,.(clustLen = .N),by = c('prot','clustNum')]
doms <- merge(inDoms, clustInfo[clustLen >= 2 & clustLen <= 4], by = c('prot','clustNum'))
# Remove proteins with more than one cluster
nClusts <- doms[,.(nClusts = length(unique(clustNum))),by = c('prot')]
doms <- merge(doms, nClusts[nClusts == 1], by = c('prot'))
# Remove proteins for which we have no motif
doms <- doms[prot %in% inMotifs$TF_ID]
motifs <- inMotifs[TF_ID %in% doms$prot]
# Append the appropriate motif ID and TF name to the domain table
doms <- merge(doms, motifs[,c('TF_ID','Motif_ID','TF_Name','TF_Species'),with=FALSE],
by.x = 'prot', by.y = 'TF_ID', all.x = TRUE)
doms <- doms[order(prot, targStart)]
# How long are the motifs relative to the cluster length?
doms <- merge(doms, fread('PWM_lengths.txt'), by = 'Motif_ID')
doms <- doms[order(prot, targStart)]
# Plot the distribution of (motifLen-1)/clustLen
g <- ggplot(doms[,.SD[1], by = 'prot'], aes(x = clustLen, y = motifLen, group = clustLen)) +
geom_boxplot(outlier.colour = 'white') +
geom_jitter(height = 0, width = 0.1, size = 1, alpha = 0.4)
g
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
doms[motifLen >= 20]
motifs <- merge(motifs, fread('PWM_lengths.txt'), by = 'Motif_ID')
motifs <- merge(motifs, fread('PWM_lengths.txt'), by = 'Motif_ID', all.x = TRUE)
motifs
motifs <- merge(motifs, fread('PWM_lengths.txt'), by = 'Motif_ID', all.x = TRUE, all.y = TRUE)
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
# Which are the really long motifs??
motifs[TF_ID %in% doms[motifLen > 5*clustLen]]
# Which are the really long motifs??
motifs[TF_ID %in% doms[motifLen > 5*clustLen]$prot]
doms[motifLen > 5*clustLen]
# Which are the really long motifs??
longMotifs <- motifs[TF_ID %in% doms[motifLen > 5*clustLen]$prot]
longMotifs.doms <- doms[motifLen > 5*clustLen][]
# Which are the really long motifs??
longMotifs <- motifs[TF_ID %in% doms[motifLen > 5*clustLen]$prot]
longMotifs.doms <- doms[motifLen > 5*clustLen]
longMotifs
doms[TF_Name == 'REST']
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
setwd('../../../../cisBP/code/')
# Gather pertinent info for C2H2-ZF proteins with PWMs in cis-bp
library(data.table)
library(ggplot2)
library(seqLogo)
rm(list = ls())
# Read in the protein/motif info from the zf-only cisbp download
prots <- fread('../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/prot_seq.txt')
# Read in the protein/motif info from the zf-only cisbp download
prots <- fread('../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/prot_seq.txt')
prots
prots$protLen <- sapply(prots$Protein_seq, nchar)
names(prots)
prots <- prots[order(TF_ID, -protLen)]
prots$protLen
prots <- prots[,.SD[1],by = 'TF_ID']
prots
rm(list = ls())
# Read in the protein/motif info from the zf-only cisbp download
prots <- fread('../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/prot_seq.txt')
# Keep the longest version only for each protein
prots$protLen <- sapply(prots$Protein_seq, nchar)
prots <- prots[order(TF_ID, -protLen)]
prots <- prots[,.SD[1],by = 'TF_ID']
# How many ZFs does each protein have?
doms <- lapply(prots$Pfam_DBDs, function(x) strsplit(x, split = ",")[[1]])
zfOnly <- sapply(doms, function(x) all(x == "zf-C2H2"))
doms <- doms[which(zfOnly)]
prots <- prots[which(zfOnly)]
prots$nZFs <- sapply(doms, length)
# How many ZFs have exactly between 2 and 4 domains?
prots.fewZFs <- prots[nZFs >= 2 & nZFs <= 4]
length(unique(prots.fewZFs$TF_ID))
tfInfo <- fread('../TF_Information_all_motifs.txt')
tfInfo <- tfInfo[TF_ID %in% prots$TF_ID & Motif_Type != 'Transfac' & DBDs == 'zf-C2H2']
tfInfo <- tfInfo[order(TF_ID, -MSource_Year)]
tfInfo.mostRecent <- tfInfo[,.SD[1],TF_ID]
write.table(tfInfo.mostRecent, file = '../zfOnlyInfo_mostRecent.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
tfInfo.mostRecent <- fread('../zfOnlyInfo_mostRecent.txt')
tfInfo.mostRecent.fewZFs <- tfInfo.mostRecent[TF_ID %in% intersect(tfInfo.mostRecent$TF_ID,prots.fewZFs$TF_ID)]
write.table(tfInfo.mostRecent.fewZFs, file = '../zfOnlyInfo_mostRecent_fewZFs.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
write.table(tfInfo.mostRecent.fewZFs, file = '../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/motifTable_mostRecent_fewZFs.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
prots.fewZFs <- prots.fewZFs[TF_ID %in% intersect(tfInfo.mostRecent.fewZFs$TF_ID,prots.fewZFs$TF_ID)]
write.table(prots.fewZFs, file = '../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/prot_seq_fewZFs.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
prots.fewZFs
# Gather pertinent info for C2H2-ZF proteins with PWMs in cis-bp
library(data.table)
library(ggplot2)
library(seqLogo)
rm(list = ls())
# Read in the protein/motif info from the zf-only cisbp download
prots <- fread('../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/prot_seq.txt')
# Keep the longest version only for each protein
prots$protLen <- sapply(prots$Protein_seq, nchar)
prots <- prots[order(TF_ID, -protLen)]
prots <- prots[,.SD[1],by = 'TF_ID']
hist(prots$protLen)
hist(prots$protLen, breaks = 1)
hist(prots$protLen, breaks = 100)
# How many ZFs does each protein have?
doms <- lapply(prots$Pfam_DBDs, function(x) strsplit(x, split = ",")[[1]])
doms
zfOnly <- sapply(doms, function(x) all(x == "zf-C2H2"))
doms
doms <- doms[which(zfOnly)]
doms
prots <- prots[which(zfOnly)]
prots
hist(prots$protLen, breaks = 100)
prots$nZFs <- sapply(doms, length)
prots
table(prots$nZFs)
# How many ZFs have exactly between 2 and 4 domains?
prots.fewZFs <- prots[nZFs >= 2 & nZFs <= 4]
length(unique(prots.fewZFs$TF_ID))
tfInfo <- fread('../TF_Information_all_motifs.txt')
tfInfo <- tfInfo[TF_ID %in% prots$TF_ID & Motif_Type != 'Transfac' & DBDs == 'zf-C2H2']
tfInfo <- tfInfo[order(TF_ID, -MSource_Year)]
tfInfo.mostRecent <- tfInfo[,.SD[1],TF_ID]
write.table(tfInfo.mostRecent, file = '../zfOnlyInfo_mostRecent.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
tfInfo.mostRecent <- fread('../zfOnlyInfo_mostRecent.txt')
tfInfo.mostRecent
tfInfo.mostRecent.fewZFs <- tfInfo.mostRecent[TF_ID %in% intersect(tfInfo.mostRecent$TF_ID,prots.fewZFs$TF_ID)]
write.table(tfInfo.mostRecent.fewZFs, file = '../zfOnlyInfo_mostRecent_fewZFs.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
tfInfo.mostRecent.fewZFs
write.table(tfInfo.mostRecent.fewZFs, file = '../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/motifTable_mostRecent_fewZFs.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
prots.fewZFs <- prots.fewZFs[TF_ID %in% intersect(tfInfo.mostRecent.fewZFs$TF_ID,prots.fewZFs$TF_ID)]
prots.fewZFs
write.table(prots.fewZFs, file = '../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/prot_seq_fewZFs.txt',
quote = FALSE, sep = '\t', row.names = FALSE)
source("~/research/cisBP/code/gatherC2H2ZFmotifs.R")
setwd('../../jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/')
library(data.table)
library(ggplot2)
rm(list = ls())
inDoms <- fread('prot_seq_fewZFs_hmmerOut.txt')
inMotifs <- fread('motifTable_mostRecent_fewZFs.txt')
# Remove clusters of length less than 2
clustInfo <- inDoms[,.(clustLen = .N),by = c('prot','clustNum')]
doms <- merge(inDoms, clustInfo[clustLen >= 2 & clustLen <= 4], by = c('prot','clustNum'))
# Remove proteins with more than one cluster
nClusts <- doms[,.(nClusts = length(unique(clustNum))),by = c('prot')]
doms <- merge(doms, nClusts[nClusts == 1], by = c('prot'))
doms
doms
library(data.table)
library(ggplot2)
rm(list = ls())
inDoms <- fread('prot_seq_fewZFs_hmmerOut.txt')
inMotifs <- fread('motifTable_mostRecent_fewZFs.txt')
# Remove clusters of length less than 2
clustInfo <- inDoms[,.(clustLen = .N),by = c('prot','clustNum')]
doms <- merge(inDoms, clustInfo[clustLen >= 2 & clustLen <= 4], by = c('prot','clustNum'))
# Remove proteins with more than one cluster
nClusts <- doms[,.(nClusts = length(unique(clustNum))),by = c('prot')]
doms <- merge(doms, nClusts[nClusts == 1], by = c('prot'))
# Remove proteins for which we have no motif
doms <- doms[prot %in% inMotifs$TF_ID]
motifs <- inMotifs[TF_ID %in% doms$prot]
# Append the appropriate motif ID and TF name to the domain table
doms <- merge(doms, motifs[,c('TF_ID','Motif_ID','TF_Name','TF_Species'),with=FALSE],
by.x = 'prot', by.y = 'TF_ID', all.x = TRUE)
doms <- doms[order(prot, targStart)]
# How long are the motifs relative to the cluster length?
doms <- merge(doms, fread('PWM_lengths.txt'), by = 'Motif_ID')
doms <- doms[order(prot, targStart)]
doms
# Sanity check
print(length(unique(doms$prot)) == length(intersect(unique(motifs$TF_ID), unique(doms$prot))))
print(length(unique(motifs$TF_ID)) == length(intersect(unique(motifs$TF_ID), unique(doms$prot))))
# Keep the subsetted domain and motif tables for organizing inputs to gibbs_GLM.py
write.table(doms, 'prot_seq_fewZFs_hmmerOut_clusteredOnly.txt',sep = '\t', quote = FALSE,row.names = FALSE)
write.table(motifs, 'motifTable_mostRecent_fewZFs_clusteredOnly.txt',sep = '\t', quote = FALSE,row.names = FALSE)
# Overall properties of dataset
print(paste("There are",length(unique(doms$prot)),"distinct C2H2-ZFs proteins with a cis-bp",
"motif and a single cluster of between 2 to 4 ZFs linked by at most 10 AAs b/w ZFs."))
print(paste("Thes proteins span:"))
print(paste(length(table(motifs$TF_Species)), "distinct species across", length(table(motifs$MSource_Identifier)), "distinct sources."))
print(paste("The distribution of cluster lengths is:"))
print(table(doms[,.(clustLen=.N), by = 'prot']$clustLen))
# Plot the distribution of (motifLen-1)/clustLen
g <- ggplot(doms[,.SD[1], by = 'prot'], aes(x = clustLen, y = motifLen, group = clustLen)) +
geom_boxplot(outlier.colour = 'white') +
geom_jitter(height = 0, width = 0.1, size = 1, alpha = 0.4) +
labs(x = "Number of ZFs in cluster", "Motif length") +
theme_classic()
ggsave(plot = g, file = '0_nZFs_V_motifLen.pdf', width = 5, height = 5)
# Remove motif if not greater than 3*clustLen in length
doms <- doms[motifLen >= 3*clustLen]
motifs <- motifs[TF_ID %in% doms$prot]
# Sanity check
print(length(unique(doms$prot)) == length(intersect(unique(motifs$TF_ID), unique(doms$prot))))
print(length(unique(motifs$TF_ID)) == length(intersect(unique(motifs$TF_ID), unique(doms$prot))))
# Overall properties of dataset
print("Considering only motifs where motifLen >= 3*clustLen:")
print(paste("There are",length(unique(doms$prot)),"distinct C2H2-ZFs proteins with a cis-bp",
"motif and a single cluster of between 2 to 4 ZFs linked by at most 10 AAs b/w ZFs."))
print(paste("Thes proteins span:"))
print(paste(length(table(motifs$TF_Species)), "distinct species across", length(table(motifs$MSource_Identifier)), "distinct sources."))
print(paste("The distribution of cluster lengths is:"))
print(table(doms[,.(clustLen=.N), by = 'prot']$clustLen))
# Plot the distribution of (motifLen-1)/clustLen
g <- ggplot(doms[,.SD[1], by = 'prot'], aes(x = clustLen, y = motifLen, group = clustLen)) +
geom_boxplot(outlier.colour = 'white') +
geom_jitter(height = 0, width = 0.1, size = 1, alpha = 0.4) +
labs(x = "Number of ZFs in cluster", "Motif length") +
theme_classic()
ggsave(plot = g, file = '0_nZFs_V_motifLen_removeTooShort.pdf', width = 5, height = 5)
# Keep the subsetted domain and motif tables for organizing inputs to gibbs_GLM.py
write.table(doms, 'prot_seq_fewZFs_hmmerOut_clusteredOnly_removeTooShort.txt',sep = '\t', quote = FALSE,row.names = FALSE)
write.table(motifs, 'motifTable_mostRecent_fewZFs_clusteredOnly_removeTooShort.txt',sep = '\t', quote = FALSE,row.names = FALSE)
# Which are the really long motifs??
longMotifs <- motifs[TF_ID %in% doms[motifLen > 5*clustLen]$prot]
longMotifs.doms <- doms[motifLen > 5*clustLen]
longMotifs
longMotifs.doms[TF_Name == 'YY1']
longMotifs.doms[TF_Name == 'PRDM6']
motifs
motifs[TF_Name == 'PRDM6']
table(motifs$MSource_Identifier)
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
names(table(doms[,.(clustLen=.N), by = 'prot']$clustLen)) * table(doms[,.(clustLen=.N), by = 'prot']$clustLen)
as.numeric(names(table(doms[,.(clustLen=.N), by = 'prot']$clustLen))) * table(doms[,.(clustLen=.N), by = 'prot']$clustLen)
sum(as.numeric(names(table(doms[,.(clustLen=.N), by = 'prot']$clustLen))) * table(doms[,.(clustLen=.N), by = 'prot']$clustLen))
longMotifs
longMotifs$TF_Name
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
longMotifs$TF_Name
longMotifs.doms[TF_Name == 'EGR1']
longMotifs.doms <- doms[motifLen > 5*clustLen]
longMotifs.doms <- doms[motifLen > 4*clustLen]
longMotifs.doms[TF_Name == 'EGR1']
motifs[TF_Name == 'EGR1']$MSource_Identifier
doms
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(arraySeq = paste(coreSeq)), by = 'TF_ID']
doms
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(arraySeq = paste(coreSeq)), by = 'prot']
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(arraySeq = paste0(coreSeq)), by = 'prot']
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(arraySeq = paste0(coreSeq, collapse = TRUE)), by = 'prot']
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(arraySeq = paste0(coreSeq, collapse = '')), by = 'prot']
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(arraySeq = paste(coreSeq, collapse = '')), by = 'prot']
# How many distinct arrays are there (in terms of base-contacting positions)?
arraySeqs <- doms[,.(arraySeq = paste(coreSeq, collapse = '')), by = 'prot']
# Keep the subsetted domain and motif tables for organizing inputs to gibbs_GLM.py
arraySeqs <- doms[,.(arraySeq = paste(coreSeq, collapse = '')), by = 'prot']
doms <- merge(doms, arraySeqs, by = 'TF_ID')
doms <- merge(doms, arraySeqs, by = 'prot')
doms <- doms[order(prot, targStart)]
write.table(doms, 'prot_seq_fewZFs_hmmerOut_clusteredOnly_removeTooShort.txt',sep = '\t', quote = FALSE,row.names = FALSE)
write.table(motifs, 'motifTable_mostRecent_fewZFs_clusteredOnly_removeTooShort.txt',sep = '\t', quote = FALSE,row.names = FALSE)
# Which are the really long motifs??
longMotifs <- motifs[TF_ID %in% doms[motifLen > 4*clustLen]$prot]
longMotifs.doms <- doms[motifLen > 4*clustLen]
# How many distinct arrays are there (in terms of base-contacting positions)?
sapply(doms$arraySeq, nchar)
# How many distinct arrays are there (in terms of base-contacting positions)?
sort(table(names(sapply(doms$arraySeq, nchar))))
# How many distinct arrays are there (in terms of base-contacting positions)?
length(sort(table(names(sapply(doms$arraySeq, nchar)))))
doms[arraySeq == 'KSHARDERRDHK']
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(nProts = length(unique(doms$prot))), by = 'arraySeq']
# How many distinct arrays are there (in terms of base-contacting positions)?
doms[,.(nProts = length(unique(prot))), by = 'arraySeq']
# How many distinct arrays are there (in terms of base-contacting positions)?
distinctArrays <- doms[,.(nProts = length(unique(prot))), by = 'arraySeq']
distinctArrays <- distinctArrays[order(-nProts)]
distinctArrays
hits(distinctArrays$nProts)
hist(distinctArrays$nProts)
hist(distinctArrays$nProts, breaks = 1)
hist(distinctArrays$nProts, breaks = 1:nrow(arraySeqs))
hist(distinctArrays$nProts, breaks = 1:max(arraySeqs$nProts))
hist(distinctArrays$nProts, breaks = 1:max(arraySeqs$nProts))
max(arraySeqs$nProts)
hist(distinctArrays$nProts, breaks = 1:max(distinctArrays$nProts))
print(paste("There are", nrow(distinctArrays), "distinct ZF arrays (considering only base-contacting positions)."))
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
setwd('../../structuralAlignmentFiles/')
library(data.table)
rm(list = ls())
# Read in the aligned contact files and subset to base-contacts only (no backbone)
# Consider positions that contact any base in >= 25% of weighted structures
allBpos <- fread('zf-C2H2_weightedSum_distCut3.6_unionBases.txt')
allBpos <- allBpos[bType == 'base' & wFrac >= .25]
# Read in the aligned contact files and subset to base-contacts only (no backbone)
# Consider all pairs that contact in >= 25% of weighted structures
pairs <- fread('zf-C2H2_weightedSum_distCut3.6.txt')
pairs <- pairs[cType == 'base' & frac >= 0.25 & mState %in% allBpos$mState]
pairs
contactMap <- pairs[,c('bPos','mState'),with=FALSE]
contactMap <- contactMap[order(bPos, mState)]
contactMap
write.table(contactMap, 'zf-C2H2_contactMap.txt', quote = FALSE, sep = '\t', row.names = FALSE)
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
getwd()
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
system('cp zf-C2H2_contactMap.txt ../precomputedInputs/zf-C2H2/')
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pais
pairs
# Read in the aligned contact files and subset to base-contacts only (no backbone)
# Consider all pairs that contact in >= 30% of weighted structures
pairs <- fread('zf-C2H2_weightedSum_distCut3.6.txt')
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
pairs
setwd('../cis_bp/C2H2-ZF/')
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
doms
doms[motifLen == clustLen*3+1]
doms[motifLen == clustLen*3]
doms[motifLen == clustLen*3+1]
doms[motifLen == clustLen*3+1]$prot
# Which motifs have length exactly 3*nZFs + 1?
motifs[TF_ID %in% unique(doms[motifLen == clustLen*3+1]$prot)]$TF_Name
motifs[TF_ID %in% unique(doms[motifLen == clustLen*3+1]$prot)]
doms[prot == 'T094828_2.00']
doms[prot == 'T094954_2.00']
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
doms[prot == 'T094954_2.00']
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
setwd('../../structuralAlignmentFiles/')
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
575*4
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
source("~/research/jointInterfaceLearning/rCLAMPS/structuralAlignmentFiles/makeContactMap.R")
contactMap
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
setwd('~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/')
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
doms
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
doms[clustLen == 2 & motifLen >= 12]
source("~/research/jointInterfaceLearning/rCLAMPS/cis_bp/C2H2-ZF/gatherClusteredZFInfo.R")
table(motifs$MSource_Identifier)
doms[prot == motifs[MSource_Identifier == 'FlyFactorSurvey']$TF_ID]$TF_Name
doms[prot %in% motifs[MSource_Identifier == 'FlyFactorSurvey']$TF_ID]$TF_Name
unique(doms[prot %in% motifs[MSource_Identifier == 'FlyFactorSurvey']$TF_ID]$TF_Name)
