# Want to check our HEK293 sequence SNPs with publicly available data.
# Got our HEK293.bam from Arshad.



# ~~~~~~~~~~~~~ R ~~~~~~~~~~~~~~~~~~~~~~~~~

# Download Supplementary Data 2 from https://www.nature.com/articles/ncomms5767
# file name:
# 41467_2014_BFncomms5767_MOESM763_ESM.xls
# Spreadsheet: Known common homozygous SNP


# Get names of dbSNP rs values from 41467_2014_BFncomms5767_MOESM763_ESM.xls spreadsheet: Known common homozygous SNP found in coding regions of genes and present in all 6 sequenced HEK293 cell lines. 
# Total of 1366 SNPs (see below)
# Cut and paste into ucsc table browser to transpose to hg38. Resulting file is HEK293_paper_SNP_hg38.txt
# Total of 1295 SNPs after transfer to hg38.


HEK_snp_paper_hg38 <- read.table("HEK293_paper_SNP_hg38.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)
HEK_snp_paper_hg38 <- HEK_snp_paper_hg38[,c(2,4,5)]
colnames(HEK_snp_paper_hg38) <- c("chr","pos","dbSNP.id")

HEK_snp_paper_hg38$coord <- paste0(HEK_snp_paper_hg38[,1],":", HEK_snp_paper_hg38[,2],"-", HEK_snp_paper_hg38[,2])

dim(HEK_snp_paper_hg38)
# [1] 1295    4

head(HEK_snp_paper_hg38)
   # chr       pos   dbSNP.id                    coord
# 1 chr1  56956811  rs1013579   chr1:56956811-56956811
# 2 chr1 109202996  rs1052878 chr1:109202996-109202996
# 3 chr1  61914626  rs1056513   chr1:61914626-61914626
# 4 chr1  21247362  rs1076669   chr1:21247362-21247362
# 5 chr1 115033402 rs10776792 chr1:115033402-115033402
# 6 chr1 216421964 rs10779261 chr1:216421964-216421964

# write.table(HEK_snp_paper_hg38$coord,"HEK_snp_paper_hg38_simple.txt",quote=FALSE,row.names=FALSE,col.names=FALSE)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~ jump to START HERE to save time and disk space ~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



# ~~~~~~~~~~~~~ In Mac OS X terminal: ~~~~~~~~~~~~~~~~

# Got our HEK293.bam from Arshad.


# make aligned sorted file
samtools sort HEK293.bam -o HEK293_aligned.sorted.bam

# make .bai file:
samtools index HEK293_aligned.sorted.bam



# bash script. Note "our_data" is always hg38, from now and until end of script.
# HEK293_snp_our_data.txt is produced by the script

rm HEK293_snp_our_data.txt
while read line;do
    samtools mpileup -C 0 -r $line HEK293_aligned.sorted.bam >> HEK293_snp_our_data.txt
done < HEK_snp_paper_hg38_simple.txt

./script

# ~~~~~~~~~~~~~ R ~~~~~~~~~~~~~~~~~~~~~~~~~



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~ START HERE TO SAVE TIME AND DISK SPACE ~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

HEK_our_data <- read.table("HEK293_snp_our_data.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)
colnames(HEK_our_data) <- c("chr","pos","N","num.nts","obs","ref")


dim(HEK_our_data)
# [1] 638   6

head(HEK_our_data)
   # chr       pos N num.nts obs ref
# 1 chr1 109202996 N       1   c   <
# 2 chr1 216421964 N       1   T   G
# 3 chr1 112712767 N       2  tt  II
# 4 chr1  15770439 N       2  Tt  II
# 5 chr1  18481459 N       1   A   I
# 6 chr1  29216125 N       1   g   G



# Download list of snps in 41467_2014_BFncomms5767_MOESM763_ESM.xls spreadsheet: Known common homozygous SNP. Some ambiguities in dbSNP entries were arbitrated. Save as txt and upload to R:
HEK_snp_paper_hg18 <- read.table("HEK293_paper_known_SNP_hg18.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)


dim(HEK_snp_paper_hg18)
# [1] 1366    7 <<<<<<<<<<<< use in paper


head(HEK_snp_paper_hg18)
  # Gene.symbol  dbSNP.id GeneID ref alt GenomeRefSequence sampleSequence
# 1      CDC2L2 rs7531938 728642   A   G                R*             R*
# 2        NADK    rs4751  65220   G   T                 N              K
# 3       PANK4 rs7535528  55229   G   A                 A              V
# 4    KIAA0562 rs2275824   9731   A   T                 L              I
# 5       AJAP1  rs242056  55966   G   A                 G              R
# 6      RNF207  rs846111 388591   G   C                 G              A


# merge HEK_snp_paper_hg38 to incorporate dbSNP.id with HEK_our_data:
HEK_our_data <- merge(HEK_our_data, HEK_snp_paper_hg38,by=c("chr","pos"))


dim(HEK_our_data)
# [1] 638   8


head(HEK_our_data)
   # chr       pos N num.nts obs ref   dbSNP.id                    coord
# 1 chr1 107765105 N       1   a   I  rs7528153 chr1:107765105-107765105
# 2 chr1 109202996 N       1   c   <  rs1052878 chr1:109202996-109202996
# 3 chr1  11030859 N       1   a   < rs12711521   chr1:11030859-11030859
# 4 chr1 111319219 N       1   g   I  rs2275253 chr1:111319219-111319219
# 5 chr1 112712767 N       2  tt  II rs10857971 chr1:112712767-112712767
# 6 chr1  11501536 N       2  aa  GI  rs2817580   chr1:11501536-11501536


HEK_merge_known_SNP <- merge(HEK_our_data, HEK_snp_paper_hg18,by="dbSNP.id")


colnames(HEK_merge_known_SNP)[7] <- "our_ref"
colnames(HEK_merge_known_SNP)[11] <- "paper_ref"
colnames(HEK_merge_known_SNP)[6] <- "our_obs"
colnames(HEK_merge_known_SNP)[12] <- "paper_alt"


# Get first letter of doublets in our_obs and make upper case
HEK_merge_known_SNP$our_obs <- gsub("^\\^:","",HEK_merge_known_SNP$our_obs)
HEK_merge_known_SNP$our_obs <- toupper(substr(HEK_merge_known_SNP$our_obs, 0, 1))


dim(HEK_merge_known_SNP)
# [1] 640  14


head(HEK_merge_known_SNP)
    # dbSNP.id   chr       pos N num.nts our_obs our_ref                    coord Gene.symbol GeneID paper_ref paper_alt
# 1  rs1000952  chr3 184038034 N       1       A       . chr3:184038034-184038034       HTR3D 200909         G         A
# 2 rs10030708  chr4  68129811 N       1       T       I   chr4:68129811-68129811   TMPRSS11F 389208         C         T
# 3  rs1017219 chr19  49070181 N       1       G       I  chr19:49070181-49070181       KCNA7   3743         A         G
# 4 rs10230120  chr7  51029366 N       1       G       I   chr7:51029366-51029366        COBL  23242         T         G
# 5 rs10233232  chr7  48046564 N       1       C       G   chr7:48046564-48046564     C7orf57 136288         T         C
# 6 rs10265083  chr7  65398479 N       2       T      IG   chr7:65398479-65398479       ZNF92 168374         C         T
  # GenomeRefSequence sampleSequence
# 1                 R              H
# 2                 A              T
# 3                 M              T
# 4                 D              A
# 5                 M              T
# 6                 A              V


# Agreement between our obs and paper obs is 637/640    
# Will calculate this agreement again (getting same answer) at end of this document but using different dataframe.
sum(HEK_merge_known_SNP$our_obs == HEK_merge_known_SNP$paper_alt)
# [1] 637 <<<<<<<<<<<< use in paper


dim(HEK_merge_known_SNP)
# [1] 640  14 <<<<<<<<<<<< use in paper


# write.table(HEK_merge_known_SNP,"HEK_merge_known_SNP.txt",quote=FALSE,sep="\t",row.names=FALSE)


# ~~~~~~~~~~~ LLR random error model ~~~~~~~~~~~~~~~~~~~

# To get alleles and allele frequencies, read again:
HEK_snp_paper_hg38 <- read.table("HEK293_paper_SNP_hg38.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)

HEK_snp_paper_hg38 <- HEK_snp_paper_hg38[,c(2:5,8:9,23,25)]

colnames(HEK_snp_paper_hg38) <- c("chr","start","end","dbSNP.id", "refNCBI", "refUCSC", "alleles","alleleFreqs")

dim(HEK_snp_paper_hg38)
# [1] 1295    8

head(HEK_snp_paper_hg38)
   # chr     start       end   dbSNP.id refNCBI refUCSC alleles        alleleFreqs
# 1 chr1  56956810  56956811  rs1013579       C       C    A,G, 0.977701,0.022299,
# 2 chr1 109202995 109202996  rs1052878       T       T    C,T, 0.905036,0.094964,
# 3 chr1  61914625  61914626  rs1056513       G       G    A,G, 0.559604,0.440396,
# 4 chr1  21247361  21247362  rs1076669       G       G    C,T, 0.945594,0.054406,
# 5 chr1 115033401 115033402 rs10776792       A       A    A,G, 0.018795,0.981205,
# 6 chr1 216421963 216421964 rs10779261       C       C    C,T, 0.269442,0.730558,


# Separate out allele frequencies
HEK_snp_paper_hg38$alleleFreq1 <- as.numeric(unlist(lapply(strsplit(HEK_snp_paper_hg38$alleleFreqs,","), function(l) l[[1]])))
HEK_snp_paper_hg38$alleleFreq2 <- as.numeric(unlist(lapply(strsplit(HEK_snp_paper_hg38$alleleFreqs,","), function(l) l[[2]])))


for(i in 1:nrow(HEK_snp_paper_hg38)) {

HEK_snp_paper_hg38[i,"alleleFreq3"] <- as.numeric(unlist(lapply(strsplit(HEK_snp_paper_hg38[i,]$alleleFreqs,","), function(l) tryCatch({l[[3]]},error = function(e) {return(0)}))))

}

for(i in 1:nrow(HEK_snp_paper_hg38)) {

HEK_snp_paper_hg38[i,"alleleFreq4"] <- as.numeric(unlist(lapply(strsplit(HEK_snp_paper_hg38[i,]$alleleFreqs,","), function(l) tryCatch({l[[4]]},error = function(e) {return(0)}))))

}


HEK_snp_paper_hg38$allele1 <- unlist(lapply(strsplit(HEK_snp_paper_hg38$alleles,","), function(l) l[[1]]))
HEK_snp_paper_hg38$allele2 <- unlist(lapply(strsplit(HEK_snp_paper_hg38$alleles,","), function(l) l[[2]]))

for(i in 1:nrow(HEK_snp_paper_hg38)) {

HEK_snp_paper_hg38[i,"allele3"] <- unlist(lapply(strsplit(HEK_snp_paper_hg38[i,]$alleles,","), function(l) tryCatch({l[[3]]},error = function(e) {return("NA")})))

}

for(i in 1:nrow(HEK_snp_paper_hg38)) {

HEK_snp_paper_hg38[i,"allele4"] <- unlist(lapply(strsplit(HEK_snp_paper_hg38[i,]$alleles,","), function(l) tryCatch({l[[4]]},error = function(e) {return("NA")})))

}


dim(HEK_snp_paper_hg38)
# [1] 1295   16


head(HEK_snp_paper_hg38)
   # chr     start       end   dbSNP.id refNCBI refUCSC alleles        alleleFreqs alleleFreq1 alleleFreq2 alleleFreq3
# 1 chr1  56956810  56956811  rs1013579       C       C    A,G, 0.977701,0.022299,    0.977701    0.022299           0
# 2 chr1 109202995 109202996  rs1052878       T       T    C,T, 0.905036,0.094964,    0.905036    0.094964           0
# 3 chr1  61914625  61914626  rs1056513       G       G    A,G, 0.559604,0.440396,    0.559604    0.440396           0
# 4 chr1  21247361  21247362  rs1076669       G       G    C,T, 0.945594,0.054406,    0.945594    0.054406           0
# 5 chr1 115033401 115033402 rs10776792       A       A    A,G, 0.018795,0.981205,    0.018795    0.981205           0
# 6 chr1 216421963 216421964 rs10779261       C       C    C,T, 0.269442,0.730558,    0.269442    0.730558           0
  # alleleFreq4 allele1 allele2 allele3 allele4
# 1           0       A       G      NA      NA
# 2           0       C       T      NA      NA
# 3           0       A       G      NA      NA
# 4           0       C       T      NA      NA
# 5           0       A       G      NA      NA
# 6           0       C       T      NA      NA



# Read again:
HEK_merge_known_SNP <- read.table("HEK_merge_known_SNP.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

dim(HEK_merge_known_SNP)
# [1] 640  14

# merge hg38 dbSNP alleles and allele freqs (HEK_snp_paper_hg38) with paper alleles and our alleles:
HEK_snp_paper_hg38  <- merge(HEK_snp_paper_hg38, HEK_merge_known_SNP[,c("dbSNP.id","paper_alt","our_obs")],by="dbSNP.id")

dim(HEK_snp_paper_hg38)
# [1] 668  18

head(HEK_snp_paper_hg38)
     # dbSNP.id   chr     start       end refNCBI refUCSC alleles                 alleleFreqs alleleFreq1 alleleFreq2 alleleFreq3
# 1  rs1000952  chr3 184038033 184038034       G       G  A,C,T, 0.000008,0.333621,0.666371,    0.000008    0.333621    0.666371
# 2 rs10030708  chr4  68129810  68129811       C       C    C,T,          0.700667,0.299333,    0.700667    0.299333    0.000000
# 3  rs1017219 chr19  49070180  49070181       A       A    A,G,          0.632284,0.367716,    0.632284    0.367716    0.000000
# 4 rs10230120  chr7  51029365  51029366       T       T    G,T,          0.966337,0.033663,    0.966337    0.033663    0.000000
# 5 rs10233232  chr7  48046563  48046564       T       T    C,T,          0.870176,0.129824,    0.870176    0.129824    0.000000
# 6 rs10265083  chr7  65398478  65398479       C       C    C,T,          0.766166,0.233834,    0.766166    0.233834    0.000000
  # alleleFreq4 allele1 allele2 allele3 allele4 paper_alt our_obs
# 1           0       A       C       T      NA         A       A
# 2           0       C       T      NA      NA         T       T
# 3           0       A       G      NA      NA         G       G
# 4           0       G       T      NA      NA         G       G
# 5           0       C       T      NA      NA         C       C
# 6           0       C       T      NA      NA         T       T




# Calculate LLR for agreement between our data and paper data
# Use allele frequencies for match because low pass sequencing only gives n = 1
# Alt hypothesis is p = 1, cf https://www.ncbi.nlm.nih.gov/books/NBK232615/
# eg do not care if pattern of matches is statistically significantly opposite to index case

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele1),"LLR"] <- -log(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele1),"alleleFreq1"])

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele2),"LLR"] <- -log(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele2),"alleleFreq2"])

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele3),"LLR"] <- -log(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele3),"alleleFreq3"])

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele4),"LLR"] <- -log(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele4),"alleleFreq4"])




# Calculate chi square for agreement between our data and paper data

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele1),"chisq"] <- (1-(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele1),"alleleFreq1"]))^2/(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele1),"alleleFreq1"])

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele2),"chisq"] <- (1-(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele2),"alleleFreq2"]))^2/(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele2),"alleleFreq2"])

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele3),"chisq"] <- (1-(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele3),"alleleFreq3"]))^2/(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele3),"alleleFreq3"])

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele4),"chisq"] <- (1-(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele4),"alleleFreq4"]))^2/(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele4),"alleleFreq4"])


# find p of success (hit)

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele1),"p_hit"] <- HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele1),"alleleFreq1"]

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele2),"p_hit"] <- HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele2),"alleleFreq2"]

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele3),"p_hit"] <- HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele3),"alleleFreq3"]

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele4),"p_hit"] <- HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt) & (HEK_snp_paper_hg38$paper_alt == HEK_snp_paper_hg38$allele4),"alleleFreq4"]






# Private alleles for HEK293 do not have measurable allele frequencies in dbSNP, leaving NAs in LLR column. Replace with highest generic SNP rate, which is conservative

colMeans(HEK_snp_paper_hg38[,c("alleleFreq1", "alleleFreq2", "alleleFreq3", "alleleFreq4")])
# alleleFreq1 alleleFreq2 alleleFreq3 alleleFreq4 
# 0.494006448 0.459056774 0.045772130 0.001164647


# Highest rate is alleleFreq1, hence max imputed allele frequency:
imputed_max_allele_freq <- mean(HEK_snp_paper_hg38[,c("alleleFreq1")],na.rm=TRUE)


HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt),][is.na(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt),"LLR"]),"LLR"] <- -log(imputed_max_allele_freq)

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt),][is.na(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt),"chisq"]),"chisq"] <- (1-(imputed_max_allele_freq))^2/imputed_max_allele_freq

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt),][is.na(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt),"p_hit"]),"p_hit"] <- imputed_max_allele_freq




# Provide mismatches (there are four mismatches) with correct LLR == log(1) cf https://www.ncbi.nlm.nih.gov/books/NBK232615/
HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),"LLR"] <- -log(1)


# Provide mismatches (there are four mismatches) with correct chisq
HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),"chisq"] <- 0


# Provide mismatches (there are four mismatches) with correct p_hit
HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),"p_hit"] <- 1








# Some alleles of ours that disagree with paper have no available allele frequency. Therefore use min imputed_allele_freq to be conservative. Mimimum allele freq is alleleFreq4. Hence min imputed allele frequency:
imputed_min_allele_freq <- mean(HEK_snp_paper_hg38[,c("alleleFreq4")],na.rm=TRUE)

HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),][is.na(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),"LLR"]),"LLR"] <- -log(1-imputed_min_allele_freq)


# For chisq use max imputed freq
HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),][is.na(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),"chisq"]),"chisq"] <- (1-imputed_max_allele_freq)^2/imputed_max_allele_freq

# For p_hit use max imputed freq
HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),][is.na(HEK_snp_paper_hg38[(HEK_snp_paper_hg38$our_obs != HEK_snp_paper_hg38$paper_alt),"p_hit"]),"p_hit"] <- imputed_max_allele_freq






# df == 668, cf https://newonlinecourses.science.psu.edu/stat414/node/170/
dim(HEK_snp_paper_hg38)
# [1] 668  21

head(HEK_snp_paper_hg38)
    # dbSNP.id   chr     start       end refNCBI refUCSC alleles                 alleleFreqs alleleFreq1 alleleFreq2 alleleFreq3 alleleFreq4 allele1 allele2 allele3 allele4
# 1  rs1000952  chr3 184038033 184038034       G       G  A,C,T, 0.000008,0.333621,0.666371,    0.000008    0.333621    0.666371           0       A       C       T      NA
# 2 rs10030708  chr4  68129810  68129811       C       C    C,T,          0.700667,0.299333,    0.700667    0.299333    0.000000           0       C       T      NA      NA
# 3  rs1017219 chr19  49070180  49070181       A       A    A,G,          0.632284,0.367716,    0.632284    0.367716    0.000000           0       A       G      NA      NA
# 4 rs10230120  chr7  51029365  51029366       T       T    G,T,          0.966337,0.033663,    0.966337    0.033663    0.000000           0       G       T      NA      NA
# 5 rs10233232  chr7  48046563  48046564       T       T    C,T,          0.870176,0.129824,    0.870176    0.129824    0.000000           0       C       T      NA      NA
# 6 rs10265083  chr7  65398478  65398479       C       C    C,T,          0.766166,0.233834,    0.766166    0.233834    0.000000           0       C       T      NA      NA
  # paper_alt our_obs         LLR        chisq    p_hit
# 1         A       A 11.73606902 1.249980e+05 0.000008
# 2         T       T  1.20619861 1.640094e+00 0.299333
# 3         G       G  1.00044438 1.087206e+00 0.367716
# 4         G       G  0.03424264 1.172673e-03 0.966337
# 5         C       C  0.13905979 1.936881e-02 0.870176
# 6         T       T  1.45314382 2.510372e+00 0.233834




# LLR (chisq is 2*LLR)
sum(HEK_snp_paper_hg38$LLR)
# [1] 503.511

# chisq
2*sum(HEK_snp_paper_hg38$LLR)
# [1] 1007.022

pchisq(2*sum(HEK_snp_paper_hg38$LLR),df=length(HEK_snp_paper_hg38$LLR),lower.tail=FALSE)
# [1] 3.532839e-16


# df == 668, cf https://newonlinecourses.science.psu.edu/stat414/node/170/
length(HEK_snp_paper_hg38$LLR)
# [1] 668

# Highly significant!




# Alternate approach to LLR 

# LLR
log(1/prod(HEK_snp_paper_hg38$p_hit))
# [1] 503.511


# chi square
2*log(1/prod(HEK_snp_paper_hg38$p_hit))
# [1] 1007.022


# This LLR pval calc is identical to LLR pval, as expected
pchisq(2*log(1/prod(HEK_snp_paper_hg38$p_hit)), df=length(HEK_snp_paper_hg38$p_hit),lower.tail=FALSE)
# [1] 3.532839e-16





# Agreement between our obs and paper obs is 664/668  
sum(HEK_snp_paper_hg38$our_obs == HEK_snp_paper_hg38$paper_alt)
# [1] 664




# Repeat calx using summed chisq statistic
# Highly significant
pchisq(sum(HEK_snp_paper_hg38$chisq),df=length(HEK_snp_paper_hg38$chisq),lower.tail=FALSE)
# [1] 0







# >>>>>>>>>>>>>>>>>>>>>>>>> use in paper  >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 

# >>>> Repeat analysis, getting rid of unachored chromosome contigs <<<<<


# df == 640, cf https://newonlinecourses.science.psu.edu/stat414/node/170/
dim(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),])
# [1] 640  21

# LLR (chisq is 2*LLR)
sum(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),"LLR"])
# [1] 491.1294

# chisq
2*sum(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),"LLR"])
# [1] 982.2588  <<<<<<<<<<<<<<<<<<<<<<<<< use in paper

pchisq(2*sum(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),"LLR"]),df=length(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),"LLR"]),lower.tail=FALSE)

# [1] 6.716719e-17    <<<<<<<<<<<<<<<<<<<<<<<<< use in paper

# Once again, agreement between our data and paper data:
# Agreement between our obs and paper obs is 637/640      <<<<<<<<<<<<<<<<<<<<<<<<< use in paper
sum(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$our_obs == HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$paper_alt)
# [1] 637


dim(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),])
# [1] 640  21


sum(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$our_obs == HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$paper_alt)/dim(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),])[1]
# [1] 0.9953125 <<<<<<<<<<<<<<<<<<<<<<<<< use in paper


# df == 640, cf https://newonlinecourses.science.psu.edu/stat414/node/170/
length(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),"LLR"])
# [1] 640 <<<<<<<<<<<<<<<<<<<<<<<<< use in paper





# Alternate approach to LLR 

# LLR
log(1/prod(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$p_hit))
# [1] 491.1294


# chi square
2*log(1/prod(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$p_hit))
# [1] 982.2588


# This LLR pval calc is identical to LLR pval, as expected
pchisq(2*log(1/prod(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$p_hit)), df=length(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),]$p_hit),lower.tail=FALSE)
# [1] 6.716719e-17







# Repeat calx using summed chisq statistic
# Highly significant
pchisq(sum(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),"chisq"]),df=length(HEK_snp_paper_hg38[-grep("_alt",HEK_snp_paper_hg38$chr),"chisq"]),lower.tail=FALSE)
# [1] 0


































