# Want to check our HEK293 sequence SNPs with publicly available data.
# Got our HEK293.bam from Arshad.




# ~~~~~~~~~~~~~ R ~~~~~~~~~~~~~~~~~~~~~~~~~

# Download Supplementary Data 2 from https://www.nature.com/articles/ncomms5767
# file name:
# 41467_2014_BFncomms5767_MOESM763_ESM.xls
# Spreadsheet: Novel common homozygous SNP


# Get SNPs from 41467_2014_BFncomms5767_MOESM763_ESM.xls spreadsheet: Novel common homozygous SNP. Save columns chr,	start,	end,	Type of SNP, as HEK_novel_snp_paper_hg18.txt
# Then simplified HEK_novel_snp_paper_hg18.txt a bit using excel. Because novel SNPs, not in dbSNP, used arbitray SNP numbers provided in paper spreadsheet. Then read in:


HEK_novel_snp_paper_hg18 <- read.table("HEK_novel_snp_paper_hg18.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

head(HEK_novel_snp_paper_hg18)
  # SNP.number                 Specifics chr    start      end
# 1          -                   G iso A  11 55617873       NA
# 2        679  A iso G, missense R to Q   1   970730   970731
# 3      30818   A isoG, missense A to T   1 19056661 19056662
# 4      35620  A iso G, missense S to L   1 22049229 22049230
# 5      42857 T iso C , missense P to L   1 26007847 26007848
# 6      62487  G iso A, missense Q to R   1 40701555 40701556

# Provide missing SNP.number:
HEK_novel_snp_paper_hg18[1,"SNP.number"] <- 1


# Repair value missing from spreadsheet:
HEK_novel_snp_paper_hg18[1,"end"] <- HEK_novel_snp_paper_hg18[1,"start"] + 1

# Trim Specifics col:
HEK_novel_snp_paper_hg18$Specifics <- gsub(".*iso","",HEK_novel_snp_paper_hg18$Specifics)
HEK_novel_snp_paper_hg18$Specifics <- gsub(",.*","",HEK_novel_snp_paper_hg18$Specifics)
HEK_novel_snp_paper_hg18$Specifics <- gsub("^ ","",HEK_novel_snp_paper_hg18$Specifics)
HEK_novel_snp_paper_hg18$Specifics <- gsub(" $","",HEK_novel_snp_paper_hg18$Specifics)

# Re-name  and reorder cols:
colnames(HEK_novel_snp_paper_hg18)[c(1,2)] <- c("SNP.id","paper_SNP")
HEK_novel_snp_paper_hg18 <- HEK_novel_snp_paper_hg18[,c("chr", "start", "end", "SNP.id", "paper_SNP")]
HEK_novel_snp_paper_hg18[,"chr"] <- paste0("chr",HEK_novel_snp_paper_hg18[,"chr"])

dim(HEK_novel_snp_paper_hg18)
# [1] 107   5 <<<<<<<<<< use in paper

head(HEK_novel_snp_paper_hg18)
    # chr    start      end SNP.id paper_SNP
# 1 chr11 55617873 55617874      1         A
# 2  chr1   970730   970731    679         G
# 3  chr1 19056661 19056662  30818         G
# 4  chr1 22049229 22049230  35620         G
# 5  chr1 26007847 26007848  42857         C
# 6  chr1 40701555 40701556  62487         A

# write.table(HEK_novel_snp_paper_hg18,"HEK_novel_snp_paper_hg18.bed",quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)

# Load bed file into https://uswest.ensembl.org/Homo_sapiens/Tools/AssemblyConverter/View?db=core;tl=vY3sQNx8DO8g69j7-4695306 to transpose to hg38. Resulting file is HEK_novel_snp_paper_hg38.txt


HEK_novel_snp_paper_hg38 <- read.table("HEK_novel_snp_paper_hg38.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)

dim(HEK_novel_snp_paper_hg38)
# [1] 107   5

colnames(HEK_novel_snp_paper_hg38) <- c("chr","start","end","SNP.id","paper_SNP")
HEK_novel_snp_paper_hg38 <- HEK_novel_snp_paper_hg38[,c("chr","end","SNP.id","paper_SNP")]
colnames(HEK_novel_snp_paper_hg38)[2] <- c("pos")
HEK_novel_snp_paper_hg38[,"chr"] <- paste0("chr", HEK_novel_snp_paper_hg38[,"chr"])

HEK_novel_snp_paper_hg38$coord <- paste0(HEK_novel_snp_paper_hg38[,"chr"],":", HEK_novel_snp_paper_hg38[,"pos"],"-", HEK_novel_snp_paper_hg38[,"pos"])

dim(HEK_novel_snp_paper_hg38)
# [1] 107   5

head(HEK_novel_snp_paper_hg38)
    # chr      pos SNP.id paper_SNP                   coord
# 1 chr11 56093822      1         A chr11:56093822-56093822
# 2  chr1  1045488    679         G    chr1:1045488-1045488
# 3  chr1 18857581  30818         G  chr1:18857581-18857581
# 4  chr1 21850150  35620         G  chr1:21850150-21850150
# 5  chr1 25808770  42857         C  chr1:25808770-25808770
# 6  chr1 40463297  62487         A  chr1:40463297-40463297


# write.table(HEK_novel_snp_paper_hg38$coord,"HEK_novel_snp_paper_hg38_simple.txt",quote=FALSE,row.names=FALSE,col.names=FALSE)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~ jump to START HERE to save time and disk space ~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



# ~~~~~~~~~~~~~ In Mac OS X terminal: ~~~~~~~~~~~~~~~~

# Got our HEK293.bam from Arshad.


# make aligned sorted file
samtools sort HEK293.bam -o HEK293_aligned.sorted.bam

# make .bai file:
samtools index HEK293_aligned.sorted.bam



# bash script_novel. Note "our_data" is always hg38, now and until end of script.
# HEK293_novel_snp_our_data.txt is produced by the script

rm HEK293_novel_snp_our_data.txt
while read line;do
    samtools mpileup -C 0 -r $line HEK293_aligned.sorted.bam >> HEK293_novel_snp_our_data.txt
done < HEK_novel_snp_paper_hg38_simple.txt

./script_novel

# ~~~~~~~~~~~~~ R ~~~~~~~~~~~~~~~~~~~~~~~~~



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~ START HERE TO SAVE TIME AND DISK SPACE ~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



HEK293_novel_snp_our_data <- read.table("HEK293_novel_snp_our_data.txt",header=FALSE,sep="\t",stringsAsFactors=FALSE)
colnames(HEK293_novel_snp_our_data) <- c("chr","pos","N","num.nts","our_obs","our_ref")


dim(HEK293_novel_snp_our_data)
# [1] 57  6

head(HEK293_novel_snp_our_data)
   # chr       pos N num.nts our_obs our_ref
# 1 chr1   1045488 N       2      gg      GG
# 2 chr1  21850150 N       2      aa      GI
# 3 chr1  40463297 N       1       A       I
# 4 chr1  62450619 N       1       g       G
# 5 chr1 150444287 N       1       t       G
# 6 chr1 151286637 N       1       a       I


# merge HEK_snp_paper_hg38 to incorporate dbSNP.id with HEK_our_data:
HEK293_novel_snp_our_data_merge <- merge(HEK293_novel_snp_our_data, HEK_novel_snp_paper_hg38,by=c("chr","pos"))


dim(HEK293_novel_snp_our_data_merge)
# [1] 57  9


head(HEK293_novel_snp_our_data_merge)
   # chr       pos N num.nts our_obs our_ref SNP.id paper_SNP                    coord
# 1 chr1   1045488 N       2      gg      GG    679         G     chr1:1045488-1045488
# 2 chr1 150444287 N       1       t       G 189680         C chr1:150444287-150444287
# 3 chr1 151286637 N       1       a       I 191043         G chr1:151286637-151286637
# 4 chr1  21850150 N       2      aa      GI  35620         G   chr1:21850150-21850150
# 5 chr1 233139847 N       2      Ag      IG 318895         G chr1:233139847-233139847
# 6 chr1  40463297 N       1       A       I  62487         A   chr1:40463297-40463297



# Get first letter of doublets in our_obs and make upper case
HEK293_novel_snp_our_data_merge$our_obs <- gsub("^\\^:","", HEK293_novel_snp_our_data_merge$our_obs)
HEK293_novel_snp_our_data_merge$our_obs <- toupper(substr(HEK293_novel_snp_our_data_merge$our_obs, 0, 1))


dim(HEK293_novel_snp_our_data_merge)
# [1] 57  9


head(HEK293_novel_snp_our_data_merge)
   # chr       pos N num.nts our_obs our_ref SNP.id paper_SNP                    coord
# 1 chr1   1045488 N       2       G      GG    679         G     chr1:1045488-1045488
# 2 chr1 150444287 N       1       T       G 189680         C chr1:150444287-150444287
# 3 chr1 151286637 N       1       A       I 191043         G chr1:151286637-151286637
# 4 chr1  21850150 N       2       A      GI  35620         G   chr1:21850150-21850150
# 5 chr1 233139847 N       2       A      IG 318895         G chr1:233139847-233139847
# 6 chr1  40463297 N       1       A       I  62487         A   chr1:40463297-40463297



# write.table(HEK293_novel_snp_our_data_merge,"HEK293_novel_snp_our_data_merge.txt",quote=FALSE,sep="\t",row.names=FALSE)

# ~~~~~~~~~~~~~ Simple chisq test ~~~~~~~~~~~

# Agreement between our obs and paper obs is 32/57
sum(HEK293_novel_snp_our_data_merge$our_obs == HEK293_novel_snp_our_data_merge$paper_SNP)
# [1] 32 <<<<<<<<<<< use in paper

dim(HEK293_novel_snp_our_data_merge)
# [1] 57  9 <<<<<<<<<<< use in paper


sum(HEK293_novel_snp_our_data_merge$our_obs == HEK293_novel_snp_our_data_merge$paper_SNP)/dim(HEK293_novel_snp_our_data_merge)[1]
# [1] 0.5614035 <<<<<<<<<<< use in paper



chisq.test(c(32,25), p = c(0.25,0.75))

	# Chi-squared test for given probabilities

# data:  c(32, 25)
# X-squared = 29.48, df = 1, p-value = 5.651e-08


# ~~~~~~~~~~~ LLR random error model ~~~~~~~~~~~~~~~~~~~


# use these results in paper, largely for reasons of consistency with known SNPs.
# actually, simple chi sq test above is more reasonable, but LLR is more conservative anyway.
# Use postulated allele frequencies for match because low pass sequencing only gives n = 1
# Alt hypothesis is p = 1, cf https://www.ncbi.nlm.nih.gov/books/NBK232615/
# eg do not care if pattern of matches is statistically significantly opposite to index case

HEK293_novel_snp_our_data_merge[(HEK293_novel_snp_our_data_merge$our_obs == HEK293_novel_snp_our_data_merge$paper_SNP),"LLR"] <- -log(0.25)
HEK293_novel_snp_our_data_merge[(HEK293_novel_snp_our_data_merge$our_obs != HEK293_novel_snp_our_data_merge$paper_SNP),"LLR"] <- -log(1)



# df == 57, cf https://newonlinecourses.science.psu.edu/stat414/node/170/
dim(HEK293_novel_snp_our_data_merge)
# [1] 57  10

head(HEK293_novel_snp_our_data_merge)
   # chr       pos N num.nts our_obs our_ref SNP.id paper_SNP                    coord      LLR
# 1 chr1   1045488 N       2       G      GG    679         G     chr1:1045488-1045488 1.386294
# 2 chr1 150444287 N       1       T       G 189680         C chr1:150444287-150444287 0.000000
# 3 chr1 151286637 N       1       A       I 191043         G chr1:151286637-151286637 0.000000
# 4 chr1  21850150 N       2       A      GI  35620         G   chr1:21850150-21850150 0.000000
# 5 chr1 233139847 N       2       A      IG 318895         G chr1:233139847-233139847 0.000000
# 6 chr1  40463297 N       1       A       I  62487         A   chr1:40463297-40463297 1.386294


# LLR (chisq is 2*LLR)
sum(HEK293_novel_snp_our_data_merge$LLR)
# [1] 44.36142

# chisq
2*sum(HEK293_novel_snp_our_data_merge$LLR)
# [1] 88.72284 <<<<<<<<<<<<<<<<<< use in paper

pchisq(2*sum(HEK293_novel_snp_our_data_merge$LLR),df=length(HEK293_novel_snp_our_data_merge$LLR),lower.tail=FALSE)
# [1] 0.004527615  <<<<<<<<<<<<<<<<<<<<<<<<< use this in paper

# Reasonably significant!

# Amount of agreement = 32/57   <<<<<<<<<<<<<<<<<<<<<<<<< use in paper

dim(HEK293_novel_snp_our_data_merge[HEK293_novel_snp_our_data_merge$our_obs== HEK293_novel_snp_our_data_merge$paper_SNP,])
# [1] 32 10

dim(HEK293_novel_snp_our_data_merge)
# [1] 57 10


# df
length(HEK293_novel_snp_our_data_merge$LLR)
#. [1] 57 <<<<<<<<<<<<<<<<<<<<<<<<< use in paper





# ~~~~~~~~~~~~~ Alternate approach to LLR ~~~~~~~~~~~~~~~~~~~~~~~~~


HEK293_novel_snp_our_data_merge[(HEK293_novel_snp_our_data_merge$our_obs == HEK293_novel_snp_our_data_merge$paper_SNP),"p_hit"] <- 0.25
HEK293_novel_snp_our_data_merge[(HEK293_novel_snp_our_data_merge$our_obs != HEK293_novel_snp_our_data_merge$paper_SNP),"p_hit"] <- 1


# LLR
log(1/prod(HEK293_novel_snp_our_data_merge$p_hit))
# [1] 44.36142



# chi square
2*log(1/prod(HEK293_novel_snp_our_data_merge$p_hit))
# [1] 88.72284


# This LLR pval calc is identical to LLR pval, as expected
pchisq(2*log(1/prod(HEK293_novel_snp_our_data_merge$p_hit)), df=length(HEK293_novel_snp_our_data_merge$p_hit),lower.tail=FALSE)
# [1] 0.004527615









# ~~~~~~~~~~~ sum of chi sq model ~~~~~~~~~~~~~~~~~~~


# actually, simple chi sq test above is more reasonable, but explore cumulative chisq since more conservative anyway.

HEK293_novel_snp_our_data_merge[(HEK293_novel_snp_our_data_merge$our_obs == HEK293_novel_snp_our_data_merge$paper_SNP),"chisq"] <- (1-0.25)^2/0.25
HEK293_novel_snp_our_data_merge[(HEK293_novel_snp_our_data_merge$our_obs != HEK293_novel_snp_our_data_merge$paper_SNP),"chisq"] <- 0



# df == 57, cf https://newonlinecourses.science.psu.edu/stat414/node/170/
dim(HEK293_novel_snp_our_data_merge)
# [1] 57  11

head(HEK293_novel_snp_our_data_merge)
   # chr       pos N num.nts our_obs our_ref SNP.id paper_SNP                    coord      LLR chisq
# 1 chr1   1045488 N       2       G      GG    679         G     chr1:1045488-1045488 1.386294  2.25
# 2 chr1 150444287 N       1       T       G 189680         C chr1:150444287-150444287 0.000000  0.00
# 3 chr1 151286637 N       1       A       I 191043         G chr1:151286637-151286637 0.000000  0.00
# 4 chr1  21850150 N       2       A      GI  35620         G   chr1:21850150-21850150 0.000000  0.00
# 5 chr1 233139847 N       2       A      IG 318895         G chr1:233139847-233139847 0.000000  0.00
# 6 chr1  40463297 N       1       A       I  62487         A   chr1:40463297-40463297 1.386294  2.25





# Repeat calx using sum chisq statistic
# sum chisq
sum(HEK293_novel_snp_our_data_merge$chisq)
# [1] 72


# Gives marginal n.s., different from simple chisq above. Simple chisq more correct.
pchisq(sum(HEK293_novel_snp_our_data_merge$chisq),df=length(HEK293_novel_snp_our_data_merge$chisq),lower.tail=FALSE)
# [1] 0.08708933



# Amount of agreement = 32/57  

dim(HEK293_novel_snp_our_data_merge[HEK293_novel_snp_our_data_merge$our_obs== HEK293_novel_snp_our_data_merge$paper_SNP,])
# [1] 32 12

dim(HEK293_novel_snp_our_data_merge)
# [1] 57 12


# df == 57, cf https://newonlinecourses.science.psu.edu/stat414/node/170/
length(HEK293_novel_snp_our_data_merge$chisq)
#. [1] 57


































