# hicScaling.py
# Jonathan M. Galazka, Andrew D. Klocko
# 2015
# Usage: python ./hicScaling.py
#
# Use to plot the log/log scaling of HiC datasets.

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

datasets = ['NMF39_1', 'N3944_1'] # datasets to plot

# set-up plots			
f1, ax1 = plt.subplots(1)
f2, ax2 = plt.subplots(1)
f3, ax3 = plt.subplots(1)
f4, ax4 = plt.subplots(1)
f5, ax5 = plt.subplots(1)

counter = 0 # simple counter
colors = ['blue', 'red', 'green', 'red', 'green', 'green', 'black', 'orange', 'purple'] # colors to use for respective datasets

# window for some plots
start = 200000 / 40000.0 # 200 kb
end = 2000000 / 40000.0 # 2 mb

# loop through datasets
for dataset in datasets:

	print(dataset)
	resolution = 40
	resolution_string = str(resolution * 1000)

	datasetpath = '/Volumes/HD/HiC2/' + dataset + '/' + dataset + '_ic-' + \
					resolution_string + '-diag-txt/heatmap'
	
	# get chromosome starts and stops
	chr_starts_path = '/Volumes/HD/HiC2/' + dataset + '/' + dataset + '_ic-' + \
					resolution_string + '-diag-txt/chromosomeStarts'
	chr_starts_array = np.loadtxt(chr_starts_path, delimiter=' ')
	chr_starts = np.transpose(chr_starts_array.astype(int)[0:7])
	chr_ends = np.transpose(chr_starts_array.astype(int)[1:8])
	chr_lens = chr_ends - chr_starts
	min_len = np.min(chr_lens)
	
	array = np.loadtxt(datasetpath, delimiter=' ')
	
	# loop through 7 chromosomes
	total_probs = np.zeros((min_len - 1)) # array to hold data from all chrom.
	for i in range(0, 7):
	
		c_start = chr_starts[i] # get chr. start
		c_end = chr_ends[i] # get chr. end
		
		half_chr_array = array[c_start:c_end,:] # get all interactions made by chrom including with other chrom
		half_chr_total = np.sum(half_chr_array) # sum total interaction made by chrom
				
		chr_array = array[c_start:c_end, c_start:c_end] # get only intra-chrom interactions
		
		observed = [] # list for observed intra-chrom interactions
		
		# get interactions along each diagonal
		for k in range(0, chr_array.shape[0]):
			observed_diag = np.sum(np.diag(chr_array, k))
			observed.append(observed_diag)
	
		diag_observed_sums = np.sum(observed[1:]) # get total intra-chrom interactions
		
		corrected_total = half_chr_total - diag_observed_sums # subtract total intra-chrom interactions from total inter- and intra- interactions because it is counted twice
		
		total_array = np.repeat(corrected_total, len(observed)) # make an array with the corrected total
		
		prob = observed / total_array # calculate probability
		
		trunc_probs = prob[1:min_len] # truncate to the length of the shortest chrom
		
		total_probs = total_probs + trunc_probs # add values from one chrom to the total array
		
	total_probs = total_probs / 7.0 # get mean of total probabilities
	
	# calculate additive probability
	add_prob = 0
	sliding_prob = []
	for i in range(0, total_probs.size):
		prob = total_probs[i]
		add_prob = add_prob + prob
		sliding_prob.append(add_prob)
		
	# plot additive probability
	scaled_x = (40000.0 * np.arange(1, (len(sliding_prob)+1))) / 1000000.0
	ax5.plot(scaled_x, sliding_prob, color=colors[counter])
	
	# output additive probability
	print('Genomic distance')
	for i in range(0, scaled_x.size):
		print((scaled_x[i]))	
	print('Sliding probabilities')
	for i in range(0, len(sliding_prob)):
		print((sliding_prob[i]))
	
	# plot probabilities	
	scaled_x2 = (40000.0 * np.arange(1, (len(total_probs)+1))) / 1000000.0
	ax1.plot(scaled_x2, total_probs, color=colors[counter])
	ax2.plot(scaled_x2, total_probs, color=colors[counter])
	
	# output probabilities
	print('Genomic distance')
	for i in range(0, scaled_x2.size):
		print((scaled_x2[i]))		
	print('Probabilities')
	for i in range(0, total_probs.size):
		print((total_probs[i]))
	
	# take chunk between 200 kb and 2 mb for linear regression
	for_linreg = total_probs[start:end]
	for_linreg_log = np.log10(for_linreg)
	
	# calculate linear regression
	x = (40000 * np.arange(start, end)) / 1000000.0
	logx = np.log10(x)
	slope, intercept, r_value, p_value, std_err = stats.linregress(logx, for_linreg_log)
	r_squared = r_value ** 2
	print('slope= ' + str(slope))
	#print('intercept= ' + str(intercept))
	#print('p value= ' + str(p_value))
	print('r squared= ' + str(r_squared))
	transformed_intercept = 10 ** intercept
	#print('transformed intercept= ' + str(transformed_intercept))

	# plot log transformed linear regression
	ax3.plot(logx, (slope * logx) + intercept, '--', lw=2, color=colors[counter])
	ax3.plot(logx, for_linreg_log, 'o', color=colors[counter])
	
	# plot linear regression
	ax4.plot(x, for_linreg, 'o', color=colors[counter])
	ax4.plot(x, transformed_intercept * (x ** slope),'--', lw=2, color=colors[counter])
	
	counter = counter + 1


ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_title('Scaling (log10)')
ax1.set_ylabel('Contact probability (log10)')
ax1.set_xlabel('Distance (log10 mb)')

ax2.set_title('Scaling')
ax2.set_ylabel('Contact probability')
ax2.set_xlabel('Distance (mb)')

ax3.set_title('Scaling (log10) 0.2 - 2 mb & linear regression')
ax3.set_ylabel('Contact probability (log10)')
ax3.set_xlabel('Distance (log10 mb)')

ax4.set_title('Scaling 0.2 - 2 mb & power law curve')
ax4.set_ylabel('Contact probability')
ax4.set_xlabel('Distance (mb)')

ax5.set_title('Probability of contact below distance')
ax5.set_ylabel('Cumulative contact probability')
ax5.set_xlabel('Distance (mb)')

plt.show()