importgzip,os,sysfromIPython.displayimportHTMLimportpandasaspdimportseabornassnsimportmatplotlib.pyplotaspltimportscipy.statsasssimportscipyasspimportnumpyasnpfromcollectionsimportdefaultdictfromscipy.signalimportargrelmin,argrelmaximportmathimportmatplotlib.patchesaspatches#import custom functions for displaying tables, bash commandssys.path.append(os.path.abspath("/home/damian/"))fromdk_ipythonimport*%matplotlibinlineHTML(addToggle())The following FACS datasets were used to calculate FACS proportional expression values for annotated loci.
rna_pearson_x1_na_na_SRR2009674
rna_pearson_x1_na_na_SRR2009675
rna_pearson_x1_na_na_SRR496276
rna_pearson_x2_na_na_SRR2009676
rna_pearson_x2_na_na_SRR2009677
rna_pearson_x2_na_na_SRR496278
rna_pearson_xins_na_na_SRR496280
rna_rajewsky_x1_na_na_oenalx1
rna_rajewsky_x2_na_na_oenalx2
rna_rajewsky_xins_na_na_oenalxins
rna_reddien_x1_na_000h_SRR1302023
rna_reddien_x1_na_000h_SRR1302024
rna_reddien_x2_na_000h_SRR1302025
rna_reddien_x2_na_000h_SRR1302026
rna_sanchez_x1_na_na_SRR2407874
rna_sanchez_x1_na_na_SRR2407875
rna_sanchez_x1_na_na_SRR2407876
rna_sanchez_x1_na_na_SRR2407877
rna_sanchez_xins_na_na_SRR2407878
rna_sanchez_xins_na_na_SRR2407879
rna_sanchez_xins_na_na_SRR2407880
rna_sanchez_xins_na_na_SRR2407881
*The last SRRXXXXXXX number is the Run accession ID
TPM values were generated using Kallisto. Sleuth was used to calculate the normalization factors. TPM can be aggregated by gene by summing the TPM of individual transcripts of a gene.
Underneath is a hierarchical clustering of FACS datasets using normalized transcripts per million (TPM). Dendograms are shown using both Pearson and Spearman correlations. Note how some RNA-seq datasets do not cluster well with other RNA-seq datasets of the same category. This is most likely owing to a difference in FACS gating between experimental replicates and labs.
kalPath='/home/share/projects/smed_neoblast/kallisto/'importscipy.spatial.distanceasdistimportscipy.cluster.hierarchyashierdefcreateDF(kalPath,szPath):libs=[[x.strip().split()[0],float(x.strip().split()[1])]forxin \
open(szPath).read().strip().split('\n')[1:]]libSeries=[]cols=[]forlibinlibs:path=kalPath+lib[0]+'/abundance.tsv'gids=defaultdict(float)d=open(path)d.next()forlineind:meta=line.strip().split()gids['.'.join(meta[0].split('.')[:-1])]+=(float(meta[-1])*lib[1])d.close()libMeta=lib[0].split('_')libName=libMeta[1]+' '+libMeta[2]+' '+libMeta[-1]cols.append(libName)s=pd.Series(gids)libSeries.append(s)df=pd.concat(libSeries,axis=1)df.columns=colsreturndfdefplotCorrelation(df,w,h,t):d=dist.pdist(df.T,'correlation')link=hier.linkage(d,method='complete')plt.figure(figsize=[w,h])hier.dendrogram(link,labels=[' '.join(x.split('_'))forxindf.axes[1]],orientation='left')plt.xlabel('Distance')plt.ylabel('Samples')plt.title(t,fontweight='bold',fontsize=14)defplotSpearmanCorrelation(df,w,h,t):d=dist.pdist(df.rank(1).T,'correlation')link=hier.linkage(d,method='complete')plt.figure(figsize=[w,h])hier.dendrogram(link,labels=[' '.join(x.split('_'))forxindf.axes[1]],orientation='left')plt.xlabel('Distance')plt.ylabel('Samples')plt.title(t,fontweight='bold',fontsize=14)df_nb=createDF('/home/share/projects/smed_neoblast/kallisto/','/home/share/projects/smed_neoblast/neoblast.tpm.sizeFactors')plotCorrelation(df_nb[(df_nb>0).all(axis=1)],12,7,'Hierarchical clustering using Pearson Correlation (normalized TPM)')plotSpearmanCorrelation(df_nb[(df_nb>0).all(axis=1)],12,7,'Hierarchical clustering using Spearman Correlation (ranked TPM)')For each experimental group of FACS libraries, the following proportional values were calculated for each type of comparison (where available):
x1 vs x2
x1 proportion = x1 / (x1 + x2)
x2 proportion = x2 / (x1 + x2)
x1 vs xins
x1 proportion = x1 / (x1 + xins)
xins proportion = xins / (x1 + xins)
x2 vs xins
x2 proportion = x2 / (x2 + xins)
xins proportion = xins / (x2 + xins)
x1 vs x2 vs xins
x1 proportion = x1 / (x1 + x2 + xins)
x2 proportion = x2 / (x1 + x2 + xins)
xins proportion = xins / (x1 + x2 + xins)
We normalized datasets from each lab separately instead of across all RNA-seq datasets.
The following figure shows the correlation of the FACS libraries after this transformation. Only Pearson and Rajewsky have all three X1, X2, and Xins libraries. Reddien has X1 and X2. Sanchez and X1 and Xins.
df_pearson=createDF('/home/share/projects/smed_neoblast/kallisto/','/home/share/projects/smed_neoblast/neoblast.pearson.tpm.sizeFactors')df_reddien=createDF('/home/share/projects/smed_neoblast/kallisto/','/home/share/projects/smed_neoblast/neoblast.reddien.tpm.sizeFactors')df_sanchez=createDF('/home/share/projects/smed_neoblast/kallisto/','/home/share/projects/smed_neoblast/neoblast.sanchez.tpm.sizeFactors')df_rajewsky=createDF('/home/share/projects/smed_neoblast/kallisto/','/home/share/projects/smed_neoblast/neoblast.rajewsky.tpm.sizeFactors')df_pearson['x1']=df_pearson[["pearson x1 SRR2009674",\
"pearson x1 SRR2009675",\
"pearson x1 SRR496276"]].mean(axis=1)df_pearson['x2']=df_pearson[["pearson x2 SRR2009676",\
"pearson x2 SRR2009677",\
"pearson x2 SRR496278"]].mean(axis=1)df_pearson['xins']=df_pearson['pearson xins SRR496280']df_reddien['x1']=df_reddien[["reddien x1 SRR1302023",\
"reddien x1 SRR1302024"]].mean(axis=1)df_reddien['x2']=df_reddien[["reddien x2 SRR1302025",\
"reddien x2 SRR1302026"]].mean(axis=1)df_rajewsky['x1']=df_rajewsky['rajewsky x1 oenalx1']df_rajewsky['x2']=df_rajewsky['rajewsky x2 oenalx2']df_rajewsky['xins']=df_rajewsky['rajewsky xins oenalxins']df_sanchez['x1']=df_sanchez[["sanchez x1 SRR2407874",\
"sanchez x1 SRR2407875",\
"sanchez x1 SRR2407876",\
"sanchez x1 SRR2407877"]].mean(axis=1)df_sanchez['xins']=df_sanchez[["sanchez xins SRR2407878",\
"sanchez xins SRR2407879",\
"sanchez xins SRR2407880",\
"sanchez xins SRR2407881"]].mean(axis=1)deftransformProp(df):pops=[]if'x1'indf:pops.append('x1')if'x2'indf:pops.append('x2')if'xins'indf:pops.append('xins')fromitertoolsimportcombinationscomps=[xforxincombinations(pops,2)]forcompincomps:compName=comp[0]+'-'+comp[1]a=comp[0]b=comp[1]df[compName+'_'+a+'prop']=df[a]/(df[a]+df[b])*100df[compName+'_'+b+'prop']=df[b]/(df[a]+df[b])*100iflen(pops)==3:df['all_x1prop']=df['x1']/(df['x1']+df['x2']+df['xins'])df['all_x2prop']=df['x2']/(df['x1']+df['x2']+df['xins'])df['all_xinsprop']=df['xins']/(df['x1']+df['x2']+df['xins'])transformProp(df_reddien)transformProp(df_rajewsky)transformProp(df_pearson)transformProp(df_sanchez)x1_x2=[]x1_x2.append(df_pearson['x1-x2_x1prop'])x1_x2.append(df_pearson['x1-x2_x2prop'])x1_x2.append(df_rajewsky['x1-x2_x1prop'])x1_x2.append(df_rajewsky['x1-x2_x2prop'])x1_x2.append(df_reddien['x1-x2_x1prop'])x1_x2.append(df_reddien['x1-x2_x2prop'])x1_x2=pd.concat(x1_x2,axis=1)x1_x2.columns=['pearson x1','pearson x2','rajewsky x1','rajewsky x2','reddien x1','reddien x2']x1_xins=[]x1_xins.append(df_pearson['x1-xins_x1prop'])x1_xins.append(df_pearson['x1-xins_xinsprop'])x1_xins.append(df_rajewsky['x1-xins_x1prop'])x1_xins.append(df_rajewsky['x1-xins_xinsprop'])x1_xins.append(df_sanchez['x1-xins_x1prop'])x1_xins.append(df_sanchez['x1-xins_xinsprop'])x1_xins=pd.concat(x1_xins,axis=1)x1_xins.columns=['pearson x1','pearson xins','rajewsky x1','rajewsky xins','sanchez x1','sanchez xins']allProp=[]allProp.append(df_pearson['all_x1prop'])allProp.append(df_pearson['all_x2prop'])allProp.append(df_pearson['all_xinsprop'])allProp.append(df_rajewsky['all_x1prop'])allProp.append(df_rajewsky['all_x2prop'])allProp.append(df_rajewsky['all_xinsprop'])allProp=pd.concat(allProp,axis=1)allProp.columns=['pearson x1','pearson x2','pearson xins','rajewsky x1','rajewsky x2','rajewsky xins']plotCorrelation(x1_x2.dropna(),4,4,'Hierarchical clustering of X1 and X2')plotCorrelation(x1_xins.dropna(),4,4,'Hierarchical clustering of X1 and Xins')plotCorrelation(allProp.dropna(),4,4,'Hierarchical clustering of X1, X2 and Xins')We now calculate three sets of pair-wise ratios (X1 vs X2, X1 vs Xins, X2 vs Xins) and consolidate them into a final set of X1:X2:Xins ratios. Given two of the three pair-wise ratios, we can calculate a predicted ratio for the third. Then we can compare the predicted ratio to the actual observed ratio to see how well they correlate (Spearman correlation). This gives us an idea of how well the datasets agree with each other.
For example, Pearson, Rajewsky, and Reddien have X1 and X2 libraries. We average the X1 proportions from each of the three libraries. We also average the X2 proportions from each of the three libraries.
We divide the average X1 by average X2 to give us the 'observed' X1:X2 ratio.
Next, we calculate the 'predicted' X1:X2 ratio: We have the X2:Xins ratio and X1:Xins ratios. We essentially divide these ratios by each other to give us the predicted X1:X2 ratio as (X1/Xins / X2/Xins) = (X1*Xins / Xins/X2) = X1/X2
We then calculate a correlation between the predicted and observed ratios for each of the three ratios.
x1_x2_x1propmean=x1_x2[['pearson x1','rajewsky x1','reddien x1']].mean(axis=1)x1_x2_x2propmean=x1_x2[['pearson x2','rajewsky x2','reddien x2']].mean(axis=1)x1_xins_x1propmean=x1_xins[['pearson x1','rajewsky x1','sanchez x1']].mean(axis=1)x1_xins_xinspropmean=x1_xins[['pearson xins','rajewsky xins','sanchez xins']].mean(axis=1)x2_xins_x2propmean=(df_pearson['x2-xins_x2prop']+df_rajewsky['x2-xins_x2prop'])/2x2_xins_xinspropmean=(df_pearson['x2-xins_xinsprop']+df_rajewsky['x2-xins_xinsprop'])/2x1_x2_predict=(x1_xins_x1propmean*x2_xins_xinspropmean)/(x1_xins_xinspropmean*x2_xins_x2propmean)x1_x2_observed=x1_x2_x1propmean/x1_x2_x2propmeanx1_xins_predict=(x1_x2_x1propmean*x2_xins_x2propmean)/(x1_x2_x2propmean*x2_xins_xinspropmean)x1_xins_observed=x1_xins_x1propmean/x1_xins_xinspropmeanx2_xins_predict=(x1_x2_x2propmean*x1_xins_x1propmean)/(x1_x2_x1propmean*x1_xins_xinspropmean)x2_xins_observed=x2_xins_x2propmean/x2_xins_xinspropmeandefpred_obs_corr(pred,obs):a=pd.concat([pred,obs],axis=1)a.columns=['pred','obs']a=a.replace([np.inf,-np.inf],np.nan).dropna()returna['pred'].corr(a['obs'],method='spearman')print'Correlation of x1 vs x2 predicted ratio and observed ratio:',pred_obs_corr(x1_x2_predict,x1_x2_observed)print'Correlation of x1 vs xins predicted ratio and observed ratio:',pred_obs_corr(x1_xins_predict,x1_xins_observed)print'Correlation of x2 vs xins predicted ratio and observed ratio:',pred_obs_corr(x2_xins_predict,x2_xins_observed)To generate the final consolidated proportions, we used the utilized the well correlated ratios X2:Xins.
To calculate X1 average TPMs we calculated a scale factor by dividing the X2 TPMs calculated for the X2:Xins ratio by the X2 TPMs calculated for the X1:X2 ratio.
We then used this scale factor to multiply the X1 TPMs calculated for the poorly correlated X1:X2 ratio.
Finally we converted each X1,X2,Xins TPM to a percentage value.
f=x2_xins_x2propmean/x1_x2_x2propmeanx1_scaled=x1_x2_x1propmean*fx2_scaled=x2_xins_x2propmeanxins_scaled=x2_xins_xinspropmeanfinalRatio=pd.concat([x1_scaled,x2_scaled,xins_scaled],axis=1)finalRatio.columns=['x1','x2','xins']finalPercent=pd.DataFrame()finalPercent['x1']=finalRatio['x1']/(finalRatio['x1']+finalRatio['x2']+finalRatio['xins'])*100finalPercent['x2']=finalRatio['x2']/(finalRatio['x1']+finalRatio['x2']+finalRatio['xins'])*100finalPercent['xins']=finalRatio['xins']/(finalRatio['x1']+finalRatio['x2']+finalRatio['xins'])*100Loci were classified by proportion expression in the following way:
X1: X1 proportion >= 50%
X2: X2 proportion >= 50%
Xins: Xins proportion >= 50%
X1/X2: X1 proportion + X2 proportion >= 75% and not in X1, X2, or Xins lists
X2/Xins: X2 proportion + Xins proportion >= 75% and not in X1, X2, or Xins lists
X1/Xins: X1 proportion + Xins proportion >= 75% and not in X1, X2, or Xins lists
Ubiquitous: All leftover loci
Underneath is a ternary plot representing the above categories.
importternarypropFile=open('/home/share/projects/smed_transcriptome/lists/facs_proportions.tab')lids=[]props={}propIndex=[]df_props=[]forlineinpropFile:meta=line.strip().split()propIndex.append(meta[0])props[meta[0]]=map(float,meta[1:])df_props.append(map(float,meta[1:]))propFile.close()df_props=pd.DataFrame(df_props,index=propIndex,columns=['x1','x2','xins'])x1_lids=set([kfork,vinprops.items()ifv[0]>=50.0])x2_lids=set([kfork,vinprops.items()ifv[1]>=50.0])xins_lids=set([kfork,vinprops.items()ifv[2]>=50.0])x12_lids=set([kfork,vinprops.items()ifv[0]+v[1]>=75.0])-x1_lids-x2_lids-xins_lidsx2ins_lids=set([kfork,vinprops.items()ifv[1]+v[2]>=75.0])-x1_lids-x2_lids-xins_lidsx1ins_lids=set([kfork,vinprops.items()ifv[0]+v[2]>=75.0])-x1_lids-x2_lids-xins_lidsx0_lids=set(props.keys())-x1_lids-x2_lids-xins_lids-x12_lids-x2ins_lids-x1ins_lidssns.set_style('white')fig,ax=plt.subplots(figsize=[12,10])tax=ternary.TernaryAxesSubplot(ax=ax,scale=100)### Scatter Plottax.set_title("Ternary plot of proportional FACS expression",fontsize=15)tax.boundary(linewidth=1.0)tax.gridlines(multiple=25,color="black")tax.left_axis_label("xins")tax.right_axis_label("x2")tax.bottom_axis_label("x1")tax.scatter([props[k]forkinx1_lidsifkinprops],color='#0C83AB',s=5,alpha=0.25,marker='s',label='X1 enriched')tax.scatter([props[k]forkinx2_lidsifkinprops],color='#75C5E0',s=5,alpha=0.25,marker='s',label='X2 enriched')tax.scatter([props[k]forkinxins_lidsifkinprops],color='#FA9441',s=5,alpha=0.25,marker='s',label='Xins enriched')tax.scatter([props[k]forkinx12_lidsifkinprops],color='#25BA2F',s=5,alpha=0.25,marker='s',label='X1/X2 enriched')tax.scatter([props[k]forkinx2ins_lidsifkinprops],color='#D42C2C',s=5,alpha=0.25,marker='s',label='X2/Xins enriched')tax.scatter([props[k]forkinx1ins_lidsifkinprops],color='#B429CC',s=5,alpha=0.25,marker='s',label='X1/Xins enriched')tax.scatter([props[k]forkinx0_lidsifkinprops],color='#5E5E5E',s=5,alpha=0.25,marker='s',label='Ubiquitous')tax.ticks(axis='lbr',linewidth=1,multiple=25)sns.set_style('darkgrid')tax.legend()Gene ontology enrichment was performed for all lists. Only X1, Xins, X1/X2 lists had significant enrichments (pvalue < 0.05). Underneath are bar plots for the three FACS categories showing mean expression of the particular GO category.
x1Enrichment=pd.DataFrame([[x.split('\t')[1],float(x.split('\t')[6])] \
forxinopen('/home/share/projects/smed_lpt/figure/x1.enrichment').read().strip().split('\n')],\
columns=['GO Term','Mean'])x1Enrichment=x1Enrichment.sort_values('Mean',ascending=False).head(10)x1Enrichment=pd.Series(list(x1Enrichment['Mean']),index=x1Enrichment['GO Term'])xinsEnrichment=pd.DataFrame([[x.split('\t')[1],float(x.split('\t')[6])] \
forxinopen('/home/share/projects/smed_lpt/figure/xins.enrichment').read().strip().split('\n')],\
columns=['GO Term','Mean'])xinsEnrichment=xinsEnrichment.sort_values('Mean',ascending=False).head(10)xinsEnrichment=pd.Series(list(xinsEnrichment['Mean']),index=xinsEnrichment['GO Term'])x12Enrichment=pd.DataFrame([[x.split('\t')[1],float(x.split('\t')[6])] \
forxinopen('/home/share/projects/smed_lpt/figure/x1_2.enrichment').read().strip().split('\n')],\
columns=['GO Term','Mean'])x12Enrichment=x12Enrichment.sort_values('Mean',ascending=False).head(10)x12Enrichment=pd.Series(list(x12Enrichment['Mean']),index=x12Enrichment['GO Term'])x1Enrichment.plot(kind='barh',xlim=[0,100],color='#1E86A8',figsize=[4,4],fontsize=15,\
title='X1 enriched loci gene ontology (GO) enrichment')\
.set_xlabel('Mean X1 proportional expression')x12Enrichment.plot(kind='barh',xlim=[0,100],color='#1E86A8',figsize=[4,4],fontsize=15,\
title='X1/X2 enriched loci gene ontology (GO) enrichment')\
.set_xlabel('Mean X1/X2 proportional expression')xinsEnrichment.plot(kind='barh',xlim=[0,100],color='#1E86A8',figsize=[4,4],fontsize=15,\
title='Xins enriched loci gene ontology (GO) enrichment')\
.set_xlabel('Mean Xins proportional expression')goFile=open('/home/share/projects/smed_lpt/genome/ox_smed_asx1.1_1.0.goslim.index')go=defaultdict(set)forlineingoFile:lid,goids=line.strip().split()goids=goids.split(',')forgoidingoids:go[goid].add(lid)goFile.close()goIndex={}indexFile=open('/home/share/projects/smed_transcriptome/go.index')forlineinindexFile:meta=line.strip().split('\t')goIndex[meta[1]]=meta[0]indexFile.close()defplotTernaryEnrichment(enrch):sns.set_style('white')fig,ax=plt.subplots(3,3,figsize=[18,15])r=0c=0fori,terminenumerate(x1_enrichment):t_ax=ax[i/3,i%3]tax=ternary.TernaryAxesSubplot(ax=t_ax,scale=100)### Scatter Plottax.set_title(term,fontsize=15)tax.boundary(linewidth=1.0)tax.gridlines(multiple=25,color="black")tax.left_axis_label("xins")tax.right_axis_label("x2")tax.bottom_axis_label("x1")tax.scatter([props[k]forkingo[goIndex[term]]ifkinprops],s=20,alpha=0.2,label='term')sns.set_style('darkgrid')x1_enrichment=[x.strip()forxin'''nucleotidyltransferase activitycell divisioncell cyclemitotic nuclear divisionchromosome segregationhelicase activitynuclease activitychromosomeDNA metabolic process'''.split('\n')]xins_enrichment=[x.strip().split('\t')[0]forxin'''peptidase activity 72.76578125extracellular matrix organization 71.42758205proteinaceous extracellular matrix 71.14856295signal transducer activity 66.51659231cell adhesion 66.51000752lysosome 66.35104029transport 66.0974965aging 66.06736947cell-cell signaling 65.93235818'''.split('\n')]x12_enrichment=[x.strip().split('\t')[1]forxin \
'''GO:0003723 RNA binding 21772 13472 906 680 90.3685491199 6.31477726626e-18GO:0006397 mRNA processing 21772 13472 490 382 90.2115426357 1.00439819809e-14GO:0006412 translation 21772 13472 472 374 89.894450863 1.98182892656e-16GO:0005694 chromosome 21772 13472 467 333 89.8355655991 9.80869577673e-06GO:0003729 mRNA binding 21772 13472 395 287 90.7302588736 3.47050892734e-06GO:0005840 ribosome 21772 13472 251 186 91.0346967918 2.60901025592e-05GO:0003735 structural constituent of ribosome 21772 13472 217 179 90.860150048 2.71392807832e-11GO:0004386 helicase activity 21772 13472 198 154 89.7530516799 1.16866042791e-06GO:0042393 histone binding 21772 13472 198 150 90.2523142869 2.20439825304e-05'''.split('\n')]Underneath are ternary plots showing FACS proportional breakdown for GO terms associated with the X1 FACS category.
plotTernaryEnrichment(x1_enrichment)