BEGIN {
    OFS="\t";
    split("",EmptyArray);
    stderr = "/dev/stderr";

    refGenomeSeq = "/grid/dobin/home/dobin/Projects/Consensus/ConsensusSimulations/genome_prep/RefGenomeSeq.1line";
    refGenomeGTF = "/grid/dobin/home/dobin/Projects/Consensus/ConsensusSimulations/genomes/h38/genes.gtf";
    # readLen = 50;
}

function push_back(a,b) {
    if (isarray(a)) {
        a[length(a)+1]=b;
    } else {
        a[1]=b;
    };
};

function funAbs(a) {
    return (a>0 ? a : -a);
};

{
    if ($1~/^#/)
        next;
    #if ($10!~/1.1/ || length($4)!=1 || length($5)!=1 ) # homoz snp
    #if (length($4)!=1 || length($5)!=1 ) # keep hom+het snp
    #if (length($4)==1 && length($5)==1 ) # keep hom+het indels
    #    next;

    ALT[$1][$2][1] = substr($10,1,1);
    ALT[$1][$2][2] = substr($10,3,1);
    ALT[$1][$2][3] = $5; #alt
    ALT[$1][$2][4] = $4; #ref
};

END {
    print "Loaded VCF: " length(ALT) > stderr;

    while (getline < refGenomeSeq) {
        #if ($1~/^>/) {
        #   chr = substr($1,2);
        #   print chr > stderr;
        #} else {
        #   G[chr]=G[chr] $1;
        #};
        G[$1] = $2;
        printf $1 " " > stderr;
    };
    print "\nLoaded " refGenomeSeq ": " length(G) > stderr;

    while (getline < refGenomeGTF) {
        if ($1!~/chr[0-9]/ || $3!="exon")
            continue; # keep chr1-22 and exonic lines
        tr = $12
        gsub("\"","",tr);
        gsub(";","",tr);

        trChr[tr]=$1;
        trgene[tr]=$10;
        if (!(tr in trS)) {
            trS[tr][1]=$4;
            trE[tr][1]=$5;
        } else {
            trS[tr][length(trS[tr])+1]=$4;
            trE[tr][length(trE[tr])+1]=$5;
        };
    };
    print "Loaded " refGenomeGTF ": " length(trS) > stderr;

    for (tr in trChr) {
        asort(trS[tr]);
        asort(trE[tr]);

        split("1",tcl);
        for (iex=1;iex<=length(trS[tr]);iex++) {
            tcl[length(tcl)+1] =tcl[iex] + trE[tr][iex]-trS[tr][iex] + 1; #start of the next exon
        };

        for (iA=1;iA<=2;iA++) {
        iTx=1;
        persSeq="";
        for (iex=2;iex<=length(tcl);iex++) {
            ir=tcl[iex-1]; 
            while (ir<tcl[iex]) {
                refCoord[iTx] = trS[tr][iex-1]+ir-tcl[iex-1];
                varType[iTx] = 0;
                base1 = substr(G[trChr[tr]], refCoord[iTx], 1);

                if (refCoord[iTx] in ALT[trChr[tr]] && ALT[trChr[tr]][refCoord[iTx]][iA]==1) {
                    varType[iTx] = 1;
                    if (ALT[trChr[tr]][refCoord[iTx]][3-iA]==1) {
                        # homoz var
                        varType[iTx] = 2;
                    };
                    if (length(ALT[trChr[tr]][refCoord[iTx]][3])>length(ALT[trChr[tr]][refCoord[iTx]][4])) {
                       varType[iTx] += 10;
                    } else if (length(ALT[trChr[tr]][refCoord[iTx]][3])<length(ALT[trChr[tr]][refCoord[iTx]][4])) {
                       varType[iTx] += 20;
                    };

                    persSeq = persSeq tolower(ALT[trChr[tr]][refCoord[iTx]][3]);
                    iTxOld = iTx;
                    ir += length(ALT[trChr[tr]][refCoord[iTx]][4]);
                    iTx += length(ALT[trChr[tr]][refCoord[iTx]][3]);
                    for (ii=iTxOld+1; ii<iTx; ii++)
                         refCoord[ii] = refCoord[iTxOld]+1;
                } else {
                    iTx++; ir++;
                    persSeq = persSeq base1;
                };
            };
        };

        for (iR=1; iR<=length(persSeq)-readLen+1; iR++) {
            nVar[0]=nVar[1]=nVar[2]=nVar[11]=nVar[12]=nVar[21]=nVar[22]=0;
            #split("",nVar);
            for (ii=0; ii<readLen; ii++) { 
                nVar[varType[iR+ii]]++;
            };
            #print nVar[1],nVar[2],nVar[12];
            if (allReads==1 || nVar[2]>0 || nVar[12]>0 || nVar[22]>0) {
	        print ">" tr "_" iR "_" trChr[tr] "_" refCoord[iR] "_" refCoord[iR+readLen-1] "_" iA "_" nVar[1]+nVar[11]+nVar[21] "_" nVar[2]"_" nVar[12]"_" nVar[22]_trGene[tr];
                print substr(persSeq, iR, readLen);
            };
        };
        };
    };
}

