#include<iostream>
#include<fstream>
#include<string>
#include<random>
#include<algorithm>
#include<vector>
#include<sstream>

#define M 860022
#define N 9793
#define thinN 979
#define testSize 1000

void ReadVCF(const char*, bool[][N]);
void Randomize(int [], int);

void ReadVCF(const char* inFile, bool panel[][N]){
    using namespace std;
    string line = "1", z = "1";
    ifstream in(inFile);
    if(!in.is_open()){
        std::cout << "Panel VCF file \"" << inFile << "\" failed to open!\n";
        exit(0);
    }

    while (line[0] != '#' || line[1] != 'C')
        getline(in, line);
    for(int j = 0; j<N; ++j){
        in >> z >> z >> z
            >> z >> z >> z
            >> z >> z >> z;
        getline(in, line);
        in >> std::ws;

        int k = 1;
        for (int i = 0; i<M; ++i){
            panel[i][j] = (line[k] == '1');
            k += 2;
        }
    }
    in.close();
}


void Randomize(int order[], int num){
    std::random_device seed;
    int s = seed();
    std::cout << "seed:" << s << std::endl;
    std::default_random_engine generator(s);
    for (int i = num-1; i>0; --i){
        std::uniform_int_distribution<int> dis(0, i);
        std::swap(order[i], order[dis(generator)]);
    }
}

int main(){
    //choose sites for thin data randomly
    int arraySites[thinN], targetHaplotypes[testSize/2];

    {
        int sites[N], haplotypes[M/2];

        for (int i = 0; i<N; ++i)
            sites[i] = i;

        for (int i = 0; i<M/2; ++i)
            haplotypes[i] = i;

        Randomize(sites, N);//first thinN are array sites
        Randomize(haplotypes, M/2);//first testSize/2 are individuals to be imputed

        for (int i = 0; i<thinN; ++i)
            arraySites[i] = sites[i];
        for (int i = 0; i< testSize/2; ++i)
            targetHaplotypes[i] = haplotypes[i];

        std::sort(arraySites, arraySites+thinN);
        std::sort(targetHaplotypes, targetHaplotypes+(testSize/2));
        std::ofstream a("hapsToImpute");
        std::ofstream a2("sitesToImpute");
        if (!a.is_open() || !a2.is_open()){
            std::cout << "failed to open output files\n";
            return 0;
        }
        for (int i = 0; i<thinN; ++i)
            a2 << arraySites[i] << '\n';
        for (int i = 0; i<testSize/2; ++i)
            a << targetHaplotypes[i]*2 << '\n'
                << (targetHaplotypes[i]*2)+1 << '\n';
        a.close();
        a2.close();
    }


    bool panel[M][N], smoothPanel[M][N];
    ReadVCF("../completeBritPanel.vcf", panel);
    ReadVCF("../completeBritPanel.smooth.vcf", smoothPanel);

    std::vector<std::string> headerLines;
    std::string fields[N][9];
    std::string headFields[9];
    std::string indivIDs[M/2];
    {//get headerLines and fields
        std::ifstream in("../completeBritPanel.smooth.vcf");
        if (!in.is_open()){
            std::cout << "panel failed to open!\n";
            return 0;
        }
        std::string line = "a";
        while (getline(in, line) && line[1] == '#')
            headerLines.push_back(std::move(line));

        std::stringstream l(std::move(line));
        for (int i = 0; i<9; ++i)
            l >> headFields[i];
        for (int i = 0; i<M/2; ++i)
            l >> indivIDs[i];

        for(int j = 0; j<N; ++j){
            for (int i = 0; i<9; ++i)
                in >> fields[j][i];
            getline(in, line);
            in >> std::ws;
        }
        in.close();
    }
    



    //need to output the following
    //1. referencePanel (missing individuals)                               //for Beagle
    //2. smooth referencePanel (missing individuals)                        //for MPSC
    //3. array data panel (only imputed individuals and missing sites)      //for Beagle
    //4. missingIndiv                                                       //for MPSC
    //5. arraySites                                                         //for MPSC


    {
        //1. referencePanel
        //2. smooth referencePanel
        std::ofstream refOut("refPanel.vcf");
        if (!refOut.is_open()){
            std::cout << "refPanel.vcf failed to open!\n";
            return 0;
        }
        std::ofstream sRefOut("refPanel.smooth.vcf");
        if (!sRefOut.is_open()){
            std::cout << "refPanel.smooth.vcf failed to open!\n";
            return 0;
        }
        //copy headers
        for (int i = 0; i<headerLines.size(); ++i){
            refOut << headerLines[i] << '\n';
            sRefOut << headerLines[i] << '\n';
        }

        refOut << headFields[0];
        sRefOut << headFields[0];
        for (int i = 1; i<9; ++i){
            refOut << '\t' << headFields[i];
            sRefOut << '\t' << headFields[i];
        }
        int missingHapIndex = 0;
        for (int i = 0; i<M/2; ++i)
            if ((missingHapIndex >= testSize/2) || 
                    (targetHaplotypes[missingHapIndex] != i)){
                refOut << '\t' << indivIDs[i];
                sRefOut << '\t' << indivIDs[i];
            }
            else
                ++missingHapIndex;

        refOut << '\n';
        sRefOut << '\n';

        for (int j = 0; j < N; ++j){
            refOut << "21";
            sRefOut << "21";
            for (int i = 1; i<9; ++i){
                refOut << '\t' << fields[j][i];
                sRefOut << '\t' << fields[j][i];
            }

            int missingHapIndex = 0;
            for (int i = 0; i<M;){
                if (missingHapIndex < testSize/2 &&
                        targetHaplotypes[missingHapIndex] == i/2){
                    ++missingHapIndex;
                    i += 2;
                    continue;
                }
                refOut << '\t' <<  panel[i][j] << '|' << panel[i+1][j];
                sRefOut << '\t' << smoothPanel[i++][j] << '|' << smoothPanel[i++][j];
            }
            refOut << '\n';
            sRefOut << '\n';
        }
        refOut.close();
        sRefOut.close();
    }
    

    {
        //3. array data panel
        std::ofstream tarOut("tarPanel.vcf");
        if (!tarOut.is_open()){
            std::cout << "tarPanel.vcf failed to open!\n";
            return 0;
        }
        for (int i = 0; i<headerLines.size(); ++i)
            tarOut << headerLines[i] << '\n';

        tarOut << headFields[0];
        for (int i = 1; i < 9; ++i)
            tarOut << '\t' << headFields[i];

        for (int i = 0; i < testSize/2; ++i)
            tarOut << '\t' << indivIDs[targetHaplotypes[i]];

        tarOut << '\n';

        for (int j = 0; j < thinN; ++j){
            tarOut << "21";
            for (int i = 1; i<9; ++i)
                tarOut << '\t' << fields[arraySites[j]][i];

            for (int i = 0; i < testSize/2; ++i)
                tarOut << '\t' << panel[targetHaplotypes[i]*2][arraySites[j]] << '|' << panel[(targetHaplotypes[i]*2)+1][arraySites[j]];
            tarOut << '\n';
        }
        tarOut.close();
    }

    return 0;
}







