/*
Copyright 2007 Daniel Zerbino (zerbino@ebi.ac.uk)

    This file is part of Velvet.

    Velvet is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Velvet is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Velvet; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

*/

/*
Copyright 2007 Daniel Zerbino (zerbino@ebi.ac.uk)

    This file is part of Velvet.

    Velvet is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Velvet is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Velvet; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include "globals.h"
#include "tightString.h"
#include "readSet.h"

ReadSet *newReadSet()
{
	return calloc(1, sizeof(ReadSet));
}

void concatenateReadSets(ReadSet * A, ReadSet * B)
{
	ReadSet tmp;
	IDnum index;

	// Read count:  
	tmp.readCount = A->readCount + B->readCount;

	// Sequences
	if (A->sequences != NULL || B->sequences != NULL) {
		tmp.sequences = malloc(tmp.readCount * sizeof(char *));

		if (A->sequences != NULL) {
			for (index = 0; index < A->readCount; index++)
				tmp.sequences[index] = A->sequences[index];
			free(A->sequences);
		} else
			for (index = 0; index < A->readCount; index++)
				tmp.sequences[index] = NULL;

		if (B->sequences != NULL) {
			for (index = 0; index < B->readCount; index++)
				tmp.sequences[A->readCount + index] =
				    B->sequences[index];
			free(B->sequences);
		} else
			for (index = 0; index < B->readCount; index++)
				tmp.sequences[A->readCount + index] = NULL;
	} else
		tmp.sequences = NULL;

	// tSequences
	if (A->tSequences != NULL || B->tSequences != NULL) {
		tmp.tSequences =
		    malloc(tmp.readCount * sizeof(TightString *));

		if (A->tSequences != NULL) {
			for (index = 0; index < A->readCount; index++)
				tmp.tSequences[index] =
				    A->tSequences[index];
			free(A->tSequences);
		} else
			for (index = 0; index < A->readCount; index++)
				tmp.tSequences[index] = NULL;

		if (B->tSequences != NULL) {
			for (index = 0; index < B->readCount; index++)
				tmp.tSequences[A->readCount + index] =
				    B->tSequences[index];
			free(B->tSequences);
		} else
			for (index = 0; index < B->readCount; index++)
				tmp.tSequences[A->readCount + index] =
				    NULL;
	} else
		tmp.tSequences = NULL;

	// Labels
	if (A->labels != NULL || B->labels != NULL) {
		tmp.labels = malloc(tmp.readCount * sizeof(char *));

		if (A->labels != NULL) {
			for (index = 0; index < A->readCount; index++)
				tmp.labels[index] = A->labels[index];
			free(A->labels);
		} else
			for (index = 0; index < A->readCount; index++)
				tmp.labels[index] = NULL;

		if (B->labels != NULL) {
			for (index = 0; index < B->readCount; index++)
				tmp.labels[A->readCount + index] =
				    B->labels[index];
			free(B->labels);
		} else
			for (index = 0; index < B->readCount; index++)
				tmp.labels[A->readCount + index] = NULL;
	} else
		tmp.labels = NULL;


	// Confidence scores
	if (A->confidenceScores != NULL || B->confidenceScores != NULL) {
		tmp.confidenceScores =
		    malloc(tmp.readCount * sizeof(Quality *));

		if (A->confidenceScores != NULL) {
			for (index = 0; index < A->readCount; index++)
				tmp.confidenceScores[index] =
				    A->confidenceScores[index];
			free(A->confidenceScores);
		} else
			for (index = 0; index < A->readCount; index++)
				tmp.confidenceScores[index] = NULL;

		if (B->confidenceScores != NULL) {
			for (index = 0; index < B->readCount; index++)
				tmp.confidenceScores[A->readCount +
						     index] =
				    B->confidenceScores[index];
			free(B->confidenceScores);
		} else
			for (index = 0; index < B->readCount; index++)
				tmp.confidenceScores[A->readCount +
						     index] = NULL;
	} else
		tmp.confidenceScores = NULL;

	// Kmer probabilities 
	if (A->kmerProbabilities != NULL || B->kmerProbabilities != NULL) {
		tmp.kmerProbabilities =
		    malloc(tmp.readCount * sizeof(Quality *));

		if (A->kmerProbabilities != NULL) {
			for (index = 0; index < A->readCount; index++)
				tmp.kmerProbabilities[index] =
				    A->kmerProbabilities[index];
			free(A->kmerProbabilities);
		} else
			for (index = 0; index < A->readCount; index++)
				tmp.kmerProbabilities[index] = NULL;

		if (B->kmerProbabilities != NULL) {
			for (index = 0; index < B->readCount; index++)
				tmp.kmerProbabilities[A->readCount +
						      index] =
				    B->kmerProbabilities[index];
			free(B->kmerProbabilities);
		} else
			for (index = 0; index < B->readCount; index++)
				tmp.kmerProbabilities[A->readCount +
						      index] = NULL;
	} else
		tmp.kmerProbabilities = NULL;

	// Mate reads 
	if (A->mateReads != NULL || B->mateReads != NULL) {
		tmp.mateReads = malloc(tmp.readCount * sizeof(IDnum));

		if (A->mateReads != NULL) {
			for (index = 0; index < A->readCount; index++)
				tmp.mateReads[index] = A->mateReads[index];
			free(A->mateReads);
		} else
			for (index = 0; index < A->readCount; index++)
				tmp.mateReads[index] = 0;

		if (B->mateReads != NULL) {
			for (index = 0; index < B->readCount; index++)
				tmp.mateReads[A->readCount + index] =
				    B->mateReads[index];
			free(B->mateReads);
		} else
			for (index = 0; index < B->readCount; index++)
				tmp.mateReads[A->readCount + index] = 0;
	} else
		tmp.mateReads = NULL;

	// Categories
	if (A->categories != NULL || B->categories != NULL) {
		tmp.categories = malloc(tmp.readCount * sizeof(Quality *));

		if (A->categories != NULL) {
			for (index = 0; index < A->readCount; index++)
				tmp.categories[index] =
				    A->categories[index];
			free(A->categories);
		} else
			for (index = 0; index < A->readCount; index++)
				tmp.categories[index] = CATEGORIES;

		if (B->categories != NULL) {
			for (index = 0; index < B->readCount; index++)
				tmp.categories[A->readCount + index] =
				    B->categories[index];
			free(B->categories);
		} else
			for (index = 0; index < B->readCount; index++)
				tmp.categories[A->readCount + index] =
				    CATEGORIES;
	} else
		tmp.categories = NULL;

	// Put everything back into A
	A->readCount = tmp.readCount;
	A->sequences = tmp.sequences;
	A->tSequences = tmp.tSequences;
	A->labels = tmp.labels;
	A->confidenceScores = tmp.confidenceScores;
	A->kmerProbabilities = tmp.kmerProbabilities;
	A->mateReads = tmp.mateReads;
	A->categories = tmp.categories;

	// Deallocate
	free(B);
}

void convertSequences(ReadSet * rs)
{
	rs->tSequences =
	    newTightStringArrayFromStringArray(rs->sequences,
					       rs->readCount);
	rs->sequences = NULL;
}

static Probability convertQualityScore(Quality score)
{
	return (Probability) 1 - pow(10, -score / ((double) 10));
}

void convertConfidenceScores(ReadSet * rs, int WORDLENGTH)
{
	Quality *baseCallerScores;
	Probability *kmerProbabilities;
	IDnum index;
	Coordinate position;
	Probability proba;

	rs->kmerProbabilities =
	    malloc(rs->readCount * sizeof(Probability *));

	for (index = 0; index < rs->readCount; index++) {
		rs->kmerProbabilities[index] =
		    malloc((getLength(rs->tSequences[index]) - WORDLENGTH +
			    1) * sizeof(Probability));
		kmerProbabilities = rs->kmerProbabilities[index];
		baseCallerScores = rs->confidenceScores[index];

		proba = 1;
		for (position = 0;
		     position < getLength(rs->tSequences[index]);
		     position++) {
			proba *=
			    convertQualityScore(baseCallerScores
						[position]);
			if (position < WORDLENGTH)
				continue;

			proba /=
			    convertQualityScore(baseCallerScores
						[position - WORDLENGTH]);
			kmerProbabilities[position - WORDLENGTH + 1] =
			    proba;
		}

		rs->confidenceScores[index] = NULL;
		free(baseCallerScores);
	}

	free(rs->confidenceScores);
	rs->confidenceScores = NULL;
}

void categorizeReads(ReadSet * readSet, Category category)
{
	IDnum index;

	if (readSet->categories == NULL)
		readSet->categories =
		    malloc(readSet->readCount * sizeof(Category));

	for (index = 0; index < readSet->readCount; index++)
		readSet->categories[index] = category;
}

void exportIDMapping(char *filename, ReadSet * reads)
{
	IDnum index;
	FILE *outfile = fopen(filename, "w");

	if (outfile == NULL) {
		puts("Couldn't open file, sorry");
		return;
	} else
		puts("Writing into file...");

	if (reads->labels == NULL) {
		fclose(outfile);
		return;
	}

	for (index = 0; index < reads->readCount; index++)
		if (reads->labels != NULL)
			fprintf(outfile, "s/SEQUENCE %li/%s/\n", index + 1,
				reads->labels[index]);

	fclose(outfile);

}

// Imports sequences from a fastq file 
// Memory space allocated within this function.
ReadSet *readSolexaFile(char *filename)
{
	FILE *file = fopen(filename, "r");
	IDnum lineCount = 0;
	IDnum readCount, readIndex;
	const int maxline = 500;
	char *line = malloc(sizeof(char) * maxline);
	ReadSet *reads = newReadSet();

	printf("Reading file %s\n", filename);

	// Count lines:
	puts("Counting lines...");
	while (fgets(line, maxline, file) != NULL)
		if (strchr(line, '.') == NULL)
			lineCount++;

	readCount = lineCount;
	printf("%li reads found.\n", readCount);
	fclose(file);

	// Create table:
	reads->readCount = readCount;
	reads->sequences = malloc(readCount * sizeof(char *));
	for (readIndex = 0; readIndex < readCount; readIndex++)
		reads->sequences[readIndex] = malloc(100 * sizeof(char));

	// Reopen file and memorize line:
	puts("Writing lines into string array...");
	file = fopen(filename, "r");
	readIndex = 0;
	while (fgets(line, maxline, file) != NULL)
		if (strchr(line, '.') == NULL) {
			sscanf(line, "%*i\t%*i\t%*i\t%*i\t%*c%[^\n]",
			       reads->sequences[readIndex]);
			readIndex++;
		}

	free(line);
	fclose(file);
	puts("Done");
	return reads;
}

ReadSet *readElandFile(char *filename)
{
	FILE *file = fopen(filename, "r");
	IDnum lineCount = 0;
	IDnum readCount, readIndex;
	const int maxline = 500;
	char line[maxline];
	ReadSet *reads = newReadSet();

	printf("Reading Eland file %s\n", filename);

	// Count lines:
	puts("Counting lines...");
	while (fgets(line, maxline, file) != NULL)
		lineCount++;

	readCount = lineCount;
	printf("%li reads found.\n", readCount);
	fclose(file);

	// Create table:
	reads->readCount = readCount;
	reads->sequences = malloc(readCount * sizeof(char *));
	for (readIndex = 0; readIndex < readCount; readIndex++)
		reads->sequences[readIndex] = malloc(100 * sizeof(char));

	// Reopen file and memorize line:
	puts("Writing lines into string array...");
	file = fopen(filename, "r");
	readIndex = 0;
	while (fgets(line, maxline, file) != NULL) {
		sscanf(line, "%*[^\t]\t%[^\t]",
		       reads->sequences[readIndex]);
		readIndex++;
	}

	fclose(file);
	puts("Done");
	return reads;
}

// Imports sequences from a fastq file 
// Memory space allocated within this function.
ReadSet *readFastQFile(char *filename)
{
	FILE *file = fopen(filename, "r");
	IDnum lineCount = 0;
	IDnum readCount, readIndex;
	const int maxline = 5000;
	char *line;
	ReadSet *reads = newReadSet();

	printf("Reading file %s\n", filename);

	// Count lines:
	puts("Counting lines...");
	line = malloc(sizeof(char) * maxline);
	while (fgets(line, maxline, file) != NULL)
		lineCount++;
	readCount = lineCount / 4;
	printf("%li reads found.\n", readCount);
	fclose(file);

	// Create table:
	reads->readCount = readCount;
	reads->sequences = malloc(sizeof(char *) * (readCount));

	// Reopen file and memorize line:
	puts("Writing lines into string array...");
	file = fopen(filename, "r");
	for (readIndex = 0; readIndex < readCount; readIndex++) {
		fgets(line, maxline, file);
		fgets(line, maxline, file);

		reads->sequences[readIndex] =
		    malloc(sizeof(char) * strlen(line));
		strncpy(reads->sequences[readIndex], line,
			strlen(line) - 1);

		fgets(line, maxline, file);
		fgets(line, maxline, file);
	}

	free(line);
	fclose(file);
	puts("Done");
	return reads;
}

// Imports sequences from a fasta file 
// Memory is allocated within the function 
ReadSet *readFastAFile(char *filename)
{
	FILE *file;
	char *sequence;
	Coordinate bpCount;
	const int maxline = 100;
	char *line = malloc(sizeof(char) * maxline);
	IDnum sequenceCount, sequenceIndex;
	IDnum index;
	ReadSet *reads = newReadSet();

	printf("Reading file %s;\n", filename);

	// Count number of separate sequences
	file = fopen(filename, "r");
	sequenceCount = 0;
	while (fgets(line, maxline, file) != NULL)
		if (line[0] == '>')
			sequenceCount++;
	fclose(file);
	printf("%li sequences found\n", sequenceCount);

	reads->readCount = sequenceCount;
	reads->sequences = malloc(sequenceCount * sizeof(char *));

	// Counting base pair length of each sequence:
	file = fopen(filename, "r");
	sequenceIndex = -1;
	while (fgets(line, maxline, file) != NULL) {
		if (line[0] == '>') {
			if (sequenceIndex != -1) {
				//printf("Sequence %li has length %li\n",
				//       sequenceIndex, bpCount);
				reads->sequences[sequenceIndex] =
				    malloc(sizeof(char) * (bpCount + 1));
				if (reads->sequences[sequenceIndex] ==
				    NULL)
					puts("Allocation screwed up!");
			}
			sequenceIndex++;
			bpCount = 0;
		} else {
			bpCount += strlen(line) - 1;
		}
	}

	//printf("Sequence %li has length %li\n", sequenceIndex, bpCount);
	reads->sequences[sequenceIndex] =
	    malloc(sizeof(char) * (bpCount + 1));
	fclose(file);

	// Reopen file and memorize line:
	file = fopen(filename, "r");
	sequenceIndex = -1;
	while (fgets(line, maxline, file)) {
		if (line[0] == '>') {
			if (sequenceIndex != -1) {
				sequence[bpCount] = '\0';
			}
			sequenceIndex++;
			bpCount = 0;
			//printf("Starting to read sequence %li\n",
			//       sequenceIndex);
			sequence = reads->sequences[sequenceIndex];
		} else {
			for (index = 0; index < strlen(line) - 1; index++)
				sequence[bpCount + index] = line[index];
			bpCount += (strlen(line) - 1);
		}
	}

	if (sequenceIndex != -1) {
		sequence[bpCount] = '\0';
	}

	fclose(file);
	free(line);

	puts("Done");
	return reads;
}

#define FASTQ 1
#define FASTA 2
#define SOLEXA 3
#define ELAND 4

// General argument parser for most functions
// Basically a reused portion of toplevel code dumped into here
ReadSet *parseDataAndReadFiles(int argc, char **argv)
{
	int argIndex = 1;
	ReadSet *reads;
	ReadSet *allSequences = newReadSet();
	int filetype = FASTA;
	IDnum fileIndex = 0;
	Category cat = 0;

	if (argc < 2) {
		puts("Wrong number of arguments!");
		puts("Correct usage:");
		puts("run -<filetype> <list of files> [...] ");
		puts("Allowed filetypes:");
		puts("\t-fasta");
		puts("\t-fastq");
		puts("\t-solexa");
		puts("\t-eland");
		puts("If reading exclusively fasta file, the -fasta parameter is not necessary");
		exit(1);
	}

	for (argIndex = 1; argIndex < argc; argIndex++) {
		if (argv[argIndex][0] == '-') {

			if (strcmp(argv[argIndex], "-fastq") == 0)
				filetype = FASTQ;
			else if (strcmp(argv[argIndex], "-fasta") == 0)
				filetype = FASTA;
			else if (strcmp(argv[argIndex], "-solexa") == 0)
				filetype = SOLEXA;
			else if (strcmp(argv[argIndex], "-eland") == 0)
				filetype = ELAND;
			else if (strcmp(argv[argIndex], "-short") == 0)
				cat = 0;
			else if (strcmp(argv[argIndex], "-shortPaired") ==
				 0)
				cat = 1;
			else if (strcmp(argv[argIndex], "-short2") == 0)
				cat = 2;
			else if (strcmp(argv[argIndex], "-shortPaired2") ==
				 0)
				cat = 3;
			else if (strcmp(argv[argIndex], "-long") == 0)
				cat = CATEGORIES * 2;
			else {
				puts("Unknown option...");
				exit(0);
			}

			continue;
		}

		switch (filetype) {
		case FASTA:
			reads = readFastAFile(argv[argIndex]);
			break;
		case FASTQ:
			reads = readFastQFile(argv[argIndex]);
			break;
		case SOLEXA:
			reads = readSolexaFile(argv[argIndex]);
			break;
		case ELAND:
			reads = readElandFile(argv[argIndex]);
			break;
		default:
			puts("Screw up in parser... exiting");
			exit(0);
		}

		convertSequences(reads);
		categorizeReads(reads, cat);
		fileIndex++;
		concatenateReadSets(allSequences, reads);
	}

	return allSequences;

}

void importClippingData(char *filename, ReadSet * reads)
{
	FILE *file = fopen(filename, "r");
	char line[100];
	const int maxline = 100;
	IDnum index = 0;
	Coordinate start, finish;
	TightString **sequences = reads->tSequences;

	puts("Importing clip data");

	// For each other lines
	while (fgets(line, maxline, file) != NULL) {
		if (line[0] == 'F') {
			destroyTightString(sequences[index]);
			sequences[index] = NULL;
		} else {
			sscanf(line, "%*[PASFIL ]%*i  %li %li %*[^\n]",
			       &start, &finish);
			clipTightString(sequences[index], start, finish);
		}
		index++;
	}

	puts("Done");

	fclose(file);
}

void pairUpReads(ReadSet * reads, Category cat)
{
	int phase = 0;
	IDnum *mateReads = malloc(reads->readCount * sizeof(IDnum));
	IDnum index;

	for (index = 0; index < reads->readCount; index++) {
		if (reads->categories[index] % 2 == 0
		    || reads->categories[index] > cat) {
			mateReads[index] = -1;
			phase = 0;
		} else if (phase == 0) {
			mateReads[index] = index + 1;
			phase = 1;
		} else {
			mateReads[index] = index - 1;
			phase = 0;
		}
	}

	free(reads->mateReads);
	reads->mateReads = mateReads;
}

void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
{
	IDnum index;
	IDnum pairID;
	IDnum sequenceCount = reads->readCount;
	IDnum *mateReads = reads->mateReads;
	Category *categories = reads->categories;

	if (reads->mateReads == NULL)
		return;

	if (dubiousReads == NULL)
		return;

	for (index = 0; index < sequenceCount; index++) {
		if (!dubiousReads[index])
			continue;

		pairID = mateReads[index];

		printf("Separating %li and %li\n", index, pairID);
		categories[index] = categories[index] / 2;
		categories[pairID] = categories[pairID] / 2;
	}
}

static void exportRead(FILE * outfile, ReadSet * reads, IDnum index)
{
	Coordinate start, finish;
	char str[100];
	TightString *sequence = reads->tSequences[index];

	if (sequence == NULL)
		return;

	fprintf(outfile, ">SEQUENCE_%li_length_%li", index,
		getLength(sequence));

	if (reads->categories != NULL)
		fprintf(outfile, "\t%i", reads->categories[index]);

	fprintf(outfile, "\n");

	start = 0;
	while (start <= getLength(sequence)) {
		finish = start + 60;
		readTightStringFragment(sequence, start, finish, str);
		fprintf(outfile, "%s\n", str);
		start = finish;
	}

	fflush(outfile);
}

void exportReadSet(char *filename, ReadSet * reads)
{
	IDnum index;
	FILE *outfile = fopen(filename, "w+");

	if (outfile == NULL) {
		puts("Couldn't open file, sorry");
		return;
	} else
		printf("Writing into readset file: %s\n", filename);

	for (index = 0; index < reads->readCount; index++) {
		exportRead(outfile, reads, index);
	}

	fclose(outfile);

	puts("Done");
}

ReadSet *importReadSet(char *filename)
{
	FILE *file;
	char *sequence;
	Coordinate bpCount;
	const int maxline = 100;
	char *line = malloc(sizeof(char) * maxline);
	IDnum sequenceCount, sequenceIndex;
	IDnum index;
	ReadSet *reads = newReadSet();

	printf("Reading read set file %s;\n", filename);

	// Count number of separate sequences
	file = fopen(filename, "r");
	sequenceCount = 0;
	while (fgets(line, maxline, file) != NULL)
		if (line[0] == '>')
			sequenceCount++;
	fclose(file);
	printf("%li sequences found\n", sequenceCount);

	reads->readCount = sequenceCount;
	reads->sequences = calloc(sequenceCount, sizeof(char *));
	reads->categories = calloc(sequenceCount, sizeof(Category));

	// Counting base pair length of each sequence:
	file = fopen(filename, "r");
	sequenceIndex = -1;
	while (fgets(line, maxline, file) != NULL) {
		if (line[0] == '>') {

			// Reading category info
			sscanf(line, "%*[^\t]\t%hhi",
			       &(reads->categories[sequenceIndex + 1]));

			if (sequenceIndex != -1) {
				//printf("Sequence %li has length %li\n",
				//       sequenceIndex, bpCount);
				reads->sequences[sequenceIndex] =
				    malloc(sizeof(char) * (bpCount + 1));
				if (reads->sequences[sequenceIndex] ==
				    NULL)
					puts("Allocation screwed up!");
			}
			sequenceIndex++;
			bpCount = 0;
		} else {
			bpCount += strlen(line) - 1;
		}
	}

	//printf("Sequence %li has length %li\n", sequenceIndex, bpCount);
	reads->sequences[sequenceIndex] =
	    malloc(sizeof(char) * (bpCount + 1));
	fclose(file);

	// Reopen file and memorize line:
	file = fopen(filename, "r");
	sequenceIndex = -1;
	while (fgets(line, maxline, file)) {
		if (line[0] == '>') {
			if (sequenceIndex != -1) {
				sequence[bpCount] = '\0';
			}
			sequenceIndex++;
			bpCount = 0;
			//printf("Starting to read sequence %li\n",
			//       sequenceIndex);
			sequence = reads->sequences[sequenceIndex];
		} else {
			for (index = 0; index < strlen(line) - 1; index++)
				sequence[bpCount + index] = line[index];
			bpCount += (strlen(line) - 1);
		}
	}

	sequence[bpCount] = '\0';
	fclose(file);

	puts("Done");
	free(line);
	return reads;

}
