function [alpha_calls,numeric_calls,allele_freqs,error_probs,post_probs] = JGIL_prototype(R,ngen)

% This matlab script implements the JGIL prototype, in other words the 
% model engine without the fancy input/output control.
%
% After the manuscript was finalized, I renamed the variables to match
% those used in the text. Hopefully this resulted in increased clarity and
% not decreased working. This version is necessarily static, but please 
% check out a dynamic version of JGIL at
%
%               http://www4.ncsu.edu/~eastone2/software/. 
%
% The JGIL software available there was written by Chris Smith based on 
% this prototype. It is a command line executable that operates directly on
% a set of .bam files.   
% 
% The first input of the prototype is a matrix R of read counts at a site.  
% Lines/strains are represented by columns, and counts of covering A,C,G,T
% are represented by rows.  Thus, R is a 4 x m matrix.  The second
% input is the number of inbreeding generations. For the DGRP, ngen = 20;
%
% An example of R is
%
%   R = [2 3 4 1 10 ; 0 0 0 1 0 ; 0 0 0 0 0 ; 0 0 0 0 0]
%
% which has count data for 5 lines.  Here, because line 4 (4th column) has
% a single C (second row), and because C is not present in any other lines, 
% JGIL reasons that the C is an error (at least when ngen = 20).  As such,
% each line is assigned the "inbred A" state, coded as state 1.

% The prototype code begins here.

% First calculate F from the number of generations ngen.
Fvec = zeros(1,ngen+2);
for i = 3:ngen+2
    Fvec(i) = .25 + .5*Fvec(i-1) + .25*Fvec(i-2);
end;
F = Fvec(end);

% Initialize genotype matrix M (4x22), rows are A,C,G,T respectively.  This
% is Table 1 from the paper.
M = [   4	0	0	0	3	2	1	3	2	1	3	2	1	0	0	0	0	0	0	0	0	0
        0	4	0	0	1	2	3	0	0	0	0	0	0	3	2	1	3	2	1	0	0	0
        0	0	4	0	0	0	0	1	2	3	0	0	0	1	2	3	0	0	0	3	2	1
        0	0	0	4	0	0	0	0	0	0	1	2	3	0	0	0	1	2	3	1	2	3   ];

% Calculate K from M.  K = ceil(M/3) is just a convenient way of mapping 
% 0 -> 0, 1 -> 1, 2 -> 1, 3 -> 1, 4 -> 2.  
K = ceil(M/3);
    
% This is set to be some small number.  It is used in several places, both
% to set a covergence threshold and to avoid log(0).
machine_eps = 10^-16;

% Identify the number of lines/strains from the input matrix.    
m = size(R,2);

% This is the total number of reads covering the site across all lines. 
Rtot = sum(sum(R));

% Initialize error probabilities e_0.  This turns out to be a robust 
% starting assignment.
e_1 = [.05 .05 .05 .05];

% Use the naive allele frequency estimates as initial estimate p_0.
p_1 = sum(R,2)/sum(sum(R));

% Initialize vectors to store values from previous iteration.
p_0 = zeros(4,1);
e_0 = zeros(1,4);

% Repeat while the parameters are changing sufficiently over a single
% iteration.  Not guaranteed to converge in parameter space but empirically
% it seems to do so without fail.  The software gives the option for 
% convergence to be assessed based on log-likelihood instead.  But it is a
% little quicker to avoid calculating likelihoods at all.

while sum((p_1-p_0).^2) + sum((e_1-e_0).^2) > machine_eps

    % Initialize p_0 to the values of the previous iteration.
    p_0 = p_1;
    e_0 = e_1;
    
    % Calculate the 22 genotypic frequencies in the vector v based on p_0.
    v = zeros(22,1);
    % First the 4 fully inbred states
    for i = 1:4
        v(i) = p_0(i)^2*(1-F) + p_0(i)*F;
    end;
    % And then the 18 segregating states
    ct = 5;
    for i = 1:3
        for j = (i+1):4
            v(ct) = .6*p_0(i)*p_0(j)*(1-F);
            v(ct+1) = .8*p_0(i)*p_0(j)*(1-F);
            v(ct+2) = .6*p_0(i)*p_0(j)*(1-F);
            ct = ct+3;
        end;
    end;

    % Now calculate e_1 and p_1
    A = zeros(4,22);
    B = zeros(4,22);
    totalprobs = zeros(4,22);
    S = zeros(4,22);
    errvec = ones(4,1)-sum(e_0);
    for j = 1:4
        errvec(j) = errvec(j)+e_0(j);
    end;
    for i = 1:22
        A(:,i) = (4-M(:,i))/4;
        A(:,i) = A(:,i).*e_0';
        B(:,i) = (M(:,i)/4).*errvec;
        totalprobs(:,i) = A(:,i)+B(:,i);
        for k = 1:4
            if totalprobs(k,i) > 0
                S(k,i) = A(k,i)/totalprobs(k,i);
            end;
        end;
    end;
    % This complicated looking expression is simply an end-around for
    % underflow.  It takes advantage of the fact that both totalprobs and v
    % have entries that are non-negative and sum-constrained.  So adding a
    % tiny bit (i.e. machine_eps) has no effect except to avoid log(0)
    tmp = (R'*log(totalprobs+machine_eps))+(ones(m,1)*log(v+machine_eps)');
    mtmp = max(tmp');
    J = exp(tmp - mtmp'*ones(1,22));
    H = J./(sum(J,2)*ones(1,22));
    e_1 = sum(S.*(R*H),2)'/Rtot;
    p_1 = K*H'*ones(m,1);
    p_1 = p_1/sum(p_1);
end;

% Set final estimates of allele frequency and error probability
allele_freqs = p_1;
error_probs = e_1;

% Use final estimates to calculate posterior probabilities.  Note that
% post_probs uses the 10 condensed states.  For the full 22, use H instead
post_probs = zeros(m,10);
post_probs(:,1:4) = H(:,1:4);
post_probs(:,5) = sum(H(:,5:7),2);
post_probs(:,6) = sum(H(:,8:10),2);
post_probs(:,7) = sum(H(:,11:13),2);
post_probs(:,8) = sum(H(:,14:16),2);
post_probs(:,9) = sum(H(:,17:19),2);
post_probs(:,10) = sum(H(:,20:22),2);

% The IUPAC-like codes for the 10 condensed states
OLC = 'ACGTMRWSYK';

% For each line identify the state with highest posterior probability
[Y,I] = max(post_probs,[],2);
numeric_calls = I';
alpha_calls = OLC(I);

% There are five outputs:
%   alpha_calls, which gives the IUPAC-like calls
%   numeric_calls, which codes these as states from Table 1
%   allele_freqs, which are the estimates of p
%   error_probs, which are the estimates of epsilon
%   post_probs, which are the posterior probabilities of each state