#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <math.h>
#include "leon.h"

#define MIN(a,b) ((a)<(b)?(a):(b))
#define MAX(a,b) ((a)>(b)?(a):(b))

typedef struct {                /* shared core blocks between 2 groups*/
        sint start;
        sint end;
        sint refstart;
        sint refend;
        sint start_col;
        sint end_col;
        float score;
} SCBLOCK,*SCBLOCKPTR;

typedef struct {                /* shared core regions between 2 groups*/
        sint start;
        sint end;
        sint refstart;
        sint refend;
        sint start_col;
        sint end_col;
	sint len;
        float score;
} SCREGION,*SCREGIONPTR;

void sort_seqscores(float *scores,sint *index,int f,int l);
void swap_seqscores(float *scores,sint *index,int s1, int s2);
static sint get_seq_with_index(SEQ *seqs,sint nseqs,sint ii);
static sint score_gaps(SEQ *seqs,int s1,int s2);
static void sort_blocks(SCBLOCK *blocks,sint f,sint l);
static void swap_blocks(SCBLOCK *blocks,sint s1,sint s2);
static void add_ft_entry(ALNPTR mult_aln,sint seq,sint first,sint last,sint type,sint code,float score,char *ctype,char *name,sint is,sint ie);
static void remove_gap_positions(ALNPTR mult_aln);

int main(int argc,char **argv)
{
	sint i,ii,j,jj,k,kk,l,s,e,n,g,seq;
	sint seq1;
	sint len1,len2;
	float diff;
	char infile[FILENAMELEN+1];
	char outfile[FILENAMELEN+1];
	char query_name[MAXNAMES+1];
	char type[11];
	ALN mult_aln;
	OPT opt;
	sint nseqs,len;
	sint *nblocks;
	sint *nregions;
	sint *nreflen;
	float *nrefscores;
	float *nrefscores_tmp;
	sint *output_index;
	sint *index;
	sint query_seq;
	sint min_blockscore,max_blockdist,min_regionscore,min_regionlen;
	double dscore;
	sint *is, *ie;
	char c;
	Boolean in_region;
	Boolean keep_unrelated;
	float *gep;
	float score;
	float query_score;
	float pcid;
	sint tot,count;
	double **tmat;
	SCBLOCK **scblocks;
	SCREGION **scregions;

	if(argc!=8 && argc!=9) {
		fprintf(stdout,"Usage: %s input_aln query_seq output_aln min_blockscore max_blockdist min_regionscore min_regionlength [-keep]\n",argv[0]);
		exit(1);
	}
	strcpy(infile,argv[1]);
	strcpy(query_name,argv[2]);
	strcpy(outfile,argv[3]);
	min_blockscore=atoi(argv[4]);
	max_blockdist=atoi(argv[5]);
	min_regionscore=atoi(argv[6]);
	min_regionlen=atoi(argv[7]);

	keep_unrelated=TRUE;
	if(argc==9) {
		if(strcmp(argv[8],"-keep")!=0) {
			fprintf(stdout,"Usage: %s input_aln query_seq output_aln [-keep]\n",argv[0]);
			exit(1);
		}
		keep_unrelated=TRUE;
	}

        init_options(&opt);

	(*opt.alnout_opt).output_clustal=FALSE;
	(*opt.alnout_opt).output_relacs=TRUE;

/* read in the sequences */
	seq_input(infile,opt.explicit_type,FALSE,&mult_aln);
	if(mult_aln.nseqs<=0) {
		error("No sequences in %s\n",infile);
		exit(1);
	}
	nseqs=mult_aln.nseqs;

/* find the query sequence */
	query_seq=(-1);
        for(i=0;i<nseqs;i++) {
                if(strcasecmp(query_name,mult_aln.seqs[i].name)==0) query_seq=i;
        }
        if(query_seq==-1) {
                fprintf(stdout,"Error: the specified reference sequence (%s) was not found in the alignment\n",query_name);
                exit(1);
        }

/* find the start and end positions of each sequence */

        is = (sint *)ckalloc((mult_aln.nseqs+1) * sizeof(sint));
        ie = (sint *)ckalloc((mult_aln.nseqs+1) * sizeof(sint));
        for(s=0;s<mult_aln.nseqs;s++) {
                is[s]=0;
                ie[s] = mult_aln.seqs[s].len-1;
                for (i=0; i<mult_aln.seqs[s].len; i++) {
                        c = mult_aln.seqs[s].data[i];
                        if (!isalpha(c))
                                is[s]++;
                        else
                                break;
                }
                for (i=mult_aln.seqs[s].len-1; i>=0; i--) {
                        c = mult_aln.seqs[s].data[i];
                        if (!isalpha(c))
                                ie[s]--;
                        else
                                break;
                }
        }


        nrefscores=(float *)ckalloc((mult_aln.nseqs+1)*sizeof(float));
        nreflen=(sint *)ckalloc((mult_aln.nseqs+1)*sizeof(sint));
        index=(sint *)ckalloc((mult_aln.nseqs+1)*sizeof(sint));
        for(i=0;i<mult_aln.nseqs;i++) 
		index[i]=i;
        nblocks=(sint *)ckalloc((mult_aln.nseqs+1)*sizeof(sint));
        for(i=0;i<mult_aln.nseqs;i++) 
		nblocks[i]=0;

        tmat = (double **) ckalloc( (mult_aln.nseqs+1) * sizeof (double *) );
        for(i=0;i<mult_aln.nseqs;i++)
               	tmat[i] = (double *)ckalloc( (mult_aln.nseqs+1) * sizeof (double) );

        for (i=0;i<mult_aln.nseqs;i++) {
               	for (j=i+1;j<mult_aln.nseqs;j++) {
                       	dscore = countid(mult_aln.seqs[i],mult_aln.seqs[j]);
                       	tmat[j][i] = tmat[i][j] = (100.0 - dscore)/100.0;
               	}
        }

/* special case - we've got no groups, so no core blocks */
	if(mult_aln.ft[query_seq].nentries[COREBLOCK]<=0) {
/* count pairwise residue percent identities */
		for(i=0;i<mult_aln.nseqs;i++) {
			if(tmat[i][query_seq]<=0.5) 
				mult_aln.seqs[i].simgroup=1;
			else
				mult_aln.seqs[i].simgroup=2;
			nrefscores[i]=(1.0-tmat[i][query_seq])*100.0;
			nreflen[i]=mult_aln.seqs[i].len;
		}
        	for(i=0;i<mult_aln.nseqs;i++) {
			mult_aln.seqs[index[i]].output_index=i;
		}
	}
	else {

/* calculate the number of shared blocks between each sequence and the reference sequence */
        for(i=0;i<mult_aln.nseqs;i++) {
		for(j=0;j<mult_aln.ft[i].nentries[COREBLOCK];j++) {
			pos2col(mult_aln.seqs[i].data,mult_aln.ft[i].data[COREBLOCK][j].start,mult_aln.ft[i].data[COREBLOCK][j].end,&s,&e);
			mult_aln.ft[i].data[COREBLOCK][j].start_col=s;
			mult_aln.ft[i].data[COREBLOCK][j].end_col=e;
		}
	}

        scblocks=(SCBLOCK **)ckalloc((mult_aln.nseqs+1) * sizeof(SCBLOCK *));

        for(i=0;i<mult_aln.nseqs;i++) {
/* count the number of blocks needed first, than allocate enough memory to store data */
		n=0;
		for(k=0;k<mult_aln.ft[i].nentries[COREBLOCK];k++) {
			for(j=0;j<mult_aln.ft[query_seq].nentries[COREBLOCK];j++) {
				if(mult_aln.ft[query_seq].data[COREBLOCK][j].color==mult_aln.ft[i].data[COREBLOCK][k].color)
				if((len=overlap(mult_aln.ft[query_seq].data[COREBLOCK][j].start_col,mult_aln.ft[query_seq].data[COREBLOCK][j].end_col,mult_aln.ft[i].data[COREBLOCK][k].start_col,mult_aln.ft[i].data[COREBLOCK][k].end_col))>3) {
					n++;
				}
			}
		}
                scblocks[i]=(SCBLOCK *)ckalloc((n+1) * sizeof(SCBLOCK));

		for(k=0;k<mult_aln.ft[i].nentries[COREBLOCK];k++) {
			for(j=0;j<mult_aln.ft[query_seq].nentries[COREBLOCK];j++) {
				if(mult_aln.ft[query_seq].data[COREBLOCK][j].color==mult_aln.ft[i].data[COREBLOCK][k].color)
				if((len=overlap(mult_aln.ft[query_seq].data[COREBLOCK][j].start_col,mult_aln.ft[query_seq].data[COREBLOCK][j].end_col,mult_aln.ft[i].data[COREBLOCK][k].start_col,mult_aln.ft[i].data[COREBLOCK][k].end_col))>3) {
					scblocks[i][nblocks[i]].start_col=MAX(mult_aln.ft[query_seq].data[COREBLOCK][j].start_col,mult_aln.ft[i].data[COREBLOCK][k].start_col);
					scblocks[i][nblocks[i]].end_col=MIN(mult_aln.ft[query_seq].data[COREBLOCK][j].end_col,mult_aln.ft[i].data[COREBLOCK][k].end_col);
					col2pos(mult_aln.seqs[i].data,scblocks[i][nblocks[i]].start_col,scblocks[i][nblocks[i]].end_col,&s,&e);
					scblocks[i][nblocks[i]].start=s;
					scblocks[i][nblocks[i]].end=e;
					col2pos(mult_aln.seqs[query_seq].data,scblocks[i][nblocks[i]].start_col,scblocks[i][nblocks[i]].end_col,&s,&e);
					scblocks[i][nblocks[i]].refstart=s;
					scblocks[i][nblocks[i]].refend=e;
					scblocks[i][nblocks[i]].score=mult_aln.ft[i].data[COREBLOCK][k].score;
					nblocks[i]++;
				}
			}
		}
	}

/* sort the blocks for each sequence into ascending order of sequence position */
	for(i=0;i<mult_aln.nseqs;i++)
		sort_blocks(scblocks[i],0,nblocks[i]-1);

/* chain the blocks together if no large gaps */
        scregions=(SCREGION **)ckalloc((mult_aln.nseqs+1) * sizeof(SCREGION *));
        for(i=0;i<mult_aln.nseqs;i++)
                scregions[i]=(SCREGION *)ckalloc((nblocks[i]+1) * sizeof(SCREGION));
        nregions=(sint *)ckalloc((mult_aln.nseqs+1)*sizeof(sint));
        for(i=0;i<mult_aln.nseqs;i++) {
		n=(-1);
		nregions[i]=0;
		in_region=FALSE;
		for(j=0;j<nblocks[i];j++) {
			if( scblocks[i][j].score<min_blockscore) continue;
fprintf(stdout,"BLOCK %s %d %d %d-%d %d-%d %.2f\n",mult_aln.seqs[i].name,scblocks[i][j].start_col,scblocks[i][j].end_col,
								scblocks[i][j].start,scblocks[i][j].end,
								scblocks[i][j].refstart,scblocks[i][j].refend,
								scblocks[i][j].score);
			if(in_region==TRUE) {
				if(scblocks[i][j].start-scregions[i][n].end >= max_blockdist || scblocks[i][j].refstart-scregions[i][n].refend >= max_blockdist) {
					in_region=FALSE;
				}
			}
			if(in_region==FALSE) {
/* start a new region */
				if(scblocks[i][j].score>0) {
					n++;
					scregions[i][n].start_col=scblocks[i][j].start_col;
					scregions[i][n].end_col=scblocks[i][j].end_col;
					scregions[i][n].start=scblocks[i][j].start;
					scregions[i][n].end=scblocks[i][j].end;
					scregions[i][n].refstart=scblocks[i][j].refstart;
					scregions[i][n].refend=scblocks[i][j].refend;
					scregions[i][n].score+=scblocks[i][j].score*(scblocks[i][j].end-scblocks[i][j].start);
					scregions[i][n].len+=scblocks[i][j].end-scblocks[i][j].start;
					in_region=TRUE;
				}
			}
			else {
/* extend the region by adding this block */
				if(scblocks[i][j].score>0) {
					scregions[i][n].end_col=scblocks[i][j].end_col;
					scregions[i][n].end=scblocks[i][j].end;
					scregions[i][n].refend=scblocks[i][j].refend;
					scregions[i][n].score+=scblocks[i][j].score*(scblocks[i][j].end-scblocks[i][j].start);
					scregions[i][n].len+=scblocks[i][j].end-scblocks[i][j].start;
				}
			}
		}
		nregions[i]=n+1;
	}

	for(i=0;i<mult_aln.nseqs;i++) {
		for(j=0;j<nregions[i];j++) {
			len1=scregions[i][j].end-scregions[i][j].start;
			len2=scregions[i][j].refend-scregions[i][j].refstart;
			len=MIN(len1,len2);
			if(len1<len2)
				diff=(float)len1/(float)len2;
			else
				diff=(float)len2/(float)len1;
			if(diff>0.5 && scregions[i][j].len>min_regionlen && scregions[i][j].score>min_regionscore) {
				count=tot=0;
				for(k=scregions[i][j].start_col;k<scregions[i][j].end_col;k++) {
					if(isalpha(mult_aln.seqs[i].data[k]) && isalpha(mult_aln.seqs[query_seq].data[k])) {
						tot++;
						if(mult_aln.seqs[query_seq].data[k]==mult_aln.seqs[i].data[k]) count++;
					}
				}
				if(tot==0) pcid=0;
				else pcid=100.0*(float)count/(float)tot;
fprintf(stdout,"REGION %s %d %d %d-%d %d-%d %.2f %.2f\n",mult_aln.seqs[i].name,scregions[i][j].start_col,scregions[i][j].end_col,
								scregions[i][j].start,scregions[i][j].end,
								scregions[i][j].refstart,scregions[i][j].refend,
								scregions[i][j].score,pcid);
				add_ft_entry(&mult_aln,i,scregions[i][j].start_col,scregions[i][j].end_col,REGION,0,scregions[i][j].score,"REGION","REGION",is[i],ie[i]);
				nrefscores[i]+=scregions[i][j].score;
				nreflen[i]+=scregions[i][j].len;
			}
		}
	}

/* copy scores to temporary array for sorting */
        nrefscores_tmp=(float *)ckalloc((mult_aln.nseqs+1)*sizeof(float));
        for(i=0;i<mult_aln.nseqs;i++) {
		nrefscores_tmp[i]=nrefscores[i];
	}

	sort_seqscores(nrefscores_tmp,index,0,mult_aln.nseqs-1);

        for(i=0;i<mult_aln.nseqs;i++) {
		mult_aln.seqs[index[i]].output_index=i;
	}

/* final check of all regions */
	query_score=nrefscores[query_seq];

	if(query_score==0.0) {
/* if we don't have any regions at all, even in the query, just output the query subgroup */
		for(i=0;i<mult_aln.nseqs;i++) {
			if(i==query_seq || tmat[i][query_seq]<0.2) mult_aln.seqs[i].simgroup=1;
			else mult_aln.seqs[i].simgroup=2;
		}
	}
	else {
        	for(i=0;i<mult_aln.nseqs;i++) {
			score=nrefscores[i];
			if ((score>query_score/2.0) || (score>query_score/4.0 && nreflen[i]>20) || (score>query_score/7.5 && nreflen[i]>30) || (score>query_score/14.0 && nreflen[i]>45)) {
				mult_aln.seqs[i].simgroup=1;
			}
			/*else if ((nreflen[i]>=70) || (nreflen[i]>=60 && score>800) || (nreflen[i]>=40 && score>1500) || (nreflen[i]>=20 && score>5000)) {*/
			else if ((nreflen[i]>=min_regionlen && score>min_regionscore)) {
				mult_aln.seqs[i].simgroup=1;
			}
			else {
				mult_aln.seqs[i].simgroup=2;
			}
		}
	}
	}
	

/* re-order sequences according to groups */
        output_index=(sint *)ckalloc((mult_aln.nseqs+1)*sizeof(sint));
	n=0;
	for(ii=0;ii<mult_aln.nseqs;ii++) {
                i=get_seq_with_index(mult_aln.seqs,mult_aln.nseqs,ii);
		if(mult_aln.seqs[i].simgroup==1) {
			output_index[i]=n++;
fprintf(stdout,"HOMOLOG %-10s %d %d %.2f %.2f\n",mult_aln.seqs[i].name,(sint)nblocks[i],nreflen[i],nrefscores[i],(float)(1.0-tmat[i][query_seq])*100.0);
		}
	}
	for(ii=0;ii<mult_aln.nseqs;ii++) {
                i=get_seq_with_index(mult_aln.seqs,mult_aln.nseqs,ii);
		if(mult_aln.seqs[i].simgroup==2) {
			if(keep_unrelated)
				output_index[i]=n++;
			else
				output_index[i]=(-1);
fprintf(stdout,"REJECT %-10s %d %d %.2f %.2f\n",mult_aln.seqs[i].name,(sint)nblocks[i],nreflen[i],nrefscores[i],(float)(1.0-tmat[i][query_seq])*100.0);
		}
	}

	for(i=0;i<mult_aln.nseqs;i++) mult_aln.seqs[i].output_index=output_index[i];
	remove_gap_positions(&mult_aln);

        for(ii=0;ii<mult_aln.nseqs;ii++) {
		i=get_seq_with_index(mult_aln.seqs,mult_aln.nseqs,ii);
	}


	strcpy((*opt.alnout_opt).relacs_outname, outfile);

	if(!open_alignment_output(infile,opt.alnout_opt)) exit(1);
        create_alignment_output(mult_aln,*opt.alnout_opt);
}

void sort_seqscores(float *scores,sint *index,int f,int l)
{
        int i,last;

        if(f>=l) return;

        swap_seqscores(scores,index,f,(f+l)/2);
        last=f;
        for(i=f+1;i<=l;i++)
        {
                if(scores[i]>scores[f])
                        swap_seqscores(scores,index,++last,i);
        }
        swap_seqscores(scores,index,f,last);
        sort_seqscores(scores,index,f,last-1);
        sort_seqscores(scores,index,last+1,l);

}

void swap_seqscores(float *scores,sint *index,int s1, int s2)
{
        float temp;

        temp=scores[s1];
        scores[s1]=scores[s2];
        scores[s2]=temp;

	temp=index[s1];
	index[s1]=index[s2];
	index[s2]=temp;
}

void sort_blocks(SCBLOCK *blocks,sint f,sint l)
{
        int i,last;

        if(f>=l) return;

        swap_blocks(blocks,f,(f+l)/2);
        last=f;
        for(i=f+1;i<=l;i++)
        {
                if(blocks[i].start<blocks[f].start)
                        swap_blocks(blocks,++last,i);
        }
        swap_blocks(blocks,f,last);
        sort_blocks(blocks,f,last-1);
        sort_blocks(blocks,last+1,l);

}

void swap_blocks(SCBLOCK *blocks,int s1, int s2)
{
	int t;
        float f;

        t=blocks[s1].start;
        blocks[s1].start=blocks[s2].start;
        blocks[s2].start=t;
        t=blocks[s1].end;
        blocks[s1].end=blocks[s2].end;
        blocks[s2].end=t;
        t=blocks[s1].refstart;
        blocks[s1].refstart=blocks[s2].refstart;
        blocks[s2].refstart=t;
        t=blocks[s1].refend;
        blocks[s1].refend=blocks[s2].refend;
        blocks[s2].refend=t;
        t=blocks[s1].start_col;
        blocks[s1].start_col=blocks[s2].start_col;
        blocks[s2].start_col=t;
        t=blocks[s1].end_col;
        blocks[s1].end_col=blocks[s2].end_col;
        blocks[s2].end_col=t;
        f=blocks[s1].score;
        blocks[s1].score=blocks[s2].score;
        blocks[s2].score=f;

}

static void add_ft_entry(ALNPTR mult_aln,sint seq,sint first,sint last,sint type,sint code,float score,char *ctype,char *name,sint is,sint ie)
{
        sint n;
	sint fr,lr;
	sint fc,lc;

	if(mult_aln->ft[seq].nentries[type]>MAXFT) {
		fprintf(stdout,"WARNING: too many features in %s %d (%d)\n",mult_aln->seqs[seq].name,mult_aln->ft[seq].nentries[type],type);
		return;
	}
	if(last<is || first>ie) return;
	if(first<is) first=is;
	if(last>ie) last=ie;
        n=mult_aln->ft[seq].nentries[type];
        alloc_ft_entry(&mult_aln->ft[seq].data[type][n]);
        strcpy(mult_aln->ft[seq].data[type][n].type,ctype);
	col2pos(mult_aln->seqs[seq].data,first,last,&fr,&lr);
	pos2col(mult_aln->seqs[seq].data,fr,lr,&fc,&lc);
	if(fc==first)
        	mult_aln->ft[seq].data[type][n].start=fr;
	else
		mult_aln->ft[seq].data[type][n].start=fr+1;
	if(lc==last)
        	mult_aln->ft[seq].data[type][n].end=lr;
	else
		mult_aln->ft[seq].data[type][n].end=lr-1;
        strcpy(mult_aln->ft[seq].data[type][n].name,name);
        mult_aln->ft[seq].data[type][n].color=code;
        mult_aln->ft[seq].data[type][n].score=score;
        mult_aln->ft[seq].nentries[type]++;
}

static sint get_seq_with_index(SEQ *seqs,sint nseqs,sint ii)
{
        sint i;

        for(i=0;i<nseqs;i++) {
                if(seqs[i].output_index==ii) break;
        }

        return i;
}

static sint score_gaps(SEQ *seqs,int s1,int s2)
{
	int i,j,k,start,len;
	int is,ie;
   	int count, total, ngaps, gaplen;
   	char c1,c2;
	float score;
	char *seq1,*seq2;
	Boolean in_gap,lin_gap;

 
   	if(seqs[s1].len<seqs[s2].len) len = seqs[s1].len;
   	else len = seqs[s2].len;

/* find the start and end of the pairwise alignment */
	is=0;
	ie=len;

	for(k=0;k<len;k++) {
     		c1 = seqs[s1].data[k];
     		c2 = seqs[s2].data[k];
		if (isalpha(c1) && isalpha(c2)) {
			is=k;
			break;
		}
	}
	for(k=len-1;k>=0;k--) {
     		c1 = seqs[s1].data[k];
     		c2 = seqs[s2].data[k];
		if (isalpha(c1) && isalpha(c2)) {
			ie=k;
			break;
		}
	}

/* remove common gap positions */
	seq1=(char *)ckalloc((len+1) * sizeof(char));
	seq2=(char *)ckalloc((len+1) * sizeof(char));
	len=0;
	for(i=is;i<ie;i++) {
		if(isalpha(seqs[s1].data[i]) || isalpha(seqs[s2].data[i])) {
			seq1[len]=seqs[s1].data[i];
			seq2[len]=seqs[s2].data[i];
			len++;
		}
	}

	ngaps=gaplen=0;
	lin_gap=in_gap=FALSE;
	for(i=0;i<len;i++)
	{
		if(isalpha(seq1[i]))
		{
			in_gap=FALSE;
		}
		else
		{
			if(in_gap==FALSE) ngaps++;
			gaplen++;
			in_gap=TRUE;
		}
		lin_gap=in_gap;
	}
	lin_gap=in_gap=FALSE;
	for(i=0;i<len;i++)
	{
		if(isalpha(seq2[i]))
		{
			in_gap=FALSE;
		}
		else
		{
			if(in_gap==FALSE) ngaps++;
			gaplen++;
			in_gap=TRUE;
		}
		lin_gap=in_gap;
	}

	ckfree(seq1);
	ckfree(seq2);

	/*return ngaps;*/
	return gaplen;
}

static void remove_gap_positions(ALNPTR mult_aln)
{
        int i,j,k,ngaps;
	int nseqs;
	int fseq;

	nseqs=0;
	fseq=(-1);
        for (i=0;i<mult_aln->nseqs;i++)
		if(mult_aln->seqs[i].output_index>=0) {
			nseqs++;
			if(fseq==(-1)) fseq=i;
		}

	if(nseqs==0) return;

        for (i=0;i<mult_aln->seqs[fseq].len;)
        {
                ngaps=0;
                for (j=0;j<mult_aln->nseqs;j++)
                        if(mult_aln->seqs[j].output_index>=0 && !isalpha(mult_aln->seqs[j].data[i])) ngaps++;
                if (ngaps==nseqs)
                {
                        for (j=0;j<mult_aln->nseqs;j++)
                        {
				if(mult_aln->seqs[j].output_index>=0) {
                                	for(k=i+1;k<=mult_aln->seqs[j].len;k++)
                                        	mult_aln->seqs[j].data[k-1]=mult_aln->seqs[j].data[k];
                                	mult_aln->seqs[j].len--;
				}
                        }
                }
                else i++;
        }
}

