##########################################################################
# VaRank 1.0                                                             #
#                                                                        #
# VaRank: a simple and powerful tool for ranking genetic variants        #
#                                                                        #
# Copyright (C) 2014 Veronique Geoffroy (veronique.geoffroy@inserm.fr)   # 
#                    Jean Muller (jeanmuller@unistra.fr)                 # 
#                                                                        #
# Please cite the following article:                                     #
#    XXX                                                                 #
#                                                                        #
# This is part of VaRank source code.                                    #
#                                                                        #
# This program is free software; you can redistribute it and/or          #
# modify it under the terms of the GNU General Public License            # 
# as published by the Free Software Foundation; either version 3         # 
# of the License, or (at your option) any later version.                 #
#                                                                        #
# This program is distributed in the hope that it will be useful,        # 
# but WITHOUT ANY WARRANTY; without even the implied warranty of         #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          #
# GNU General Public License for more details.                           #
#                                                                        #
# You should have received a copy of the GNU General Public License      #
# along with this program; If not, see <http://www.gnu.org/licenses/>.   #
##########################################################################

##
## Filtering of all the existing ranking files (not already filtered).
##
## OUTPUTS: 	g_VaRank(vcfDir)/"family"_"patient"_filteredVariants.rankingByVar.tsv (1 par patient)
##

proc executeFilters {} {

    global g_VaRank
    global g_vcfINFOS

    puts "...writing all filtered ranking files ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"

    ## Remove all old filtered files
    ################################
    foreach filteredFile [glob -nocomplain $g_VaRank(vcfDir)/*filteredVariants*.tsv] {file delete -force $filteredFile}
    
    ## Filter every line with the user's options.
    ## Create the filtered files.
    #############################################################
    foreach rankFile [glob -nocomplain $g_VaRank(vcfDir)/*Variants.rankingBy*.tsv] {
	regsub "all(Variants)" $rankFile "filtered\\1" outputfile
	set OutputText ""
	foreach L [LinesFromFile $rankFile] {
	    if {[regexp "^## Barcode:" $L] || $L == ""} {append OutputText     $L; continue}
	    if {[regexp "^##"          $L] || $L == ""} {append OutputText "\n$L"; continue}

	    if {[regexp "^VariantID" $L]} {
		append OutputText "\n$L"
		set L [split $L "\t"]

		set i_clinic    [lsearch -regexp $L "^rsClinicalSignificance"];if {$i_clinic == -1} {puts "rsClinicalSignificance: column not found - Exit";exit}
		set i_valid     [lsearch -regexp $L "^rsValidation"];          if {$i_valid  == -1} {puts "rsValidation: column not found - Exit";exit}

		set i_cov       [lsearch -regexp $L "^TotalReadDepth"];   if {$i_cov       == -1} {puts "TotalReadDepth: column not found - Exit";exit}
		set i_read      [lsearch -regexp $L "^VarReadDepth"];     if {$i_read      == -1} {puts "ReadWithVar: column not found - Exit";exit}
		set i_percent   [lsearch -regexp $L "^%Reads_variation"]; if {$i_percent   == -1} {puts "%Reads_variation: column not found - Exit";exit}
		
		set i_rsMAF     [lsearch -regexp $L "^rsMAF"];            if {$i_rsMAF     == -1} {puts "rsMAF: column not found - Exit";exit}
		set i_espEAMAF  [lsearch -regexp $L "^espEAMAF"];         if {$i_espEAMAF  == -1} {puts "espEAMAF: column not found - Exit";exit}
		set i_espAAMAF  [lsearch -regexp $L "^espAAMAF"];         if {$i_espAAMAF  == -1} {puts "espAAMAF: column not found - Exit";exit}
		set i_espAllMAF [lsearch -regexp $L "^espAllMAF"];        if {$i_espAllMAF == -1} {puts "espAllMAF: column not found - Exit";exit}

		continue        
	    }
	    set L [split $L "\t"]
	    regsub -all " " [lindex $L $i_cov]     "" cov
	    regsub -all " " [lindex $L $i_read]    "" read
	    regsub -all " " [lindex $L $i_percent] "" percent
	    
	    #Filtering on sequencing data
	    #
	    if {$cov != "" && $cov != "NA" && $cov != "." && $cov != "0"} { 
		if {$cov < $g_VaRank(depthFilter)} {continue}

		if {$read != "" && $read != "NA" && $read != "."} {
		    if {$read    < $g_VaRank(readFilter)}        {continue}
		    if {$percent < $g_VaRank(readPercentFilter)} {continue}
		}
	    }

	    #Filtering on frequency data
	    #
	    if {$g_VaRank(freqFilter) != ""} {

		#Global minor allele frequency (MAF)
		#The MAF is actually the second most frequent allele value. In other words, if there are 3 alleles, with frequencies of 0.50, 0.49, and 0.01, the MAF will be reported as 0.49. 
		#The current default global population is 1000Genome phase 1 genotype data from 1094 worldwide individuals, released in the May 2011 dataset.
		#http://www.ncbi.nlm.nih.gov/projects/SNP/docs/rs_attributes.html
		
		regsub -all " " [lindex $L $i_rsMAF]     "" rsMAF
		regsub -all " " [lindex $L $i_espEAMAF]  "" espEAMAF
		regsub -all " " [lindex $L $i_espAAMAF]  "" espAAMAF
		regsub -all " " [lindex $L $i_espAllMAF] "" espAllMAF

		set TooFrequent 0

		foreach freq [list $rsMAF $espAllMAF $espEAMAF $espAAMAF] {
		    if {[set g_VaRank(metrics)]=="fr"} {
			#puts "Filters replacing $freq"
			regsub {\,} $freq "." freq
			#puts "Filters: $freq"
		    }

		    if {$freq != "NA" && $freq != "" && $freq>=$g_VaRank(freqFilter)} {set TooFrequent 1;break}
		}

		if {$TooFrequent} {continue}
	    }

	    #Filtering on sequencing data
	    #
	    #Here we need to improve filter and add also frequency
	    if {$g_VaRank(rsFilter) == "removeNonPathoRS"} {
		regsub -all " " [lindex $L $i_valid]  "" valid
		regsub -all " " [lindex $L $i_clinic] "" clinic

		#Clinical significance:   
		#Assertions of clinical significance for alleles of human sequence variations are reported as provided by the submitter and not interpreted by NCBI. 
		#Submissions based on processing data from OMIM were assigned the value of ‘probable-pathogenic, based on a personal communication from Ada Hamosh, director of OMIM. 
		#http://www.ncbi.nlm.nih.gov/projects/SNP/docs/rs_attributes.html

		#unknown 
		#untested
		#non-pathogenic
		#probable-non-pathogenic
		#probable-pathogenic
		#pathogenic
		#drug-response
		#histocompatibility
		#other

		#Keeping the pathogenic ones
		if {![regexp "^pathogenic" $clinic] && ![regexp "^probable-pathogenic" $clinic]} {
		    #Keeping only the ones with 2 validation or more
		    if {[regsub -all "/|;" $valid "" toto] >= 1 || [regexp -nocase "^yes$" $valid]} {continue}
		}
	    }

	    append OutputText "\n[join $L "\t"]"
	}
	if {[regexp "rankingByVar" $outputfile]} {
	    ReplaceTextInFile $OutputText $outputfile
	    continue
	}

	## Ranking by gene have to be done once again after filtering
	regexp "fam\[0-9\]+_(.+)_allVariants.rankingBy" $rankFile match patient
	set OutputText2 ""
	set lGenes ""
	set lVariantsRankingByGene ""
	set lID ""
	foreach L [split $OutputText "\n"] {
	    if {$L == ""} {continue}
	    if {[regexp "^## Barcode:" $L]} {append OutputText2     $L; continue}
	    if {[regexp "^##"          $L]} {append OutputText2 "\n$L"; continue}
	    if {[regexp "^VariantID" $L]} {
		append OutputText2 "\n$L"
		set Ls [split $L "\t"]
		set i_gene  [lsearch -regexp $Ls "^Gene$"];            if {$i_gene  == -1} {puts "Gene: column number not found - Exit"; exit}
		set i_score [lsearch -regexp $Ls "^VaRank_VarScore$"]; if {$i_score == -1} {puts "VaRank_VarScore: column number not found - Exit"; exit}
		continue
	    }
	    set Ls [split $L "\t"]
	    regsub -all " " [lindex $Ls 0]       "" id
	    regsub -all " " [lindex $Ls $i_gene] "" gene
	    if {$gene == "NA"} {continue}

	    if {[lsearch -exact $lGenes "$gene"] == -1} {
		lappend lGenes $gene
	    }
	    set l($id) $L
		
	    regsub -all " " [lindex $Ls $i_score] "" score
	    
	    lappend lVariants($gene) "$id $score"
	}
	
	foreach gene $lGenes {
	    set liste {}
	    foreach duoIDScore $lVariants($gene) {
		set id [lindex $duoIDScore 0]
		lappend liste $id
	    }
	    set maxScore1 [lindex [lindex $lVariants($gene) 0] 1]
	    if {[llength $lVariants($gene)] == 1} {
		set BestDuoScore "[expr {$maxScore1*2}]"
	    } else {
		set bestID [lindex [lindex $lVariants($gene) 0] 0]
		if {[regexp "$patient:(\[^: \]+):" $g_vcfINFOS($bestID) match homhtz] && [regexp -nocase "hom" $homhtz]} {
		    set BestDuoScore "[expr {$maxScore1*2}]"
		} else {
		    set maxScore2 [lindex [lindex $lVariants($gene) 1] 1]
		    set BestDuoScore "[expr {$maxScore1+$maxScore2}]"
		}
	    }
	    lappend lVariantsRankingByGene "{$liste} $BestDuoScore"
	}
	if {[info exists lVariantsRankingByGene]} {
	    set lVariantsRankingByGene [lsort -command DescendingSortOnElement1 $lVariantsRankingByGene]
	}    
	foreach val $lVariantsRankingByGene {
	    set lID [lindex $val 0]
	    foreach id $lID {
		append OutputText2 "\n$l($id)"
	    }
	}

	ReplaceTextInFile "$OutputText2" $outputfile
	
	## Very important 
	if {[info exists l]} {unset l}
	if {[info exists lVariants]} {unset lVariants}
    }
    return
}


