##########################################################################
# VaRank 1.0                                                             #
#                                                                        #
# VaRank: a simple and powerful tool for ranking genetic variants        #
#                                                                        #
# Copyright (C) 2014 Veronique Geoffroy (veronique.geoffroy@inserm.fr)   # 
#                    Jean Muller (jeanmuller@unistra.fr)                 # 
#                                                                        #
# Please cite the following article:                                     #
#    XXX                                                                 #
#                                                                        #
# This is part of VaRank source code.                                    #
#                                                                        #
# This program is free software; you can redistribute it and/or          #
# modify it under the terms of the GNU General Public License            # 
# as published by the Free Software Foundation; either version 3         # 
# of the License, or (at your option) any later version.                 #
#                                                                        #
# This program is distributed in the hope that it will be useful,        # 
# but WITHOUT ANY WARRANTY; without even the implied warranty of         #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          #
# GNU General Public License for more details.                           #
#                                                                        #
# You should have received a copy of the GNU General Public License      #
# along with this program; If not, see <http://www.gnu.org/licenses/>.   #
##########################################################################


## Return the bonus value to add to the VaRank score
proc addBonus {siftPred siftMed pph2 phastcons} {

    if {[info exists g_VaRank(DEBUG)]} {puts "AddBonue values: $siftPred $siftMed $pph2 $phastcons"}

    set bonus 0
    if {[string is double $siftMed]} {
	if {[regexp "Deleterious" $siftPred] && $siftMed >= 2.75 && $siftMed <= 3.5} {
	    incr bonus 10
	}
    }
    if {[regexp "deleterious" $pph2]} {
	incr bonus 10
    }
    if {[string is double $phastcons] && $phastcons > 0.95} {incr bonus 5}

    if {[info exists g_VaRank(DEBUG)]} {puts "Bonus $bonus"}

    return $bonus
}

proc scoreAllTheID {} {

    global g_VaRank
    global g_ALAMUT
    global g_lScore
    global g_deltaSSF
    global g_deltaMES
    global g_deltaNNS

    ## Define the "g_lScore" global variable with the genetic variants analysed by alamut
    #####################################################################################
    puts "...scoring each genetic variant ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
    regsub -all "^\\\\?{|\\\\?}$" [split $g_ALAMUT(#id) "\t"] "" L
    set i_gene      [lsearch -exact $L "gene"];           if {$i_gene   == -1} {puts "column number not found for gene - Exit"; exit}
 
    set i_trans     [lsearch -exact $L "transcript"];     if {$i_trans  == -1} {puts "column number not found for transcript - Exit"; exit}
    
    set i_length    [lsearch -exact $L "transLen"];       if {$i_length == -1} {puts "column number not found for transLen - Exit"; exit}
    set i_clinic    [lsearch -exact $L "rsClinicalSignificance"]; if {$i_clinic == -1} {puts "column number not found for rsClinicalSignificance - Exit"; exit}
    set i_effect    [lsearch -exact $L "codingEffect"];   if {$i_effect == -1} {puts "column number not found for codingEffect - Exit"; exit}

    set i_distSS    [lsearch -exact $L "distNearestSS"];  if {$i_distSS == -1} {puts "column number not found for distNearestSS - Exit"; exit}
    set i_nearSS    [lsearch -exact $L "nearestSSType"];  if {$i_nearSS == -1} {puts "column number not found for nearestSSType - Exit"; exit}
    set i_intron    [lsearch -exact $L "varLocation"];    if {$i_intron == -1} {puts "column number not found for varLocation - Exit"; exit}
    set i_wtSSF     [lsearch -exact $L "wtSSFScore"];     if {$i_wtSSF  == -1} {puts "column number not found for wtSSFScore - Exit"; exit}
    set i_wtMES     [lsearch -exact $L "wtMaxEntScore"];  if {$i_wtMES  == -1} {puts "column number not found for wtMaxEntScore - Exit"; exit}
    set i_wtNNS     [lsearch -exact $L "wtNNSScore"];     if {$i_wtNNS  == -1} {puts "column number not found for wtNNSScore - Exit"; exit}
    set i_varSSF    [lsearch -exact $L "varSSFScore"];    if {$i_varSSF == -1} {puts "column number not found for varSSFScore - Exit"; exit}
    set i_varMES    [lsearch -exact $L "varMaxEntScore"]; if {$i_varMES == -1} {puts "column number not found for varMaxEntScore - Exit"; exit}
    set i_varNNS    [lsearch -exact $L "varNNSScore"];    if {$i_varNNS == -1} {puts "column number not found for varNNSScore - Exit"; exit}

    set i_siftPred  [lsearch -exact $L "SIFTprediction"]; if {$i_siftPred  == -1} {puts "column number not found for SIFTprediction - Exit"; exit}
    set i_siftMed   [lsearch -exact $L "SIFTmedian"];     if {$i_siftMed   == -1} {puts "column number not found for SIFTmedian - Exit"; exit}
    set i_pph2      [lsearch -exact $L "PPH2class"];      if {$i_pph2      == -1} {puts "column number not found for PPH2class "}
    set i_phastcons [lsearch -exact $L "phastCons"];      if {$i_phastcons == -1} {puts "column number not found for phastCons - Exit"; exit}
    set i_hgmdId    [lsearch -exact $L "hgmdId"];         if {$i_hgmdId    == -1} {puts "column number not found for hgmdId - Exit"; exit}
    #set i_hgmdPhen  [lsearch -exact $L "hgmdPhenotype"];  if {$i_hgmdPhen  == -1} {puts "column number not found for hgmdPhenotype - Exit"; exit}
    #set i_hgmdPub   [lsearch -exact $L "hgmdPubMedId"];   if {$i_hgmdPub   == -1} {puts "column number not found for hgmdPubMedId - Exit"; exit}

    #set i_phylop [lsearch -exact $L "phyloP"]; if {$i_phylop == -1} {puts "column number not found for phyloP - Exit"; exit}
    
    #foreach ID [list "11_532627_T_A"] {}

    foreach ID [array name g_ALAMUT] {
	if {$ID == "#id"} {continue}
	set score      0
	set bestScore  0
	set bestLength 0
	set allGene   {}
	set bestGene ".."

	if {[info exists g_VaRank(DEBUG)]} {puts "Variant: $ID"}

	#For a single variant, multiple isoforms can give different score. Each is evaluated.
	foreach L $g_ALAMUT($ID) {

	    set L [split $L "\t"]

	    #Reset score between isoforms
	    set score 0

	    set trans     ""
	    set length    ""
	    set gene      ""
	    set effect    ""
	    set clinic    ""
	    set hgmdId    ""
	    set siftPred  ""
	    set siftMed   ""
	    set pph2      ""
	    set phastcons ""


	    if {[info exists g_VaRank(DEBUG)]} {puts "Data: $L"}
	    
	    regsub -all " " [lindex $L $i_trans]     "" trans

	    regsub -all " " [lindex $L $i_length]    "" length
	    regsub -all " " [lindex $L $i_gene]      "" gene
	    if {![regexp -nocase "$gene" $allGene]} {lappend allGene $gene}
	    
	    regsub -all " " [lindex $L $i_effect]    "" effect
	    regsub -all " " [lindex $L $i_clinic]    "" clinic

	    regsub -all " " [lindex $L $i_hgmdId]    "" hgmdId
	    
	    regsub -all " " [lindex $L $i_siftPred]  "" siftPred
	    regsub -all " " [lindex $L $i_siftMed]   "" siftMed
	    regsub -all " " [lindex $L $i_pph2]      "" pph2
	    regsub -all " " [lindex $L $i_phastcons] "" phastcons 
	    
	    #puts "effect $effect, pph2 $pph2"

	    #General scoring scheme for big categories
	    #
	    if {$clinic == "probable-pathogenic" || $clinic == "pathogenic" || ($hgmdId != "" && [regexp -nocase "^CM" $hgmdId])} {
		set score $g_VaRank(S_Known)
		#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
	    } elseif {[regexp "nonsense" $effect]} {
		set score $g_VaRank(S_Nonsense)
		#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
		incr score [addBonus "NA" "NA" "NA" $phastcons]
	    } elseif {[regexp "Frameshift" $effect]} {
		set score $g_VaRank(S_Fs)
		#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
	    } elseif {[regexp "startloss" $effect]} {
		set score $g_VaRank(S_StartLoss)
		#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
		incr score [addBonus "NA" "NA" "NA" $phastcons]
	    } elseif {[regexp "stoploss" $effect]} {
		set score $g_VaRank(S_StopLoss)
		#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
		incr score [addBonus "NA" "NA" "NA" $phastcons]
	    } elseif {[regexp "missense" $effect]} {
		set score $g_VaRank(S_Missense)
		incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
	    } elseif {[regexp "In-frame" $effect]} {
		set score $g_VaRank(S_Inframe)
		#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
	    } elseif {[regexp "synonymous" $effect]} {
		set score $g_VaRank(S_Synonymous)
		#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
		incr score [addBonus "NA" "NA" "NA" $phastcons]
	    }

	    if {[info exists g_VaRank(DEBUG)]} {puts "Score $score"}

	    #Scoring for splice effect
	    #
	    regsub -all " " [lindex $L $i_distSS] "" distSS
	    regsub -all " " [lindex $L $i_nearSS] "" nearSS
	    regsub -all " " [lindex $L $i_intron] "" intron
	    regsub -all " " [lindex $L $i_wtSSF]  "" wtSSF
	    regsub -all " " [lindex $L $i_wtMES]  "" wtMES
	    regsub -all " " [lindex $L $i_wtNNS]  "" wtNNS
	    regsub -all " " [lindex $L $i_varSSF] "" varSSF
	    regsub -all " " [lindex $L $i_varMES] "" varMES
	    regsub -all " " [lindex $L $i_varNNS] "" varNNS
	    
	    set n   0
	    set tot 0
	    
	    #2013/03/15 bug in alamut when predictions for Variant is null the score is empty and not equal to  0
	    #2013/03/26 bug splice scores are not indexed by isoforms/transcript
	    #
	    if {[info exists g_VaRank(DEBUG)]} {puts "Splices Scores: $wtSSF, $wtMES, $wtNNS, $varSSF, $varMES, $varNNS"}
	    

	    if {$wtSSF != "" && $wtSSF != "NA" && $wtSSF != "0"} {
		if {$varSSF=="" || $varSSF=="NA"} {set varSSF 0}

		set  g_deltaSSF($ID,$trans) [format "%.1f" [expr {($varSSF-$wtSSF)/$wtSSF*100.0}]]
		if {$g_deltaSSF($ID,$trans) < $g_VaRank(SSFcutoff)} {incr n}
		incr tot
	    } else {set g_deltaSSF($ID,$trans) "NA"}

	    if {[info exists g_VaRank(DEBUG)]} {puts "g_deltaSSF $trans [set g_deltaSSF($ID,$trans)]"}

	    if {$wtMES != "" && $wtMES != "NA" && $wtMES != "0"} {
		if {$varMES == "" || $varMES=="NA"} {set varMES 0}

		set  g_deltaMES($ID,$trans) [format "%.1f" [expr {($varMES-$wtMES)/$wtMES*100.0}]]
		if {$g_deltaMES($ID,$trans) < $g_VaRank(MEScutoff)} {incr n}
		incr tot
	    } else {set g_deltaMES($ID,$trans) "NA"}

	    if {[info exists g_VaRank(DEBUG)]} {puts "g_deltaMES $trans [set g_deltaMES($ID,$trans)]"}

	    if {$wtNNS != "" && $wtNNS != "NA"  && $wtNNS != "0"} {
		if {$varNNS == "" || $varNNS=="NA"} {set varNNS 0}

		set  g_deltaNNS($ID,$trans) [format "%.1f" [expr {($varNNS-$wtNNS)/$wtNNS*100.0}]]
		if {$g_deltaNNS($ID,$trans) < $g_VaRank(NNScutoff)} {incr n}
		incr tot
	    } else {set g_deltaNNS($ID,$trans) "NA"}

	    if {[info exists g_VaRank(DEBUG)]} {puts "g_deltaNNS $trans [set g_deltaNNS($ID,$trans)]"}

	    set splicingEffect 0

	    #JEAN Correct when only one program available
	    #
	    if {$tot != 0 && [expr $n./$tot] >= 0.5} {set splicingEffect 1}

	    if {[info exists g_VaRank(DEBUG)]} {puts "splicingEffect $splicingEffect"}

	    if {$splicingEffect} {
		if {$score <= $g_VaRank(S_EssentialSplice)} {
		    # Essential splice site
		    if {$nearSS == "5'" && [regexp "^1$|^2$" $distSS]} {
			set  score $g_VaRank(S_EssentialSplice)
			#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
			incr score [addBonus "NA" "NA" "NA" $phastcons]
		    } elseif {$nearSS == "3'" && [regexp "^-1$|^-2$" $distSS]} {
			set  score $g_VaRank(S_EssentialSplice)
			#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
			incr score [addBonus "NA" "NA" "NA" $phastcons]
		    }
		} 
		if {$score <= $g_VaRank(S_CloseSplice)} {
		    # Intron-Exon boundary
		    if {$nearSS == "5'" && $distSS >= -3 && $distSS <= 6} {
			set  score $g_VaRank(S_CloseSplice)
			#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
			incr score [addBonus "NA" "NA" "NA" $phastcons]
		    } elseif {$nearSS == "3'" && $distSS >= -12 && $distSS <= 2} {
			set  score $g_VaRank(S_CloseSplice)
			#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
			incr score [addBonus "NA" "NA" "NA" $phastcons]
		    }
		} 
		if {$score <= $g_VaRank(S_DeepSplice)} {
		    # Deep intron-exon boundary
		    if {$intron == "intron"} {
			set score $g_VaRank(S_DeepSplice)
			#incr score [addBonus $siftPred $siftMed $pph2 $phastcons]
			incr score [addBonus "NA" "NA" "NA" $phastcons]
		    }
		}
	    }
	    if {$score == 0} {incr score [addBonus $siftPred $siftMed $pph2 $phastcons]}

	    if {[info exists g_VaRank(DEBUG)]} {puts "Score $ID $trans: $score / Best $bestScore"}
	    
	    #Within the different isoforms we keep the one for which the effect is maximum
	    #If similar score between isoforms we keep the longest one
	    if {$score > $bestScore} {
		set bestGene $gene
		set bestL $L
		set bestScore $score
		set bestLength $length
	    } elseif {$score == $bestScore} {
		if {$length > $bestLength} {
		    set bestGene $gene
		    set bestL $L
		    set bestLength $length
		}
	    }
	}
	if {$bestGene != $allGene} {
	    regsub "$bestGene" $allGene "" allGene
	    set allGene "$bestGene/[join $allGene "/"]"
	}
	set g_ALAMUT($ID) [join [lreplace $bestL $i_gene $i_gene "$allGene"] "\t"]

	if {[info exists g_VaRank(DEBUG)]} {puts "Best $ID lreplace $bestL $i_gene $i_gene $allGene"}
	
	lappend g_lScore "$ID $bestScore"
    }


    #	## Add to g_lScore the genetic variants non analysed by alamut, but with a pathogenic rsID into the VCF file (VaRank score = 110)
    #	##############################################################################################################################################
    #        foreach ID [array names g_vcfINFOS] {
    #                if {[info exists g_ALAMUT($ID)]} {continue}
    #                set rsID [lindex $g_vcfINFOS($ID) 4]
    #		 # set clinic ...
    #		 ### no clinical info for the rsID given in the VCF file!!!!!!
    #                if {$clinic == "probable-pathogenic" || $clinic == "pathogenic"} {
    #			lappend g_lScore($ID) "$ID 110"
    #		}
    #        }

    
    puts "...classifying each genetic variant ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"

    if {![info exists g_lScore] || $g_lScore == {}} {
	puts "No score could be retrieved something must be wrong with variation identifiers (g_lScore) - Exit";exit
    	if {[info exists g_VaRank(DEBUG)]} {
	    puts "List of uploaded variation from alamut: [array name g_ALAMUT]"
	    puts "g_lScore $g_lScore"
	}
    }
    
    set g_lScore [lsort -command DescendingSortOnElement1 $g_lScore]

    return
}



