##########################################################################
# VaRank 1.0                                                             #
#                                                                        #
# VaRank: a simple and powerful tool for ranking genetic variants        #
#                                                                        #
# Copyright (C) 2014 Veronique Geoffroy (veronique.geoffroy@inserm.fr)   # 
#                    Jean Muller (jeanmuller@unistra.fr)                 # 
#                                                                        #
# Please cite the following article:                                     #
#    XXX                                                                 #
#                                                                        #
# This is part of VaRank source code.                                    #
#                                                                        #
# This program is free software; you can redistribute it and/or          #
# modify it under the terms of the GNU General Public License            # 
# as published by the Free Software Foundation; either version 3         # 
# of the License, or (at your option) any later version.                 #
#                                                                        #
# This program is distributed in the hope that it will be useful,        # 
# but WITHOUT ANY WARRANTY; without even the implied warranty of         #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          #
# GNU General Public License for more details.                           #
#                                                                        #
# You should have received a copy of the GNU General Public License      #
# along with this program; If not, see <http://www.gnu.org/licenses/>.   #
##########################################################################


proc searchProteicSequence {IDprot database} {
    
    ## Cleaning:
    foreach f [glob -nocomplain wgetz\?-id*] {
	file delete -force $f
    }
    file delete -force Wget.log

    ## Searching for a RefSeqP sequence on a regularly updated site.
    set command "/usr/bin/wget -nv -a Wget.log -N \"http://srs.ebi.ac.uk/srsbin/cgi-bin/wgetz?-id+_bIV1er7R7+-e+\\\[$database:'$IDprot'\\\]+-qnum+2+-enum+1\""

    if {[catch {set Seq [eval exec $command]} Message]} {
	set Seq ""
    } else {
	set test 1
	foreach f [glob -nocomplain wgetz\?-id*] {
	    set Seq ""
	    foreach L [LinesFromFile $f] {
		if {[regexp "^ >>refseqp" $L]} {set test 0; continue}
		if {!$test && [regexp "^<" $L]} {set test 1; break}
		if {$test} {continue}
		regsub -all " " $L "" L
		regsub -all "\[0-9\]" $L "" L
		append Seq "[string toupper $L]"
	    }
	}
    }
    
    ## Cleaning:
    foreach f [glob -nocomplain wgetz\?-id**] {
	file delete -force $f
    }
    file delete -force Wget.log
    
    # Return the sequence
    return $Seq
}


proc searchOldRefSeqPSequence {IDprot} {

    global g_VaRank
    
    if {$g_VaRank(nowebsearch)=="yes"} {return ""}

    ## Cleaning:
    foreach f [glob -nocomplain wgetz\?_AUTHS_*] {
	file delete -force $f
    }
    file delete -force Wget.log

    ## Searching for a RefSeqP sequence on a site not often updated.
    ## Permitted some old sequences to be find.
    set command "/usr/bin/wget -nv -a Wget.log -N \"http://bioinfo.ceinge.unina.it/srs7131bin/cgi-bin/wgetz?_AUTHS_-page+LibInfo+-lib+REFSEQP-AUTHE_-page+EntryPage+-id+_bIV1er7R7+-e+\\\[REFSEQP:$IDprot\\\]+-vn+2\""
    if {[catch {set Seq [eval exec $command]} Message]} {
	set Seq ""
    } else {
	set test 1
	foreach f [glob -nocomplain wgetz\?_AUTHS_*] {
	    set Seq ""
	    foreach L [LinesFromFile $f] {
		if {[regexp "SRS ERROR" $L]} {set Seq ""; break}
		if {[regexp "^ORIGIN" $L]} {set test 0; continue}
		if {[regexp "^//" $L]} {set test 1; break}
		if {$test} {continue}
		regsub -all " " $L "" L
		regsub -all "\[0-9\]" $L "" L
		append Seq "[string toupper $L]"
	    }
	}
    }

    ## Cleaning:
    foreach f [glob -nocomplain wgetz\?_AUTHS_*] {
	file delete -force $f
    }
    file delete -force Wget.log

    # Return the sequence
    return $Seq
}


proc searchBadAA1position {IDprot Seq patientsDir} {

    ## Search into the created PPH2input file the SNV corresponding to the ID given in fasta.
    ## Check if SNV positions correspond to AA1 provided.
    ## Returns 1 if the positions are not all OK, 0 otherwise.
    foreach L [LinesFromFile $patientsDir/PPH2/PPH2input_all.txt] {
	if {![regexp "$IDprot" $L]} {continue}
	set Pos [expr {[lindex $L 1]-1}]
	set AA1 [lindex $L 2]
	if {[string index $Seq $Pos] != $AA1} {
	    #puts "\t$L"
	    #puts "\t***** Colle pas: [string index $Seq $Pos] != $AA1"
	    return 1
	} else {
	    #puts "\tOk for [lindex $L 0] - [string index $Seq $Pos] == $AA1"
	}
    }
    
    return 0
}

proc FastaSequence_HumanUniProt args {

    #Store the uniprot identifiers and sequences

    #ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/proteomes/README
    #ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/proteomes/

    global g_VaRank
    global SeqFasta

    if {$g_VaRank(DB)      == "" || ![file exists $g_VaRank(DB)] } {return ""}
    if {$g_VaRank(uniprot) == ""} {return ""}

    set InputFile [file join $g_VaRank(DB) $g_VaRank(uniprot)]
    if {![file exists $InputFile]} {puts "Inputfile UNIPROT $g_VaRank(uniprot)";return ""}

    set What [join [concat $InputFile $args] ","]

    if {  [info exists SeqFasta($What)]} {return [set SeqFasta($What)]}
    if {! [info exists SeqFasta($InputFile,Loaded)]} {
	set SeqFasta($InputFile,Loaded) 1

	set ID  ""
	set Seq ""
	set SeqFasta($InputFile,L_ID) {}

	if {[regexp ".gz$" $InputFile]} {
	    set F [open "|gzip -cd $InputFile"] 
	} else {
	    set F [open "$InputFile"]
	}
	while {[gets $F Line]>=0} {
	    if {[string first ">" $Line]=="0"} {
		if {$Seq!=""} {
		    if {[info exists SeqFasta($InputFile,$ID)]} {Spy "$ID already seen"}
		    
		    regsub -all " "  $Seq "" Seq
		    #regsub -all {\-} $Seq "" Seq
		    
		    set SeqFasta($InputFile,$ID) $Seq
		    #Spy "$ID [set SeqFasta($InputFile,$ID)]"
		    set ID  ""
		    set Seq ""
		}

		#>tr|A0JP02|A0JP02_HUMAN PLEKHA5 protein OS=Homo sapiens GN=PLEKHA5 PE=2 SV=1
		#>sp|A0M8Q6|LAC7_HUMAN Ig lambda-7 chain C region OS=Homo sapiens GN=IGLC7 PE=1 SV=2

		set  ID [string trim [lindex [split $Line "|"] 1]]
		if {$ID==""} {puts "WARNING $Line"}
		lappend SeqFasta($InputFile,L_ID) $ID
	    } else {
		append Seq [string trim $Line]
	    }
	}
	close $F
	
	if {$Seq!=""} {
	    if {[info exists SeqFasta($InputFile,$ID)]} {Spy "$ID already seen"}
	    
	    regsub -all " "  $Seq "" Seq
	    #regsub -all {\-} $Seq "" Seq

	    set SeqFasta($InputFile,$ID) $Seq
	    set ID  ""
	    set Seq ""
	}
	
	set SeqFasta($InputFile,Loaded) 1

	if {[info exists SeqFasta($What)]} {return [set SeqFasta($What)]}
    } else {
	return ""
    }
}

proc FastaSequence_HumanRefSeq args {

    #Store the refseq identifiers and sequences

    #ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.protein.faa.gz

    global g_VaRank
    global SeqFasta

    if {$g_VaRank(DB) == "" || ![file exists $g_VaRank(DB)] } {return ""}
    if {$g_VaRank(refseq) == ""} {return ""}
    
    set InputFile [file join $g_VaRank(DB) $g_VaRank(refseq)]
    if {![file exists $InputFile]} {return ""}

    set What [join [concat $InputFile $args] ","]

    if {  [info exists SeqFasta($What)]} {return [set SeqFasta($What)]}
    if {! [info exists SeqFasta($InputFile,Loaded)]} {
	set SeqFasta($InputFile,Loaded) 1

	set ID  ""
	set Seq ""
	set SeqFasta($InputFile,L_ID) {}

	if {[regexp ".gz$" $InputFile]} {
	    set F [open "|gzip -cd $InputFile"] 
	} else {
	    set F [open "$InputFile"]
	}
	while {[gets $F Line]>=0} {
	    if {[string first ">" $Line]=="0"} {
		if {$Seq!=""} {
		    if {[info exists SeqFasta($InputFile,$ID)]} {Spy "$ID already seen"}

		    regsub -all " "  $Seq "" Seq
		    #regsub -all {\-} $Seq "" Seq

		    set SeqFasta($InputFile,$ID) $Seq
		    #Spy "$ID [set SeqFasta($InputFile,$ID)]"
		    set ID  ""
		    set Seq ""
		}

		#>gi|53292629|ref|NP_001005405.1| keratin-associated protein 5-11 [Homo sapiens]
		#>gi|52317162|ref|NP_001004713.1| olfactory receptor 1I1 [Homo sapiens]

		set  ID [string trim [lindex [split $Line "|"] 3]]
		if {$ID==""} {puts "WARNING $Line"}
		lappend SeqFasta($InputFile,L_ID) $ID
	    } else {
		append Seq [string trim $Line]
	    }
	}
	close $F
	
	if {$Seq!=""} {
	    if {[info exists SeqFasta($InputFile,$ID)]} {Spy "$ID already seen"}
	    
	    regsub -all " "  $Seq "" Seq
	    #regsub -all {\-} $Seq "" Seq

	    set SeqFasta($InputFile,$ID) $Seq
	    set ID  ""
	    set Seq ""
	}
	
	set SeqFasta($InputFile,Loaded) 1

	if {[info exists SeqFasta($What)]} {return [set SeqFasta($What)]}
    } else {
	return ""
    }
}


##
## - Create the PPH2 input file from g_ALAMUT
##   ->  Format of the input file created (corresponding to V2.2.2 of PPH2): ID-UniProt/RefSeqP position AA1 AA2
## - Create a unique fasta file (RefSeqPsequences.fasta) for the RefSeqP ID of all the patients.
##
## OUTPUT:
##	- $patientsDir/PPH2/PPH2input_all.txt
## 	- $patientsDir/PPH2/RefSeqPsequences.fasta
##
## Use of the wgetz command at: "http://srs.ebi.ac.uk/srsbin/cgi-bin/wgetz?" wich seems to be regularly updataed. Site is retired now since 19/12/2013
##				"http://bioinfo.ceinge.unina.it/srs7131bin/cgi-bin/wgetz?" which seems not to be regularly updated.
##
## TO KNOW : In the PPH2 input file and in the "RefSeqPsequences.fasta" file:
##            - The RefSeqP ID found with ebi haven't the version number at the end (eg: NP_001185763)
##            - The RefSeqP ID found with bioinfo.ceinge.unina.it have the version number at the end (eg: NP_001185763.1)
##
##	      The SNV without associated sequence are written into the PPH2 input.
##	      => This means that if you do not manually add it in "RefSeqPsequences.fasta", PPH2 will crashed on this SNV.
proc createPPH2Input {} {

    global g_VaRank
    global g_ALAMUT

    #set lRefSeqID ""
    
    set lRefSeqID(L_IDs) {}
    
    ## pph2Dir must be given in the config file or in the command line.
    ###################################################################
    if {$g_VaRank(pph2Dir) == ""} {return}

    set patientsDir $g_VaRank(vcfDir)
    if {![file exists "$patientsDir/PPH2"]} {file mkdir "$patientsDir/PPH2"}

    set PPH2InputFile "$patientsDir/PPH2/PPH2input_all.txt"
    puts "...creation of the PPH2 input file ($PPH2InputFile) ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"

    ## If the RefSeqPsequences.fasta file already exists:
    ## - Load all the sequences of this file (without headline, without "\n") into $Seq($IDprot).
    ## - Load the IDprot of these sequences into $InfosRefSeqPsequences

    #set InfosRefSeqPsequences ""

    set fastaFile "$patientsDir/PPH2/RefSeqPsequences.fasta"
    if {[file exists $fastaFile]} {
	foreach L [LinesFromFile $fastaFile] {
	    if {$L == ""} {continue}
	    if {[regexp "^>(.*)" $L match IDprot]} {
		#append InfosRefSeqPsequences "$IDprot "

		set InfosRefSeqPsequences($IDprot) 1
		
		#Not sure we need to store the short version see with Vero
		set IDprotShort ""
		set IDprotShort $IDprot
		
		regsub {\.[0-9]+$} $IDprotShort "" IDprotShort

		set InfosRefSeqPsequences($IDprotShort) 1
		
		continue
	    }
	    append Seq($IDprot) "$L"
	}
    }

    set PPH2InputFile "$patientsDir/PPH2/PPH2input_all.txt"
    if {[file exists $PPH2InputFile]} {
	puts "\t...PPH2 input file already exists, continue"
	puts "\t...downloading RefSeqP ID from the PPH2 input file"
	foreach L [LinesFromFile $PPH2InputFile] {
	    set IDprot [lindex $L 0]

	    ## We only load the RefSeqP ID (not the uniprot ID)
	    if {![regexp "_" $IDprot]} {continue}

	    #if {![regexp $IDprot $lRefSeqID] && ![regexp $IDprot $InfosRefSeqPsequences]} {lappend lRefSeqID $IDprot}

	    if {![info exists lRefSeqID($IDprot)] && ![info exists InfosRefSeqPsequences($IDprot)]} {set lRefSeqID($IDprot) 1;lappend lRefSeqID(L_IDs) $IDprot}
	}
	#puts "\t...[llength $lRefSeqID] RefSeqP ID loaded from the PPH2 input file"
	#puts "\t...[llength [set lRefSeqID(L_IDs)]] RefSeqP ID loaded from the PPH2 input file"

	return
    } else {
	set Info ""
	puts "\t...Extracting identifiers and sequences from Alamut annotation data, Uniprot and RefSeqP fasta files."
	set L [split $g_ALAMUT(#id) "\t"]
	set i_refseq  [lsearch -regexp $L "^protein$"]; if {$i_refseq  == -1} {puts "column number not found for refseq - Exit"; exit}
	set i_uniprot [lsearch -regexp $L "^Uniprot$"]; if {$i_uniprot == -1} {puts "column number not found for uniprot - Exit"; exit}
	set i_pos     [lsearch -regexp $L "^posAA$"];   if {$i_pos     == -1} {puts "column number not found for posAA - Exit"; exit}
	set i_AA      [lsearch -regexp $L "^wtAA_1$"];  if {$i_AA      == -1} {puts "column number not found for wtAA_1 - Exit"; exit}
	set i_var     [lsearch -regexp $L "^varAA_1$"]; if {$i_var     == -1} {puts "column number not found for varAA_1 - Exit"; exit}
	set i_effect  [lsearch -regexp $L "^codingEffect$"]; if {$i_effect  == -1} {puts "column number not found for codingEffect - Exit"; exit}

	set nbTestedUni   0
	set nbNotFoundUni 0
	set nbFoundUni    0
	set L_NotFoundUni {}

	set nbTestedRef   0
	set nbNotFoundRef 0
	set nbFoundRef    0
	set L_NotFoundRef {}

	#puts "From Uniprot [join [FastaSequence_HumanUniProt L_ID] ","]"
	#puts "From Refseq [join [FastaSequence_HumanRefSeq L_ID] ","]"
	
	foreach ID [array name g_ALAMUT] {
	    if {$ID == "#id"} {continue}
	    foreach L $g_ALAMUT($ID) {

		# Selection of the lines with: "varType" = substitution; "varLocation" = exon; "codingEffect" = missense
		set  codingEffect [lindex $L $i_effect]
		if {$codingEffect!= "missense"} {continue}
		#if {![regexp "missense" $L]} {continue}

		set L [split $L "\t"]

		# Selection of the UniProt ID
		set SNVid     [lindex $L 0]
		set IDuniprot [lindex $L $i_uniprot]
		set Pos       [expr {[lindex $L $i_pos]-1}]
		set AA1       [lindex $L $i_AA]

		if {$IDuniprot != "" && $IDuniprot != "NA"} {
		    # Check the position and the AA1 on the sequence of this UniProt ID.
		    
		    incr nbTestedUni

		    if {![info exists Seq($IDuniprot)]} {
			set Sequni ""
			set Sequni [FastaSequence_HumanUniProt $IDuniprot]
			set  Seq($IDuniprot) $Sequni
			
			if {$Seq($IDuniprot)==""} {
			    lappend L_NotFoundUni $IDuniprot
			    
			    #EBI SRS webserver is retired since 19/12/2013
			    #FIND ANOTHER WAY????

			    #set Seq($IDuniprot) [searchProteicSequence $IDuniprot UNIPROT]
			} 
		    } 
		    
		    if {$Seq($IDuniprot) == ""} {
			set IDuniprot ""
			incr nbNotFoundUni
		    } else {
			if {[string index $Seq($IDuniprot) $Pos] != $AA1} {
			    set IDuniprot ""
			    incr nbNotFoundUni
			} else {
			    incr nbFoundUni
			}
		    }
		}


		## - No UniProt ID or...
		## - UniProt ID not found or...
		## - UniProt ID doesn't match for AA1...
		if {$IDuniprot == "" || $IDuniprot == "NA"} {
		    
		    # Selection of the RefSeqP ID (with the number version at the end)
		    set IDrefseqp_V [lindex $L $i_refseq]
		    regsub "\\..*$" $IDrefseqp_V "" IDrefseqp

		    if {$IDrefseqp == "" || $IDrefseqp == "NA"} {
			puts "\t...ERROR for $SNVid: No UniProt ID or RefSeqP ID !! SNV NOT WRITTEN INTO THE INPUT FILE :"
			puts "\t$L"
			continue
		    } else {
			incr nbTestedRef

			# Get the sequence if possible
			# Check the position and the AA1 on the sequence of this UniProt ID.

			set NotFound 1
			foreach IDrefseqTmp [list $IDrefseqp_V $IDrefseqp] {

			    if {![info exists Seq($IDrefseqTmp)]} {
				set Seqref ""
				set Seqref [FastaSequence_HumanRefSeq $IDrefseqTmp]
				set Seq($IDrefseqTmp) $Seqref
				
				if {$Seq($IDrefseqTmp)==""} {
				    #set Seq($IDrefseqTmp) [searchProteicSequence $IDrefseqTmp REFSEQP]
				    set  Seq($IDrefseqTmp) [searchOldRefSeqPSequence $IDrefseqTmp]
				} 
			    }
			    # Check the position and the AA1 on the sequence of this RefSeqP ID
			    if {$Seq($IDrefseqTmp) != ""} {
				if {[string index $Seq($IDrefseqTmp) $Pos] != $AA1} {
				    set Seq($IDrefseqTmp) ""
				} else {
				    set NotFound 0
				    incr nbFoundRef
				    break
				}
			    } 
			}
			set IDrefseqp $IDrefseqTmp
			
			if {$NotFound} {
			    incr    nbNotFoundRef
			    lappend L_NotFoundRef $IDrefseqp
			}
			
			if {0&&$Seq($IDrefseqp) == ""} {
			    # Search the sequence on a not regularly updated web site
			    #set  Seq($IDrefseqp_V) ""
			    #set  Seq($IDrefseqp_V) [searchOldRefSeqPSequence $IDrefseqp]
			    if {$Seq($IDrefseqp_V) == "" || [string index $Seq($IDrefseqp_V) $Pos] != $AA1} {
				#puts "\t...ERROR for $SNVid: $IDrefseqp_V doesn't match for AA1 or no sequence found !!"
				#puts "\t   $SNVid\t$IDrefseqp_V\t[lindex $L $i_pos]\t$AA1\t[lindex $L $i_var]"
			    }
			    #set IDrefseqp $IDrefseqp_V
			}

			#if {![regexp $IDrefseqp $lRefSeqID] && ![regexp $IDrefseqp $InfosRefSeqPsequences]} {lappend lRefSeqID $IDrefseqp}

			if {![info exists lRefSeqID($IDrefseqp)] && ![info exists InfosRefSeqPsequences($IDrefseqp)]} {set lRefSeqID($IDrefseqp) 1;lappend lRefSeqID(L_IDs) $IDrefseqp}
		    }
		    
		    set l "$IDrefseqp\t[lindex $L $i_pos]\t$AA1\t[lindex $L $i_var]"
		} else {
		    set l "$IDuniprot\t[lindex $L $i_pos]\t$AA1\t[lindex $L $i_var]"
		}
		# Elimination of the redondance
		if {[regexp $l $Info]} {continue}
		append Info "$l\n"
	    }
	}

	if {[info exists g_VaRank(DEBUG)]} {
	    puts "\t...UniProt: total tested $nbTestedUni for $nbFoundUni found and $nbNotFoundUni not found ([join $L_NotFoundUni ","])"
	    puts "\t...RefSeqP: total tested $nbTestedRef for $nbFoundRef found and $nbNotFoundRef not found ([join $L_NotFoundRef ","])"
	} else {
	    puts "\t...UniProt: total tested $nbTestedUni for $nbFoundUni found and $nbNotFoundUni not found"
            puts "\t...RefSeqP: total tested $nbTestedRef for $nbFoundRef found and $nbNotFoundRef not found"
        }

	if {$Info==""} {
	    puts "\t...no missense found, skipping PPH2 step."
	    return 0
	} else {
	    puts "\t...writing the PPH2 input file"
	    WriteTextInFile $Info $PPH2InputFile
	}
    }

    puts "\t...creation of the sequences file for PPH2 ($fastaFile)"

    # We work in $patientsDir. Creation/Complementation of the "RefSeqPsequences.fasta" file.
    foreach IDrefseqp [set lRefSeqID(L_IDs)] {
    #foreach IDrefseqp $lRefSeqID {}

	if {[info exists InfosRefSeqPsequences($IDrefseqp)]} {continue}

	#if {[regexp $IDrefseqp $InfosRefSeqPsequences]} {continue}
	
	## Load the sequence if it is not.
	# (<=> Possible if VaRank was running with an already existing RefSeqPsequences.fasta,
	#       and if the sequence is not present in the "RefSeqPsequences.fasta" file)
	if {![info exists Seq($IDrefseqp)]} {
	    set Seq($IDrefseqp) [searchProteicSequence $IDrefseqp REFSEQP]
	    # Check the position and the AA1 on all the SNV given in input
	    if {[searchBadAA1position $IDrefseqp $Seq($IDrefseqp) $patientsDir]} {
		# Search the sequence on a web site
		set Seq($IDrefseqp) [searchOldRefSeqPSequence $IDrefseqp]
	    }
	}

	# Search if the sequence is OK or not.
	if {$Seq($IDrefseqp) == "" || [searchBadAA1position $IDrefseqp $Seq($IDrefseqp) $patientsDir]} {
	    ## The RefSeqP sequences not found have their SNV written in the PPH2 input files.
	    ## Sequences should be added manually (if found on the net) in RefSeqPsequences.fasta file
	    ## Sequences should match whith the AA1 position.
	    ## Else, remove the SNV line in PPH2 inputfile.
	    # puts "\tRefSeqP ID ($IDrefseqp) not found or doesn't match for all AA1. SEQUENCE NOT WRITTEN INTO RefSeqPsequences.fasta!!"
	    continue
	}

	## Sequences loaded are not in the good format. No "\n".
	set fastaOk ">$IDrefseqp"
	set l [string length $Seq($IDrefseqp)]
	for {set i 0} {$i < $l} {incr i 60} {
	    append fastaOk "\n[string range $Seq($IDrefseqp) $i [expr {$i+59}]]"
	}

	WriteTextInFile $fastaOk $fastaFile
    }

    return
}

## Return 0 if all the variations of the input file have been analysed or if the input file is empty.
## Else return 1 and create a inputFile.tmp with the variations not yet analysed.
proc pph2Step1IsNotCompleted {inputFile featFile errorFile} {

    if {[file size $inputFile] == 0} {
	puts "Empty input file for PPH2: $inputFile"
	return 0
    }

    set lIDinput [LinesFromFile $inputFile]

    file delete -force $inputFile.tmp

    if {![file exists $featFile] && ![file exists $errorFile]} {
	file copy -force $inputFile $inputFile.tmp
	return 1
    }
    
    ## Load in lIDana the ID already analysed by PPH2.
    set lIDana {}
    if {[file exists $featFile]} {
	foreach L [LinesFromFile $featFile] {
	    if {$L == ""} {continue}
	    #lappend lIDana [join [lrange $L 0 3] " "]
	    set ID [join [lrange $L 0 3] " "]
	    if {![info exists TabIDana($ID)]} {set TabIDana($ID) 1;lappend lIDana $ID} else {continue}
	}
    }
    if {[file exists $errorFile]} {
	foreach L [LinesFromFile $errorFile] {
	    if {$L == ""} {continue}
	    if {![regexp ":" $L]} {
		if {[llength $L] == 4} {
		    #lappend lIDana [join [lrange $L 0 3] " "]

		    set ID [join [lrange $L 0 3] " "]
		    if {![info exists TabIDana($ID)]} {set TabIDana($ID) 1;lappend lIDana $ID} else {continue}
		}
	    }
	}
    }

    if {$lIDana == ""} {
	file copy -force $inputFile $inputFile.tmp
	return 1
    }

    set test 0
    foreach L [LinesFromFile $inputFile] {
	if {$L == ""} {continue}
	set ID [join [lrange $L 0 3] " "]
	if {![info exists TabIDana($ID)]} {
	    #if {[lsearch -exact $lIDana "$ID"] == -1} {}
	    set test 1
	    WriteTextInFile $L $inputFile.tmp
	}
    }

    return $test
}

## Return 0 if all the variations of the input file have been analysed or if the input file is empty.
## Else return 1.
proc pph2Step2IsNotCompleted {inputFile HumVarOutput errorFile} {

    if {![file exists $HumVarOutput]} {
	return 1
    }

    set lIDinput [LinesFromFile $inputFile]

    ## Load in lIDana the ID already analysed by PPH2.
    set lIDana {}
    foreach L [LinesFromFile $HumVarOutput] {
	#lappend lIDana [join [lrange $L 0 3] " "]
	set ID [join [lrange $L 0 3] " "]
	if {![info exists TabIDana($ID)]} {set TabIDana($ID) 1;lappend lIDana $ID} else {continue}
    }

    if {[file exists $errorFile]} {
	foreach L [LinesFromFile $errorFile] {
	    if {$L == ""} {continue}
	    if {![regexp ":" $L]} {
		if {[llength $L] == 4} {
		    #lappend lIDana [join [lrange $L 0 3] " "]
		    set ID [join [lrange $L 0 3] " "]
		    if {![info exists TabIDana($ID)]} {set TabIDana($ID) 1;lappend lIDana $ID} else {continue}
		}
	    } 
	}
    }

    set test 0
    foreach L [LinesFromFile $inputFile] {
	if {$L == ""} {continue}
	set ID [join [lrange $L 0 3] " "]
	if {![info exists TabIDana($ID)]} {
	    #if {[lsearch -exact $lIDana "$ID"] == -1} {}
	    return 1
	}
    }
    
    return 0
}

## CONTEXT:
## With a PPH2 input file containing several SNV, if PPH2 crashes on a SNV line so PPH2 crashes.
## <=> PPH2 doesn't run on all the SNV lines.
##
## USE OF THIS PROC :
## runPPH2-1by1 runs PPH2 independently for each SNV line of the PPH2 input file
## So, if PPH2 crashes, PPH2 will continue to run with the following SNV line.
##
## OUTPUT :
## - $patientsDir/PPH2/PPH2features_all.txt
## - $patientsDir/PPH2/PPH2errors_all.txt (containing all the SNV lines where PPH2 has crashed)
##
proc runPPH2-1by1 {} {

    global g_VaRank

    set patientsDir $g_VaRank(vcfDir)
    
    ## pph2Dir must be given in the config file or in the command line
    if {$g_VaRank(pph2Dir) == ""} {
	puts "...PPH environment variable not specified, not running PPH2"
	return
    }

    set PPH2inputFile "$patientsDir/PPH2/PPH2input_all.txt"
    if {![file exists $PPH2inputFile]} {puts "$PPH2inputFile doesn't exist. Exit."; exit}

    puts "...running PPH2 ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
    set inputUnite        "$patientsDir/PPH2/unite.PPH2input"
    set pph2FeaturesUnite "$patientsDir/PPH2/unite.PPH2features"
    set pph2FeaturesAll   "$patientsDir/PPH2/PPH2features_all.txt"
    set pph2ErrorAll      "$patientsDir/PPH2/PPH2errors_all.txt"

    foreach file "$inputUnite $pph2FeaturesUnite" {file delete -force $file}
    
    ################################
    # First step of PPH2 (long step)
    ################################
    if {![pph2Step1IsNotCompleted $PPH2inputFile $pph2FeaturesAll $pph2ErrorAll]} {
	puts "\t...pph2 step 1 already done and completed, continue"
    } else {
	if {![file exists $pph2FeaturesAll]} {
	    puts "\t...running pph2 step 1 ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
	} else {
	    puts "\t...pph2 step 1 already done but is not completed. Running again ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
	}
	if {[file exists $patientsDir/PPH2/RefSeqPsequences.fasta]} {        
	    set PPH2command "$g_VaRank(pph2Dir)/bin/run_pph.pl -s $patientsDir/PPH2/RefSeqPsequences.fasta $inputUnite > $pph2FeaturesUnite"
	} else {
	    set PPH2command "$g_VaRank(pph2Dir)/bin/run_pph.pl $inputUnite > $pph2FeaturesUnite"
	}
	## The first SNV of the input is analysed to keep the headline of the PPH2 output: "#o_acc  o_acc  o_pos  o_aa1   o_aa2   snp_id ..."
	foreach L [LinesFromFile $PPH2inputFile.tmp] {
	    if {$L == ""} {continue}
	    ReplaceTextInFile $L $inputUnite

	    catch {eval exec $PPH2command} Message

	    if {[regexp -nocase "error" $Message]} {
		if {![file exists $pph2FeaturesAll]} {
		    WriteTextInFile [lindex [LinesFromFile $pph2FeaturesUnite] 0] $pph2FeaturesAll
		}
		WriteTextInFile [ContentFromFile $inputUnite] $pph2ErrorAll
		set error [lindex [LinesFromFile $pph2FeaturesUnite] end]
		if {![regexp "#o_snp_id|#o_acc" $error]} {
		    WriteTextInFile $error $pph2ErrorAll
		}
		WriteTextInFile "$Message\n" $pph2ErrorAll
	    } else {
		if {![file exists $pph2FeaturesAll]} {
		    WriteTextInFile [ContentFromFile $pph2FeaturesUnite] $pph2FeaturesAll
		} else {
		    WriteTextInFile [lindex [LinesFromFile $pph2FeaturesUnite] end] $pph2FeaturesAll
		}
	    }	
	    break		
	}
	
	# Treatment of the following SNV
	foreach L [lrange [LinesFromFile $PPH2inputFile.tmp] 1 end] {
	    if {$L == ""} {continue}
	    ReplaceTextInFile $L $inputUnite
	    catch {eval exec $PPH2command} Message
	    if {[regexp -nocase "error" $Message]} {
		WriteTextInFile [ContentFromFile $inputUnite] $pph2ErrorAll
		set error [lindex [LinesFromFile $pph2FeaturesUnite] end]
		if {![regexp "#o_snp_id|#o_acc" $error]} {
		    WriteTextInFile $error $pph2ErrorAll
		}
		WriteTextInFile "$Message\n" $pph2ErrorAll
	    } else {
		WriteTextInFile [lindex [LinesFromFile $pph2FeaturesUnite] end] $pph2FeaturesAll
	    }
	}

	# Cleaning
	foreach file "$inputUnite $pph2FeaturesUnite" {file delete -force $file}
    } 

    ##############################
    # Second step of PPH2 (fast) :
    ##############################
    set pph2HumVarOutput "$patientsDir/PPH2/PPH2humVar_all.txt"
    if {![pph2Step2IsNotCompleted $PPH2inputFile $pph2HumVarOutput $pph2ErrorAll]} {
	puts "\t...pph2 step 2 already done and completed, continue "
    } else {
	if {![file exists $pph2HumVarOutput]} {
	    puts "\t...running pph2 step 2 ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
	} else {
	    puts "\t...pph2 step 2 already done but is not completed. Running again ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
	}
	set PPH2command "$g_VaRank(pph2Dir)/bin/run_weka.pl -l $g_VaRank(pph2Dir)/models/HumVar.UniRef100.NBd.f11.model $pph2FeaturesAll >& $pph2HumVarOutput"
	if {[catch {eval exec $PPH2command} Message]} {
	    puts "ERROR : $Message"
	}
    }

    return
}

## Integrate PPH2 data into the alamut files and in the global variable g_ALAMUT
#
proc integratePPH2dataIntoAlamut {} {
    
    global g_VaRank
    global g_ALAMUT

    ## pph2Dir must be given in the config file or in the command line
    ## else it is not run
    if {$g_VaRank(pph2Dir) == ""} {return}

    set patientsDir $g_VaRank(vcfDir)
    set pph2HumVarOutput "$patientsDir/PPH2/PPH2humVar_all.txt"

    if {[file exists $pph2HumVarOutput]} {

	puts "...updating alamut with PPH2 ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"

	## Loading PPH2 data (neutral or deleterious or unknown)
	puts "\t...downloading PPH2 data"

	set PPH2Header 0
	
	foreach L [LinesFromFile $pph2HumVarOutput] {
	    if {[regexp "^#o_acc" $L]} {
		set L [split $L "\t"]
		set i_pos      [lsearch -regexp $L "o_pos"];      if {$i_pos      == -1} {puts "o_pos: column number not found. Something must be wrong with PPH2 run - Exit"; exit}
		set i_aa1      [lsearch -regexp $L "o_aa1"];      if {$i_aa1      == -1} {puts "o_aa1: column number not found. Something must be wrong with PPH2 run - Exit"; exit}
		set i_aa2      [lsearch -regexp $L "o_aa2"];      if {$i_aa2      == -1} {puts "o_aa2: column number not found. Something must be wrong with PPH2 run - Exit"; exit}
		set i_pph2     [lsearch -regexp $L "pph2_class"]; if {$i_pph2     == -1} {puts "pph2class: column number not found. Something must be wrong with PPH2 run - Exit"; exit}
		#set i_pph2prob [lsearch -regexp $L "pph2_prob"];  if {$i_pph2prob == -1} {puts "pph2prob: column number not found. Something must be wrong with PPH2 run - Exit"; exit}
		
		set PPH2Header 1
		
		continue
	    }
	    set L [split $L "\t"]

	    if {$PPH2Header!="1"} {puts "PPH2 output header not found. Something must be wrong with PPH2 run - Exit"; exit}

	    regsub -all " " [lindex $L 0] "" access
	    ## We remove the version number at the end of the ID (eg: refseqP ID = NP_001185763.1)
	    regsub "\\..*$" $access "" access
	    regsub -all " " [lindex $L $i_pos]  "" pos
	    regsub -all " " [lindex $L $i_aa1]  "" aa1
	    regsub -all " " [lindex $L $i_aa2]  "" aa2
	    regsub -all " " [lindex $L $i_pph2] "" actpph2
	    #regsub -all " " [lindex $L $i_pph2prob] "" probpph2

	    if {![info exists pph2($access,$pos,$aa1,$aa2)] || $pph2($access,$pos,$aa1,$aa2) == "unknown" || $actpph2 == "deleterious"} {
		set pph2($access,$pos,$aa1,$aa2) $actpph2
		#set pph2($access,$pos,$aa1,$aa2,prob) $probpph2
	    }
	}
    }

    ## Writing in the AlamutAnnotations_patient.txt file
    puts "\t...rewriting completed Alamut file"
    set AlamutFile "$patientsDir/Alamut/AlamutAnnotations_all.txt"
    file delete -force $AlamutFile.tmp
    foreach L [LinesFromFile $AlamutFile] {
	if {[regexp "^#id" $L]} {
	    set alamutText "$L"
	    set L [split $L "\t"]
	    set i_refseq  [lsearch -regexp $L "^protein$"]; if {$i_refseq  == -1} {puts "column number not found for refseq - Exit"; exit}
	    set i_uniprot [lsearch -regexp $L "^Uniprot$"]; if {$i_uniprot == -1} {puts "column number not found for uniprot - Exit"; exit}
	    set i_pos     [lsearch -regexp $L "^posAA$"];   if {$i_pos     == -1} {puts "column number not found for posAA - Exit"; exit}
	    set i_aa1     [lsearch -regexp $L "^wtAA_1$"];  if {$i_aa1     == -1} {puts "column number not found for wtAA_1 - Exit"; exit}
	    set i_aa2     [lsearch -regexp $L "^varAA_1$"]; if {$i_aa2     == -1} {puts "column number not found for varAA_1 - Exit"; exit}
	    set i_effect  [lsearch -regexp $L "^codingEffect$"]; if {$i_effect  == -1} {puts "column number not found for codingEffect - Exit"; exit}

	    set i_ppAl    [lsearch -regexp $L "PPH2class"]; if {$i_ppAl    == -1} {
		regsub -all " $|\t$" [join $L "\t"] "" alamutText
		set alamutText "$alamutText\tPPH2class"
	    }
	    #set i_ppAlprob [lsearch -regexp $L "PPH2prob"]; if {$i_ppAlprob    == -1} {
		#regsub -all " $|\t$" [join $L "\t"] "" alamutText
		#set alamutText "$alamutText\tPPH2prob"
	    #}
	    continue
	}


	# Selection of the lines with: "varType" = substitution; "varLocation" = exon; "codingEffect" = missense
	#set  codingEffect [lindex $L $i_effect]
	#if {$codingEffect!= "missense"} {append alamutText "\n$L";continue}

	if {![regexp "missense" $L]} {append alamutText "\n$L";continue}

	set L [split $L "\t"]

	set IDuniprot [lindex $L $i_uniprot]
	## We remove the version number at the end of the ID (eg: refseqP ID = NP_001185763.1)
	regsub "\\..*$" [lindex $L $i_refseq] "" IDrefseq
	set pos [lindex $L $i_pos]
	set aa1 [lindex $L $i_aa1]
	set aa2 [lindex $L $i_aa2]
	if {[info exists pph2($IDuniprot,$pos,$aa1,$aa2)]} {
	    if {$i_ppAl == -1} {
		regsub -all " $|\t$" [join $L "\t"] "" L
		set L "$L\t$pph2($IDuniprot,$pos,$aa1,$aa2)"
		#set L "$L\t$pph2($IDuniprot,$pos,$aa1,$aa2)\t$pph2($IDuniprot,$pos,$aa1,$aa2,prob)"
		set L [split $L "\t"]
	    } else {

		#Voir pour ajouter la proba de PPH2

		set L [lreplace $L $i_ppAl $i_ppAl $pph2($IDuniprot,$pos,$aa1,$aa2)]
	    }
	} elseif {[info exists pph2($IDrefseq,$pos,$aa1,$aa2)]} {
	    if {$i_ppAl == -1} {
		regsub -all " $|\t$" [join $L "\t"] "" L
		set L "$L\t$pph2($IDrefseq,$pos,$aa1,$aa2)"
		#set L "$L\t$pph2($IDrefseq,$pos,$aa1,$aa2)\t$pph2($IDrefseq,$pos,$aa1,$aa2,prob)"
		set L [split $L "\t"]
	    } else {

		#Voir pour ajouter la proba de PPH2

		set L [lreplace $L $i_ppAl $i_ppAl $pph2($IDrefseq,$pos,$aa1,$aa2)]
	    }
	}
	append alamutText "\n[join $L "\t"]"
    }

    ReplaceTextInFile $alamutText $AlamutFile

    # Updating the g_ALAMUT global variable.
    puts "\t...updating alamut data in memory"
    unset g_ALAMUT
    parseAlamutFile 1

    return
}

