##########################################################################
# VaRank 1.0                                                             #
#                                                                        #
# VaRank: a simple and powerful tool for ranking genetic variants        #
#                                                                        #
# Copyright (C) 2014 Veronique Geoffroy (veronique.geoffroy@inserm.fr)   # 
#                    Jean Muller (jeanmuller@unistra.fr)                 # 
#                                                                        #
# Please cite the following article:                                     #
#    XXX                                                                 #
#                                                                        #
# This is part of VaRank source code.                                    #
#                                                                        #
# This program is free software; you can redistribute it and/or          #
# modify it under the terms of the GNU General Public License            # 
# as published by the Free Software Foundation; either version 3         # 
# of the License, or (at your option) any later version.                 #
#                                                                        #
# This program is distributed in the hope that it will be useful,        # 
# but WITHOUT ANY WARRANTY; without even the implied warranty of         #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          #
# GNU General Public License for more details.                           #
#                                                                        #
# You should have received a copy of the GNU General Public License      #
# along with this program; If not, see <http://www.gnu.org/licenses/>.   #
##########################################################################


## Use of the g_vcfINFOS global variable to create the Alamut input file (one file for all patients)
proc createAlamutInputFile {} {

    global g_vcfINFOS
    global g_VaRank

    set patientsDir $g_VaRank(vcfDir)
    set AlamutInputFile "$patientsDir/Alamut/AlamutInputFile_all.txt"

    if {![file exists "$patientsDir/Alamut"]} {file mkdir "$patientsDir/Alamut"}

    puts "...creation of the alamut input file ($AlamutInputFile) ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"

    if {[file exists $AlamutInputFile]} {puts "\t...$AlamutInputFile already exists, continue"; return}


    set nbSNV 0
    set AlamutText ""

    foreach ID [set g_vcfINFOS(L_IDs)] {
	append AlamutText "$ID\t[join [lrange $g_vcfINFOS($ID) 0 3] "\t"]\n"		
	incr nbSNV
    }	
    if {[info exists g_VaRank(DEBUG)]} {$puts "\t...$nbSNV variations created"}

    regsub "\n$" $AlamutText "" AlamutText
    WriteTextInFile $AlamutText $AlamutInputFile
    return
}


## Return 0 if all the input file variations have been analysed or if the input file is empty.
## Else, return 1 and create an inputFile.tmp with the variations not yet analysed.
proc AlamutIsNotCompleted {inputFile annFile unnanFile outputfile {segmentation 0}} {
    
    #The Header seems to be missing when alamut is to be rerun and the alamut file is to be rerun using the tmp files 
    #
    #  #id     gene    geneId  chrom   transcript      strand  transLen

    regsub "InputFile_all.txt" $inputFile "CannotLoadGene.txt" cannotloadgeneFile
    
    if {[file size $inputFile] == 0} {
	puts "\t...Empty input file for Alamut: $inputFile"
	return 0
    }

    file delete -force $inputFile.tmp

    ## Load into lIDana the ID already analysed by Alamut.
    set lIDana {}

    if {[file exists $annFile]} {
	foreach L [LinesFromFile $annFile] {
	    set ID [lindex $L 0]
	    if {![info exists TabIDana($ID)]} {set TabIDana($ID) 1;lappend lIDana $ID} else {continue}
	}
    }
    if {[file exists $unnanFile]} {
	foreach L [LinesFromFile $unnanFile] {
	    set ID [lindex $L 0]
	    if {![info exists TabIDana($ID)]} {set TabIDana($ID) 1;lappend lIDana $ID} else {continue}
	}
    } 
    if {$lIDana == ""} {
	if {$segmentation} {
	    WriteTextInFile [join [lrange [LinesFromFile $inputFile] 1 end] "\n"] $inputFile.tmp
	    WriteTextInFile "[lindex [LinesFromFile $inputFile] 1]: child killed: segmentation violation" $unnanFile
	} else {
	    file copy -force $inputFile $inputFile.tmp
	}
	return 1
    }

    ## Load into TabIDinvalid the ID that need to be updated by Alamut developer:
    ## Genes defined with "Invalid data while loading gene" or "Cannot load gene" must be requested by mail to the Alamut developers.
    ## These ID don't work with Alamut, we do not put them in the inputfile.

    if {[file exists $outputfile]} {
	foreach L [LinesFromFile $outputfile] {
	    #puts $L
	    if {[regexp "Cannot load gene (.*)" $L match gene]} {
		regsub ":" $L "" L
		#Room for improvment, use lsearch -index 0 or add the identifiers from lIDinvalid into a Tab like for lIDana
		#JEAN
		#lappend lIDinvalid {[lindex $L 0] $gene}
		set ID [lindex $L 0]
		if {![info exists TabIDinvalid($ID)]} {set TabIDinvalid($ID) $gene} else {continue}

		#lappend lIDinvalid "[lindex $L 0] $gene]"
	    }
	}
    }
    #set lIDinvalid [lsort -unique $lIDinvalid]


    set test 0
    ## Some ID (With Knome for example) are too long to be treated by a lsearch:
    ## "couldn't compile regular expression pattern: out of memory"
    ## They are not analyzed and are written in $unnanFile.strange
    file delete -force $unnanFile.strange

    ## inputFile.tmp creation:
    file delete -force $cannotloadgeneFile
    foreach L [LinesFromFile $inputFile] {
	if {$L == ""} {continue}

	set ID [lindex $L 0]
	#if {0&&[string length $ID] > 400} {
	    #puts "ID not analysed, string too long:\n$ID"
	    #WriteTextInFile $L $unnanFile.strange
	    #continue
	#}
	
	if {[info exists TabIDinvalid($ID)]} {
	    WriteTextInFile "Cannot load gene [set TabIDinvalid($ID)]!!! $ID not analysed" $cannotloadgeneFile
	    #WriteTextInFile "Cannot load gene [lindex [lindex $lIDinvalid $i] end]!!! $ID not analysed" $cannotloadgeneFile
	    continue
	}
	
	#if {0 && $lIDinvalid!={}} {
	    #set  i [lsearch -regexp $lIDinvalid "$ID"]
	    #if {$i != -1} {
		#WriteTextInFile "Cannot load gene [lindex [lindex $lIDinvalid $i] end]!!! $ID not analysed" $cannotloadgeneFile
		#continue
	    #}
	#}

	if {![info exists TabIDana($ID)]} {
	    if {$segmentation} {
		set segmentation 0
		WriteTextInFile "$L: child killed: segmentation violation" $unnanFile
		continue
	    }
	    set test 1
	    WriteTextInFile $L $inputFile.tmp

	}
    }
    
    if {[file exists $cannotloadgeneFile]} {
	WriteTextInFile "\nPlease, ask for an update for these genes to the Alamut-Batch developers." $cannotloadgeneFile
    }

    return $test
}

#################################################################################################
# OUTPUT:
#    - $patientsDir/Alamut/AlamutAnnotations_all.txt
#    - $patientsDir/Alamut/AlamutUnannotated_all.txt
#    - $patientsDir/Alamut/AlamutOutput_all.txt
#
# RETURN :
#    - return "1" if Alamut has been run for all the variations
#    - else "exit" (if alamut has been run 10 times and is still not completed)
#     	 	   (or if alamut license expired)
#################################################################################################
proc runAlamut {} {

    global g_VaRank

    set patientsDir $g_VaRank(vcfDir)
    set AlamutInputFile "$patientsDir/Alamut/AlamutInputFile_all.txt"
    regsub "AlamutInputFile"   $AlamutInputFile "AlamutAnnotations"  annFile
    regsub "AlamutInputFile"   $AlamutInputFile "AlamutUnannotated"  unnanFile
    regsub "AlamutInputFile"   $AlamutInputFile "AlamutOutput"       outputFile
    regsub "InputFile_all.txt" $AlamutInputFile "CannotLoadGene.txt" cannotloadgeneFile

    file delete -force $outputFile
    file delete -force $cannotloadgeneFile

    ## Running Alamut for the first time if the output files don't exist.
    #
    puts "...running Alamut-Batch ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"

    if {![file exists $annFile] && ![file exists $unnanFile]} {
	set DateDuJour [clock seconds]
	ReplaceTextInFile "Alamut-Batch started: [clock format $DateDuJour -format "%B %d %Y - %H:%M"]" $outputFile

	set doitagain 1
	while {$doitagain} {

	    if {([info exists g_VaRank(hgmdUser)] && $g_VaRank(hgmdUser) != "") && ([info exists g_VaRank(hgmdPasswd)] && $g_VaRank(hgmdPasswd) != "")} {
		set alamutCmd "$g_VaRank(alamutDir)/alamut-batch --in $AlamutInputFile --ann $annFile --unann $unnanFile --alltrans --outputannonly --hgmdUser $g_VaRank(hgmdUser) --hgmdPasswd $g_VaRank(hgmdPasswd) >>& $outputFile"
	    } else {
		set alamutCmd "$g_VaRank(alamutDir)/alamut-batch --in $AlamutInputFile --ann $annFile --unann $unnanFile --alltrans --outputannonly >>& $outputFile"
	    }

	    #set  CODE [catch {exec $FILE_cmd > $OutputFileStat} Message]
	    #if {$CODE != "0"} {puts stderr "stderr $Message"}

	    if {[catch {eval exec $alamutCmd} Message]} {
		## Alamut has implemented a token system so that we can run only x instance(s) at once.
		## Else we get the error message "Sorry. ... Access denied."

		WriteTextInFile "\nAlamut unexpected stop\n$Message" $outputFile

		set ErrorLines ""
		if {[file exists $outputFile]} {
		    set ErrorLines [join [LinesFromFile $outputFile] " "]
		}
		#file delete -force $outputFile
		
		set What "Sorry. Access denied: key .* in use by"
		if {[regexp $What $Message]||[regexp $What $ErrorLines]} {
		    puts "\t...Alamut-Batch access denied because key in use, waiting."
		    after 600000; # on attend 10 min qu'une instance se libre
		} else {
		    set doitagain 0
		}

		set What  "expired license"
		if {[regexp $What $Message]||[regexp $What $ErrorLines]} {
		    #Sorry. Access denied: expired license

		    puts "##############################################################"
		    puts "\t...Alamut-Batch expired license!"
		    puts "\t   Please make sure your Alamut-Batch license is still active."
		    puts "\t   EXIT of VaRank!"
		    puts "##############################################################"
		    exit
		} 

		set What "HGMD Pro authentication failed"
		if {[regexp $What $Message]||[regexp $What $ErrorLines]} {
		    #HGMD Pro authentication failed: Error: Subscription has expired.

		    puts "############################################################################"
		    puts "\t...HGMD pro expired license!"
		    puts "\t   Please make sure that your HGMD pro license is still active."
		    puts "\t   EXIT of VaRank!"
		    puts "############################################################################"
		    exit
		} 
	    } else {
		set doitagain 0
	    }
	}
	set DateDuJour [clock seconds]
	WriteTextInFile "Alamut-Batch finished: [clock format $DateDuJour -format "%B %d %Y - %H:%M"]\n\n" $outputFile
    } elseif {![AlamutIsNotCompleted $AlamutInputFile $annFile $unnanFile $outputFile]} {
	puts "\t...Alamut-Batch has already analysed all variations, continue ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
	return 1
    }
    
    if {![info exists g_VaRank(AlamutHeader)]} {
	#BUG HEADER PROBLEM one way to solve it

	if {[file exists $annFile]} {
	    set Header [FirstLineFromFile $annFile]
	    if {![regexp {^#} $Header]} {puts "\t...WARNING Trying to save the Alamut-Batch header but already missing"} else {set g_VaRank(AlamutHeader) $Header}
	}
    }
    
    ## Rerun Alamut until all variations have been analysed (max executed 10 times)
    #
    set i 0

    while {[AlamutIsNotCompleted $AlamutInputFile $annFile $unnanFile $outputFile] && $i < 10} {
	puts "\t...checking if Alamut-Batch has already run on all ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
	puts "\t...Alamut-Batch didn't finish to analyse all variations. New run is launched ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
	set DateDuJour [clock seconds]
	WriteTextInFile "Alamut-Batch started: [clock format $DateDuJour -format "%B %d %Y - %H:%M"]" $outputFile

	set doitagain 1
	set segmentation 0
	while {$doitagain} {

	    if {([info exists g_VaRank(hgmdUser)] && $g_VaRank(hgmdUser) != "") && ([info exists g_VaRank(hgmdPasswd)] && $g_VaRank(hgmdPasswd) != "")} {
		set alamutCmd "$g_VaRank(alamutDir)/alamut-batch --in $AlamutInputFile.tmp --ann $annFile.tmp --unann $unnanFile.tmp --alltrans --outputannonly --hgmdUser $g_VaRank(hgmdUser) --hgmdPasswd $g_VaRank(hgmdPasswd) >>& $outputFile"
	    } else {
		set alamutCmd "$g_VaRank(alamutDir)/alamut-batch --in $AlamutInputFile.tmp --ann $annFile.tmp --unann $unnanFile.tmp --alltrans --outputannonly >>& $outputFile"
	    }

	    WriteTextInFile "\n$alamutCmd" $outputFile
	    if {[catch {eval exec $alamutCmd} Message]} {
		WriteTextInFile "\nAlamut unexpected stop\n$Message" $outputFile

		set ErrorLines ""
		if {[file exists $outputFile]} {
		    set ErrorLines [join [LinesFromFile $outputFile] " "]
		}
		
		set What  "expired license"
		if {[regexp $What $Message]||[regexp $What $ErrorLines]} {
		    #Sorry. Access denied: expired license

		    puts "##############################################################"
		    puts "\t...Alamut-Batch expired license!"
		    puts "\t   Please make sure your Alamut-Batch license is still active."
		    puts "\t   EXIT of VaRank!"
		    puts "##############################################################"
		    exit
		} 

		set What "HGMD Pro authentication failed"
		if {[regexp $What $Message]||[regexp $What $ErrorLines]} {
		    #HGMD Pro authentication failed: Error: Subscription has expired.

		    puts "############################################################################"
		    puts "\t...HGMD pro license seems to be expired!"
		    puts "\t   Please make sure that your HGMD pro license is still active."
		    puts "\t   EXIT of VaRank!"
		    puts "############################################################################"
		    exit
		} 

		set What "child killed: segmentation violation"
		if {[regexp $What $Message]||[regexp $What $ErrorLines]} {
		    #Some ID caused "segmentation violation" in the alamut program only on debian. Not successfully debbugged by alamut.

		    puts "############################################################################"
		    puts "\t...Alamut-Batch segmentation violation"
		    puts "############################################################################"
		    
		    # We will remove the bad ID from the input file
		    set segmentation 1

		    if {[file exists $annFile.tmp] && [file size $annFile.tmp] != 0} {
			set  infosAnn [lrange [LinesFromFile $annFile.tmp] 1 end]
			if {$infosAnn != ""} {
			    WriteTextInFile [join $infosAnn "\n"] $annFile
			}
		    }
		    if {[file exists $unnanFile.tmp] && [file size $unnanFile.tmp] != 0} {
			WriteTextInFile [join [LinesFromFile $unnanFile.tmp] "\n"] $unnanFile
		    }
		    
		    file delete -force $annFile.tmp
		    file delete -force $unnanFile.tmp
		    WriteTextInFile [join [LinesFromFile $outputFile] "\n"] $outputFile.$i
		    file delete -force $outputFile

		    AlamutIsNotCompleted $AlamutInputFile $annFile $unnanFile $outputFile $segmentation
		    continue
		} 

		set What "Sorry. Access denied: key .* in use by"
		if {[regexp $What $Message]||[regexp $What $ErrorLines]} {
		    after 600000; # on attend 10 min qu'une instance se libre
		    if {[file exists $annFile.tmp] && [file size $annFile.tmp] != 0} {
			set  infosAnn [lrange [LinesFromFile $annFile.tmp] 1 end]
			if {$infosAnn != ""} {
			    WriteTextInFile [join $infosAnn "\n"] $annFile
			}
		    }
		    if {[file exists $unnanFile.tmp] && [file size $unnanFile.tmp] != 0} {
			WriteTextInFile [join [LinesFromFile $unnanFile.tmp] "\n"] $unnanFile
		    }
		    
		    file delete -force $annFile.tmp
		    file delete -force $unnanFile.tmp
		    WriteTextInFile [join [LinesFromFile $outputFile] "\n"] $outputFile.$i
		    file delete -force $outputFile
		} else {
		    set doitagain 0
		}
	    } else {
		WriteTextInFile "\n$Message" $outputFile
		set doitagain 0
	    }
	}
	set DateDuJour [clock seconds]
	WriteTextInFile "Alamut-Batch finished: [clock format $DateDuJour -format "%B %d %Y - %H:%M"]\n\n" $outputFile

	incr i
	if {[file exists $annFile.tmp] && [file size $annFile.tmp] != 0} {
	    set  infosAnn [lrange [LinesFromFile $annFile.tmp] 1 end]
	    if {$infosAnn != ""} {
		WriteTextInFile [join $infosAnn "\n"] $annFile
	    }
	}
	if {[file exists $unnanFile.tmp] && [file size $unnanFile.tmp] != 0} {
	    WriteTextInFile [join [LinesFromFile $unnanFile.tmp] "\n"] $unnanFile
	}
	file delete -force $annFile.tmp
	file delete -force $unnanFile.tmp
    }

    if {$i == 10} {
	# Alamut did not run on all variations
	if {[file exists $cannotloadgeneFile]} {
	    puts "[ContentFromFile $cannotloadgeneFile]\n"
	}

	puts "##############################################################"
	puts "\t   WARNING: Alamut-Batch is not finished after 10 run!"
	puts "\t   EXIT of VaRank!"
	puts "##############################################################"
	exit
    }

    ## Alamut has been run for all the variations
    if {[file exists $cannotloadgeneFile]} {
	puts "[ContentFromFile $cannotloadgeneFile]\n"
    }

    return 1
}

proc parseAlamutFile {{afterPPH 0}} {

    global g_ALAMUT
    global g_VaRank
    global g_vcfINFOS

    set patientsDir $g_VaRank(vcfDir)
    set annFile "$patientsDir/Alamut/AlamutAnnotations_all.txt"

    ## reminder: backup of the headline into g_ALAMUT(#id)

    if {! $afterPPH} {
	puts "...parsing Alamut-Batch results ([clock format [clock seconds] -format "%B %d %Y - %H:%M"])"
    }

    set First 1
    foreach L [LinesFromFile $annFile] {

	set ID [lindex $L 0]

	#Analyzing the alamut header
	#
	if {$First} {
	    set First 0
	    if {![regexp {^#} $L]} {
		if {[info exists g_VaRank(AlamutHeader)] && [set g_VaRank(AlamutHeader)]!=""} {
		    set ID [lindex $L 0]
		    lappend g_ALAMUT($ID) "$L"
		    puts "\t...WARNING Alamut-Batch output file seems to miss the header. Rescue done!"
		} else {
		    puts "\t...WARNING Alamut-Batch output file seems to miss the header. Exit of VaRank!"
		    exit
		}
	    }
	} else {
	    #Sometimes we compute and remove variants from annotations files here add selectively to load only variants from the patients analyzed
	    if {![info exists g_vcfINFOS($ID)]} {continue}
	}

	#Replacing the empty columns by NA
	#   Note regsub is faster
	while {[regexp "\t\t" $L]} {
	    #set L [string map {"\t\t" "\tNA\t"} $L]
	    regsub -all "\t\t" $L "\tNA\t" L
	}
	regsub "\t$" $L "\tNA" L

	## ID corresponds to 1 genetic variant.
	## There may be multiple transcripts per genetic variant (strand +, strand - different splicing).
	## This need to be handled specifically later to ensure coherence of the specific data for each isoform
	lappend g_ALAMUT($ID) "$L"
    }

    return 
}

