#rR gscope_yeast.tcl #rR Here you'll find 2 kinds of projects #rR 1/ the Joy project with 9 yeast genomes (Johans's Yeasts) #rR 2/ the YeastGenome with 92 strains + S88C #rR The Joy project #rR We have 9 .gbff genbank files containing the chromosomes of 9 yeast genomes. proc JoyCreateTfasDesCopains {} { file mkdir "[RepertoireDuGenome]/tfasdescopains" set LesFichiersCrees {} foreach Nom [ListeDesPABs] { Espionne $Nom set FicBlast [GscopeFile $Nom "blastpJoy"] if {[FileAbsent $FicBlast]} { continue } set NbHits [DecortiqueBlast $FicBlast 1.0e-10 "" Query LesHits "" "" "" LesExpects] if {$NbHits==0} { continue } if {[info exists DejaVu]} { unset DejaVu } set LeTfasDesCopains {} foreach Hit $LesHits Expect $LesExpects { if { ! [regexp {(Joy([A-Z][a-z])CDS)([0-9]+)$} $Hit Match JoyFriend Gs I]} { continue } if {[info exists DejaVu($Gs)]} { continue } set DejaVu($Gs) 1 Espionne =========== $Hit $Gs $I $JoyFriend $Expect lappend LeTfasDesCopains [ContenuDuFichier "[JoyDir]/$JoyFriend/prottfa/$Hit"] } if {[llength $LeTfasDesCopains]<2} { continue } lappend LesFichiersCrees [SauveLesLignes $LeTfasDesCopains dans [GscopeFile $Nom "tfasdescopains"]] } return $LesFichiersCrees } proc JoyVersusOi {} { foreach Ox [JoyCode ListOf Ox] { set Os [JoyCode $Ox OS] set OiOs [OiCode $Ox OS] Espionne $Ox $Os = $OiOs = } exit } proc DeployJoy {} { #rR Johan-s Yeasts #rR We start with the list of .gbff files in /genomics/link/Joy/DuNCBI #rR JoyCode provides the lists of Files, Os (the full organism name) , Gs (Genius species), Ox (taxid) #rR and returns one info from an other : ie JoyCode $Os Ox returns the taxid , etc... #rR JoyCreateGenbankFiles creates directories and stores the genebank files in Joy/JoyGsChr/genbankfiles for Gs in Dh Kc ... #rR JoyGenome allows to query all these data (JoyGenome ListOf Strain , JoyGenome JoyScChr ListOfChr, etc.) #rR CreateJoyGscopeProjectsPourTous runs #rR CreateJoyChrGscopeProject #rR CreateJoyCDSGscopeProject #rR So simply run : JoyCreateGenbankFiles CreateJoyGscopeProjectsPourTous #rR And you get the 2x9 gscope projects JoyDhChr, JoyDhCDS, ... JoyYlChr, JoyYlCDS #rR ie the JoyuScChr project contains the chromosomes JoyScChr01 ... JoyScChr17 #rR the JoyrScCDS project contains the mRNA and proteins JoyScCDS0001 ... JoyScCDS6008 #rR and also the tRNA, rRNA and ncRNA #rR If all projects are created we need to calculate the alignments. #rR 1/ Create all blastpdatabase foreach Gs JoyGsCDS/banques/AllProttfa #rR 2/ Create the .pal merging all these blastdatabases in JoyScCDS/banques/ #rR 3/ Run the 6008 blast for all Sc proteins #rR 4/ Create the 6008 tfasdescopains/ #rR 5/ Run the 6008 DbClustal #rR 6/ Create the nucleotide aligments using the protein alignments. #rR #rR Now you can run the CirCode Statistics as with MAMA : Joy00, Joy01, etc } proc JoyCreateAllProttfaPourTous {} { set LesBanquesCreees {} foreach Science [JoyGenome ListOf JoyCDS] { lappend LesBanquesCreees [JoyCreateAllProttfa $Science] } return $LesBanquesCreees } proc JoyCreateAllProttfa Science { return [QuestionDeScience $Science "ret CreeUneBanqueBlast AllProttfa"] } proc JoyCreateAllJoyAllProttfa {} { set AllBanks {} foreach JoyCDS [JoyGenome ListOf JoyCDS] { set ItsBank "[JoyDir]/$JoyCDS/banques/AllProttfa" lappend AllBanks $ItsBank } set LePal {} lappend LePal "# Alias file created by JoyCreateAllJoyPal" lappend LePal "#" lappend LePal "TITLE AllJoyAllProttfa" lappend LePal "#" lappend LePal "DBLIST [join $AllBanks { }]" lappend LePal "#" return [SauveLesLignes $LePal dans "[RepertoireDuGenome]/banques/AllJoyAllProttfa.pal"] } proc JoyDir {} { return "/genomics/link/Joy" } proc CreateJoyGscopeProjectsPourTous {{Ask ""}} { set Ask [string equal -nocase $Ask "Ask"] foreach Strain [JoyGenome ListOf Strain] { set FichierAdn "/genomics/link/Joy/${Strain}CDS/beton/adn" if {[file exists $FichierAdn] && (! $Ask || [OuiOuNon "$FichierAdn already exists. Do I keep it" 0])} { continue } set Fait [CreateJoyGscopeProjects $Strain] lappend LesFaits $Fait } return $LesFaits } proc CreateJoyGscopeProjects Strain { CreateJoyChrGscopeProject $Strain CreateJoyCDSGscopeProject $Strain Espionne "${Strain}Chr et ${Strain}CDS ont été créés. Pour les voir :" Espionne "setgscoperr ${Strain}Chr ; gscope" Espionne "setgscoperr ${Strain}CDS ; gscope" return "${Strain}CDS" } proc CreateJoyCDSGscopeProject {Strain} { return [CreateYeastCDSGscopeProject $Strain "Joy"] } proc CreateJoyChrGscopeProject Strain { set StrainChr "${Strain}Chr" set RG "[JoyDir]/$StrainChr" if { ! [file exists $RG]} { file mkdir $RG } set Link "/genomics/link/$StrainChr" if { ! [file exist $Link]} { exec ln -sf $RG /genomics/link/$StrainChr } set Prefixe $StrainChr NousAllonsAuBoulot $RG file mkdir beton file mkdir fiches file mkdir infos file mkdir genbankfiles file mkdir nuctfa file mkdir nucembl file mkdir cds file mkdir rna set N 0 foreach Chr [JoyGenome $StrainChr ListOfChr] { incr N set Nom $Strain$Chr set FicGb "genbankfiles/$Nom" Espionne "$Nom $FicGb" set FicTfa nuctfa/$Nom set FicInfos infos/$Nom set LesLignesGb [LesLignesDuFichier $FicGb] if { 1 } { set Locus "LocusUnknown" set Definition "DefinitionUnknown" set OS "OSUnknown" set OX "OXUnknown" set OnAttendOrigin 1 set Sequence "" set OnEstDansCDS 0 set OnEstDansRNA 0 set LesCDSs {} set LesRNAs {} set CEstFini 0 foreach Ligne $LesLignesGb { if {[regexp "^//" $Ligne]} { break } regexp {^LOCUS +([^ ]+)( |$)} $Ligne Match Locus regexp {^DEFINITION +(.+)$} $Ligne Match Definition regexp {/organism=\"([^\"]+)\"} $Ligne Match OS regexp {taxon\:([^\"]+)\"} $Ligne Match OX if { ! $OnEstDansCDS && [regexp {^ CDS } $Ligne]} { set OnEstDansCDS 1 set LeCDS {} lappend LeCDS $Ligne continue } if { ! $OnEstDansRNA && [regexp {^ (r|t|nc)RNA } $Ligne]} { set OnEstDansRNA 1 set LeRNA {} lappend LeRNA $Ligne continue } if {$OnEstDansCDS} { if {$OnAttendOrigin && [regexp "^ORIGIN" $Ligne]} { set OnAttendOrigin 0 ; set CEstFini 1 } if {$CEstFini || [regexp -nocase {^ [a-z]} $Ligne]} { set CDS [NiceCDSFromCDSLines $LeCDS] #Espionne $CDS lappend LesCDSs $CDS set OnEstDansCDS 0 continue } lappend LeCDS $Ligne continue } if {$OnEstDansRNA} { if {$OnAttendOrigin && [regexp "^ORIGIN" $Ligne]} { set OnAttendOrigin 0 ; set CEstFini 1 } if {$CEstFini || [regexp -nocase {^ [a-z]} $Ligne]} { set RNA [NiceRNAFromRNALines $LeRNA] #Espionne $RNA lappend LesRNAs $RNA set OnEstDansRNA 0 continue } lappend LeRNA $Ligne continue } if {$OnAttendOrigin && [regexp "^ORIGIN" $Ligne]} { set OnAttendOrigin 0 ; continue } if {$OnAttendOrigin} { continue } set Seq $Ligne regsub -all -nocase {[^a-z]} $Seq "" Seq append Sequence $Seq } #FaireLire [string length $Sequence] Sauve $LesCDSs dans cds/$Nom Sauve $LesRNAs dans rna/$Nom set Entete ">$Nom $Locus $OS DE=$Definition OX=$OX" set Tfa [SequenceFormatTFA $Sequence $Entete "nucbrut"] Sauve $Tfa dans $FicTfa set LesInfos {} set OC [TaxClass $OX Name] regsub {cellular organism; } $OC "" OC lappend LesInfos "Nom: $Nom" lappend LesInfos "Locus: $Locus" lappend LesInfos "Definition: $Definition" lappend LesInfos "OS: $OS" lappend LesInfos "OX: $OX" lappend LesInfos "OC: $OC" SauveLesLignes $LesInfos dans $FicInfos #EspionneL $LesInfos } } #EspionneL $LesCDSs set FicBornes "fiches/bornesdespabs" CreeBornesDesPABsTroisGradins 1 $N $Prefixe 901 "" "2.2d" ./ set LeMini {} lappend LeMini "#MiniConfig" lappend LeMini "FichierSequenceADN" lappend LeMini "VersionDeGscopeAuBigBang [VersionDeGscope]" lappend LeMini "PrefixeDesORFs $Prefixe" lappend LeMini "LongueurMiniDesORFs 300" lappend LeMini "NotreOS $OS" lappend LeMini "NotreOC $OC" lappend LeMini "NotreOX $OX" lappend LeMini "GenomeOuCollection Collection" set Mini [join $LeMini "\n"] Sauve $Mini dans "beton/miniconfig" OnRevientDuBoulot return $Mini } proc JoyCreateGenbankFiles {} { set LesFicGb {} foreach Gs [JoyCode ListOf Gs] { set File [JoyCode $Gs File] set JoyName "Joy$Gs" set RgDir "[JoyDir]/${JoyName}Chr" file mkdir $RgDir file mkdir "$RgDir/genbankfiles" set LeGb {} Espionne $File set Chr 0 set ChrLine "" foreach Ligne [LesLignesDuFichier $File] { lappend LeGb $Ligne if {[regexp "^//" $Ligne]} { incr Chr Espionne "$Chr $ChrLine" set ChrLine "" Espionne [llength $LeGb] set NiceChr [format "%s%2.2d" Chr $Chr] set FicGb "$RgDir/genbankfiles/${JoyName}$NiceChr" SauveLesLignes $LeGb dans $FicGb lappend LesFicGb $FicGb set LeGb {} } if {[regexp {chromosome=\"} $Ligne]} { set ChrLine $Ligne } } } return $LesFicGb } proc JoyGenome {{Qui ""} {Quoi ""}} { global JoyGenome set Qui [string toupper $Qui] set Quoi [string toupper $Quoi] if {[info exists JoyGenome($Qui,$Quoi)]} { return $JoyGenome($Qui,$Quoi) } if {[info exists JoyGenome("EstCharge")]} { return "" } set JoyGenome("EstCharge") 1 NousAllonsAuBoulot [JoyDir] set LesJoyChr [lsort [glob -nocomplain "Joy*Chr"]] set Reference "JoyScChr" set IRef [lsearch $LesJoyChr $Reference] set LesJoyChrWithoutReference [lreplace $LesJoyChr $IRef $IRef] set LesJoyChr [concat [list $Reference] $LesJoyChrWithoutReference] foreach JoyChr $LesJoyChr { regsub {Chr$} $JoyChr "" Strain regsub {Chr$} $JoyChr "CDS" JoyCDS lappend JoyGenome(LISTOF,STRAIN) $Strain lappend JoyGenome(LISTOF,JOYCHR) $JoyChr lappend JoyGenome(LISTOF,JOYCDS) $JoyCDS } foreach JoyChr $JoyGenome(LISTOF,JOYCHR) { set JOYCHR [string toupper $JoyChr] foreach ChrPath [lsort [glob "$JoyChr/genbankfiles/${JoyChr}*"]] { set ChrFile [file tail $ChrPath] if { ! [regexp {Chr[0-9][0-9]$} $ChrFile Chr]} { FaireLire "Oups something is wrong with $ChrFile" } lappend JoyGenome($JOYCHR,LISTOFCHRFILE) $ChrFile lappend JoyGenome($JOYCHR,LISTOFCHR) $Chr } } OnRevientDuBoulot return [JoyGenome $Qui $Quoi] } proc JoyCode {{Qui ""} {Quoi ""}} { global JoyCode set Qui [string toupper $Qui] set Quoi [string toupper $Quoi] if {[info exists JoyCode($Qui,$Quoi)]} { return $JoyCode($Qui,$Quoi) } if {[info exists JoyCode("EstCharge")]} { return "" } set JoyCode("EstCharge") 1 set RepJoy "/genomics/link/Joy" foreach Fichier [lsort [glob "$RepJoy/DuNCBI/*.gbff"]] { set FICHIER [string toupper $Fichier] set Os "" set Ox "" foreach Ligne [LesLignesDuFichier $Fichier] { if { $Os=="" && [regexp {^ *ORGANISM } $Ligne]} { regsub {^ *ORGANISM } $Ligne "" Organism set Organism [string trim $Organism] scan $Ligne "%s %s %s" o G S set Os "$G $S" set OS [string toupper $Os] set Gs "[string index $G 0][string index $S 0]" if {[info exists DejaVu($Gs)]} { FaireLire "Duplicate $Gs $Fichier $DejaVu($Gs)" } set DejaVu($Gs) $Fichier set GS [string toupper $Gs] set JoyCode($FICHIER,FILE) $Fichier set JoyCode($FICHIER,OS) $Os set JoyCode($FICHIER,GS) $Gs set JoyCode($OS,FILE) $Fichier set JoyCode($OS,OS) $Os set JoyCode($OS,GS) $Gs set JoyCode($GS,FILE) $Fichier set JoyCode($GS,OS) $Os set JoyCode($GS,GS) $Gs lappend JoyCode(LISTOF,FILE) $Fichier lappend JoyCode(LISTOF,OS) $Os lappend JoyCode(LISTOF,GS) $Gs continue } if { $Ox=="" && [regexp {/db_xref="taxon\:([0-9]+)"} $Ligne Match Ox]} { set OX $Ox lappend JoyCode(LISTOF,OX) $Ox lappend JoyCode(LISTOF,OXOS) "$Ox $Os" set JoyCode($OX,FILE) $Fichier set JoyCode($OX,OS) $Os set JoyCode($OX,GS) $Gs set JoyCode($FICHIER,OX) $Ox set JoyCode($OS,OX) $Ox set JoyCode($GS,OX) $Ox set JoyCode($OX,OX) $Ox break } } } return [JoyCode $Qui $Quoi] } #rR rest concerns YeastGenome #rR j'ai ecrit ce qu'il faut pour créer un projet Gscope à partir d'un ensemble de fichiers Genbank #rR Pour les levures par exemple : #rR Une souche (strain) de levure (par ex YJM1078 est composée de 16 Chromosomes + celui de la mito (que je numérote Chr17) #rR Voir YeastGenome qui gère les 92 strains + S88C (voir CreateYeastGenomesFile) #rR Je crée deux types de projet par strain #rR YJM1078Chr qui contient les 17 chromosomes en tant que boîte Gscope et aussi les etxtes des CDS des fichiers Genbank #rR YJM1078CDS qui contient tous les CDS de tous les chromosomes mis bout à bout (avec 99 bases entre) proc tmou {taxid} { if {1} { array set Tx [eSummaryREST taxonomy $taxid] parray Tx set Name $Tx($taxid,ScientificName) puts "" } set out [eSearchREST genome txid$taxid\[Organism\]] set Lid [split $out " "] puts [llength $Lid] array set T [eSummaryREST genome $Lid] parray T puts "" set Lgood [list] foreach k [array names T "*,TaxId"] { if {$T($k) == $taxid} { lappend Lgood [lindex [split $k ,] 0] } } puts "Lgood [llength $Lgood]" exit } proc reste TaxId { set L [eSearchREST genome "txid$TaxId\[Organism\]"] Espionne $L array set T [eSummaryREST genome $L] parray T exit } proc YeastFile {Cds {SubDir ""}} { regsub {CDS[0-9]+$} $Cds "CDS" Scds set Fichier "[GscopeDatabaseDir $Scds]/$SubDir/$Cds" if {[file exists $Fichier]} { return $Fichier } return "" } proc AdnDesCopainsAlignes {} { set RepTfasDesCopains "[RepertoireDuGenome]/tfasdescopainsnuc" file mkdir $RepTfasDesCopains set LesFichierCrees {} foreach Nom [ListeDesPABs] { set FicCopains "$RepTfasDesCopains/$Nom" if {[file exists $FicCopains]} { continue } set LesTFAs {} set I -1 set LeTout {} foreach Copain [YeastFriends $Nom] { incr I if {$I%10!=0} { continue } set FicNuc [YeastFile $Copain nuctfa] if {$FicNuc==""} { Warne "no file $FicNuc"; continue } set TFA [ContenuDuFichier $FicNuc] set FicPro [YeastFile $Copain prottfa] if {$FicPro==""} { Warne "no file $FicPro"; continue } set PRO [ContenuDuFichier $FicPro] set SeqPro [QueLaSequenceDuTexteTFA $PRO] set SeqPro [string range $SeqPro 20 end] set Seq [QueLaSequenceDuTexteTFA $TFA] set Seq [string range $Seq 60 end] set L [string length $Seq] set LesCodons {} foreach R [split $SeqPro ""] {a b c} [split $Seq ""] { lappend LesCodons "$R-$a$b$c" } set Codons [format "%-16s %5d %s" $Copain $L [join $LesCodons " "]] lappend LeTout [string range $Codons 0 220] } Espionne EspionneL $LeTout continue if {$LesTFAs=={}} { Warne "$Nom no TFA"; continue } Espionne $Nom lappend LesFichiersCrees [SauveLesLignes $LesTFAs dans $FicCopains] } return $LesFichierCrees } proc YeastTfasDesCopains {} { set RepTfasDesCopains "[RepertoireDuGenome]/tfasdescopains" file mkdir $RepTfasDesCopains set LesFichierCrees {} foreach Nom [ListeDesPABs] { set FicCopains "$RepTfasDesCopains/$Nom" if {[file exists $FicCopains]} { continue } set LesTFAs {} foreach Copain [YeastFriends $Nom] { set FicProt [YeastFile $Copain prottfa] if {$FicProt==""} { Warne "no file $FicProt"; continue } set TFA [ContenuDuFichier $FicProt] lappend LesTFAs $TFA } if {$LesTFAs=={}} { Warne "$Nom no TFA"; continue } Espionne $Nom lappend LesFichiersCrees [SauveLesLignes $LesTFAs dans $FicCopains] } return $LesFichierCrees } proc YeastFriends {{Qui ""} {Quoi ""}} { global YeastFriends set AvecPrecalcul 0 if {$Quoi==""} { set Quoi "ListOfFriends" } if {[info exists YeastFriends($Qui,$Quoi)]} { return $YeastFriends($Qui,$Quoi) } if {[info exists YeastFriends("EstCharge")]} { if { ! [regexp "^S288C" $Qui] && [set Ref [YeastReference $Qui]]!=""} { return [YeastFriends $Ref $Quoi] } return "" } set YeastFriends("EstCharge") 1 set FichierMemo "[GscopeDatabaseDir "S288CCDS"]/fiches/YeastFriends.txt" if {[file exists $FichierMemo]} { array set YeastFriends [ContenuDuFichier $FichierMemo] return [YeastFriends $Qui $Quoi] } foreach Scds [lrange [YeastGenome ListOf Scds] 0 end] { foreach Nom [YeastListOfCds $Scds] { set Ref [YeastReference $Nom] lappend YeastFriends($Ref,ListOfFriends) $Nom lappend YeastFriends(ListOf,Ref) $Ref } } set YeastFriends(ListOf,Ref) [lsort -unique $YeastFriends(ListOf,Ref)] if {$AvecPrecalcul} { foreach Ref $YeastFriends(ListOf,Ref) { foreach Nom $YeastFriends($Ref,ListOfFriends) { if {$Nom==$Ref} { continue } set YeastFriends($Nom,ListOfFriends) $YeastFriends($Ref,ListOfFriends) } } } Sauve [array get YeastFriends] dans $FichierMemo return [YeastFriends $Qui $Quoi] } proc YeastListOfCds Scds { set FicBornes "[GscopeDatabaseDir $Scds]/fiches/bornesdespabs" set LesNoms {} foreach Ligne [LesLignesDuFichier $FicBornes] { scan $Ligne "%s" Nom lappend LesNoms $Nom } return $LesNoms } proc NomDuLocusTag {{Qui ""}} { global NomDuLocusTag if {[info exists NomDuLocusTag($Qui)]} { return $NomDuLocusTag($Qui) } if {[info exists NomDuLocusTag("EstCharge")]} { return "" } set NomDuLocusTag("EstCharge") 1 set RepInfos "[GscopeDatabaseDir [YeastGenome S288C CDS]]/infos" foreach FicheInfos [lsort [glob -nocomplain "$RepInfos/*"]] { set Nom [file tail $FicheInfos] set Infos [ContenuDuFichier $FicheInfos] set LT [StringApres "locus_tag: " dans $Infos] if {$LT==""} { continue } lappend NomDuLocusTag(ListOf) $LT lappend NomDuLocusTag(ListOfNom) $Nom if {[info exists NomDuLocusTag($LT)]} { FaireLire "Doublon" } set NomDuLocusTag($LT) $Nom } return [NomDuLocusTag $Qui] } proc YeastReference {{Nom ""}} { if {[regexp {^S288CCDS} $Nom]} { return $Nom } regsub {[0-9]+$} $Nom "" Scds set Rep [GscopeDatabaseDir $Scds] set FicheInfos "$Rep/infos/$Nom" if { ! [file exists $FicheInfos]} { return "" } set Infos [ContenuDuFichier $FicheInfos] if {[regexp {corresponds to s288c ([^\n]+)\n} $Infos Match LocusTag]} { return [NomDuLocusTag $LocusTag] } return "" } proc YeastGenomesInventory {} { foreach Strain [YeastGenome ListOf Strain] { set Scds [YeastGenome $Strain "CDS"] set Rep [GscopeDatabaseDir $Scds] set NbCds [llength [LesLignesDuFichier "$Rep/fiches/bornesdespabs"]] set NbTrna [llength [LesLignesDuFichier "$Rep/fiches/bornesdestrnas"]] set NbRna [llength [LesLignesDuFichier "$Rep/fiches/bornesdesarns"]] set Message "$Strain $NbCds $NbTrna $NbRna" lappend LesMessages $Message #Espionne $Message } return $LesMessages } proc CreateYeastGscopeProjectsPourTous {{Ask ""}} { set Ask [string equal -nocase $Ask "Ask"] foreach Strain [YeastGenome ListOf Strain] { set FichierAdn "/genomics/link/YeastGenomes/${Strain}CDS/beton/adn" if {[file exists $FichierAdn] && (! $Ask || [OuiOuNon "$FichierAdn already exists. Do I keep it" 0])} { continue } set Fait [CreateYeastGscopeProjects $Strain] lappend LesFaits $Fait } return $LesFaits } proc CreateYeastGscopeProjects Strain { CreateYeastChrGscopeProject $Strain CreateYeastCDSGscopeProject $Strain Espionne "${Strain}Chr et ${Strain}CDS ont été créés. Pour les voir :" Espionne "setgscoperr ${Strain}Chr ; gscope" Espionne "setgscoperr ${Strain}CDS ; gscope" return "${Strain}CDS" } proc CreateYeastCDSGscopeProject {Strain {Joy ""}} { #rR je l'ai réécrit en CreateCDSGscopeProjectFromGenbank pour êre plus général set StrainCDS "${Strain}CDS" if {$Joy=="Joy"} { set RG "[JoyDir]/$StrainCDS" if { ! [file exists $RG]} { file mkdir $RG } set Link "/genomics/link/$StrainCDS" if { ! [file exist $Link]} { exec ln -sf $RG /genomics/link/$StrainCDS } } set Prefixe $StrainCDS set FormatOrfNumbering "%4.4d" NousAllonsAuBoulot $RG file mkdir beton file mkdir fiches file mkdir infos file mkdir nuctfa file mkdir nucembl file mkdir prottfa file mkdir protembl set RepCDS "/genomics/link/${Strain}Chr/cds" set RepRNA "/genomics/link/${Strain}Chr/rna" set N 0 set SeqTotale "n" append SeqTotale [string repeat "n" 99] set Offset [string length $SeqTotale] set LesBornesDesPABs {} foreach FichierCDS [lsort [glob "$RepCDS/${Strain}Chr*"]] { set LesChrBornes {} set Chr "Chr00" if { ! [regexp {Chr[0-9]+$} $FichierCDS Chr]} { FaireLire "no Chr in $FichierCDS" } set ChrDir [file dirname [file dirname $FichierCDS]] set ChrNuc [QueLaSequenceDuFichierTFA "$ChrDir/nuctfa/${Strain}$Chr"] set ChrLen [string length $ChrNuc] set LesCDSs [ContenuDuFichier $FichierCDS] set OnEstDansSeq 0 foreach CDS $LesCDSs { set Bornes "" set ItsOk 1 foreach Ligne [split $CDS "\n"] { if { ! [regexp {^/([^=]+)=(.*)$} $Ligne Match K V]} { Warne "Wrong $Ligne" ; set ItsOk 0; break } regsub -all {\"} $V "" V if {$K=="location"} { regsub -all {[<>]} $V "" V set Bornes $V } lappend Infos($Chr,$Bornes,ListOfK) $K set Infos($Chr,$Bornes,$K) $V } if {$ItsOk} { lappend LesChrBornes "$Chr $Bornes" } } set Offset [string length $SeqTotale] Espionne "$Chr $ChrLen [llength $LesCDSs] CDS Offset $Offset" foreach ChrBornes $LesChrBornes { lassign [split $ChrBornes " "] Chr Bornes if { ! [regexp {([0-9]+)\.\.([0-9]+)} $Bornes Match Deb Fin ]} { Warne "Wrong location $Bornes in $ChrBornes"; continue } incr N set Nom [format "%s$FormatOrfNumbering" $Prefixe $N] set Sens "F" if {[regexp "complement" $Bornes]} { set Sens "R" } regsub -all {[^0-9\,\.]} $Bornes "" Bo set LesDF {} foreach DF [split $Bo ","] { set D "" set F "" if {[regexp {^([0-9]+)\.\.([0-9]+)$} $DF Match D F]} {} if {[regexp {^([0-9]+)$} $DF D] } {set F $D} if {$D=="" || $F==""} { Espionne "$Bo" ; exit } lappend LesDF $D $F } set NucCds "" if {0 && $Chr=="Chr10"} { Espionne "$ChrBornes $LesBo" } foreach {D F} $LesDF { if {0 && $Chr=="Chr10"} { Espionne "=$D=$F=" } if {[catch {append NucCds [string range "n$ChrNuc" $D $F]} Message]} { Espionne $Message Espionne "=$ChrBornes=$D=$F=" exit } } if {$Sens=="R"} { set NucCds [NucToReverseAndComplementNuc $NucCds] } set Deb [lindex $LesDF 0] set Fin [lindex $LesDF end] incr Deb $Offset ; incr Deb -1 incr Fin $Offset ; incr Fin -1 lappend LesBornesDesPABs "$Nom $Deb $Fin $Sens" set LaFicheInfo {} lappend LaFicheInfo "Nom: $Nom" lappend LaFicheInfo "Chr: $Chr" lappend LaFicheInfo "Bornes: $Bornes" set Gn "" set ProCds "" foreach K $Infos($Chr,$Bornes,ListOfK) { set V $Infos($Chr,$Bornes,$K) if {$K=="translation"} { set ProCds $V continue } if {$K=="gene"} { set Gn $V } if {$K=="product" && $Gn==""} { set Gn $V } lappend LaFicheInfo "$K: $V" } if {$Gn!=""} { lappend LaFicheInfo "ValiGN: $Gn" } SauveLesLignes $LaFicheInfo dans "infos/$Nom" set Entete ">$Nom $Gn $Bornes" set NucTfa [SequenceFormatTFA $NucCds $Entete "nucbrut"] set ProTfa [SequenceFormatTFA $ProCds $Entete "protbrut"] Sauve $NucTfa dans "nuctfa/$Nom" Sauve $ProTfa dans "prottfa/$Nom" } append SeqTotale $ChrNuc set LongTotale [string length $SeqTotale] set ProchainStart [expr (($LongTotale + 999)/3)*3] set Tampon [string repeat "n" [expr $ProchainStart-$LongTotale]] append SeqTotale $Tampon } #rR ATTENTION ici on traite les tRNA rRNA et ncRNA set SeqTotaleRNA "n" set LesBornesDesTRNAs {} set LesBornesDesARNs {} foreach FichierRNA [lsort [glob "$RepRNA/${Strain}Chr*"]] { set LesChrBornes {} set Chr "Chr00" if { ! [regexp {Chr[0-9]+$} $FichierRNA Chr]} { FaireLire "no Chr in $FichierRNA" } set ChrDir [file dirname [file dirname $FichierCDS]] set ChrNuc [QueLaSequenceDuFichierTFA "$ChrDir/nuctfa/${Strain}$Chr"] set ChrLen [string length $ChrNuc] set LesRNAs [ContenuDuFichier $FichierRNA] set OnEstDansSeq 0 foreach RNA $LesRNAs { set Bornes "" set ItsOk 1 foreach Ligne [split $RNA "\n"] { if { ! [regexp {^/([^=]+)=(.*)$} $Ligne Match K V]} { Warne "Wrong $Ligne" ; set ItsOk 0; break } Espionne "$K///$V" regsub -all {\"} $V "" V if {[regexp {^([^_]+)_location$} $K Match T]} { #set Type [string toupper $T] set Type $T set K "location" } if {$K=="location"} { regsub -all {[<>]} $V "" V set Bornes $V set Infos($Chr,$Bornes,Type) $Type } lappend Infos($Chr,$Bornes,ListOfK) $K set Infos($Chr,$Bornes,$K) $V } if {$ItsOk} { lappend LesChrBornes "$Chr $Bornes" } } set Offset [string length $SeqTotaleRNA] foreach ChrBornes $LesChrBornes { lassign [split $ChrBornes " "] Chr Bornes if { ! [regexp {([0-9]+)\.\.([0-9]+)} $Bornes Match Deb Fin ]} { Warne "Wrong Bornes $Bornes in $ChrBornes" ; continue } set Type $Infos($Chr,$Bornes,Type) Espionne "$Chr $Bornes $Type" set Sens "F" if {[regexp "complement" $Bornes]} { set Sens "R" } regsub -all {[^0-9\,\.]} $Bornes "" Bo set LesDF {} foreach DF [split $Bo ","] { set D "" set F "" if {[regexp {^([0-9]+)\.\.([0-9]+)$} $DF Match D F]} {} if {[regexp {^([0-9]+)$} $DF D] } {set F $D} if {$D=="" || $F==""} { Espionne "$Bo" ; exit } lappend LesDF $D $F } set NucRna "" if {0 && $Chr=="Chr10"} { Espionne "$ChrBornes $LesBo" } foreach {D F} $LesDF { if {0 && $Chr=="Chr10"} { Espionne "=$D=$F=" } if {[catch {append NucRna [string range "n$ChrNuc" $D $F]} Message]} { Espionne $Message Espionne "=$ChrBornes=$D=$F=" exit } } if {$Sens=="R"} { set NucRna [NucToReverseAndComplementNuc $NucRna] } set Deb [lindex $LesDF 0] set Fin [lindex $LesDF end] incr Deb $Offset ; incr Deb -1 incr Fin $Offset ; incr Fin -1 set Gn "" set AA "" set AntiCodon "" foreach K $Infos($Chr,$Bornes,ListOfK) { set V $Infos($Chr,$Bornes,$K) if {$K=="product"} { set Gn [lindex [split $V " "] 0]} if {[string equal -nocase $Type "tRNA"]} { if {$K=="note" && [regexp {corresponds to s288c [^\(]+\(([^\)]+)\)} $V Match AntiCodon]} { regsub -all "U" $AntiCodon "T" AntiCodon Espionne $AntiCodon } } lappend LaFicheInfo "$K: $V" } if {[string equal -nocase $Type "tRNA"]} { set AA $Gn regsub -nocase {tRNA\-} $AA "" AA set Nom "tRNA$AA$AntiCodon" if { ! [info exists NiemeDuNom($Nom)]} { set NiemeDuNom($Nom) 0 } incr NiemeDuNom($Nom) set Nom $Nom-$NiemeDuNom($Nom) } else { set Nom "$Type-$Gn" } if {[string equal -nocase $Type "tRNA"]} { lappend LesBornesDesTRNAs "$Nom $Deb $Fin $Sens" Espionne "$Nom $Deb $Fin $Sens" } else { lappend LesBornesDesARNs "$Nom $Deb $Fin $Sens" } set LaFicheInfo {} lappend LaFicheInfo "Nom: $Nom" lappend LaFicheInfo "Chr: $Chr" lappend LaFicheInfo "Bornes: $Bornes" if {$Gn!=""} { lappend LaFicheInfo "ValiGN: $Gn" } SauveLesLignes $LaFicheInfo dans "infos/$Nom" set Entete ">$Nom $Gn $Bornes" set NucTfa [SequenceFormatTFA $NucRna $Entete "nucbrut"] Sauve $NucTfa dans "nuctfa/$Nom" } append SeqTotaleRNA $ChrNuc set LongTotale [string length $SeqTotaleRNA] set ProchainStart [expr (($LongTotale + 999)/3)*3] set Tampon [string repeat "n" [expr $ProchainStart-$LongTotale]] append SeqTotaleRNA $Tampon } set FichierSequenceADN "beton/${Strain}_All_Chromosomes.tfa" set TFA [SequenceFormatTFA [string range $SeqTotale 1 end] ">${Strain}_All_Chromosomes with around 99 n between chromosomes"] Sauve $TFA dans $FichierSequenceADN set MiniChr [ContenuDuFichier "/genomics/link/${Strain}Chr/beton/miniconfig"] set OS [StringSuivant "NotreOS " dans $MiniChr] set OC [StringSuivant "NotreOC " dans $MiniChr] set OX [StringSuivant "NotreOX " dans $MiniChr] set LeMini {} lappend LeMini "#MiniConfig" lappend LeMini "FichierSequenceADN $Link$FichierSequenceADN" lappend LeMini "VersionDeGscopeAuBigBang [VersionDeGscope]" lappend LeMini "PrefixeDesORFs $Prefixe" lappend LeMini "LongueurMiniDesORFs 0" lappend LeMini "NotreOS $OS" lappend LeMini "NotreOC $OC" lappend LeMini "NotreOX $OX" lappend LeMini "GenomeOuCollection Genome" set Mini [join $LeMini "\n"] Sauve $Mini dans "beton/miniconfig" SauveLesLignes $LesBornesDesPABs dans "fiches/bornesdespabs" SauveLesLignes $LesBornesDesTRNAs dans "fiches/bornesdestrnas" SauveLesLignes $LesBornesDesARNs dans "fiches/bornesdesarns" Espionne "CreeADNetTDNetRAC" Espionne [CreeADNetTDNetRAC [string range $SeqTotale 1 end] "" "."] OnRevientDuBoulot return "dernier cree : $Nom" } proc NiceCDSFromCDSLines CDSLines { set LesInfos {} set LesLignesDeLInfoCourante {} lappend CDSLines "/LASTLINE=" foreach Ligne $CDSLines { regsub {^ CDS } $Ligne "/location=" Ligne regsub {^ } $Ligne "" Ligne if {[regexp {^/([^=]+)=} $Ligne]} { if {$LesLignesDeLInfoCourante!=""} { set JoinCar " " regexp {^/([^=]+)=} [lindex $LesLignesDeLInfoCourante 0] Match Key if {$Key=="location" || $Key=="translation"} { set JoinCar "" } set Info [join $LesLignesDeLInfoCourante $JoinCar] regsub -all {\[} $Info "(" Info ;#rR trop dangereux de les garder regsub -all {\]} $Info ")" Info ;#rR trop dangereux de les garder lappend LesInfos $Info set LesLignesDeLInfoCourante {} } if {$Ligne=="/LASTLINE="} { break } } lappend LesLignesDeLInfoCourante $Ligne } return [join $LesInfos "\n"] } proc NiceRNAFromRNALines RNALines { set LesInfos {} set LesLignesDeLInfoCourante {} lappend RNALines "/LASTLINE=" foreach Ligne $RNALines { regexp {^ ((m|r|t|nc)RNA) +} $Ligne Match Type regsub {^ ((m|r|t|nc)RNA) +} $Ligne "/${Type}_location=" Ligne regsub {^ +} $Ligne "" Ligne if {[regexp {^/([^=]+)=} $Ligne]} { if {$LesLignesDeLInfoCourante!=""} { set JoinCar " " regexp {^/([^=]+)=} [lindex $LesLignesDeLInfoCourante 0] Match Key if {[regexp "RNA_location" $Key]} { set JoinCar "" } set Info [join $LesLignesDeLInfoCourante $JoinCar] regsub -all {\[} $Info "(" Info ;#rR trop dangereux de les garder regsub -all {\]} $Info ")" Info ;#rR trop dangereux de les garder lappend LesInfos $Info set LesLignesDeLInfoCourante {} } if {$Ligne=="/LASTLINE="} { break } } lappend LesLignesDeLInfoCourante $Ligne } return [join $LesInfos "\n"] } proc CreateYeastChrGscopeProject Strain { set StrainChr "${Strain}Chr" set RG "/gstock/YeastGenomes/$StrainChr" if { ! [file exists $RG]} { file mkdir $RG } set Link "/genomics/link/$StrainChr" if { ! [file exist $Link]} { exec ln -sf $RG /genomics/link/$StrainChr } set Prefixe $StrainChr NousAllonsAuBoulot $RG file mkdir beton file mkdir fiches file mkdir infos file mkdir genbankfiles file mkdir nuctfa file mkdir nucembl file mkdir cds file mkdir rna set N 0 foreach Chr [YeastGenome ListOf Chr] { incr N set Nom $Strain$Chr set FicGb "genbankfiles/$Nom" set Id [YeastGenome $Strain $Chr] if {[file exists $FicGb]} { set Gb [ContenuDuFichier $FicGb] } else { set Gb [GenbankNucleotide $Id gb "GetText"] Sauve $Gb dans $FicGb } Espionne "$Nom $FicGb" set FicTfa nuctfa/$Nom set FicInfos infos/$Nom if { 1 } { set Locus "LocusUnknown" set Definition "DefinitionUnknown" set OS "OSUnknown" set OX "OXUnknown" set OnAttendOrigin 1 set Sequence "" set OnEstDansCDS 0 set OnEstDansRNA 0 set LesCDSs {} set LesRNAs {} set CEstFini 0 foreach Ligne [split $Gb "\n"] { if {[regexp "^//" $Ligne]} { break } regexp {^LOCUS +([^ ]+)( |$)} $Ligne Match Locus regexp {^DEFINITION +(.+)$} $Ligne Match Definition regexp {/organism=\"([^\"]+)\"} $Ligne Match OS regexp {taxon\:([^\"]+)\"} $Ligne Match OX if { ! $OnEstDansCDS && [regexp {^ CDS } $Ligne]} { set OnEstDansCDS 1 set LeCDS {} lappend LeCDS $Ligne continue } if { ! $OnEstDansRNA && [regexp {^ (r|t|nc)RNA } $Ligne]} { set OnEstDansRNA 1 set LeRNA {} lappend LeRNA $Ligne continue } if {$OnEstDansCDS} { if {$OnAttendOrigin && [regexp "^ORIGIN" $Ligne]} { set OnAttendOrigin 0 ; set CEstFini 1 } if {$CEstFini || [regexp -nocase {^ [a-z]} $Ligne]} { set CDS [NiceCDSFromCDSLines $LeCDS] #Espionne $CDS lappend LesCDSs $CDS set OnEstDansCDS 0 continue } lappend LeCDS $Ligne continue } if {$OnEstDansRNA} { if {$OnAttendOrigin && [regexp "^ORIGIN" $Ligne]} { set OnAttendOrigin 0 ; set CEstFini 1 } if {$CEstFini || [regexp -nocase {^ [a-z]} $Ligne]} { set RNA [NiceRNAFromRNALines $LeRNA] #Espionne $RNA lappend LesRNAs $RNA set OnEstDansRNA 0 continue } lappend LeRNA $Ligne continue } if {$OnAttendOrigin && [regexp "^ORIGIN" $Ligne]} { set OnAttendOrigin 0 ; continue } if {$OnAttendOrigin} { continue } set Seq $Ligne regsub -all -nocase {[^a-z]} $Seq "" Seq append Sequence $Seq } #FaireLire [string length $Sequence] Sauve $LesCDSs dans cds/$Nom Sauve $LesRNAs dans rna/$Nom set Entete ">$Nom $Locus $OS DE=$Definition OX=$OX" set Tfa [SequenceFormatTFA $Sequence $Entete "nucbrut"] Sauve $Tfa dans $FicTfa set LesInfos {} set OC [TaxClass $OX Name] regsub {cellular organism; } $OC "" OC lappend LesInfos "Nom: $Nom" lappend LesInfos "Locus: $Locus" lappend LesInfos "Definition: $Definition" lappend LesInfos "OS: $OS" lappend LesInfos "OX: $OX" lappend LesInfos "OC: $OC" SauveLesLignes $LesInfos dans $FicInfos #EspionneL $LesInfos } } #EspionneL $LesCDSs set FicBornes "fiches/bornesdespabs" CreeBornesDesPABsTroisGradins 1 $N $Prefixe 901 "" "%2.2d" ./ set LeMini {} lappend LeMini "#MiniConfig" lappend LeMini "FichierSequenceADN" lappend LeMini "VersionDeGscopeAuBigBang [VersionDeGscope]" lappend LeMini "PrefixeDesORFs $Prefixe" lappend LeMini "LongueurMiniDesORFs 300" lappend LeMini "NotreOS $OS" lappend LeMini "NotreOC $OC" lappend LeMini "NotreOX $OX" lappend LeMini "GenomeOuCollection Collection" set Mini [join $LeMini "\n"] Sauve $Mini dans "beton/miniconfig" OnRevientDuBoulot return $Mini } proc CreateYeastGenomesFile {} { #rR Julie m'avait donné les strains YJM #rR Je mets en début S288C #rR ... on peut en rajouter d'autres. set FichierCsv [YeastGenome Filename] set FichierCsvYJMstrains "[file dirname $FichierCsv]/YJMstrains.csv" set LesLignesYJMstrains [LesLignesDuFichier $FichierCsvYJMstrains] set LesLignes {} lappend LesLignes [lindex $LesLignesYJMstrains 0] ; #rR pour le titre lappend LesLignes [CsvLineForS288C] LConcat LesLignes [lrange $LesLignesYJMstrains 1 end] SauveLesLignes $LesLignes dans $FichierCsv return $FichierCsv } proc YeastGenome {{Qui ""} {Quoi ""}} { global YeastGenome #rR Pour la mise à jour du fichier voir CreateYeastGenomesFile set FichierCsv "[GscopeDatabaseDir YeastGenomes]/DuNCBI/YeastGenomes.csv" set YeastGenome(Filename,) $FichierCsv if {[string equal -nocase $Quoi Chr]} { return "${Qui}Chr" } if {[string equal -nocase $Quoi CDS]} { return "${Qui}CDS" } if {[info exists YeastGenome($Qui,$Quoi)]} { return $YeastGenome($Qui,$Quoi) } if {[info exists YeastGenome("EstCharge")]} { return "" } set YeastGenome("EstCharge") 1 LoadTxl $FichierCsv YeastGenome 0 ";" #parray YeastGenome set YeastGenome(ListOf,Strain) $YeastGenome(ListOf,Index) set Prems [lindex $YeastGenome(ListOf,Strain) 0] foreach Strain $YeastGenome(ListOf,Strain) { set YeastGenome($Strain,Chr17) $YeastGenome($Strain,Mito) } set YeastGenome(ListOf,Id) {} set YeastGenome(ListOf,Chr) {} set L [lsort [array names YeastGenome "$Prems,Chr*"]] foreach SC $L { regsub "$Prems," $SC "" Chr lappend YeastGenome(ListOf,Chr) $Chr } foreach Strain $YeastGenome(ListOf,Strain) { lappend YeastGenome(ListOf,Scds) "${Strain}CDS" lappend YeastGenome(ListOf,Schr) "${Strain}Chr" foreach Chr $YeastGenome(ListOf,Chr) { set Id $YeastGenome($Strain,$Chr) lappend YeastGenome(ListOf,Id) $Id set YeastGenome($Id,Strain) $Strain set YeastGenome($Id,Chr) $Chr set YeastGenome($Id,StrainChr) "$Strain-$Chr" } } return [YeastGenome $Qui $Quoi] } proc GetGenbankForS288Cne_sert_plus {} { NousAllonsAuBoulot "/genomics/link/YeastGenomes/S288CChr/genbankfiles" set Chromosomes { chromosome I:NC_001133.9/BK006935.2 chromosome II:NC_001134.8/BK006936.2 chromosome III:NC_001135.5/BK006937.2 chromosome IV:NC_001136.10/BK006938.2 chromosome V:NC_001137.3/BK006939.2 chromosome VI:NC_001138.5/BK006940.2 chromosome VII:NC_001139.9/BK006941.2 chromosome VIII:NC_001140.6/BK006934.2 chromosome IX:NC_001141.2/BK006942.2 chromosome X:NC_001142.9/BK006943.2 chromosome XI:NC_001143.9/BK006944.2 chromosome XII:NC_001144.5/BK006945.2 chromosome XIII:NC_001145.3/BK006946.2 chromosome XIV:NC_001146.8/BK006947.3 chromosome XV:NC_001147.6/BK006948.2 chromosome XVI:NC_001148.4/BK006949.2 mitochondrion MT:NC_001224.1/KP263414.1 } set LesGb {} set I 0 foreach {a b} $Chromosomes { incr I set Chr [format "Chr%2.2d" $I] if { ! [regexp {:([^/]+)/(.+)$} $b Match NC BK]} { FaireLire "Wrong $b" } set FichierGb "S288C$Chr" if { ! [file exists $FichierGb]} { GenbankNucleotide $BK "" $FichierGb } lappend LesGb $FichierGb } OnRevientDuBoulot return $LesGb } proc CsvLineForS288C {} { set LesColonnes { S288C BK006935.2 BK006936.2 BK006937.2 BK006938.2 BK006939.2 BK006940.2 BK006941.2 BK006934.2 BK006942.2 BK006943.2 BK006944.2 BK006945.2 BK006946.2 BK006947.3 BK006948.2 BK006949.2 KP263414.1 unknwon S288C unknwon } set Ligne [join $LesColonnes ";"] return $Ligne } proc GenbankNucleotide {Id {Quoi ""} {FicOut ""}} { if {$Quoi==""} { set Quoi "gb" } if {$FicOut==""} { set FicOut "$Id.gb" } set Url "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$Id&rettype=$Quoi&retmode=txt" Espionne $Url set Gb [ContenuDuFichier $Url] if {$FicOut=="GetText"} { return $Gb } return [Sauve $Gb dans $FicOut] }