From 05a49947ad898249606315cb08c6456cc46184a4 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Mon, 10 Feb 2025 13:14:54 +0100
Subject: [PATCH 1/3] gtdbtk on bins extract archee outputs

---
 modules/gtdbtk.nf                   |  5 +++--
 modules/sum_up_bins_informations.nf | 21 +++++++++++++++------
 subworkflows/08_binning.nf          |  5 +++--
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf
index 8ec0172..ffaeeb5 100644
--- a/modules/gtdbtk.nf
+++ b/modules/gtdbtk.nf
@@ -8,8 +8,9 @@ process GTDBTK {
     val mash_db
       
   output:
-    path "gtdbtk.bac120.summary.tsv*", emit: gtdbtk_affiliations_predictions
-    path "v_gtdbtk.txt", emit: v_gtdbtk
+    path "gtdbtk.bac120.summary.tsv*" , emit : gtdbtk_affiliations_predictions_bact
+    path "gtdbtk.ar53.summary.tsv*" , emit : gtdbtk_affiliations_predictions_arch
+    path "v_gtdbtk.txt" ,emit : v_gtdbtk
 
   script:
   """
diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf
index 042d2da..cc4a4ff 100644
--- a/modules/sum_up_bins_informations.nf
+++ b/modules/sum_up_bins_informations.nf
@@ -6,22 +6,31 @@ process GENOMES_ABUNDANCES_PER_SAMPLE {
       path flagstats_files
       val(bins_folder)
       path genomes_informations
-      path affiliations_predictions
+      path affiliations_predictions_arch
+      path affiliations_predictions_bact
       path heatmap_header_mqc
       path table_header_mqc
 
    output:
-      path "genomes_abundances.tsv" , emit: genomes_abundances
-      tuple path("stats/genomes_abundances_mqc.tsv"), path("stats/genomes_checkm_mqc.json"), path("stats/bins_general_stats_mqc.tsv"), emit: report
+      path "genomes_abundances_arch.tsv" , emit: genomes_abundances_arch
+      path "genomes_abundances_bact.tsv" , emit: genomes_abundances_bact
+      tuple path("stats/genomes_abundances_mqc_arch.tsv"), path("stats/genomes_checkm_mqc_arch.json"), path("stats/bins_general_stats_mqc_arch.tsv"), emit: report_arch
+      tuple path("stats/genomes_abundances_mqc_bact.tsv"), path("stats/genomes_checkm_mqc_bact.json"), path("stats/bins_general_stats_mqc_bact.tsv"), emit: report_bact
 
    script:
       """
       mkdir -p stats
       bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \
-      --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions} \
+      --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_arch} \
       --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \
-      --output_file genomes_abundances.tsv --report_file stats/genomes_abundances_mqc.tsv \
-      --checkm_file stats/genomes_checkm_mqc.json --table_file stats/bins_general_stats_mqc.tsv
+      --output_file genomes_abundances_arch.tsv --report_file stats/genomes_abundances_mqc_arch.tsv \
+      --checkm_file stats/genomes_checkm_mqc_arch.json --table_file stats/bins_general_stats_mqc_arch.tsv
+
+      bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \
+      --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_bact} \
+      --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \
+      --output_file genomes_abundances_bact.tsv --report_file stats/genomes_abundances_mqc_bact.tsv \
+      --checkm_file stats/genomes_checkm_mqc_bact.json --table_file stats/bins_general_stats_mqc_bact.tsv
 
       cat ${table_header_mqc} > stats/tmp.txt && cat stats/bins_general_stats_mqc.tsv >> stats/tmp.txt \
       && mv stats/tmp.txt stats/bins_general_stats_mqc.tsv
diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf
index 4573dc2..2ae4568 100644
--- a/subworkflows/08_binning.nf
+++ b/subworkflows/08_binning.nf
@@ -225,7 +225,8 @@ workflow STEP_08_BINNING {
 
   GTDBTK(ch_bins_drep, gtdbtk_db, mash_db)
   ch_gtdbtk_v = GTDBTK.out.v_gtdbtk
-  ch_gtdbtk_affi = GTDBTK.out.gtdbtk_affiliations_predictions
+  ch_gtdbtk_affi_arch = GTDBTK.out.gtdbtk_affiliations_predictions_arch
+  ch_gtdbtk_affi_bact = GTDBTK.out.gtdbtk_affiliations_predictions_bact
 
   /////////////////////////////
   ////GENOMES ABUNDANCES
@@ -254,7 +255,7 @@ workflow STEP_08_BINNING {
   ch_collect_flagstats = GET_ALIGNMENT_METRICS.out.sam_flagstat.collect()
 
   GENOMES_ABUNDANCES_PER_SAMPLE(ch_collect_coverages, ch_collect_flagstats, 
-   ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi ,
+   ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi_arch , ch_gtdbtk_affi_bact)
    ch_heatmap_header_multiqc, ch_table_header_multiqc)
 
   ch_bins_abundances_report = GENOMES_ABUNDANCES_PER_SAMPLE.out.report
-- 
GitLab


From 3a44fa42aa3dc34f65d65c787b1f8003352bce64 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Tue, 11 Feb 2025 15:02:49 +0100
Subject: [PATCH 2/3] modify input and output for gtdbtk archee

---
 modules/sum_up_bins_informations.nf | 40 ++++++++++++++++-------------
 subworkflows/08_binning.nf          |  5 ++--
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf
index cc4a4ff..48f1bd1 100644
--- a/modules/sum_up_bins_informations.nf
+++ b/modules/sum_up_bins_informations.nf
@@ -4,7 +4,7 @@ process GENOMES_ABUNDANCES_PER_SAMPLE {
    input:
       path coverage_files
       path flagstats_files
-      val(bins_folder)
+      val bins_folder
       path genomes_informations
       path affiliations_predictions_arch
       path affiliations_predictions_bact
@@ -20,23 +20,27 @@ process GENOMES_ABUNDANCES_PER_SAMPLE {
    script:
       """
       mkdir -p stats
-      bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \
-      --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_arch} \
-      --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \
-      --output_file genomes_abundances_arch.tsv --report_file stats/genomes_abundances_mqc_arch.tsv \
-      --checkm_file stats/genomes_checkm_mqc_arch.json --table_file stats/bins_general_stats_mqc_arch.tsv
-
-      bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \
-      --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_bact} \
-      --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \
-      --output_file genomes_abundances_bact.tsv --report_file stats/genomes_abundances_mqc_bact.tsv \
-      --checkm_file stats/genomes_checkm_mqc_bact.json --table_file stats/bins_general_stats_mqc_bact.tsv
-
-      cat ${table_header_mqc} > stats/tmp.txt && cat stats/bins_general_stats_mqc.tsv >> stats/tmp.txt \
-      && mv stats/tmp.txt stats/bins_general_stats_mqc.tsv
-
-      cat ${heatmap_header_mqc} > stats/tmp.txt && cat stats/genomes_abundances_mqc.tsv >> stats/tmp.txt \
-      && mv stats/tmp.txt stats/genomes_abundances_mqc.tsv
+
+      type=("bact" "arch")
+
+      for i in "\${type[@]}";
+      do
+        var_name="affiliations_predictions_\${i}"
+        
+        bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \\
+        --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${!var_name} \\
+        --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \\
+        --output_file genomes_abundances_\${i}.tsv --report_file stats/genomes_abundances_mqc_\${i}.tsv \\
+        --checkm_file stats/genomes_checkm_mqc_\${i}.json --table_file stats/bins_general_stats_mqc_\${i}.tsv
+
+        cat ${table_header_mqc} > stats/tmp_\${i}.txt && cat stats/bins_general_stats_mqc_\${i}.tsv >> stats/tmp_\${i}.txt \\
+        && mv stats/tmp_\${i}.txt stats/bins_general_stats_mqc_\${i}.tsv
+
+        cat ${heatmap_header_mqc} > stats/tmp_\${i}.txt && cat stats/genomes_abundances_mqc_\${i}.tsv >> stats/tmp_\${i}.txt \\
+        && mv stats/tmp_\${i}.txt stats/genomes_abundances_mqc_\${i}.tsv
+
+      done
+      
       """
 }
 
diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf
index 2ae4568..6bcc673 100644
--- a/subworkflows/08_binning.nf
+++ b/subworkflows/08_binning.nf
@@ -171,7 +171,8 @@ workflow STEP_08_BINNING {
   ch_bins_drep = Channel.empty()
   ch_bam_bins = Channel.empty()
   ch_reads_fna = Channel.empty()
-  ch_gtdbtk_affi = Channel.empty()
+  ch_gtdbtk_affi_arch = Channel.empty()
+  ch_gtdbtk_affi_bact = Channel.empty()
   ch_drep_stats = Channel.empty()
 
   ch_bins_assembly = ch_bins_set.multiple.join(ch_assembly)
@@ -255,7 +256,7 @@ workflow STEP_08_BINNING {
   ch_collect_flagstats = GET_ALIGNMENT_METRICS.out.sam_flagstat.collect()
 
   GENOMES_ABUNDANCES_PER_SAMPLE(ch_collect_coverages, ch_collect_flagstats, 
-   ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi_arch , ch_gtdbtk_affi_bact)
+   ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi_arch , ch_gtdbtk_affi_bact,
    ch_heatmap_header_multiqc, ch_table_header_multiqc)
 
   ch_bins_abundances_report = GENOMES_ABUNDANCES_PER_SAMPLE.out.report
-- 
GitLab


From 0424d5d5d42f674d7e9b282b38fe9ada1f84f583 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Mon, 3 Mar 2025 15:12:05 +0100
Subject: [PATCH 3/3] gtdbtk allways give in output bacteria and archaea
 summary

---
 docs/source/output.md               | 3 ++-
 modules/gtdbtk.nf                   | 9 +++++++++
 modules/sum_up_bins_informations.nf | 8 ++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/docs/source/output.md b/docs/source/output.md
index 9c75622..fc4ee22 100644
--- a/docs/source/output.md
+++ b/docs/source/output.md
@@ -267,7 +267,8 @@ If you want to make further analysis about intra-population genetic diversity (m
 
 | File      | Description                                           |
 | ----------------------- | --------------------------------------- |
-| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. |
+| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications of bacteria provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. |
+| `gtdbtk.ar53.summary.tsv` | Taxonomic classifications of archae provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. |
 
 #### 4. 08_4_mapping_on_final_bins
 
diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf
index ffaeeb5..f820a00 100644
--- a/modules/gtdbtk.nf
+++ b/modules/gtdbtk.nf
@@ -19,5 +19,14 @@ process GTDBTK {
 
   gtdbtk classify_wf --genome_dir $bins_drep -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus}
   echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt
+
+  if [ ! -f ./gtdbtk.bac120.summary.tsv ]; then
+    touch ./gtdbtk.bac120.summary.tsv
+  fi
+
+  if [ ! -f ./gtdbtk.ar53.summary.tsv ]; then
+    touch ./gtdbtk.ar53.summary.tsv
+  fi
+
   """
 }
\ No newline at end of file
diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf
index 48f1bd1..e649b82 100644
--- a/modules/sum_up_bins_informations.nf
+++ b/modules/sum_up_bins_informations.nf
@@ -25,10 +25,14 @@ process GENOMES_ABUNDANCES_PER_SAMPLE {
 
       for i in "\${type[@]}";
       do
-        var_name="affiliations_predictions_\${i}"
+        if [ "\$i" == "bact" ]; then
+          var_name=${affiliations_predictions_bact}
+        else
+          var_name=${affiliations_predictions_arch}
+        fi
         
         bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \\
-        --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${!var_name} \\
+        --list_of_flagstats_files ${flagstats_files} --affiliations_predictions \$var_name \\
         --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \\
         --output_file genomes_abundances_\${i}.tsv --report_file stats/genomes_abundances_mqc_\${i}.tsv \\
         --checkm_file stats/genomes_checkm_mqc_\${i}.json --table_file stats/bins_general_stats_mqc_\${i}.tsv
-- 
GitLab