From 05a49947ad898249606315cb08c6456cc46184a4 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Mon, 10 Feb 2025 13:14:54 +0100 Subject: [PATCH 1/3] gtdbtk on bins extract archee outputs --- modules/gtdbtk.nf | 5 +++-- modules/sum_up_bins_informations.nf | 21 +++++++++++++++------ subworkflows/08_binning.nf | 5 +++-- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf index 8ec0172..ffaeeb5 100644 --- a/modules/gtdbtk.nf +++ b/modules/gtdbtk.nf @@ -8,8 +8,9 @@ process GTDBTK { val mash_db output: - path "gtdbtk.bac120.summary.tsv*", emit: gtdbtk_affiliations_predictions - path "v_gtdbtk.txt", emit: v_gtdbtk + path "gtdbtk.bac120.summary.tsv*" , emit : gtdbtk_affiliations_predictions_bact + path "gtdbtk.ar53.summary.tsv*" , emit : gtdbtk_affiliations_predictions_arch + path "v_gtdbtk.txt" ,emit : v_gtdbtk script: """ diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf index 042d2da..cc4a4ff 100644 --- a/modules/sum_up_bins_informations.nf +++ b/modules/sum_up_bins_informations.nf @@ -6,22 +6,31 @@ process GENOMES_ABUNDANCES_PER_SAMPLE { path flagstats_files val(bins_folder) path genomes_informations - path affiliations_predictions + path affiliations_predictions_arch + path affiliations_predictions_bact path heatmap_header_mqc path table_header_mqc output: - path "genomes_abundances.tsv" , emit: genomes_abundances - tuple path("stats/genomes_abundances_mqc.tsv"), path("stats/genomes_checkm_mqc.json"), path("stats/bins_general_stats_mqc.tsv"), emit: report + path "genomes_abundances_arch.tsv" , emit: genomes_abundances_arch + path "genomes_abundances_bact.tsv" , emit: genomes_abundances_bact + tuple path("stats/genomes_abundances_mqc_arch.tsv"), path("stats/genomes_checkm_mqc_arch.json"), path("stats/bins_general_stats_mqc_arch.tsv"), emit: report_arch + tuple path("stats/genomes_abundances_mqc_bact.tsv"), path("stats/genomes_checkm_mqc_bact.json"), path("stats/bins_general_stats_mqc_bact.tsv"), emit: report_bact script: """ mkdir -p stats bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \ - --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions} \ + --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_arch} \ --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \ - --output_file genomes_abundances.tsv --report_file stats/genomes_abundances_mqc.tsv \ - --checkm_file stats/genomes_checkm_mqc.json --table_file stats/bins_general_stats_mqc.tsv + --output_file genomes_abundances_arch.tsv --report_file stats/genomes_abundances_mqc_arch.tsv \ + --checkm_file stats/genomes_checkm_mqc_arch.json --table_file stats/bins_general_stats_mqc_arch.tsv + + bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \ + --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_bact} \ + --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \ + --output_file genomes_abundances_bact.tsv --report_file stats/genomes_abundances_mqc_bact.tsv \ + --checkm_file stats/genomes_checkm_mqc_bact.json --table_file stats/bins_general_stats_mqc_bact.tsv cat ${table_header_mqc} > stats/tmp.txt && cat stats/bins_general_stats_mqc.tsv >> stats/tmp.txt \ && mv stats/tmp.txt stats/bins_general_stats_mqc.tsv diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf index 4573dc2..2ae4568 100644 --- a/subworkflows/08_binning.nf +++ b/subworkflows/08_binning.nf @@ -225,7 +225,8 @@ workflow STEP_08_BINNING { GTDBTK(ch_bins_drep, gtdbtk_db, mash_db) ch_gtdbtk_v = GTDBTK.out.v_gtdbtk - ch_gtdbtk_affi = GTDBTK.out.gtdbtk_affiliations_predictions + ch_gtdbtk_affi_arch = GTDBTK.out.gtdbtk_affiliations_predictions_arch + ch_gtdbtk_affi_bact = GTDBTK.out.gtdbtk_affiliations_predictions_bact ///////////////////////////// ////GENOMES ABUNDANCES @@ -254,7 +255,7 @@ workflow STEP_08_BINNING { ch_collect_flagstats = GET_ALIGNMENT_METRICS.out.sam_flagstat.collect() GENOMES_ABUNDANCES_PER_SAMPLE(ch_collect_coverages, ch_collect_flagstats, - ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi , + ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi_arch , ch_gtdbtk_affi_bact) ch_heatmap_header_multiqc, ch_table_header_multiqc) ch_bins_abundances_report = GENOMES_ABUNDANCES_PER_SAMPLE.out.report -- GitLab From 3a44fa42aa3dc34f65d65c787b1f8003352bce64 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Tue, 11 Feb 2025 15:02:49 +0100 Subject: [PATCH 2/3] modify input and output for gtdbtk archee --- modules/sum_up_bins_informations.nf | 40 ++++++++++++++++------------- subworkflows/08_binning.nf | 5 ++-- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf index cc4a4ff..48f1bd1 100644 --- a/modules/sum_up_bins_informations.nf +++ b/modules/sum_up_bins_informations.nf @@ -4,7 +4,7 @@ process GENOMES_ABUNDANCES_PER_SAMPLE { input: path coverage_files path flagstats_files - val(bins_folder) + val bins_folder path genomes_informations path affiliations_predictions_arch path affiliations_predictions_bact @@ -20,23 +20,27 @@ process GENOMES_ABUNDANCES_PER_SAMPLE { script: """ mkdir -p stats - bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \ - --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_arch} \ - --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \ - --output_file genomes_abundances_arch.tsv --report_file stats/genomes_abundances_mqc_arch.tsv \ - --checkm_file stats/genomes_checkm_mqc_arch.json --table_file stats/bins_general_stats_mqc_arch.tsv - - bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \ - --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions_bact} \ - --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \ - --output_file genomes_abundances_bact.tsv --report_file stats/genomes_abundances_mqc_bact.tsv \ - --checkm_file stats/genomes_checkm_mqc_bact.json --table_file stats/bins_general_stats_mqc_bact.tsv - - cat ${table_header_mqc} > stats/tmp.txt && cat stats/bins_general_stats_mqc.tsv >> stats/tmp.txt \ - && mv stats/tmp.txt stats/bins_general_stats_mqc.tsv - - cat ${heatmap_header_mqc} > stats/tmp.txt && cat stats/genomes_abundances_mqc.tsv >> stats/tmp.txt \ - && mv stats/tmp.txt stats/genomes_abundances_mqc.tsv + + type=("bact" "arch") + + for i in "\${type[@]}"; + do + var_name="affiliations_predictions_\${i}" + + bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \\ + --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${!var_name} \\ + --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \\ + --output_file genomes_abundances_\${i}.tsv --report_file stats/genomes_abundances_mqc_\${i}.tsv \\ + --checkm_file stats/genomes_checkm_mqc_\${i}.json --table_file stats/bins_general_stats_mqc_\${i}.tsv + + cat ${table_header_mqc} > stats/tmp_\${i}.txt && cat stats/bins_general_stats_mqc_\${i}.tsv >> stats/tmp_\${i}.txt \\ + && mv stats/tmp_\${i}.txt stats/bins_general_stats_mqc_\${i}.tsv + + cat ${heatmap_header_mqc} > stats/tmp_\${i}.txt && cat stats/genomes_abundances_mqc_\${i}.tsv >> stats/tmp_\${i}.txt \\ + && mv stats/tmp_\${i}.txt stats/genomes_abundances_mqc_\${i}.tsv + + done + """ } diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf index 2ae4568..6bcc673 100644 --- a/subworkflows/08_binning.nf +++ b/subworkflows/08_binning.nf @@ -171,7 +171,8 @@ workflow STEP_08_BINNING { ch_bins_drep = Channel.empty() ch_bam_bins = Channel.empty() ch_reads_fna = Channel.empty() - ch_gtdbtk_affi = Channel.empty() + ch_gtdbtk_affi_arch = Channel.empty() + ch_gtdbtk_affi_bact = Channel.empty() ch_drep_stats = Channel.empty() ch_bins_assembly = ch_bins_set.multiple.join(ch_assembly) @@ -255,7 +256,7 @@ workflow STEP_08_BINNING { ch_collect_flagstats = GET_ALIGNMENT_METRICS.out.sam_flagstat.collect() GENOMES_ABUNDANCES_PER_SAMPLE(ch_collect_coverages, ch_collect_flagstats, - ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi_arch , ch_gtdbtk_affi_bact) + ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi_arch , ch_gtdbtk_affi_bact, ch_heatmap_header_multiqc, ch_table_header_multiqc) ch_bins_abundances_report = GENOMES_ABUNDANCES_PER_SAMPLE.out.report -- GitLab From 0424d5d5d42f674d7e9b282b38fe9ada1f84f583 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Mon, 3 Mar 2025 15:12:05 +0100 Subject: [PATCH 3/3] gtdbtk allways give in output bacteria and archaea summary --- docs/source/output.md | 3 ++- modules/gtdbtk.nf | 9 +++++++++ modules/sum_up_bins_informations.nf | 8 ++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/source/output.md b/docs/source/output.md index 9c75622..fc4ee22 100644 --- a/docs/source/output.md +++ b/docs/source/output.md @@ -267,7 +267,8 @@ If you want to make further analysis about intra-population genetic diversity (m | File | Description | | ----------------------- | --------------------------------------- | -| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | +| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications of bacteria provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | +| `gtdbtk.ar53.summary.tsv` | Taxonomic classifications of archae provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | #### 4. 08_4_mapping_on_final_bins diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf index ffaeeb5..f820a00 100644 --- a/modules/gtdbtk.nf +++ b/modules/gtdbtk.nf @@ -19,5 +19,14 @@ process GTDBTK { gtdbtk classify_wf --genome_dir $bins_drep -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus} echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt + + if [ ! -f ./gtdbtk.bac120.summary.tsv ]; then + touch ./gtdbtk.bac120.summary.tsv + fi + + if [ ! -f ./gtdbtk.ar53.summary.tsv ]; then + touch ./gtdbtk.ar53.summary.tsv + fi + """ } \ No newline at end of file diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf index 48f1bd1..e649b82 100644 --- a/modules/sum_up_bins_informations.nf +++ b/modules/sum_up_bins_informations.nf @@ -25,10 +25,14 @@ process GENOMES_ABUNDANCES_PER_SAMPLE { for i in "\${type[@]}"; do - var_name="affiliations_predictions_\${i}" + if [ "\$i" == "bact" ]; then + var_name=${affiliations_predictions_bact} + else + var_name=${affiliations_predictions_arch} + fi bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \\ - --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${!var_name} \\ + --list_of_flagstats_files ${flagstats_files} --affiliations_predictions \$var_name \\ --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \\ --output_file genomes_abundances_\${i}.tsv --report_file stats/genomes_abundances_mqc_\${i}.tsv \\ --checkm_file stats/genomes_checkm_mqc_\${i}.json --table_file stats/bins_general_stats_mqc_\${i}.tsv -- GitLab