add possibility to filter assembly with a list of taxids.

42ddfc84 · Jean Mainguy · cbaec51f · 42ddfc84 · 42ddfc84
Commit 42ddfc84 authored 4 years ago by Jean Mainguy
--- a/region_identification_workflow/bin/format_assemblies_summary.py
+++ b/region_identification_workflow/bin/format_assemblies_summary.py
@@ -34,8 +34,8 @@ def parse_arguments():

    parser.add_argument("assembly_file", type=str,
                        help="ncbi assembly summary file with info on the assemblies. can be download at ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt")
-    parser.add_argument("-t", "--tax_level", type=str, default='2',
-                        help="Taxonomic level under investigation. ie 2 for Bacteria")
+    parser.add_argument("-t", "--tax_levels", default=['2'], nargs='+',
+                        help="Taxonomic levels under investigation: 2 for Bacteria")
    parser.add_argument("-o", "--output", type=str, default='assemblies_with_tax.tsv',
                        help="output file : final assembly selection")
    parser.add_argument("--ncbi_taxdump", type=str, default='new_taxdump.tar.gz',
@@ -47,11 +47,12 @@ def parse_arguments():
    return args


-def get_all_taxids_of_tax_level(tax_level, taxidlineage):
+def get_all_taxids_of_tax_level(tax_levels, taxidlineage):
    taxids = set()
    with open(taxidlineage) as fl:
        for line in fl:
-            if tax_level in line.strip().split('|')[1].strip().split(' '):
+            taxids_lineage = line.strip().split('|')[1].strip().split(' ')
+            if any((True for tax_level in tax_levels if tax_level in taxids_lineage)):
                taxids.add(line.split('|')[0].strip())
    return taxids

@@ -70,7 +71,7 @@ def main():
    raw_assembly_file = args.assembly_file
    output_file = args.output
    ncbi_taxdump = args.ncbi_taxdump
-    tax_level = args.tax_level
+    tax_levels = [tax_lvl.strip() for tax_lvl in args.tax_levels]

    taxidlineage = 'taxidlineage.dmp'
    rankedlineage = 'rankedlineage.dmp'
@@ -85,9 +86,9 @@ def main():
    tar_tax_dump.extract(taxidlineage)
    tar_tax_dump.extract(rankedlineage)

-    # Get all taxid of the tax_level
-    taxids_of_tax_lvl = get_all_taxids_of_tax_level(tax_level, taxidlineage)
-    logging.info(f"{len(taxids_of_tax_lvl)} taxids belong to {tax_level}")
+    # Get all taxid of the tax_levels
+    taxids_of_tax_lvl = get_all_taxids_of_tax_level(tax_levels, taxidlineage)
+    logging.info(f"{len(taxids_of_tax_lvl)} taxids belong to {tax_levels}")

    # get_taxid_ranked_lineage
    taxid_ranked_lineage = ncbi.get_taxid_ranked_lineage(rankedlineage)

--- a/region_identification_workflow/main.nf
+++ b/region_identification_workflow/main.nf
@@ -106,7 +106,7 @@ else {
 }


-// Number of assemblies to process by chunk. 
+// Number of assemblies to process by chunk.
 params.assemblies_chunk_size = 200

 // Run the workflow on a test sample
@@ -216,6 +216,16 @@ cog_prct_channel =  Channel.value(params.cog_prct)
 region_prct_threshold_channel =  Channel.value(params.region_genome_prct_threshold)
 rrna_genes_to_fetch = Channel.value(params.rrna_genes_to_fetch)

+// MULTI TAX LEVELS
+params.multi_tax_levels = false
+if (params.multi_tax_levels) {
+    multi_tax_levels_file = Channel.fromPath(params.multi_tax_levels)
+}
+else {
+    multi_tax_levels_file = Channel.value(params.multi_tax_levels)
+}
+
+
 /**
 The python script executed by this process:
  - it takes the taxon level for which we want to find genomic regions
@@ -249,6 +259,10 @@ process parse_and_identify_universal_cog {
    val tax_level from tax_level_channel
    val cog_single_copy_prct from single_copy_treshold_channel
    val cog_prct from cog_prct_channel
+    // when multi tax levels is used
+    file ncbi_taxdump
+    file multi_tax_levels_file
+
  output:
    file "identified_cogs.tsv" into cogs_selection_info
    file "${tax_level}_members.tsv*" optional true into initial_members_file
@@ -257,14 +271,37 @@ process parse_and_identify_universal_cog {
    file "taxid_list.txt" into taxids_list
    file "selected_cogs.html" into plot_selected_OGs
    file 'cog_identification.stat' into cog_selection_stat
+    env taxid_selection

    """
-    parse_and_identify_universal_cog.py --tax_level $tax_level \
+    echo multi_tax_levels $multi_tax_levels_file ${params.multi_tax_levels}
+    if [ ${params.multi_tax_levels} == "false" ]; then
+        parse_and_identify_universal_cog.py --tax_level $tax_level \
                                                      --cog_single_copy_prct $cog_single_copy_prct \
                                                      --cog_prct $cog_prct \
                                                      --check_dir ${eggNOG_data_dir} \
-                                                      --get_COG_gene_name \
-                                                      #--sample_test_run ${params.test} # is false by default
+                                                      --get_COG_gene_name
+       taxid_selection=$tax_level
+    else
+        # Multi tax levels
+        mkdir new_taxdump
+        tar -xzf $ncbi_taxdump -C new_taxdump
+
+        from_tax_names_to_taxids.py --tax_names_file $multi_tax_levels_file \
+                                    --new_taxdump_dir tax_dump/ -v \
+                                    --main_tax_level 2 -o taxid_selection.txt
+
+
+        parse_and_identify_universal_cog.py --tax_level $tax_level \
+                                            --taxid_selection taxid_selection.txt \
+                                            --new_taxdump_dir   new_taxdump/ \
+                                            --cog_single_copy_prct $cog_single_copy_prct \
+                                            --cog_prct $cog_prct \
+                                            --check_dir ${eggNOG_data_dir} \
+                                            --get_COG_gene_name
+
+        taxid_selection=`cat taxid_selection.txt | tr '\r\n' ' '`
+    fi

    # Quick and dirty way to write basic stat..
    nb_species=`wc -l taxid_list.txt | cut -d' ' -f1`
@@ -459,7 +496,7 @@ process assembly_selection {
    publishDir "${result_dir}/assembly_selections/", pattern: "*.tsv", mode:'copy'

    input:
-      val tax_level from tax_level_channel
+      val taxid_selection from taxid_selection
      file summary_refseq
      file ncbi_taxdump

@@ -469,24 +506,31 @@ process assembly_selection {
      file "one_assembly_per_sp.tsv" into one_assembly_per_sp
      file "rep_ref_and_10_assemblies_per_sp.tsv" into rep_ref_and_10_assemblies_per_sp
      file "assembly_selection_stat.tsv" into assembly_selection_stat
-      env taxon_name into taxon_name
+      env taxon_names into taxon_name

      """
    # Format assemblies summary with only interesting columns and add taxonomy information
-    format_assemblies_summary.py $summary_refseq -t $tax_level\
+    format_assemblies_summary.py $summary_refseq -t $taxid_selection\
                                  -o all_assemblies_with_taxonomy.tsv \
                                  --ncbi_taxdump $ncbi_taxdump -v
    # extract taxon name of the tax level.
    # rankedlineage.dmp  has been extracted by format_assemblies_summary.py
-    taxon_name=`grep ^"$tax_level"\$'\t' rankedlineage.dmp | cut -d\$'\t' -f3`

+    taxon_names=''
+    for taxid in $taxid_selection; do
+        taxon_name=`grep ^"\$taxid"\$'\t' rankedlineage.dmp | cut -d\$'\t' -f3`
+        taxon_names="\$taxon_name \$taxon_names "
+    done
+
+    echo \$taxon_names
+    
    # Creation of assemblies selection
    # file created ref_and_rep_assemblies.tsv one_assembly_per_sp.tsv rep_ref_and_10_assemblies_per_sp.tsv
    assembly_selection.py all_assemblies_with_taxonomy.tsv --assembly_selections ref_and_rep_assemblies one_assembly_per_sp rep_ref_and_10_assemblies_per_sp -n 10 -v

    # Clean directory
-    # rm all_assemblies.tsv rankedlineage.dmp taxids_${tax_level}.txt taxidlineage.dmp
-    # rm rankedlineage_${tax_level}.dmp
+    # rm all_assemblies.tsv rankedlineage.dmp taxids {tax_level}.txt taxidlineage.dmp
+    # rm rankedlineage_{tax_level}.dmp
      """
  }