Skip to content
Snippets Groups Projects
Commit 42ddfc84 authored by Jean Mainguy's avatar Jean Mainguy
Browse files

add possibility to filter assembly with a list of taxids.

parent cbaec51f
No related branches found
No related tags found
No related merge requests found
......@@ -34,8 +34,8 @@ def parse_arguments():
parser.add_argument("assembly_file", type=str,
help="ncbi assembly summary file with info on the assemblies. can be download at ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt")
parser.add_argument("-t", "--tax_level", type=str, default='2',
help="Taxonomic level under investigation. ie 2 for Bacteria")
parser.add_argument("-t", "--tax_levels", default=['2'], nargs='+',
help="Taxonomic levels under investigation: 2 for Bacteria")
parser.add_argument("-o", "--output", type=str, default='assemblies_with_tax.tsv',
help="output file : final assembly selection")
parser.add_argument("--ncbi_taxdump", type=str, default='new_taxdump.tar.gz',
......@@ -47,11 +47,12 @@ def parse_arguments():
return args
def get_all_taxids_of_tax_level(tax_level, taxidlineage):
def get_all_taxids_of_tax_level(tax_levels, taxidlineage):
taxids = set()
with open(taxidlineage) as fl:
for line in fl:
if tax_level in line.strip().split('|')[1].strip().split(' '):
taxids_lineage = line.strip().split('|')[1].strip().split(' ')
if any((True for tax_level in tax_levels if tax_level in taxids_lineage)):
taxids.add(line.split('|')[0].strip())
return taxids
......@@ -70,7 +71,7 @@ def main():
raw_assembly_file = args.assembly_file
output_file = args.output
ncbi_taxdump = args.ncbi_taxdump
tax_level = args.tax_level
tax_levels = [tax_lvl.strip() for tax_lvl in args.tax_levels]
taxidlineage = 'taxidlineage.dmp'
rankedlineage = 'rankedlineage.dmp'
......@@ -85,9 +86,9 @@ def main():
tar_tax_dump.extract(taxidlineage)
tar_tax_dump.extract(rankedlineage)
# Get all taxid of the tax_level
taxids_of_tax_lvl = get_all_taxids_of_tax_level(tax_level, taxidlineage)
logging.info(f"{len(taxids_of_tax_lvl)} taxids belong to {tax_level}")
# Get all taxid of the tax_levels
taxids_of_tax_lvl = get_all_taxids_of_tax_level(tax_levels, taxidlineage)
logging.info(f"{len(taxids_of_tax_lvl)} taxids belong to {tax_levels}")
# get_taxid_ranked_lineage
taxid_ranked_lineage = ncbi.get_taxid_ranked_lineage(rankedlineage)
......
......@@ -106,7 +106,7 @@ else {
}
// Number of assemblies to process by chunk.
// Number of assemblies to process by chunk.
params.assemblies_chunk_size = 200
// Run the workflow on a test sample
......@@ -216,6 +216,16 @@ cog_prct_channel = Channel.value(params.cog_prct)
region_prct_threshold_channel = Channel.value(params.region_genome_prct_threshold)
rrna_genes_to_fetch = Channel.value(params.rrna_genes_to_fetch)
// MULTI TAX LEVELS
params.multi_tax_levels = false
if (params.multi_tax_levels) {
multi_tax_levels_file = Channel.fromPath(params.multi_tax_levels)
}
else {
multi_tax_levels_file = Channel.value(params.multi_tax_levels)
}
/**
The python script executed by this process:
- it takes the taxon level for which we want to find genomic regions
......@@ -249,6 +259,10 @@ process parse_and_identify_universal_cog {
val tax_level from tax_level_channel
val cog_single_copy_prct from single_copy_treshold_channel
val cog_prct from cog_prct_channel
// when multi tax levels is used
file ncbi_taxdump
file multi_tax_levels_file
output:
file "identified_cogs.tsv" into cogs_selection_info
file "${tax_level}_members.tsv*" optional true into initial_members_file
......@@ -257,14 +271,37 @@ process parse_and_identify_universal_cog {
file "taxid_list.txt" into taxids_list
file "selected_cogs.html" into plot_selected_OGs
file 'cog_identification.stat' into cog_selection_stat
env taxid_selection
"""
parse_and_identify_universal_cog.py --tax_level $tax_level \
echo multi_tax_levels $multi_tax_levels_file ${params.multi_tax_levels}
if [ ${params.multi_tax_levels} == "false" ]; then
parse_and_identify_universal_cog.py --tax_level $tax_level \
--cog_single_copy_prct $cog_single_copy_prct \
--cog_prct $cog_prct \
--check_dir ${eggNOG_data_dir} \
--get_COG_gene_name \
#--sample_test_run ${params.test} # is false by default
--get_COG_gene_name
taxid_selection=$tax_level
else
# Multi tax levels
mkdir new_taxdump
tar -xzf $ncbi_taxdump -C new_taxdump
from_tax_names_to_taxids.py --tax_names_file $multi_tax_levels_file \
--new_taxdump_dir tax_dump/ -v \
--main_tax_level 2 -o taxid_selection.txt
parse_and_identify_universal_cog.py --tax_level $tax_level \
--taxid_selection taxid_selection.txt \
--new_taxdump_dir new_taxdump/ \
--cog_single_copy_prct $cog_single_copy_prct \
--cog_prct $cog_prct \
--check_dir ${eggNOG_data_dir} \
--get_COG_gene_name
taxid_selection=`cat taxid_selection.txt | tr '\r\n' ' '`
fi
# Quick and dirty way to write basic stat..
nb_species=`wc -l taxid_list.txt | cut -d' ' -f1`
......@@ -459,7 +496,7 @@ process assembly_selection {
publishDir "${result_dir}/assembly_selections/", pattern: "*.tsv", mode:'copy'
input:
val tax_level from tax_level_channel
val taxid_selection from taxid_selection
file summary_refseq
file ncbi_taxdump
......@@ -469,24 +506,31 @@ process assembly_selection {
file "one_assembly_per_sp.tsv" into one_assembly_per_sp
file "rep_ref_and_10_assemblies_per_sp.tsv" into rep_ref_and_10_assemblies_per_sp
file "assembly_selection_stat.tsv" into assembly_selection_stat
env taxon_name into taxon_name
env taxon_names into taxon_name
"""
# Format assemblies summary with only interesting columns and add taxonomy information
format_assemblies_summary.py $summary_refseq -t $tax_level\
format_assemblies_summary.py $summary_refseq -t $taxid_selection\
-o all_assemblies_with_taxonomy.tsv \
--ncbi_taxdump $ncbi_taxdump -v
# extract taxon name of the tax level.
# rankedlineage.dmp has been extracted by format_assemblies_summary.py
taxon_name=`grep ^"$tax_level"\$'\t' rankedlineage.dmp | cut -d\$'\t' -f3`
taxon_names=''
for taxid in $taxid_selection; do
taxon_name=`grep ^"\$taxid"\$'\t' rankedlineage.dmp | cut -d\$'\t' -f3`
taxon_names="\$taxon_name \$taxon_names "
done
echo \$taxon_names
# Creation of assemblies selection
# file created ref_and_rep_assemblies.tsv one_assembly_per_sp.tsv rep_ref_and_10_assemblies_per_sp.tsv
assembly_selection.py all_assemblies_with_taxonomy.tsv --assembly_selections ref_and_rep_assemblies one_assembly_per_sp rep_ref_and_10_assemblies_per_sp -n 10 -v
# Clean directory
# rm all_assemblies.tsv rankedlineage.dmp taxids_${tax_level}.txt taxidlineage.dmp
# rm rankedlineage_${tax_level}.dmp
# rm all_assemblies.tsv rankedlineage.dmp taxids {tax_level}.txt taxidlineage.dmp
# rm rankedlineage_{tax_level}.dmp
"""
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment