Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
axis3-metagenomics-amplicon
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
seqOccIn
axis3-metagenomics-amplicon
Commits
42ddfc84
Commit
42ddfc84
authored
4 years ago
by
Jean Mainguy
Browse files
Options
Downloads
Patches
Plain Diff
add possibility to filter assembly with a list of taxids.
parent
cbaec51f
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
region_identification_workflow/bin/format_assemblies_summary.py
+9
-8
9 additions, 8 deletions
..._identification_workflow/bin/format_assemblies_summary.py
region_identification_workflow/main.nf
+54
-10
54 additions, 10 deletions
region_identification_workflow/main.nf
with
63 additions
and
18 deletions
region_identification_workflow/bin/format_assemblies_summary.py
+
9
−
8
View file @
42ddfc84
...
...
@@ -34,8 +34,8 @@ def parse_arguments():
parser
.
add_argument
(
"
assembly_file
"
,
type
=
str
,
help
=
"
ncbi assembly summary file with info on the assemblies. can be download at ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt
"
)
parser
.
add_argument
(
"
-t
"
,
"
--tax_level
"
,
type
=
str
,
default
=
'
2
'
,
help
=
"
Taxonomic level under investigation
. ie
2 for Bacteria
"
)
parser
.
add_argument
(
"
-t
"
,
"
--tax_level
s
"
,
default
=
[
'
2
'
],
nargs
=
'
+
'
,
help
=
"
Taxonomic level
s
under investigation
:
2 for Bacteria
"
)
parser
.
add_argument
(
"
-o
"
,
"
--output
"
,
type
=
str
,
default
=
'
assemblies_with_tax.tsv
'
,
help
=
"
output file : final assembly selection
"
)
parser
.
add_argument
(
"
--ncbi_taxdump
"
,
type
=
str
,
default
=
'
new_taxdump.tar.gz
'
,
...
...
@@ -47,11 +47,12 @@ def parse_arguments():
return
args
def
get_all_taxids_of_tax_level
(
tax_level
,
taxidlineage
):
def
get_all_taxids_of_tax_level
(
tax_level
s
,
taxidlineage
):
taxids
=
set
()
with
open
(
taxidlineage
)
as
fl
:
for
line
in
fl
:
if
tax_level
in
line
.
strip
().
split
(
'
|
'
)[
1
].
strip
().
split
(
'
'
):
taxids_lineage
=
line
.
strip
().
split
(
'
|
'
)[
1
].
strip
().
split
(
'
'
)
if
any
((
True
for
tax_level
in
tax_levels
if
tax_level
in
taxids_lineage
)):
taxids
.
add
(
line
.
split
(
'
|
'
)[
0
].
strip
())
return
taxids
...
...
@@ -70,7 +71,7 @@ def main():
raw_assembly_file
=
args
.
assembly_file
output_file
=
args
.
output
ncbi_taxdump
=
args
.
ncbi_taxdump
tax_level
=
args
.
tax_level
tax_level
s
=
[
tax_lvl
.
strip
()
for
tax_lvl
in
args
.
tax_level
s
]
taxidlineage
=
'
taxidlineage.dmp
'
rankedlineage
=
'
rankedlineage.dmp
'
...
...
@@ -85,9 +86,9 @@ def main():
tar_tax_dump
.
extract
(
taxidlineage
)
tar_tax_dump
.
extract
(
rankedlineage
)
# Get all taxid of the tax_level
taxids_of_tax_lvl
=
get_all_taxids_of_tax_level
(
tax_level
,
taxidlineage
)
logging
.
info
(
f
"
{
len
(
taxids_of_tax_lvl
)
}
taxids belong to
{
tax_level
}
"
)
# Get all taxid of the tax_level
s
taxids_of_tax_lvl
=
get_all_taxids_of_tax_level
(
tax_level
s
,
taxidlineage
)
logging
.
info
(
f
"
{
len
(
taxids_of_tax_lvl
)
}
taxids belong to
{
tax_level
s
}
"
)
# get_taxid_ranked_lineage
taxid_ranked_lineage
=
ncbi
.
get_taxid_ranked_lineage
(
rankedlineage
)
...
...
This diff is collapsed.
Click to expand it.
region_identification_workflow/main.nf
+
54
−
10
View file @
42ddfc84
...
...
@@ -106,7 +106,7 @@ else {
}
// Number of assemblies to process by chunk.
// Number of assemblies to process by chunk.
params
.
assemblies_chunk_size
=
200
// Run the workflow on a test sample
...
...
@@ -216,6 +216,16 @@ cog_prct_channel = Channel.value(params.cog_prct)
region_prct_threshold_channel
=
Channel
.
value
(
params
.
region_genome_prct_threshold
)
rrna_genes_to_fetch
=
Channel
.
value
(
params
.
rrna_genes_to_fetch
)
// MULTI TAX LEVELS
params
.
multi_tax_levels
=
false
if
(
params
.
multi_tax_levels
)
{
multi_tax_levels_file
=
Channel
.
fromPath
(
params
.
multi_tax_levels
)
}
else
{
multi_tax_levels_file
=
Channel
.
value
(
params
.
multi_tax_levels
)
}
/**
The python script executed by this process:
- it takes the taxon level for which we want to find genomic regions
...
...
@@ -249,6 +259,10 @@ process parse_and_identify_universal_cog {
val
tax_level
from
tax_level_channel
val
cog_single_copy_prct
from
single_copy_treshold_channel
val
cog_prct
from
cog_prct_channel
// when multi tax levels is used
file
ncbi_taxdump
file
multi_tax_levels_file
output:
file
"identified_cogs.tsv"
into
cogs_selection_info
file
"${tax_level}_members.tsv*"
optional
true
into
initial_members_file
...
...
@@ -257,14 +271,37 @@ process parse_and_identify_universal_cog {
file
"taxid_list.txt"
into
taxids_list
file
"selected_cogs.html"
into
plot_selected_OGs
file
'cog_identification.stat'
into
cog_selection_stat
env
taxid_selection
"""
parse_and_identify_universal_cog.py --tax_level $tax_level \
echo multi_tax_levels $multi_tax_levels_file ${params.multi_tax_levels}
if [ ${params.multi_tax_levels} == "false" ]; then
parse_and_identify_universal_cog.py --tax_level $tax_level \
--cog_single_copy_prct $cog_single_copy_prct \
--cog_prct $cog_prct \
--check_dir ${eggNOG_data_dir} \
--get_COG_gene_name \
#--sample_test_run ${params.test} # is false by default
--get_COG_gene_name
taxid_selection=$tax_level
else
# Multi tax levels
mkdir new_taxdump
tar -xzf $ncbi_taxdump -C new_taxdump
from_tax_names_to_taxids.py --tax_names_file $multi_tax_levels_file \
--new_taxdump_dir tax_dump/ -v \
--main_tax_level 2 -o taxid_selection.txt
parse_and_identify_universal_cog.py --tax_level $tax_level \
--taxid_selection taxid_selection.txt \
--new_taxdump_dir new_taxdump/ \
--cog_single_copy_prct $cog_single_copy_prct \
--cog_prct $cog_prct \
--check_dir ${eggNOG_data_dir} \
--get_COG_gene_name
taxid_selection=`cat taxid_selection.txt | tr '\r\n' ' '`
fi
# Quick and dirty way to write basic stat..
nb_species=`wc -l taxid_list.txt | cut -d' ' -f1`
...
...
@@ -459,7 +496,7 @@ process assembly_selection {
publishDir
"${result_dir}/assembly_selections/"
,
pattern:
"*.tsv"
,
mode:
'copy'
input:
val
tax
_level
from
tax_level_channel
val
tax
id_selection
from
taxid_selection
file
summary_refseq
file
ncbi_taxdump
...
...
@@ -469,24 +506,31 @@ process assembly_selection {
file
"one_assembly_per_sp.tsv"
into
one_assembly_per_sp
file
"rep_ref_and_10_assemblies_per_sp.tsv"
into
rep_ref_and_10_assemblies_per_sp
file
"assembly_selection_stat.tsv"
into
assembly_selection_stat
env
taxon_name
into
taxon_name
env
taxon_name
s
into
taxon_name
"""
# Format assemblies summary with only interesting columns and add taxonomy information
format_assemblies_summary.py $summary_refseq -t $tax
_level
\
format_assemblies_summary.py $summary_refseq -t $tax
id_selection
\
-o all_assemblies_with_taxonomy.tsv \
--ncbi_taxdump $ncbi_taxdump -v
# extract taxon name of the tax level.
# rankedlineage.dmp has been extracted by format_assemblies_summary.py
taxon_name=`grep ^"$tax_level"\$'\t' rankedlineage.dmp | cut -d\$'\t' -f3`
taxon_names=''
for taxid in $taxid_selection; do
taxon_name=`grep ^"\$taxid"\$'\t' rankedlineage.dmp | cut -d\$'\t' -f3`
taxon_names="\$taxon_name \$taxon_names "
done
echo \$taxon_names
# Creation of assemblies selection
# file created ref_and_rep_assemblies.tsv one_assembly_per_sp.tsv rep_ref_and_10_assemblies_per_sp.tsv
assembly_selection.py all_assemblies_with_taxonomy.tsv --assembly_selections ref_and_rep_assemblies one_assembly_per_sp rep_ref_and_10_assemblies_per_sp -n 10 -v
# Clean directory
# rm all_assemblies.tsv rankedlineage.dmp taxids
_$
{tax_level}.txt taxidlineage.dmp
# rm rankedlineage_
$
{tax_level}.dmp
# rm all_assemblies.tsv rankedlineage.dmp taxids
{tax_level}.txt taxidlineage.dmp
# rm rankedlineage_{tax_level}.dmp
"""
}
...
...
This diff is collapsed.
Click to expand it.
Ghost User
@ghost
mentioned in issue
#37
·
4 years ago
mentioned in issue
#37
mentioned in issue #37
Toggle commit list
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment