From b9a2a03861028bc1733cff62937142e1857816ff Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Mon, 17 Feb 2025 13:38:00 +0100 Subject: [PATCH 1/8] start integration mash db in workflow --- conf/functional_test.config | 3 ++- docs/source/usage.md | 3 ++- docs/source/use_case.md | 4 ++-- functional_tests/test_download.config | 3 ++- main.nf | 8 +++++--- modules/gtdbtk.nf | 3 ++- nextflow.config | 1 + subworkflows/00_databases.nf | 4 ++++ subworkflows/08_binning.nf | 3 ++- 9 files changed, 22 insertions(+), 10 deletions(-) diff --git a/conf/functional_test.config b/conf/functional_test.config index 31d579c..e1a1fcf 100644 --- a/conf/functional_test.config +++ b/conf/functional_test.config @@ -26,5 +26,6 @@ params { host_fasta = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/host/Homo_sapiens.GRCh38_chr21.fa" input = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/samplesheet.csv" type='SR' - gtdbtk_bank="/work/project/plateforme/metaG/databases/GTDBtk_data/release207_v2" + gtdbtk_bank="/work/bank2/GTDB/GTDB/current/flat/release220/" + mash_bank="/work/project/plateforme/metaG/functional_test/FT_banks/mash/gtdb_ref_sketch.msh" } diff --git a/docs/source/usage.md b/docs/source/usage.md index 5aae59f..52f275c 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -92,7 +92,7 @@ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files tar xvzf gtdbtk_v2_data.tar.gz ``` -After that, you have to indicate the path to gtdb-tk file database with the flag --gtdbtk_bank . +After that, you have to indicate the path to gtdb-tk file database with the flags --gtdbtk_bank and --mash_bank. ```{warning} * if you use steps `S02_ASSEMBLY` or `S03_FILTERING` or `S04_STRUCTURAL_ANNOT` or `S05_ALIGNMENT` or `S06_FUNC_ANNOT` or @@ -477,6 +477,7 @@ at the same time as the protein database used in 05_alignment step. | Parameter | Description | | ------------ | ----------- | | `--gtdbtk_bank` | indicates path to the GTDBTK database (see [GTDBTk installing](https://ecogenomics.github.io/GTDBTk/installing/index.html)).<br /> Default: `""`.| +| `--mash_bank` | indicates path to the Mash database (see [GTDBTk installing](https://ecogenomics.github.io/GTDBTk/installing/index.html)).<br /> Default: `""`.| | `--checkm2_bank` | indicates path to the checkm2 database (`PATH/uniref100.KO.1.dmnd`).<br /> To download it, launch `singularity exec -B YOUR_ACTUAL_PATH PATH_TO/binning.sif checkm2 database --download --path YOUR_ACTUAL_PATH/checkm2DB`.<br /> Adapt -B parameter to your infrastructure if you are not in genobioinfo. This command will return 2 non-blocking errors, don't worry about them. <br /> Default: `""`.| | `--metabat2_seed` | Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed)).<br /> Default: `0`.| | `--min_completeness [nb]` | Minimum % of bins completeness for the bins to be kept after bin_refinement step.<br /> Default: `50`.| diff --git a/docs/source/use_case.md b/docs/source/use_case.md index 72e6b80..3b0f959 100644 --- a/docs/source/use_case.md +++ b/docs/source/use_case.md @@ -89,7 +89,7 @@ sample_3,test_data/ERR3201928_1.fastq.gz,test_data/ERR3201928_2.fastq.gz 4. We also need to have the diamond bank we want to use to align protein sequence of genes. Different diamond banks are available on genobioinfo cluster, here we will use NR: `/work/bank/diamonddb/nr.dmnd`. The path to this file will be used into our script. -5. Finally, we want to download the gtdb-tk database in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.1.1_env/share/gtdbtk-2.1.1/db/` +5. Finally, we want to download the gtdb-tk database in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.4.0_env/share/gtdbtk-2.4.0/db/` 6. In `Script_filtering_binning.sh` write: @@ -108,7 +108,7 @@ nextflow run -profile genotoul main.nf --type SR --input "samplesheet.csv" \ --kaiju_db_dir "/bank/kaijudb/kaijudb_refseq_2020-05-25" \ --assembly metaspades \ --diamond_bank "/work/bank/diamonddb/nr.dmnd" \ ---gtdbtk_bank "/work/project/plateforme/metaG/databases/GTDBtk_data/release207_v2/" \ +--gtdbtk_bank "/work/bank2/GTDB/GTDB/current/flat/release220/" \ -with-report -with-timeline -with-trace -with-dag -resume ``` diff --git a/functional_tests/test_download.config b/functional_tests/test_download.config index a8b0cfd..3575b4d 100644 --- a/functional_tests/test_download.config +++ b/functional_tests/test_download.config @@ -19,7 +19,8 @@ params { host_fasta = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/host/Homo_sapiens.GRCh38_chr21.fa" host_index = "" diamond_bank = "/work/project/plateforme/metaG/functional_test/FT_banks/refseq_bacteria_2021-05-20/refseq_bacteria_100000.dmnd" - gtdbtk_bank="/work/project/plateforme/metaG/databases/GTDBtk_data/release207_v2" + gtdbtk_bank="/work/bank2/GTDB/GTDB/current/flat/release220/" + mash_bank="/work/project/plateforme/metaG/functional_test/FT_banks/mash/gtdb_ref_sketch.msh" input = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/samplesheet.csv" type='SR' } diff --git a/main.nf b/main.nf index 92ac20f..9c1f7ab 100644 --- a/main.nf +++ b/main.nf @@ -93,6 +93,7 @@ include { MULTIQC } from './modules/multiqc' SO8_BINNING options: --skip_binning Skip this step --gtdbtk_bank Path to the GTDBTK database + --mash_bank Path to the Mash database --checkm2_bank Path to the CheckM2 database --metabat2_seed Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed)) --binning_cross_alignment Mapping strategy to compute co-abundances for binning: . @@ -208,8 +209,8 @@ workflow { exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank" } - if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning ) && !(params.gtdbtk_bank || params.checkm2_bank) ) { - exit 1, "You must specify --skip_binning or specify a GTDB-TK bank with --gtdbtk_bank and a checkm2 bank with --checkm2_bank" + if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning ) && !(params.gtdbtk_bank || params.mash_bank || params.checkm2_bank) ) { + exit 1, "You must specify --skip_binning or specify a GTDB-TK bank with --gtdbtk_bank, mash bank with --mash_bank and a checkm2 bank with --checkm2_bank" } if ( params.coassembly && params.binning_cross_alignment == 'group'){ @@ -300,6 +301,7 @@ workflow { ch_taxonomy = DATABASES.out.taxonomy ch_diamon_db = DATABASES.out.diamond ch_gtbdtk_db = DATABASES.out.gtdbtk + ch_mash_db = DATABASES.out.mash ch_checkm2_db = DATABASES.out.checkm2 ch_multiqc_config = Channel.empty() @@ -450,7 +452,7 @@ workflow { if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_binning ) { - S08_BINNING( ch_reads, ch_assembly, ch_bam, ch_gtbdtk_db, ch_checkm2_db, ch_quast, ch_circular) + S08_BINNING( ch_reads, ch_assembly, ch_bam, ch_gtbdtk_db, ch_mash_db, ch_checkm2_db, ch_quast, ch_circular) ch_bins_abundances_report = S08_BINNING.out.bins_abundances_report ch_bins_stats_report = S08_BINNING.out.bins_stats_report diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf index ba9b86d..9eecfce 100644 --- a/modules/gtdbtk.nf +++ b/modules/gtdbtk.nf @@ -5,6 +5,7 @@ process GTDBTK { input: val(drep_bins_folder) val(gtdbtk_db) + val(mash_db) output: path "gtdbtk.bac120.summary.tsv*" , emit : gtdbtk_affiliations_predictions @@ -15,7 +16,7 @@ process GTDBTK { export GTDBTK_DATA_PATH=$gtdbtk_db - gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --skip_ani_screen --pplacer_cpus ${task.cpus} --cpus ${task.cpus} + gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus} echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index c70ed22..59dff8c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,7 @@ params { min_contigs_cpm = 1 diamond_bank = "" gtdbtk_bank = "" + mash_bank = "" checkm2_bank = "" percentage_identity = 0.95 type = "" diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf index 9c0ca02..f808092 100644 --- a/subworkflows/00_databases.nf +++ b/subworkflows/00_databases.nf @@ -82,10 +82,12 @@ workflow DATABASES { } ch_gtdbtk_db = Channel.empty() + ch_mash_db = Channel.empty() ch_checkm2_db = Channel.empty() if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning) ) { ch_gtdbtk_db = Channel.fromPath(params.gtdbtk_bank).first() + ch_mash_db = Channel.fromPath(params.mash_bank).first() ch_checkm2_db = Channel.fromPath(params.checkm2_bank).first() } @@ -97,6 +99,7 @@ workflow DATABASES { ch_taxdump.ifEmpty([]), ch_diamond.ifEmpty([]), ch_gtdbtk_db.ifEmpty([]), + ch_mash_db.ifEmpty([]) ch_checkm2_db.ifEmpty([]) ) @@ -108,6 +111,7 @@ workflow DATABASES { taxonomy = ch_taxonomy.first() diamond = ch_diamond gtdbtk = ch_gtdbtk_db + mash = ch_mash_db checkm2 = ch_checkm2_db } diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf index 1346139..4573dc2 100644 --- a/subworkflows/08_binning.nf +++ b/subworkflows/08_binning.nf @@ -15,6 +15,7 @@ workflow STEP_08_BINNING { assembly bam gtdbtk_db + mash_db checkm2_db quast circular @@ -222,7 +223,7 @@ workflow STEP_08_BINNING { ///// TAXONOMIC AFFILIATION BIN /////////////////////////////// - GTDBTK(ch_bins_drep, gtdbtk_db) + GTDBTK(ch_bins_drep, gtdbtk_db, mash_db) ch_gtdbtk_v = GTDBTK.out.v_gtdbtk ch_gtdbtk_affi = GTDBTK.out.gtdbtk_affiliations_predictions -- GitLab From 19612c6363976d8a725c2cd5cf3401b8719f91fa Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Mon, 17 Feb 2025 15:18:13 +0100 Subject: [PATCH 2/8] gtdbtk run with mash db in a specific folder --- main.nf | 3 ++- modules/get_db_versions.nf | 6 ++++++ subworkflows/00_databases.nf | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 9c1f7ab..96ccb76 100644 --- a/main.nf +++ b/main.nf @@ -291,7 +291,8 @@ workflow { ch_taxonomy = Channel.empty() ch_diamon_db = Channel.empty() ch_gtbdtk_db = Channel.empty() - ch_gtbdtk_db = Channel.empty() + ch_mash_db = Channel.empty() + ch_checkm2_db = Channel.empty() DATABASES () ch_host_fasta = DATABASES.out.host_fasta diff --git a/modules/get_db_versions.nf b/modules/get_db_versions.nf index b64e41b..407d3aa 100644 --- a/modules/get_db_versions.nf +++ b/modules/get_db_versions.nf @@ -10,6 +10,7 @@ process GET_DB_VERSIONS { path taxdump path diamond path gtdbtk + path mash path checkm2 @@ -49,6 +50,11 @@ process GET_DB_VERSIONS { echo "GTDBTK ${gtdbtk}" > gtdbtk_db.txt fi + if [[ "${mash}" != "" ]] + then + echo "MASH ${mash}" > mash_db.txt + fi + if [[ "${checkm2}" != "" ]] then echo "Checkm2 ${checkm2}" > checkm2_db.txt diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf index f808092..92ac645 100644 --- a/subworkflows/00_databases.nf +++ b/subworkflows/00_databases.nf @@ -99,7 +99,7 @@ workflow DATABASES { ch_taxdump.ifEmpty([]), ch_diamond.ifEmpty([]), ch_gtdbtk_db.ifEmpty([]), - ch_mash_db.ifEmpty([]) + ch_mash_db.ifEmpty([]), ch_checkm2_db.ifEmpty([]) ) -- GitLab From ac7a2c16e3af29dca3e6658f452d44798042170b Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Mon, 17 Feb 2025 15:58:46 +0100 Subject: [PATCH 3/8] update documentation for gtdbtk + mash --- docs/source/output.md | 2 +- docs/source/usage.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/output.md b/docs/source/output.md index e9ca730..9c75622 100644 --- a/docs/source/output.md +++ b/docs/source/output.md @@ -267,7 +267,7 @@ If you want to make further analysis about intra-population genetic diversity (m | File | Description | | ----------------------- | --------------------------------------- | -| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `fastani_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | +| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | #### 4. 08_4_mapping_on_final_bins diff --git a/docs/source/usage.md b/docs/source/usage.md index 52f275c..03dae44 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -87,9 +87,9 @@ In addition to the general mandatory files, if you wish to launch certain steps * Step `S08_BINNING`, To perform the taxonomic affiliations of the bins, you must download the gtdb-tk database as follows (see [GTDBTk](https://ecogenomics.github.io/GTDBTk/installing/index.html)) : ```bash -wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_v2_data.tar.gz -wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_v2_data.tar.gz (or, mirror) -tar xvzf gtdbtk_v2_data.tar.gz +wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_package/full_package/gtdbtk_data.tar.gz +wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_package/full_package/gtdbtk_data.tar.gz ( mirror for Australia) +tar xvzf gtdbtk_data.tar.gz ``` After that, you have to indicate the path to gtdb-tk file database with the flags --gtdbtk_bank and --mash_bank. -- GitLab From 6d9138b52862c3b04e575e18b71c786117662524 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Tue, 18 Feb 2025 10:49:10 +0100 Subject: [PATCH 4/8] use gtdbk240 with mash db only if it already exist and pointed in the config path --- main.nf | 4 ++-- modules/gtdbtk.nf | 18 +++++++++--------- subworkflows/00_databases.nf | 18 +++++++++++++++++- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/main.nf b/main.nf index 96ccb76..f902901 100644 --- a/main.nf +++ b/main.nf @@ -93,7 +93,7 @@ include { MULTIQC } from './modules/multiqc' SO8_BINNING options: --skip_binning Skip this step --gtdbtk_bank Path to the GTDBTK database - --mash_bank Path to the Mash database + --mash_bank Path to the Mash database. If the database doesn't exist, it will be created in this path. --checkm2_bank Path to the CheckM2 database --metabat2_seed Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed)) --binning_cross_alignment Mapping strategy to compute co-abundances for binning: . @@ -294,7 +294,7 @@ workflow { ch_mash_db = Channel.empty() ch_checkm2_db = Channel.empty() - DATABASES () + DATABASES() ch_host_fasta = DATABASES.out.host_fasta ch_host_index = DATABASES.out.host_index ch_kaiju_db = DATABASES.out.kaiju_db diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf index 9eecfce..8ec0172 100644 --- a/modules/gtdbtk.nf +++ b/modules/gtdbtk.nf @@ -1,22 +1,22 @@ process GTDBTK { - publishDir "${params.outdir}/08_binning/08_3_gtdbtk", mode: 'copy' - label 'BINNING' + publishDir "${params.outdir}/08_binning/08_3_gtdbtk", mode: 'copy' + label 'BINNING' input: - val(drep_bins_folder) - val(gtdbtk_db) - val(mash_db) + path bins_drep + val gtdbtk_db + val mash_db output: - path "gtdbtk.bac120.summary.tsv*" , emit : gtdbtk_affiliations_predictions - path "v_gtdbtk.txt" ,emit : v_gtdbtk + path "gtdbtk.bac120.summary.tsv*", emit: gtdbtk_affiliations_predictions + path "v_gtdbtk.txt", emit: v_gtdbtk script: """ - + echo "hello" export GTDBTK_DATA_PATH=$gtdbtk_db - gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus} + gtdbtk classify_wf --genome_dir $bins_drep -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus} echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt """ } \ No newline at end of file diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf index 92ac645..a39ca92 100644 --- a/subworkflows/00_databases.nf +++ b/subworkflows/00_databases.nf @@ -86,8 +86,12 @@ workflow DATABASES { ch_checkm2_db = Channel.empty() if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning) ) { + if ( params.mash_bank && file(params.mash_bank).exists() ) { + ch_mash_db = Channel.fromPath(params.mash_bank).first() + } else { + MASH_DB() + } ch_gtdbtk_db = Channel.fromPath(params.gtdbtk_bank).first() - ch_mash_db = Channel.fromPath(params.mash_bank).first() ch_checkm2_db = Channel.fromPath(params.checkm2_bank).first() } @@ -163,3 +167,15 @@ process EGGNOG_MAPPER_DB { """ } +process MASH_DB { + publishDir "${params.databases}/mash_db" + label 'BINNING' + + output: + path "mash_db", emit: mash_annot_db + + script: + """ + mkdir mash_db + """ +} \ No newline at end of file -- GitLab From 8ffac784e391d802e6f80b5b7829946039ca0dba Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Wed, 19 Feb 2025 10:53:01 +0100 Subject: [PATCH 5/8] specify release of gtdb tk database in doc --- docs/source/usage.md | 4 ++++ docs/source/use_case.md | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/usage.md b/docs/source/usage.md index 03dae44..4f972f5 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -94,6 +94,10 @@ tar xvzf gtdbtk_data.tar.gz After that, you have to indicate the path to gtdb-tk file database with the flags --gtdbtk_bank and --mash_bank. +```{note} +metagWGS v2.4.devel use GTDB-Tk v2.4.0 and require the gtdb databank r220. +``` + ```{warning} * if you use steps `S02_ASSEMBLY` or `S03_FILTERING` or `S04_STRUCTURAL_ANNOT` or `S05_ALIGNMENT` or `S06_FUNC_ANNOT` or `S07_TAXO_AFFI` without skipping `S01_CLEAN_QC` or host reads filtering, you need to use the mandatory files of step `S01_CLEAN_QC`. diff --git a/docs/source/use_case.md b/docs/source/use_case.md index 3b0f959..3df94bd 100644 --- a/docs/source/use_case.md +++ b/docs/source/use_case.md @@ -89,7 +89,8 @@ sample_3,test_data/ERR3201928_1.fastq.gz,test_data/ERR3201928_2.fastq.gz 4. We also need to have the diamond bank we want to use to align protein sequence of genes. Different diamond banks are available on genobioinfo cluster, here we will use NR: `/work/bank/diamonddb/nr.dmnd`. The path to this file will be used into our script. -5. Finally, we want to download the gtdb-tk database in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.4.0_env/share/gtdbtk-2.4.0/db/` + +5. Finally, we want to download the gtdb-tk database (r220) in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.4.0_env/share/gtdbtk-2.4.0/db/` 6. In `Script_filtering_binning.sh` write: -- GitLab From c49a0444b54959f097fb20eea9f997073d75e9ff Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Wed, 19 Feb 2025 17:00:29 +0100 Subject: [PATCH 6/8] run gtdbtk with mash db --- modules/get_db_versions.nf | 8 ++++---- subworkflows/00_databases.nf | 20 ++------------------ 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/modules/get_db_versions.nf b/modules/get_db_versions.nf index 407d3aa..4c51979 100644 --- a/modules/get_db_versions.nf +++ b/modules/get_db_versions.nf @@ -50,10 +50,10 @@ process GET_DB_VERSIONS { echo "GTDBTK ${gtdbtk}" > gtdbtk_db.txt fi - if [[ "${mash}" != "" ]] - then - echo "MASH ${mash}" > mash_db.txt - fi + #if [[ "${mash}" != "" ]] + #then + # echo "MASH ${mash}" > mash_db.txt + #fi if [[ "${checkm2}" != "" ]] then diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf index a39ca92..fb0cc56 100644 --- a/subworkflows/00_databases.nf +++ b/subworkflows/00_databases.nf @@ -86,14 +86,11 @@ workflow DATABASES { ch_checkm2_db = Channel.empty() if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning) ) { - if ( params.mash_bank && file(params.mash_bank).exists() ) { - ch_mash_db = Channel.fromPath(params.mash_bank).first() - } else { - MASH_DB() - } ch_gtdbtk_db = Channel.fromPath(params.gtdbtk_bank).first() + ch_mash_db = Channel.value(file(params.mash_bank)).first() ch_checkm2_db = Channel.fromPath(params.checkm2_bank).first() } + GET_DB_VERSIONS( ch_host_fasta.ifEmpty([]), @@ -166,16 +163,3 @@ process EGGNOG_MAPPER_DB { download_eggnog_data.py -f -y --data_dir db_eggnog_mapper """ } - -process MASH_DB { - publishDir "${params.databases}/mash_db" - label 'BINNING' - - output: - path "mash_db", emit: mash_annot_db - - script: - """ - mkdir mash_db - """ -} \ No newline at end of file -- GitLab From a1f485b1c15644ee9a99b9dac1f4e568aaba76b2 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Tue, 25 Feb 2025 16:47:17 +0100 Subject: [PATCH 7/8] modif explaination of mash bank param --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index f902901..de8cdda 100644 --- a/main.nf +++ b/main.nf @@ -93,7 +93,7 @@ include { MULTIQC } from './modules/multiqc' SO8_BINNING options: --skip_binning Skip this step --gtdbtk_bank Path to the GTDBTK database - --mash_bank Path to the Mash database. If the database doesn't exist, it will be created in this path. + --mash_bank Path to the Mash database. If the database doesn't exist, it will be created in the working directory. --checkm2_bank Path to the CheckM2 database --metabat2_seed Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed)) --binning_cross_alignment Mapping strategy to compute co-abundances for binning: . -- GitLab From f7f598c9022086f41addfa3f7cbe8d16bddeb460 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Tue, 25 Feb 2025 16:49:07 +0100 Subject: [PATCH 8/8] mash db version is not referenced in db_versions.txt --- modules/get_db_versions.nf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modules/get_db_versions.nf b/modules/get_db_versions.nf index 4c51979..1f59ce5 100644 --- a/modules/get_db_versions.nf +++ b/modules/get_db_versions.nf @@ -50,11 +50,6 @@ process GET_DB_VERSIONS { echo "GTDBTK ${gtdbtk}" > gtdbtk_db.txt fi - #if [[ "${mash}" != "" ]] - #then - # echo "MASH ${mash}" > mash_db.txt - #fi - if [[ "${checkm2}" != "" ]] then echo "Checkm2 ${checkm2}" > checkm2_db.txt -- GitLab