From b9a2a03861028bc1733cff62937142e1857816ff Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Mon, 17 Feb 2025 13:38:00 +0100
Subject: [PATCH 1/8] start integration mash db in workflow

---
 conf/functional_test.config           | 3 ++-
 docs/source/usage.md                  | 3 ++-
 docs/source/use_case.md               | 4 ++--
 functional_tests/test_download.config | 3 ++-
 main.nf                               | 8 +++++---
 modules/gtdbtk.nf                     | 3 ++-
 nextflow.config                       | 1 +
 subworkflows/00_databases.nf          | 4 ++++
 subworkflows/08_binning.nf            | 3 ++-
 9 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/conf/functional_test.config b/conf/functional_test.config
index 31d579c..e1a1fcf 100644
--- a/conf/functional_test.config
+++ b/conf/functional_test.config
@@ -26,5 +26,6 @@ params {
     host_fasta = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/host/Homo_sapiens.GRCh38_chr21.fa"
     input = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/samplesheet.csv"
     type='SR'
-    gtdbtk_bank="/work/project/plateforme/metaG/databases/GTDBtk_data/release207_v2"
+    gtdbtk_bank="/work/bank2/GTDB/GTDB/current/flat/release220/"
+    mash_bank="/work/project/plateforme/metaG/functional_test/FT_banks/mash/gtdb_ref_sketch.msh"
 }
diff --git a/docs/source/usage.md b/docs/source/usage.md
index 5aae59f..52f275c 100644
--- a/docs/source/usage.md
+++ b/docs/source/usage.md
@@ -92,7 +92,7 @@ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files
 tar xvzf gtdbtk_v2_data.tar.gz
 ```
 
-After that, you have to indicate the path to gtdb-tk file database with the flag --gtdbtk_bank .
+After that, you have to indicate the path to gtdb-tk file database with the flags --gtdbtk_bank and --mash_bank.
 
 ```{warning}
 * if you use steps `S02_ASSEMBLY` or `S03_FILTERING` or `S04_STRUCTURAL_ANNOT` or `S05_ALIGNMENT` or `S06_FUNC_ANNOT` or
@@ -477,6 +477,7 @@ at the same time as the protein database used in 05_alignment step.
 | Parameter | Description |   
 | ------------ | ----------- |
 | `--gtdbtk_bank` | indicates path to the GTDBTK database (see [GTDBTk installing](https://ecogenomics.github.io/GTDBTk/installing/index.html)).<br /> Default: `""`.|
+| `--mash_bank` | indicates path to the Mash database (see [GTDBTk installing](https://ecogenomics.github.io/GTDBTk/installing/index.html)).<br /> Default: `""`.|
 | `--checkm2_bank` | indicates path to the checkm2 database (`PATH/uniref100.KO.1.dmnd`).<br /> To download it, launch `singularity exec -B YOUR_ACTUAL_PATH PATH_TO/binning.sif checkm2 database --download --path YOUR_ACTUAL_PATH/checkm2DB`.<br /> Adapt -B parameter to your infrastructure if you are not in genobioinfo. This command will return 2 non-blocking errors, don't worry about them. <br /> Default: `""`.|
 | `--metabat2_seed` | Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed)).<br /> Default: `0`.|
 | `--min_completeness [nb]` | Minimum % of bins completeness for the bins to be kept after bin_refinement step.<br /> Default: `50`.|
diff --git a/docs/source/use_case.md b/docs/source/use_case.md
index 72e6b80..3b0f959 100644
--- a/docs/source/use_case.md
+++ b/docs/source/use_case.md
@@ -89,7 +89,7 @@ sample_3,test_data/ERR3201928_1.fastq.gz,test_data/ERR3201928_2.fastq.gz
 
 4. We also need to have the diamond bank we want to use to align protein sequence of genes. Different diamond banks are available on genobioinfo cluster, here we will use NR: `/work/bank/diamonddb/nr.dmnd`. The path to this file will be used into our script.
 
-5. Finally, we want to download the gtdb-tk database in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.1.1_env/share/gtdbtk-2.1.1/db/`
+5. Finally, we want to download the gtdb-tk database in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.4.0_env/share/gtdbtk-2.4.0/db/`
 
 6. In `Script_filtering_binning.sh` write:
 
@@ -108,7 +108,7 @@ nextflow run -profile genotoul main.nf --type SR --input "samplesheet.csv" \
 --kaiju_db_dir "/bank/kaijudb/kaijudb_refseq_2020-05-25" \
 --assembly metaspades \
 --diamond_bank "/work/bank/diamonddb/nr.dmnd" \
---gtdbtk_bank "/work/project/plateforme/metaG/databases/GTDBtk_data/release207_v2/" \
+--gtdbtk_bank "/work/bank2/GTDB/GTDB/current/flat/release220/" \
 -with-report -with-timeline -with-trace -with-dag -resume
 ```
 
diff --git a/functional_tests/test_download.config b/functional_tests/test_download.config
index a8b0cfd..3575b4d 100644
--- a/functional_tests/test_download.config
+++ b/functional_tests/test_download.config
@@ -19,7 +19,8 @@ params {
     host_fasta = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/host/Homo_sapiens.GRCh38_chr21.fa"
     host_index = ""
     diamond_bank = "/work/project/plateforme/metaG/functional_test/FT_banks/refseq_bacteria_2021-05-20/refseq_bacteria_100000.dmnd"
-    gtdbtk_bank="/work/project/plateforme/metaG/databases/GTDBtk_data/release207_v2"
+    gtdbtk_bank="/work/bank2/GTDB/GTDB/current/flat/release220/"
+    mash_bank="/work/project/plateforme/metaG/functional_test/FT_banks/mash/gtdb_ref_sketch.msh"
     input = "/work/project/plateforme/metaG/functional_test/metagwgs-test-datasets/small/input/samplesheet.csv"
     type='SR'
 }
diff --git a/main.nf b/main.nf
index 92ac20f..9c1f7ab 100644
--- a/main.nf
+++ b/main.nf
@@ -93,6 +93,7 @@ include { MULTIQC } from './modules/multiqc'
      SO8_BINNING options:
        --skip_binning                Skip this step
        --gtdbtk_bank                 Path to the GTDBTK database
+       --mash_bank                   Path to the Mash database
        --checkm2_bank                Path to the CheckM2 database
        --metabat2_seed               Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed))
        --binning_cross_alignment     Mapping strategy to compute co-abundances for binning: .
@@ -208,8 +209,8 @@ workflow {
       exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank"
   }
 
-  if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning ) && !(params.gtdbtk_bank || params.checkm2_bank) ) {
-      exit 1, "You must specify --skip_binning or specify a GTDB-TK bank with --gtdbtk_bank and a checkm2 bank with --checkm2_bank"
+  if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning ) && !(params.gtdbtk_bank || params.mash_bank || params.checkm2_bank) ) {
+      exit 1, "You must specify --skip_binning or specify a GTDB-TK bank with --gtdbtk_bank, mash bank with --mash_bank and a checkm2 bank with --checkm2_bank"
   }
 
   if ( params.coassembly && params.binning_cross_alignment == 'group'){
@@ -300,6 +301,7 @@ workflow {
   ch_taxonomy = DATABASES.out.taxonomy
   ch_diamon_db = DATABASES.out.diamond
   ch_gtbdtk_db = DATABASES.out.gtdbtk
+  ch_mash_db = DATABASES.out.mash
   ch_checkm2_db = DATABASES.out.checkm2
 
   ch_multiqc_config = Channel.empty()
@@ -450,7 +452,7 @@ workflow {
 
   if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_binning ) {
 
-    S08_BINNING( ch_reads, ch_assembly, ch_bam, ch_gtbdtk_db, ch_checkm2_db, ch_quast, ch_circular)
+    S08_BINNING( ch_reads, ch_assembly, ch_bam, ch_gtbdtk_db, ch_mash_db, ch_checkm2_db, ch_quast, ch_circular)
     ch_bins_abundances_report = S08_BINNING.out.bins_abundances_report
     ch_bins_stats_report = S08_BINNING.out.bins_stats_report
 
diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf
index ba9b86d..9eecfce 100644
--- a/modules/gtdbtk.nf
+++ b/modules/gtdbtk.nf
@@ -5,6 +5,7 @@ process GTDBTK {
   input:
     val(drep_bins_folder)
     val(gtdbtk_db)
+    val(mash_db)
       
   output:
     path "gtdbtk.bac120.summary.tsv*" , emit : gtdbtk_affiliations_predictions
@@ -15,7 +16,7 @@ process GTDBTK {
 
   export GTDBTK_DATA_PATH=$gtdbtk_db
 
-  gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --skip_ani_screen --pplacer_cpus ${task.cpus} --cpus ${task.cpus}
+  gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus}
   echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt
   """
 }
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index c70ed22..59dff8c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -16,6 +16,7 @@ params {
     min_contigs_cpm = 1
     diamond_bank = ""
     gtdbtk_bank = ""
+    mash_bank = ""
     checkm2_bank = ""
     percentage_identity = 0.95
     type = ""
diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf
index 9c0ca02..f808092 100644
--- a/subworkflows/00_databases.nf
+++ b/subworkflows/00_databases.nf
@@ -82,10 +82,12 @@ workflow DATABASES {
         }
 
         ch_gtdbtk_db = Channel.empty()
+        ch_mash_db = Channel.empty()
         ch_checkm2_db = Channel.empty()
 
         if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning) ) {
             ch_gtdbtk_db = Channel.fromPath(params.gtdbtk_bank).first()
+            ch_mash_db = Channel.fromPath(params.mash_bank).first()
             ch_checkm2_db = Channel.fromPath(params.checkm2_bank).first()
         }
         
@@ -97,6 +99,7 @@ workflow DATABASES {
             ch_taxdump.ifEmpty([]),
             ch_diamond.ifEmpty([]),
             ch_gtdbtk_db.ifEmpty([]),
+            ch_mash_db.ifEmpty([])
             ch_checkm2_db.ifEmpty([])
         )
         
@@ -108,6 +111,7 @@ workflow DATABASES {
         taxonomy = ch_taxonomy.first()
         diamond = ch_diamond
         gtdbtk = ch_gtdbtk_db
+        mash = ch_mash_db
         checkm2 = ch_checkm2_db
 }
 
diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf
index 1346139..4573dc2 100644
--- a/subworkflows/08_binning.nf
+++ b/subworkflows/08_binning.nf
@@ -15,6 +15,7 @@ workflow STEP_08_BINNING {
   assembly
   bam
   gtdbtk_db
+  mash_db
   checkm2_db
   quast
   circular
@@ -222,7 +223,7 @@ workflow STEP_08_BINNING {
   ///// TAXONOMIC AFFILIATION BIN
   /////////////////////////////// 
 
-  GTDBTK(ch_bins_drep, gtdbtk_db)
+  GTDBTK(ch_bins_drep, gtdbtk_db, mash_db)
   ch_gtdbtk_v = GTDBTK.out.v_gtdbtk
   ch_gtdbtk_affi = GTDBTK.out.gtdbtk_affiliations_predictions
 
-- 
GitLab


From 19612c6363976d8a725c2cd5cf3401b8719f91fa Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Mon, 17 Feb 2025 15:18:13 +0100
Subject: [PATCH 2/8] gtdbtk run with mash db in a specific folder

---
 main.nf                      | 3 ++-
 modules/get_db_versions.nf   | 6 ++++++
 subworkflows/00_databases.nf | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 9c1f7ab..96ccb76 100644
--- a/main.nf
+++ b/main.nf
@@ -291,7 +291,8 @@ workflow {
   ch_taxonomy = Channel.empty()
   ch_diamon_db = Channel.empty()
   ch_gtbdtk_db = Channel.empty()
-  ch_gtbdtk_db = Channel.empty()
+  ch_mash_db = Channel.empty()
+  ch_checkm2_db = Channel.empty()
 
   DATABASES ()
   ch_host_fasta = DATABASES.out.host_fasta
diff --git a/modules/get_db_versions.nf b/modules/get_db_versions.nf
index b64e41b..407d3aa 100644
--- a/modules/get_db_versions.nf
+++ b/modules/get_db_versions.nf
@@ -10,6 +10,7 @@ process GET_DB_VERSIONS {
   path taxdump
   path diamond
   path gtdbtk
+  path mash
   path checkm2
 
 
@@ -49,6 +50,11 @@ process GET_DB_VERSIONS {
     echo "GTDBTK ${gtdbtk}" > gtdbtk_db.txt
   fi
 
+  if [[ "${mash}" != "" ]]
+  then   
+    echo "MASH ${mash}" > mash_db.txt
+  fi
+
   if [[ "${checkm2}" != "" ]]
   then   
     echo "Checkm2 ${checkm2}" > checkm2_db.txt
diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf
index f808092..92ac645 100644
--- a/subworkflows/00_databases.nf
+++ b/subworkflows/00_databases.nf
@@ -99,7 +99,7 @@ workflow DATABASES {
             ch_taxdump.ifEmpty([]),
             ch_diamond.ifEmpty([]),
             ch_gtdbtk_db.ifEmpty([]),
-            ch_mash_db.ifEmpty([])
+            ch_mash_db.ifEmpty([]),
             ch_checkm2_db.ifEmpty([])
         )
         
-- 
GitLab


From ac7a2c16e3af29dca3e6658f452d44798042170b Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Mon, 17 Feb 2025 15:58:46 +0100
Subject: [PATCH 3/8] update documentation for gtdbtk + mash

---
 docs/source/output.md | 2 +-
 docs/source/usage.md  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/output.md b/docs/source/output.md
index e9ca730..9c75622 100644
--- a/docs/source/output.md
+++ b/docs/source/output.md
@@ -267,7 +267,7 @@ If you want to make further analysis about intra-population genetic diversity (m
 
 | File      | Description                                           |
 | ----------------------- | --------------------------------------- |
-| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `fastani_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. |
+| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. |
 
 #### 4. 08_4_mapping_on_final_bins
 
diff --git a/docs/source/usage.md b/docs/source/usage.md
index 52f275c..03dae44 100644
--- a/docs/source/usage.md
+++ b/docs/source/usage.md
@@ -87,9 +87,9 @@ In addition to the general mandatory files, if you wish to launch certain steps
 * Step `S08_BINNING`, To perform the taxonomic affiliations of the bins, you must download the gtdb-tk database as follows (see [GTDBTk](https://ecogenomics.github.io/GTDBTk/installing/index.html)) :
 
 ```bash
-wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_v2_data.tar.gz
-wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_v2_data.tar.gz  (or, mirror)
-tar xvzf gtdbtk_v2_data.tar.gz
+wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_package/full_package/gtdbtk_data.tar.gz
+wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_package/full_package/gtdbtk_data.tar.gz ( mirror for Australia)
+tar xvzf gtdbtk_data.tar.gz
 ```
 
 After that, you have to indicate the path to gtdb-tk file database with the flags --gtdbtk_bank and --mash_bank.
-- 
GitLab


From 6d9138b52862c3b04e575e18b71c786117662524 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Tue, 18 Feb 2025 10:49:10 +0100
Subject: [PATCH 4/8] use gtdbk240 with mash db only if it already exist and
 pointed in the config path

---
 main.nf                      |  4 ++--
 modules/gtdbtk.nf            | 18 +++++++++---------
 subworkflows/00_databases.nf | 18 +++++++++++++++++-
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/main.nf b/main.nf
index 96ccb76..f902901 100644
--- a/main.nf
+++ b/main.nf
@@ -93,7 +93,7 @@ include { MULTIQC } from './modules/multiqc'
      SO8_BINNING options:
        --skip_binning                Skip this step
        --gtdbtk_bank                 Path to the GTDBTK database
-       --mash_bank                   Path to the Mash database
+       --mash_bank                   Path to the Mash database. If the database doesn't exist, it will be created in this path.
        --checkm2_bank                Path to the CheckM2 database
        --metabat2_seed               Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed))
        --binning_cross_alignment     Mapping strategy to compute co-abundances for binning: .
@@ -294,7 +294,7 @@ workflow {
   ch_mash_db = Channel.empty()
   ch_checkm2_db = Channel.empty()
 
-  DATABASES ()
+  DATABASES()
   ch_host_fasta = DATABASES.out.host_fasta
   ch_host_index = DATABASES.out.host_index
   ch_kaiju_db = DATABASES.out.kaiju_db
diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf
index 9eecfce..8ec0172 100644
--- a/modules/gtdbtk.nf
+++ b/modules/gtdbtk.nf
@@ -1,22 +1,22 @@
 process GTDBTK {
- publishDir "${params.outdir}/08_binning/08_3_gtdbtk", mode: 'copy'
- label 'BINNING'
+  publishDir "${params.outdir}/08_binning/08_3_gtdbtk", mode: 'copy'
+  label 'BINNING'
  
   input:
-    val(drep_bins_folder)
-    val(gtdbtk_db)
-    val(mash_db)
+    path bins_drep
+    val gtdbtk_db
+    val mash_db
       
   output:
-    path "gtdbtk.bac120.summary.tsv*" , emit : gtdbtk_affiliations_predictions
-    path "v_gtdbtk.txt" ,emit : v_gtdbtk
+    path "gtdbtk.bac120.summary.tsv*", emit: gtdbtk_affiliations_predictions
+    path "v_gtdbtk.txt", emit: v_gtdbtk
 
   script:
   """
-
+  echo "hello"
   export GTDBTK_DATA_PATH=$gtdbtk_db
 
-  gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus}
+  gtdbtk classify_wf --genome_dir $bins_drep -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus}
   echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt
   """
 }
\ No newline at end of file
diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf
index 92ac645..a39ca92 100644
--- a/subworkflows/00_databases.nf
+++ b/subworkflows/00_databases.nf
@@ -86,8 +86,12 @@ workflow DATABASES {
         ch_checkm2_db = Channel.empty()
 
         if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning) ) {
+            if ( params.mash_bank && file(params.mash_bank).exists() ) {
+                ch_mash_db = Channel.fromPath(params.mash_bank).first()
+            } else {
+                MASH_DB() 
+            }
             ch_gtdbtk_db = Channel.fromPath(params.gtdbtk_bank).first()
-            ch_mash_db = Channel.fromPath(params.mash_bank).first()
             ch_checkm2_db = Channel.fromPath(params.checkm2_bank).first()
         }
         
@@ -163,3 +167,15 @@ process EGGNOG_MAPPER_DB {
         """
 }
 
+process MASH_DB {
+    publishDir "${params.databases}/mash_db"
+    label 'BINNING'
+
+    output:
+        path "mash_db", emit: mash_annot_db
+
+    script:
+        """
+        mkdir mash_db
+        """
+}
\ No newline at end of file
-- 
GitLab


From 8ffac784e391d802e6f80b5b7829946039ca0dba Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Wed, 19 Feb 2025 10:53:01 +0100
Subject: [PATCH 5/8] specify release of gtdb tk database in doc

---
 docs/source/usage.md    | 4 ++++
 docs/source/use_case.md | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/usage.md b/docs/source/usage.md
index 03dae44..4f972f5 100644
--- a/docs/source/usage.md
+++ b/docs/source/usage.md
@@ -94,6 +94,10 @@ tar xvzf gtdbtk_data.tar.gz
 
 After that, you have to indicate the path to gtdb-tk file database with the flags --gtdbtk_bank and --mash_bank.
 
+```{note}
+metagWGS v2.4.devel use GTDB-Tk v2.4.0 and require the gtdb databank r220.
+```
+
 ```{warning}
 * if you use steps `S02_ASSEMBLY` or `S03_FILTERING` or `S04_STRUCTURAL_ANNOT` or `S05_ALIGNMENT` or `S06_FUNC_ANNOT` or
 `S07_TAXO_AFFI` without skipping `S01_CLEAN_QC` or host reads filtering, you need to use the mandatory files of step `S01_CLEAN_QC`.
diff --git a/docs/source/use_case.md b/docs/source/use_case.md
index 3b0f959..3df94bd 100644
--- a/docs/source/use_case.md
+++ b/docs/source/use_case.md
@@ -89,7 +89,8 @@ sample_3,test_data/ERR3201928_1.fastq.gz,test_data/ERR3201928_2.fastq.gz
 
 4. We also need to have the diamond bank we want to use to align protein sequence of genes. Different diamond banks are available on genobioinfo cluster, here we will use NR: `/work/bank/diamonddb/nr.dmnd`. The path to this file will be used into our script.
 
-5. Finally, we want to download the gtdb-tk database in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.4.0_env/share/gtdbtk-2.4.0/db/`
+
+5. Finally, we want to download the gtdb-tk database (r220) in order to perform the taxonomic affiliations of the bins (see https://ecogenomics.github.io/GTDBTk/installing/index.html). The path to gtdb-tk database on genobioinfo is `usr/local/bioinfo/src/Miniconda/Miniconda3/envs/gtdbtk-v2.4.0_env/share/gtdbtk-2.4.0/db/`
 
 6. In `Script_filtering_binning.sh` write:
 
-- 
GitLab


From c49a0444b54959f097fb20eea9f997073d75e9ff Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Wed, 19 Feb 2025 17:00:29 +0100
Subject: [PATCH 6/8] run gtdbtk with mash db

---
 modules/get_db_versions.nf   |  8 ++++----
 subworkflows/00_databases.nf | 20 ++------------------
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/modules/get_db_versions.nf b/modules/get_db_versions.nf
index 407d3aa..4c51979 100644
--- a/modules/get_db_versions.nf
+++ b/modules/get_db_versions.nf
@@ -50,10 +50,10 @@ process GET_DB_VERSIONS {
     echo "GTDBTK ${gtdbtk}" > gtdbtk_db.txt
   fi
 
-  if [[ "${mash}" != "" ]]
-  then   
-    echo "MASH ${mash}" > mash_db.txt
-  fi
+  #if [[ "${mash}" != "" ]]
+  #then   
+  #  echo "MASH ${mash}" > mash_db.txt
+  #fi
 
   if [[ "${checkm2}" != "" ]]
   then   
diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf
index a39ca92..fb0cc56 100644
--- a/subworkflows/00_databases.nf
+++ b/subworkflows/00_databases.nf
@@ -86,14 +86,11 @@ workflow DATABASES {
         ch_checkm2_db = Channel.empty()
 
         if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning) ) {
-            if ( params.mash_bank && file(params.mash_bank).exists() ) {
-                ch_mash_db = Channel.fromPath(params.mash_bank).first()
-            } else {
-                MASH_DB() 
-            }
             ch_gtdbtk_db = Channel.fromPath(params.gtdbtk_bank).first()
+            ch_mash_db = Channel.value(file(params.mash_bank)).first()
             ch_checkm2_db = Channel.fromPath(params.checkm2_bank).first()
         }
+
         
         GET_DB_VERSIONS(
             ch_host_fasta.ifEmpty([]),
@@ -166,16 +163,3 @@ process EGGNOG_MAPPER_DB {
         download_eggnog_data.py -f -y --data_dir db_eggnog_mapper
         """
 }
-
-process MASH_DB {
-    publishDir "${params.databases}/mash_db"
-    label 'BINNING'
-
-    output:
-        path "mash_db", emit: mash_annot_db
-
-    script:
-        """
-        mkdir mash_db
-        """
-}
\ No newline at end of file
-- 
GitLab


From a1f485b1c15644ee9a99b9dac1f4e568aaba76b2 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Tue, 25 Feb 2025 16:47:17 +0100
Subject: [PATCH 7/8] modif explaination of mash bank param

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index f902901..de8cdda 100644
--- a/main.nf
+++ b/main.nf
@@ -93,7 +93,7 @@ include { MULTIQC } from './modules/multiqc'
      SO8_BINNING options:
        --skip_binning                Skip this step
        --gtdbtk_bank                 Path to the GTDBTK database
-       --mash_bank                   Path to the Mash database. If the database doesn't exist, it will be created in this path.
+       --mash_bank                   Path to the Mash database. If the database doesn't exist, it will be created in the working directory.
        --checkm2_bank                Path to the CheckM2 database
        --metabat2_seed               Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed))
        --binning_cross_alignment     Mapping strategy to compute co-abundances for binning: .
-- 
GitLab


From f7f598c9022086f41addfa3f7cbe8d16bddeb460 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Tue, 25 Feb 2025 16:49:07 +0100
Subject: [PATCH 8/8] mash db version is not referenced in db_versions.txt

---
 modules/get_db_versions.nf | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/modules/get_db_versions.nf b/modules/get_db_versions.nf
index 4c51979..1f59ce5 100644
--- a/modules/get_db_versions.nf
+++ b/modules/get_db_versions.nf
@@ -50,11 +50,6 @@ process GET_DB_VERSIONS {
     echo "GTDBTK ${gtdbtk}" > gtdbtk_db.txt
   fi
 
-  #if [[ "${mash}" != "" ]]
-  #then   
-  #  echo "MASH ${mash}" > mash_db.txt
-  #fi
-
   if [[ "${checkm2}" != "" ]]
   then   
     echo "Checkm2 ${checkm2}" > checkm2_db.txt
-- 
GitLab