References

Nextflow Documentation

Visit the Nextflow Documentation website to learn more about this workflow manager.

Processes: `modules.nf`

Source Code:

   /*
   start.nf modules
    */

   /*
    * Process 0A: Download ARG Database
    */
   process ARG_DB {
       tag "Download ARG Database"
       publishDir params.DBdir, mode: 'copy', overwrite: false

       input:
       val(params.ARG_db_URL)

       output:
       file(params.ARG_db)

       script:
       /* temporarily does not accomadate a different ARG database due to the tarball */
       """
       wget ${params.ARG_db_URL}
       tar -xf broadstreet-v3.2.2.tar.bz2 
       mv nucleotide_fasta_protein_homolog_model.fasta ${params.ARG_db} 
       """
       }

   /*
    * Process 0B: Download 16S rRNA Database 
    */
   process rRNA16S_DB {
       tag "Download 16S rRNA Database"
       publishDir params.DBdir, mode: 'copy', overwrite: false

       input:
       val(params.rRNA16S_db_URL)

       output:
       file(params.rRNA16S_db)

       /* temporarily does not accomadate a different 16S database due to nomenclature */
       script:
       """
       wget ${params.rRNA16S_db_URL} 
       mv gg_12_10.fasta.gz ${params.rRNA16S_db} 
       """
   }

   /*
    * Process 0C: Cluster ARG Database 
    */
   process CDHIT_ARG {
       tag "Remove ARG Redundancy"
       publishDir params.DBdir, mode: 'copy', overwrite: false

       input:
       file(params.ARG_db)

       output:
       file(params.ARG_db_NR)

       script:
       """
       /mnt/research/qgg/software/cd-hit-4.8.1/cd-hit-est -i ${params.ARG_db} -o ${params.ARG_db_NR} -c 0.99 -G 0 -M 0 -T 0 -aS 0.9 -g 1 -d 0
       """

   }

   /*
    * Process 0D: Cluster 16S rRNA Database 
    */
   process CDHIT_16S {
       tag "Remove 16S rRNA Redundancy"
       publishDir params.DBdir, mode: 'copy', overwrite: false

       /* slurm resources */
       executor 'slurm'    
       memory '100 GB'
       cpus 16
       time '12h'

       input:
       file(params.rRNA16S_db)

       output:
       file(params.rRNA16S_db_NR)

       script:
       """
       /mnt/research/qgg/software/cd-hit-4.8.1/cd-hit-est -i ${params.rRNA16S_db} -o ${params.rRNA16S_db_NR} -c 0.99 -G 0 -M 0 -T 0 -aS 0.9 -g 1 -d 0
       """
   }

   /*
    * Process 0E: Index ARG Database 
    */
   process INDEX_ARG_REF {
       tag "Index ARG Reference"
       publishDir params.DBdir, mode: 'copy', overwrite: false

       /* slurm resources */
       memory '100 GB'
       cpus 16
       time '12h'

       input:
       file(params.ARG_db_NR)

       output:
       tuple file(params.ARG_db_NR), path("${params.ARG_db_NR}.{amb,ann,bwt,pac,sa}")

       script:
       """
       bwa index -a bwtsw ${params.ARG_db_NR} 
       """
   }

   /*
    * Process 0F: Index 16S rRNA Database 
    */
   process INDEX_16S_REF {
       tag "Index 16S rRNA Reference"
       publishDir params.DBdir, mode: 'copy', overwrite: false

       /* slurm resources */
       memory '100 GB'
       cpus 16
       time '12h'

       input:
       file(params.rRNA16S_db_NR)

       output:
       tuple file(params.rRNA16S_db_NR), path("${params.rRNA16S_db_NR}.{amb,ann,bwt,pac,sa}")

       script:
       """
       bwa index -a bwtsw ${params.rRNA16S_db_NR} 
       """
   }

   /*
    * Process 0G: Download Test SRA Data for main.nf
    */
   process download_testSRA {
       tag "Test SRA Download: $params.test_SRA_ID"
       publishDir params.readsDir, mode: 'copy', overwrite: false

       input:
       tuple val(SRA_ID), val(fastqURLs)

       output:
       tuple file("TEST_SAMPLE_${SRA_ID}_1.fastq.gz"), file("TEST_SAMPLE_${SRA_ID}_2.fastq.gz")

       script:
       """
       wget ftp://ftp.sra.ebi.ac.uk${fastqURLs[0]}
       mv *_1.fastq.gz TEST_SAMPLE_${SRA_ID}_1.fastq.gz

       wget ftp://ftp.sra.ebi.ac.uk${fastqURLs[1]} 
       mv *_2.fastq.gz TEST_SAMPLE_${SRA_ID}_2.fastq.gz
       """ 
   }


   /*
   main.nf modules
    */

   /*
    * Process 1A: Run FastQC
    */
   process FASTQC {
       tag "${sample_id}" 
       publishDir params.QC_files, mode: 'copy', overwrite: true

       input:
       tuple val(sample_id), file(reads)

       output:
       path "${sample_id}_fastqc.log"

       script:
       """
       fastqc --outdir $params.QC_files ${reads[0]} ${reads[1]} > ${sample_id}_fastqc.log
       """
    }

   /*
    * Process 1B: Quality Control ~ Adapter Trimming, Quality Filtering, Read Trimming
    */
    process fastq_qualityControl {
       tag "QC on $sample_id"
       publishDir params.clean_reads, mode: 'copy', overwrite: true

       input:
       tuple val(sample_id), file(reads)


       output:
       tuple val("$sample_id"), path("${sample_id}_*.clean_*.fastq.gz")

       script:
       """
       /mnt/research/qgg/software/bbmap/bbduk.sh in1=${reads[0]} in2=${reads[1]} out1=${sample_id}_1.clean_1.fastq.gz out2=${sample_id}_2.clean_2.fastq.gz outm=${sample_id}_1.outm_1.fastq.gz outm2=${sample_id}_2.outm_2.fastq.gz outs=${sample_id}.outs.fastq.gz ref=/mnt/research/qgg/software/bbmap/resources/adapters.fa qtrim=r trimq=20 minlen=50
       """
   }

   /*
    * Process 1C: Run FastQC on Clean Reads
    */
   process FASTQC_on_clean {
       tag "${sample_id}"
       publishDir params.QC_files, mode: 'copy', overwrite: true

       input:
       tuple val(sample_id), file(reads)

       output:
       path "${sample_id}_clean_fastqc.log"

       script:
       """
       fastqc --outdir $params.QC_files ${reads[0]} ${reads[1]} > ${sample_id}_clean_fastqc.log
       """
    }

   /*
    * Process 2A: Align to ARG Reference: CARD
    */
   process map_ARG {
       tag "Align ${sample_id} -> ARG Database"
       publishDir params.outdir, mode: 'copy', overwrite: true

       input:
       tuple val(sample_id), path(clean_reads)

       output:
       tuple val("$sample_id"), path("${sample_id}_ARG.bam")


       script:
       """
       bwa mem -t 8 ${params.CARD_db} ${clean_reads[0]} ${clean_reads[1]} 2> ${sample_id}_ARG_bwaMem.err | samtools view -bS - > ${sample_id}_ARG.bam 2> ${sample_id}_ARG_samtoolsView.err
       """
   }

   /*
    * Process 2B: Align to 16S rRNA Reference: GreenGenes 
    */
   process map_16S {
       tag "Align ${sample_id} -> 16S Database"
       publishDir params.outdir, mode: 'copy', overwrite: true

       input:
       tuple val(sample_id), path(clean_reads)

       output:
       tuple val("$sample_id"), path("${sample_id}_16S.bam")

       script:
       """
       bwa mem -t 8 ${params.GG_db} ${clean_reads[0]} ${clean_reads[1]} 2> ${sample_id}_16S_bwaMem.err | samtools view -bS - > ${sample_id}_16S.bam 2> ${sample_id}_16S_samtoolsView.err
       """
   }

    /*
    * Process 3A: Gene Quantification ~ ARGs
    */
   process count_ARG {
    tag "Quantify ARG Abundance: ${sample_id}"
    publishDir params.outdir, mode: 'copy', overwrite: true

       input: 
       tuple val(sample_id), path(bam)

       output: 
       tuple val("$sample_id"), path("${sample_id}_absAbun_ARG")

       shell:
       '''
       join -t $'\t' -a 1 -a 2 -e - -o '0,1.2,2.2,1.3,2.3' <(samtools view -q 1 -f 0x40 !{bam} | cut -f 1,3,5 | sort -k 1,1) <(samtools view -q 1 -F 0x40 !{bam} | cut -f 1,3,5 | sort -k 1,1) | perl -wne 'chomp $_; @line = split /\t/, $_; if ($line[1] eq $line[2]) { print $line[0], "\t", $line[1], "\n"; } elsif ( $line[1] eq "-" ) { print $line[0], "\t", $line[2], "\n"; } elsif ( $line[2] eq "-" ) { print $line[0], "\t", $line[1], "\n"; } elsif ($line[3] >= $line[4]) { print $line[0], "\t", $line[1], "\n"; } else { print $line[0], "\t", $line[2], "\n"; }' | awk '{print $1"::::"$2}' | sort -k1,1 | bedtools groupby -g 1 -c 1 -o collapse,count | sed 's/::::/\t/' | sort -k1,1 | bedtools groupby -g 1 -c 2,4,4 -o collapse,collapse,sum | perl -wne 'chomp $_; @line = split /\t/, $_; @genes = split /,/, $line[1]; @counts = split /,/, $line[2]; for (my $i = 0; $i <= $#genes; $i++) { print $genes[$i], "\t", $counts[$i]/$line[3], "\n"; }' | sort -k1,1 | bedtools groupby -g 1 -c 2 -o sum > !{sample_id}_absAbun_ARG
       '''
   }

   /*
    * Process 3B: Gene Quantification ~ 16S rRNA
    */
   process count_16S {
    tag "Quantify 16S rRNA Abundance: ${sample_id}"
    publishDir params.outdir, mode: 'copy', overwrite: true

       input: 
       tuple val(sample_id), path(bam)

       output: 
       tuple val("$sample_id"), path("${sample_id}_absAbun_16S")

       shell:
       '''
       join -t $'\t' -a 1 -a 2 -e - -o '0,1.2,2.2,1.3,2.3' <(samtools view -q 1 -f 0x40 !{bam} | cut -f 1,3,5 | sort -k 1,1) <(samtools view -q 1 -F 0x40 !{bam} | cut -f 1,3,5 | sort -k 1,1) | perl -wne 'chomp $_; @line = split /\t/, $_; if ($line[1] eq $line[2]) { print $line[0], "\t", $line[1], "\n"; } elsif ( $line[1] eq "-" ) { print $line[0], "\t", $line[2], "\n"; } elsif ( $line[2] eq "-" ) { print $line[0], "\t", $line[1], "\n"; } elsif ($line[3] >= $line[4]) { print $line[0], "\t", $line[1], "\n"; } else { print $line[0], "\t", $line[2], "\n"; }' | awk '{print $1"::::"$2}' | sort -k1,1 | bedtools groupby -g 1 -c 1 -o collapse,count | sed 's/::::/\t/' | sort -k1,1 | bedtools groupby -g 1 -c 2,4,4 -o collapse,collapse,sum | perl -wne 'chomp $_; @line = split /\t/, $_; @genes = split /,/, $line[1]; @counts = split /,/, $line[2]; for (my $i = 0; $i <= $#genes; $i++) { print $genes[$i], "\t", $counts[$i]/$line[3], "\n"; }' | sort -k1,1 | bedtools groupby -g 1 -c 2 -o sum > !{sample_id}_absAbun_16S
       '''
   }

Workflow: `start.nf`

Source Code:

   #!/usr/bin/env nextflow

   /* 
    * A Nextflow pipeline to download default databases & test data for main animalARG workflow.
    */

   /*
    * The following pipeline parameters specify the reference genomes
    * and read pairs that will be donwnloaded for future use by animalARG. 
    */
   params.SRA_IDs = "None Identified by User"
   params.test_SRA_ID ='SRS6485642'
   params.ARG_db = "ARG_referenceDB.fasta"
   params.ARG_db_URL = "https://card.mcmaster.ca/download/0/broadstreet-v3.2.2.tar.bz2"
   params.ARG_db_NR = "ARG_referenceDB_NR99.fasta"
   params.rRNA16S_db = "16SrRNA_referenceDB.fasta.gz"
   params.rRNA16S_db_URL = "https://gg-sg-web.s3-us-west-2.amazonaws.com/downloads/greengenes_database/gg_12_10/gg_12_10.fasta.gz"
   params.rRNA16S_db_NR = "16SrRNA_referenceDB_NR99.fasta.gz"
   params.DBdir = "$baseDir/data/DB/" 
   params.readsDir = "$baseDir/data/reads/"


   /* 
    * Indicate important info to user when script is run on the command line. 
    */
   log.info """\
          Download Defaults   
   ===============================
   SRA IDs      : $params.SRA_IDs
   ARG Database : $params.ARG_db_URL
   16S Database : $params.rRNA16S_db_URL
   """

   /* 
    * Enable DSL 2 syntax
    */
   nextflow.enable.dsl = 2

    /* 
    * Import modules 
    */
   include {
       ARG_DB
       rRNA16S_DB
       CDHIT_ARG
       CDHIT_16S
       INDEX_ARG_REF
       INDEX_16S_REF
       download_testSRA} from './modules.nf'

   /* 
    * main bioinformatic pipeline 
    */
   workflow {

       /* Download data for tutorial and tests, this will only be used later on by main.nf  */
       /* First we retrieve SRA ID infromation, including the URL needed for donwloading
       this is accomplished via the built in nextflow function: fromSRA  */ 
       sra_test_ch = Channel
                       .fromSRA(params.test_SRA_ID)
                       .view()
       /* Once we have the SRA ID, and a tuple of the two URLs, they are sent to the sra_test_ch
       and then passed to the process download_testSRA to retreive the fastq files  */ 
       download_testSRA(sra_test_ch)

       /* Download ARG Reference file via the default URL provided, this file is then 
       passed to the ARG_DB_ch for downstream processing */
       ARG_DB_ch = ARG_DB (params.ARG_db_URL)
       ARG_DB_ch.view()

       /* Download 16S rRNA Reference file via the default URL provided, this file is then 
       passed to the rRNA16S_DB_ch for downstream processing */
       rRNA16S_DB_ch = rRNA16S_DB (params.rRNA16S_db_URL)
       rRNA16S_DB_ch.view()

       /* Cluster the sequences in the downloaded ARG Reference file, pass NonRedundant file
       to the ARG_DB_NR_ch for downstream processing  */ 
       ARG_DB_NR_ch = CDHIT_ARG (ARG_DB_ch)
       ARG_DB_NR_ch.view()

       /* Cluster the sequences in the downloaded 16S rRNA Reference file, pass NonRedundant file
       to the rRNA16S_DB_NR_ch for downstream processing  */ 
       rRNA16S_DB_NR_ch = CDHIT_16S (rRNA16S_DB_ch)
       rRNA16S_DB_NR_ch.view()

       /* Index the NonRedundant ARG Reference file, store in params.DBdir 
       for downstream utilization via main.nf */  
       ARG_ref_idx_ch = INDEX_ARG_REF (ARG_DB_NR_ch)
       ARG_ref_idx_ch.view()

       /* Index the NonRedundant 16S rRNA Reference file, store in params.DBdir 
       for downstream utilization via main.nf */  
       rRNA16S_ref_idx_ch = INDEX_16S_REF (rRNA16S_DB_NR_ch)
       rRNA16S_ref_idx_ch.view() 
   }

Expected Output:

InitialLogMessage&Command Start CompletedProcesses Start SuccesfulRun Start

Workflow: `main.nf`

Source Code:

   #!/usr/bin/env nextflow

   /* 
    * 'animal-ARG' - A Nextflow pipeline for estimating antimicrobial resistance gene abundance
    */

   /*
    * The following pipeline parameters specify the reference genomes
    * and read pairs and can be provided as command line options
    */
   params.SRA_IDs = "USER_Defined_List"
   params.CARD_db = "$baseDir/data/DB/ARG_referenceDB_NR99.fasta"
   params.GG_db = "$baseDir/data/DB/16SrRNA_referenceDB.fasta.gz"
   params.reads = "$baseDir/data/reads/*_{1,2}.fastq.gz"
   params.readDir = "$baseDir/data/reads/"
   params.outdir = "$baseDir/results"
   params.QC_files = "$baseDir/QCmetrics"
   params.clean_reads = "$baseDir/data/cleanReads/"


   /* 
    * Indicate important info to user when script is run on the command line. 
    */
   log.info """\
        A N I M A L  -  A R G     
   ===============================
   SRA IDs      : $params.SRA_IDs
   Reads        : $params.reads
   ARG Database : $params.CARD_db
   16S Database : $params.GG_db
   QC Analysis  : $params.QC_files
   Results      : $params.outdir
   """

   /* 
    * Enable DSL 2 syntax
    */
   nextflow.enable.dsl = 2

    /* 
    * Import modules 
    */
   include { 
     FASTQC;
     fastq_qualityControl;
     FASTQC_on_clean;
     map_ARG; 
     map_16S;
     count_ARG;
     count_16S} from './modules.nf'

   /* 
    * main bioinformatic pipeline 
    */
   workflow {
     /* this ouputs in tuple format of: [sampleID, [file1, file2]] */
     read_pairs_ch = channel.fromFilePairs( params.reads, checkIfExists: true ) 
     /* the checkIfExists function tests to make sure there is data present for futher analysis, 
     if not the process is aborted */
     /* this ouputs in tuple format of: [sampleID, [file1, file2]] */
     read_pairs_ch.view()

     // Part 1: Quality Control
     /* first lets peform FASTQC on the raw data for later reference by user */
     fastqc_ch = FASTQC( read_pairs_ch )
     fastqc_ch.view()

     /* perform adapter trimming and read quality filtering via BBDuk and the 
     fastq_qualityControl process, qc_ch contains clean reads which is passed to next steps */
     qc_ch = fastq_qualityControl( read_pairs_ch )
     qc_ch.view()

     /* following the execution of the fastq_qualityControl process we rerun FASTQC on the clean reads */
     fastqc_clean_ch = FASTQC_on_clean( qc_ch )
     fastqc_clean_ch.view()


     // Part 2: Gene Mapping
     /* clean reads found in qc_ch are aligned to the specified ARG reference */
     bam_ARG_ch = map_ARG( qc_ch ) 
     /* output file is .bam format, and contains results of mapping */
     bam_ARG_ch.view()

     /* clean reads found in qc_ch are aligned to the specified ARG reference */
     bam_16S_ch = map_16S( qc_ch ) 
     /* output file is .bam format, and contains results of mapping */
     bam_16S_ch.view()

     // Part 3: Gene Quantification
     /* .bam file for each set of reads is passed to quantification process via bam_ARG_ch */
     quant_ARG_ch = count_ARG ( bam_ARG_ch )
     /* output file contains the absolute abundances found in sample of interest for 
      each reference gene specified in the nonredundant ARG reference file */
     quant_ARG_ch.view()

     /* .bam file for each set of reads is passed to quantification process via bam_16S_ch */
     quant_16S_ch = count_16S ( bam_16S_ch )
     /* output file contains the absolute abundances found in sample of interest for 
      each reference gene specified in the nonredundant 16S reference file */
     quant_16S_ch.view()
   }

Expected Output:

InitialLogMessage&Command Main CompletedProcesses Main SuccessfulRun Main

References

Nextflow Documentation

Processes: modules.nf

Source Code:

Workflow: start.nf

Source Code:

Expected Output:

Workflow: main.nf

Source Code:

Expected Output:

Processes: `modules.nf`

Workflow: `start.nf`

Workflow: `main.nf`