diff --git a/README.md b/README.md index 9f4461948a0830709fc4cecefeeca11c385b516c..33b4fbfd8e747dc4ea1dde4b078835c50155ac1b 100644 --- a/README.md +++ b/README.md @@ -4,78 +4,20 @@ [](https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf//-/commits/master) -# Ce repository est un template pour les workflows Get - -Ce workflow et ses différentes configurations permettent : -- d'executer un pipeline a partir d'un fichier samples.csv -- d'utiliser une image singularity ou conda ou path (cf profils) -- d'executer un multiqc -- de tracer les versions des logiciels -- d'envoyer un email à la fin du pipeline --email toto@fai.fr -- de générer automatiquement une image singularity et de la mettre a disposition dans le registry de la forge. - -## Comment utiliser ce répository ? - -Cloner le repo -``` -git clone git@forgemia.inra.fr:get-nextflow-ngl-bi/template-nf.git -``` - -Voici la liste des fichiers a récupérer avec leur utilité : -- `asset` code pour email et config de multiQC -- `conf` configurations utilisées dans `nextflow.config` - - base : conf générale - - path : si profile utilisé est --multipath ajouter un block par process ayant des dépendances - - test : chaque pipeline devra avoir un profil de test pour tester les pipelines - - genomes : devra peut-etre etre centralisé ailleurs pour avoir un seul fichier contenant les genomes utilisés par la pf. - -- `doc/output.md` : ce fichier devra etre copié et modifié avec la description des outputs du pipeline. Ce fichier est ensuite converti en html dans le repertoires de resultats du pipelines. - -- `.gitlab-ci.yml` si vous souhaitez avoir la génération automatique de l'image singularity à partir des fichiers `Singularityfile` et `environment.yml` mettez ce fichier à la racine de votre projet. L'image sera ensuite recupérable avec la commande suivante : -``` -singularity pull template-nf.sif oras://registry.forgemia.inra.fr/get-nextflow-ngl-bi/template-nf/template-nf:latest -``` - -- les fichiers `CHANGELOG.md`, `LICENCE`, `README.md` a utiliser et modifier - -- `main.nf` : le pipeline -- `nextflow.config` : la conf générale du pipeline -- pour le reproductibilité : `Singularityfile` et `environment.yml` (si besoin en plus: `Dockerfile`) - -## Et apres ? -- nomenclature: les channels doivent etre nommée comme suis: ch_FILE1_for_PROCESS_DESTINATION -- mettre en place des données de tests -- lorsque l'on code un process : - - utiliser les labels (pour la memoire, cpu, temps) définis dans base.config - - ajouter les logiciels utilisés dans get_software_versions -- documenter le quick start ci-dessous et supprimer le paragraphe 'Ce repository est un template pour les workflows Get' -- completer le `doc/output.md` et le `doc/usage.md` -- tagger un pipeline dès que les fonctionnalités attendues sont codées - - - -> La documentation suivante est a modifier et a garder. La precedente est a supprimer. - -## Introduction - -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker and singularity containers making installation trivial and results highly reproducible. - -## Quick Start - -i. Install [`nextflow`](https://nf-co.re/usage/installation) - -ii. Install one of [`singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`conda`](https://conda.io/miniconda.html) - -iii. Clone the pipeline and download the singularity pipeline - -```bash -git clone git@forgemia.inra.fr:get-nextflow-ngl-bi/template-nf.git -cd template-nf -singularity pull template-nf.sif oras://registry.forgemia.inra.fr/get-nextflow-ngl-bi/template-nf/template-nf:latest -``` -iv. Run the pipeline - -```bash -nextflow run pathto/template-nf/main.nf -profile test,singularity -``` - +# The wf-illumina-nf pipeline +This pipeline performes the QC of data from Illumina sequencers. + +## How tu use it ? +The pipeline begin after the NGS_Illumina pipeline, which, at the end performes the demultiplexing of raw data. In the output directory of demultiplexing, five elements are needed : +- one fastq files folder per project +- the SampleSheet.csv +- the nextflow outputs folder +- the params.config file +- the fastqScreen configration file + +An example of the params.config and fastqScreen are available in the assets folder. + +Example of a basic command line the launch the pipeline is (from the nextflow folder) : +```bash +sbatch -J nf-illumina_BHNKY7DRX2_1 -p wflowq -t 3-00 --mem 5GB --wrap="module load bioinfo/Nextflow-v21.04.1; cd /home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x/nextflow; nextflow run /work/sbsuser/test/jules/VisualStudioSources/wf-illumina-nf/main.nf -profile prod -ansi-log false" +``` \ No newline at end of file diff --git a/assets/fastq_screen.conf_example b/assets/fastq_screen.conf_example new file mode 100644 index 0000000000000000000000000000000000000000..78180aedd6af036d755ac20046b302818472966e --- /dev/null +++ b/assets/fastq_screen.conf_example @@ -0,0 +1,64 @@ +# This is an example configuration file for FastQ Screen + +############################ +## Bowtie, Bowtie 2 or BWA # +############################ +## If the Bowtie, Bowtie 2 or BWA binary is not in your PATH, you can set +## this value to tell the program where to find your chosen aligner. Uncomment +## the relevant line below and set the appropriate location. Please note, +## this path should INCLUDE the executable filename. + +#BOWTIE /usr/local/bin/bowtie/bowtie +#BOWTIE2 /usr/local/bioinfo/src/bowtie/bowtie2-2.4.4-linux-x86_64/bowtie2 +BWA /usr/local/bioinfo/src/bwa/bwa-0.7.15/bwa + +############################################ +## Bismark (for bisulfite sequencing only) # +############################################ +## If the Bismark binary is not in your PATH then you can set this value to +## tell the program where to find it. Uncomment the line below and set the +## appropriate location. Please note, this path should INCLUDE the executable +## filename. + +#BISMARK /usr/local/bin/bismark/bismark + +############ +## Threads # +############ +## Genome aligners can be made to run across multiple CPU cores to speed up +## searches. Set this value to the number of cores you want for mapping reads. + +THREADS 8 + +############## +## DATABASES # +############## +## This section enables you to configure multiple genomes databases (aligner index +## files) to search against in your screen. For each genome you need to provide a +## database name (which can't contain spaces) and the location of the aligner index +## files. +## +## The path to the index files SHOULD INCLUDE THE BASENAME of the index, e.g: +## /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37 +## Thus, the index files (Homo_sapiens.GRCh37.1.bt2, Homo_sapiens.GRCh37.2.bt2, etc.) +## are found in a folder named 'GRCh37'. +## +## If, for example, the Bowtie, Bowtie2 and BWA indices of a given genome reside in +## the SAME FOLDER, a SINLGE path may be provided to ALL the of indices. The index +## used will be the one compatible with the chosen aligner (as specified using the +## --aligner flag). +## +## The entries shown below are only suggested examples, you can add as many DATABASE +## sections as required, and you can comment out or remove as many of the existing +## entries as desired. We suggest including genomes and sequences that may be sources +## of contamination either because they where run on your sequencer previously, or may +## have contaminated your sample during the library preparation step. +## +Genome of E. coli +DATABASE E.coli /work/bank/bwadb/Escherichia_coli_FRIK2069 + +Sequence of PhiX +DATABASE PhiX /work/bank/bwadb/phi.fa + +Genome of yeast +DATABASE Yeast /work/bank/bwadb/yeast.nt diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index d7106f397827622e939664759042a948915ee06b..528c27ced03f492edc020287268965ec58831569 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,11 +1,79 @@ +## Report general informations +# Change with option --title in command line in process +title: "My Title" +#subtitle: "A subtitle to go underneath in grey" +intro_text: "This MultiQC report summarise Quality Control analysis results." + report_comment: > - This report has been generated by the <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf" target="_blank">nf-core/template</a> + This report has been generated by the <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf" target="_blank">wf-illumina-nf</a> analysis pipeline. For information about how to interpret these results, please see the - <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf" target="_blank">documentation</a>. + <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf" target="_blank">documentation</a>. + +show_analysis_paths: False +show_analysis_time: False + +## Number formatting +thousandsSep_format: " " + +## Sample name formatting +extra_fn_clean_trim: + - "_filtered" + - "_unmerged" + - "_unmerged_stats" + - "_flagstat" + - "_subset" + - "_screen" + +## Plot config +export_plots: true +plots_force_interactive: true + +## Module config report_section_order: software_versions: order: -1000 summary: order: -1001 + +module_order: + - fastqc: + name: "ReadsStats" + #info: "Analysis performed with FastQC, which is a quality control tool for high throughput sequence data, written by Simon Andrews at the Babraham Institute in Cambridge" + href: "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/" + target: "FastQC" + - qualimap: + name: "AlignmentStat" + #info: "Analysis performed with QualiMap" + href: "http://qualimap.bioinfo.cipf.es/" + target: "QualiMap" + - samtools: + - fastp: + name: "Duplicats" + href: "https://github.com/OpenGene/fastp" + target: "Fastp" + - fastq_screen: + name: "ContaminationSearch" + #info: "This section shows the module with different files" + target: "FastQ-Screen" -export_plots: true +# Pattern +sp: + fastqc: + fn: "*.zip" + fastq_screen: + fn: '*_screen.txt' + + +custom_logo: "./get_logo.png" +custom_logo_url: "https://get.genotoul.fr/" +custom_logo_title: "GeT-GenoToul" + +# FastQC +#top_modules: # Keep FastQC on top of the report +# - "fastqc" + + +# FastQC-Screen +fastqscreen_simpleplot: true + +# Qualimap diff --git a/assets/params.config_example b/assets/params.config_example new file mode 100644 index 0000000000000000000000000000000000000000..0bd525efeaddf04e134582ed908a180048c59615 --- /dev/null +++ b/assets/params.config_example @@ -0,0 +1,19 @@ +params { + inputdir="/home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x" + samplesheet = inputdir+'/SampleSheet.csv' + project = 'MAGICs' + data=inputdir+'/'+project + isMultiplex = true + dataNature = 'DNA' + //pairedEnd = true + splitReads = true + referenceGenome = '' + addBankForConta = '' + runName='Test_10X' + sequencer='NovaSeq' + run_date='230116' + machineID='NOVA' + fcID='BHNKY7DRX2' + lane='1' + demuxUniqueness='1673933427' +} \ No newline at end of file diff --git a/bin/DTM/circlize_v2.R b/bin/DTM/circlize_v2.R new file mode 100644 index 0000000000000000000000000000000000000000..f4c9d69ab00ff4d294e04c3a389a027112bdd013 --- /dev/null +++ b/bin/DTM/circlize_v2.R @@ -0,0 +1,114 @@ +#!/usr/bin/env Rscript + +#install.packages("circlize",repos = "http://cran.us.r-project.org") +#BiocManager::install("rtracklayer") +#BiocManager::install("ComplexHeatmap") +library(rtracklayer) +library(circlize) +library(ComplexHeatmap) + +# Args +args <- commandArgs(trailingOnly=TRUE) +# test if there are two arguments: if not, return an error +if (length(args) != 2) { + stop("Exactly two arguments must be supplied in the following order: + \n 1. an integer for chunk_size /!\\ too small (10) will take forever, too big (1000000) will cause clustering err: 10000 or 100000 recommended + \n 2. followed by all input.bedgraph files separated by commas and NO spaces + \n ex: circlize_v2.R 100000 filtered_Sdomesticus6.bedgraph,filtered_Sdomesticus4.bedgraph", call.=FALSE) +} else if (length(args) == 2) { + chunk_size <- as.numeric(args[1]) + list_bedgraphs <- strsplit(args[2], ", ")[[1]] +} + +# Initialize empty matrix to plot. Each column will hold chunked data from one sample. +cov_matrix <- c() +loop <- 1 # loop counter + +for (bedgraph in list_bedgraphs){ + # Import bedgraph generated with -bga + print(paste0("Loading bedgraph ", bedgraph)) + BedFile <- rtracklayer::import(bedgraph, format = "bed") + print(paste0("Loaded. Binning data by ", chunk_size, "bp intervals")) + + # Extract coverage values and weigh by width + coverage_points <- as.numeric(BedFile@elementMetadata@listData[["name"]])*as.numeric(BedFile@ranges@width) + + # Reduce data + pos_start=BedFile@ranges@start # extract start positions from bed object + chr <- 0 # chromosome counter + c <- 0 # chunk counter + # chunk_size=10000 #10k, 100k... defined in args + chunks <- c() # position of [chunk_size]th element in coverage_points vector + chr_factors <- c() # reduced vector of chromosomes to use as split factors (same size as chunks) + + for (i in 1:length(pos_start)){ + val <- pos_start[i] + if(val == 1){ + c <- 0 # reset count + chr <- chr+1 # next chromosome + } + if (val > chunk_size * c){ + c <- c+1 # next chunk (10k, 20k, 30k...) + chr_factors <- c(chr_factors, toString(BedFile@seqnames@values[chr])) # save corresponding chr + chunks <- c(chunks, i-1) # save coordinate + } + } + + # Calcualte averages of each chunk + values_avg <- c() + for (i in 1:(length(chunks)-1)){ # i starts at 1 + start <-chunks[i]+1 + x <- i+1 + end <- chunks[x] + diff <- (pos_start[end]-pos_start[start])+1 + if (diff==0){ # If only one line in chunk + diff= as.numeric(BedFile@ranges@width)[start] + } + values_avg <- c(values_avg, sum(coverage_points[start:end])/diff) + } + # Example: verify second value in bash with + #head -n 74952 bga_zeros_scaled_doublefiltered_Sdomesticus6_S6_L001_R1_001_subset_unmerged.bedgraph | tail -n 35055 | awk -F'\t' '{ sum += $4*($3-$2); n++ } END { if (n > 0) print sum / n; }' + + # Append to matrix + cov_matrix <- cbind(cov_matrix, values_avg) + colnames(cov_matrix)[loop] <- basename(bedgraph) + loop <- loop+1 +} + +# Order of samples +print(paste0("Samples plot order (ext->int) ", colnames(cov_matrix))) + +# Plot +print("Generating graph") +bed_min <- 0 +bed_med <- median(cov_matrix) +#nintyninth_percentile <- floor(length(values_avg)*0.01) # Index of top 1 percent of sorted points +#bed_max <- head(sort(cov_matrix,decreasing=TRUE),n=nintyninth_percentile)[nintyninth_percentile] # largest of 99% to avoid outliers +bed_max <- max(values_avg) +col_fun <- colorRamp2(c(bed_min, bed_med, bed_max), c("blue","gray85", "red")) +split <- factor(chr_factors, levels = BedFile@seqnames@values) + +# Reduce track width (default=0.2) if multiple samples +circos.clear() +circos.par(RESET = TRUE) +if((ncol(cov_matrix)>1) & (ncol(cov_matrix)<=4)){ + circos.par("track.height" = 0.1) +} else if(ncol(cov_matrix)>4){ + circos.par("track.height" = 0.05) +} + +filename <- paste(basename(bedgraph), ".jpeg", sep="") +jpeg(file=filename, units="in", width=5, height=5, res=150, pointsize = 8) + +# Very important that we do not cluster to not change the order! +for(i in 1:ncol(cov_matrix)) { # for-loop over columns (samples) + circos.heatmap(cov_matrix[,i], col=col_fun, split=split, cluster = FALSE, show.sector.labels = TRUE) +} +circos.clear() + +legend_title <- paste("Genome coverage (normalized RMP)\n", chunk_size, "bp resolution\n") +lgd_heat <- Legend(title = legend_title, col_fun = col_fun, + labels_gp = gpar(fontsize = 6), title_gp = gpar(fontsize = 8), grid_width = unit(0.25, "cm")) +grid.draw(lgd_heat) + +dev.off() diff --git a/bin/DTM/make_bedgraph.sh b/bin/DTM/make_bedgraph.sh new file mode 100644 index 0000000000000000000000000000000000000000..1eab8918d443762ca0ffb71ce6a68d5b2e160ed8 --- /dev/null +++ b/bin/DTM/make_bedgraph.sh @@ -0,0 +1,100 @@ +#!/bin/bash +#SBATCH --mail-user=jules.sabban@inrae.fr +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH -p wflowq +#SBATCH -t 4-00 +#SBATCH --mem-per-cpu=12G +#SBATCH -e %x_%j.err +#SBATCH -o %x_%j.log + +#### USAGE ### +<< usageMessage +USAGE : sbatch -J make_bedgraph_bacterium --array=1-6 make_bedgraph.sh <bam_fodler> <names_of_chromosomes_file> <chrom_pattern_to_remove> +EXAMPLE : sbatch -J make_bedgraph_pic --array=1-6make_bedgraph.sh ../samtools ../chrom_names "JANXI\|CM" + +<chrom_pattern_to_remove> is mandatory, but can be a void string +usageMessage + +#### ARGUMENT #### +I_DIR=$1 # path to samtools outputs +I_NAMES=$2 # path to chrom_names file +R_PATTERN=$3 # chr pattern to remove from bedgraph file + +#### MODULES #### +module load bioinfo/samtools-1.16.1 +module load bioinfo/bedtools-2.27.1 + + + +replace_chr_names() { + # replace chr names + echo -e "Replace chr names" + SAMTOOLS_CMD="samtools view -H ${BAM_PATH} |" + while read LINE + do + read -r OLD NEW <<< $(echo -e $LINE) + SAMTOOLS_CMD+=" sed -e 's/SN:${OLD}/SN:${NEW}/' |" + done < $I_NAMES + + SAMTOOLS_CMD+=" samtools reheader - $BAM_PATH > filtered_${S_NAME}.bam" + # note the - is on purpose, -c adds chr in front + sh -c "$SAMTOOLS_CMD" +} + + + +#samtools index chr_${S_NAME}.bam +#cp chr_${S_NAME}.bam filtered_${S_NAME}.bam + +# filter out unplaced contigs +#samtools view chr_${S_NAME}.bam `seq 1 18` X Y -b > filtered_${S_NAME}.bam + +index_bam(){ + echo -e "Indexing filtered BAM" + samtools index filtered_${S_NAME}.bam +} + + +# no longer need intermediary chr renamed bam/bai +#rm chr_${S_NAME}.bam chr_${S_NAME}.bam.bai + +make_bedgraph(){ + # Scale factor reads per million (of total reads or chr mapped reads) + scale=`bc <<< "scale=6;1000000/$(samtools view -f 0 -c filtered_${S_NAME}.bam)"` + #0.000808 + echo -e "Scaling factor ${scale}. On to bedgraph generation" + + # bedgraph + bedtools genomecov -ibam filtered_${S_NAME}.bam -bga -scale ${scale} > zeros_scaled_${S_NAME}.bedgraph +} + +remove_unwanted_scaffold(){ + # Even though bam was filtered, still have 0 values for unplaced scaffolds...remove non numeric or X/Y chromosomes + if [[ ! -z $R_PATTERN ]] + then + grep -v $R_PATTERN zeros_scaled_${S_NAME}.bedgraph > zeros_scaled_filtered_${S_NAME}.bedgraph + rm zeros_scaled_${S_NAME}.bedgraph + else + mv zeros_scaled_${S_NAME}.bedgraph zeros_scaled_filtered_${S_NAME}.bedgraph + fi +} + + + +main() { + BAM=$(find $I_DIR -type f -name '*R1*unmerged.bam' -execdir basename '{}' ';'|sed -n ${SLURM_ARRAY_TASK_ID}p) + echo -e "Traitement de ${BAM}" + BAM_PATH="${I_DIR}/${BAM}" + + S_NAME=$(basename $BAM .bam) + + replace_chr_names + + index_bam + + make_bedgraph + + remove_unwanted_scaffold +} + +main \ No newline at end of file diff --git a/bin/alignementStatTreatment.pl b/bin/alignementStatTreatment.pl new file mode 100755 index 0000000000000000000000000000000000000000..54b7f880df3b0d3e4508fb0c9978e7dd968391b0 --- /dev/null +++ b/bin/alignementStatTreatment.pl @@ -0,0 +1,202 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + alignmentStatTreatment.pl + +=head1 DESCRIPTION + + Lit les fichiers de sortie d'alignement et ajoute les informations extraites au treatment NGL-Bi + +=head1 SYNOPSIS + + alignmentStatTreatment.pl --file <path> + +=head1 OPTIONS + + --file=s : path to a stat file + +=head1 EXEMPLES + + perl alignmentStatTreatment.pl --file /path/to/my/file.stat + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use Log::Log4perl; + +################################################################## +# +# INITIALISATION +# +################################################################## +Log::Log4perl -> init('/home/sbsuser/save/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/conf/log4perl.conf'); +my $logger = Log::Log4perl->get_logger("MyLog"); + +my $file = ""; + +GetOptions( + "file=s" => \$file, # path to statistic file +); + +if ($file eq "") { + $logger -> warn("USAGE : alignmentStatTreatment.pl --file <STAT_FILE>\n"); + $logger -> fatal("At least one argument is missing !") and die; +} + +################################################################## +# +# MAIN +# +################################################################## +MAIN: +{ + # Initialisation du hash qui contiendra les info a inserer dans NGL-Bi + my %TreatmentProperties = (); + + # Définitions des regex + my $total_regex = '(\d+) .*in total'; # total regexp + my $qcfailure_regex = '(\d+ \+ (\d+) in total)|((\d+) QC failure)'; # qcfailure regexp + my $duplicates_regex = '(\d+) .*duplicates'; # duplicates regexp + my $mapped_regex = '(\d+) .*mapped \(([^:]*).*\)'; # mapped regexp + my $paired_regex = '(\d+) .*paired in sequencing'; # paired regexp + my $read1_regex = '(\d+) .*read1'; # read1 regexp + my $read2_regex = '(\d+) .*read2'; # read2 regexp + my $matemapped_regex = '(\d+) .*with itself and mate mapped'; # matemapped regexp + my $properlypaired_regex = '(\d+) .*properly paired \(([^:]*).*\)'; # properlypaired regexp + my $singletons_regex = '(\d+) .*singletons \(([^:]*).*\)'; # singletons regexp + my $mapch1_regex = '(\d+) .*with mate mapped to a different chr'; # mapch1 regexp + my $supplementary_regex = '(\d+).*supplementary'; # supplementary regexp + + # Lecture du fichier de statistiques + open my $openFile, '<', $file; $? and $logger -> fatal("Impossible d'ouvrir le fichier $file") and die; + chomp( my @lines = <$openFile> ); + close $openFile; + + foreach my $line (@lines) { + #$logger -> info("Evaluation de la ligne : ". $line); + if ($line =~ qr/$total_regex/) { + $TreatmentProperties{"total"} = $1; + $logger -> info("total_regex a ete trouvee et vaut : ". $TreatmentProperties{"total"}); + } + if ($line =~ qr/$qcfailure_regex/) { + if ($2 ne '') { + $TreatmentProperties{"qcfailure"} = $2; + } else { + $TreatmentProperties{"qcfailure"} = $4; + } + + $logger -> info("qcfailure a ete trouvee et vaut : ". $TreatmentProperties{"qcfailure"}); + } + if ($line =~ qr/$duplicates_regex/) { + $TreatmentProperties{"duplicates"} = $1; + $logger -> info("duplicates a ete trouvee et vaut : ". $TreatmentProperties{"duplicates"}); + } + if ($line =~ qr/$mapped_regex/) { + if (index($line,'primary') != -1) { + $TreatmentProperties{"primary_mapped_nb"} = $1; + $TreatmentProperties{"primary_mapped_perc"} = $2; + $logger -> info("primary_mapped_nb a ete trouvee et vaut : ". $TreatmentProperties{"primary_mapped_nb"}); + $logger -> info("primary_mapped_perc a ete trouvee et vaut : ". $TreatmentProperties{"primary_mapped_perc"}); + } else { + $TreatmentProperties{"mapped_nb"} = $1; + $TreatmentProperties{"mapped_perc"} = $2; + $logger -> info("mapped_nb a ete trouvee et vaut : ". $TreatmentProperties{"mapped_nb"}); + $logger -> info("mapped_perc a ete trouvee et vaut : ". $TreatmentProperties{"mapped_perc"}); + } + } + if ($line =~ qr/$paired_regex/) { + $TreatmentProperties{"paired"} = $1; + $logger -> info("paired a ete trouvee et vaut : ". $TreatmentProperties{"paired"}); + } + if ($line =~ qr/$read1_regex/) { + $TreatmentProperties{"read1"} = $1; + $logger -> info("read1 a ete trouvee et vaut : ". $TreatmentProperties{"read1"}); + } + if ($line =~ qr/$read2_regex/) { + $TreatmentProperties{"read2"} = $1; + $logger -> info("read2 a ete trouvee et vaut : ". $TreatmentProperties{"read2"}); + } + if ($line =~ qr/$matemapped_regex/) { + $TreatmentProperties{"matemapped"} = $1; + $logger -> info("matemapped a ete trouvee et vaut : ". $TreatmentProperties{"matemapped"}); + } + if ($line =~ qr/$properlypaired_regex/) { + $TreatmentProperties{"properlypaired_nb"} = $1; + $TreatmentProperties{"properlypaired_perc"} = $2; + $logger -> info("properlypaired_nb a ete trouvee et vaut : ". $TreatmentProperties{"properlypaired_nb"}); + $logger -> info("properlypaired_perc a ete trouvee et vaut : ". $TreatmentProperties{"properlypaired_perc"}); + } + if ($line =~ qr/$singletons_regex/) { + $TreatmentProperties{"singletons_nb"} = $1; + $TreatmentProperties{"singletons_perc"} = $2; + $logger -> info("singletons_nb a ete trouvee et vaut : ". $TreatmentProperties{"singletons_nb"}); + $logger -> info("singletons_perc a ete trouvee et vaut : ". $TreatmentProperties{"singletons_perc"}); + } + if ($line =~ qr/$mapch1_regex/ && index($line,'mapQ') == -1) { + $TreatmentProperties{"mapch1"} = $1; + $logger -> info("mapch1 a ete trouvee et vaut : ". $TreatmentProperties{"mapch1"}); + } + if ($line =~ qr/$supplementary_regex/) { + $TreatmentProperties{"supplementary"} = $1; + $logger -> info("supplementary a ete trouvee et vaut : ". $TreatmentProperties{"supplementary"}); + } + } + + + ## Insertion du treatment + ## TODO + +} +$logger -> info("Fin normale du script."); + +################################################################## +# +# FUNCTIONS +# +################################################################## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/bin/createNGLBiReadSets.pl b/bin/createNGLBiReadSets.pl new file mode 100755 index 0000000000000000000000000000000000000000..e5cdf2e378a6637bf0c5ef4fd97470eeefe2fcca --- /dev/null +++ b/bin/createNGLBiReadSets.pl @@ -0,0 +1,127 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + createNGLBiReadSets.pl + +=head1 DESCRIPTION + + Performe readSets creation on NGL-Bi + +=head1 SYNOPSIS + + createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> + +=head1 OPTIONS + + --infoFile=s : path to the info file + --env_ngl_bi=s : environment varible of ngl-bi + +=head1 EXEMPLES + + perl createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use Log::Log4perl qw(:easy);; + +################################################################## +# +# INITIALISATION +# +################################################################## +Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p>createNGLBiReadSets.pl:L%L] %m%n' } ); + +my $logger = Log::Log4perl -> get_logger(); + +my $infoFile=""; +my $env_ngl_bi = ""; + +GetOptions ('infoFile=s' => \$infoFile, + "env_ngl_bi=s" => \$env_ngl_bi, # environnement path of NGL-Bi +); + +if ($env_ngl_bi eq "" || $infoFile eq "" ) { + $logger -> logdie("USAGE : createNGLBiReadSets.pl --infoFile <File> --env_ngl_bi <ENV>\n"); +} + +my $experimentName=""; +my $runName=""; +my $laneNumber=""; +my $script_path="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; # Répertoire des scripts de l'API NGL + +################################################################## +# +# NGL-Bi ENVIRONMENT +# +################################################################## + +$ENV{APIPERL}=$env_ngl_bi; +$ENV{CONFFILE}=$env_ngl_bi."conf/prod_illumina_qc.conf"; +$logger = Log::Log4perl -> get_logger('loadConfFile'); +unless ($ENV{CONFFILE}) { + $logger -> logdie("$0 : Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); +} +my $dbconf_file = $ENV{CONFFILE}; +unless (-f $dbconf_file) { + $logger -> logdie("$0 : Database configuration file does not exist : $dbconf_file. It's necessary for continue."); +} +open my $handle, '<', $dbconf_file; +chomp ( my @lines = <$handle> ); +close $handle; +foreach my $line (@lines) { + $line =~ s/#.*//o; + unless ($line) {next;} + if ($line =~ /(.*)=(.*)/o) { + my $key = $1; + my $value = $2; + $key =~ s/^\s*//o; + $key =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/^\s*//o; + $ENV{$key} = $value; + } else { + $logger -> logdie("$0 : Can't load variable to dababase configration file $dbconf_file in line : '$_'"); + } +} + +unshift @INC, $env_ngl_bi."Common_tools/src/perl/lib"; +unshift @INC, $env_ngl_bi."DB_tools/src/perl/lib"; + +require illumina; +require json; +$logger -> info("\tVariables d'environnement pour NGL-Bi charées."); + +################################################################## +# +# INFO FILE READING +# +################################################################## +$experimentName=`grep "ExperimentName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep ExperimentName impossible : $!"); +$runName=`grep "NGLBiRunName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep NGLBiRunName impossible : $!"); +$laneNumber=`grep "LaneNumber" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep LaneNumber impossible : $!"); + +chomp($experimentName); +chomp($runName); +chomp($laneNumber); + + +my $commandNGLBiReadSets = "perl $script_path/createNGL-BiReadSets.pl --NGLBiRunCode $runName --NGLSqExperimentCode $experimentName --laneNumberToWorkOn $laneNumber"; +$logger -> info("\tCreation des readSets dans NGL-Bi : ".$commandNGLBiReadSets); +my $result_commandNGLBiReadSets = `$commandNGLBiReadSets 2>&1`; $? and $logger -> logdie("[Erreur]Lancement de createNGL-BiReadSets.pl\n".$result_commandNGLBiReadSets); \ No newline at end of file diff --git a/bin/demuxStatsFromXML.R b/bin/demuxStatsFromXML.R new file mode 100755 index 0000000000000000000000000000000000000000..1aec58cfff9f19e5546b5b15930388fa56c8f330 --- /dev/null +++ b/bin/demuxStatsFromXML.R @@ -0,0 +1,214 @@ +#!/usr/bin/env Rscript + +# R version : 4.0.4 +## module load system/R-4.0.4_gcc-9.3.0 + +# demuxStatsFromXML.R +# Lecture d'un fichier XML pour extraction et mise en forme des statistiques de démultiplexage (orienté 10X pour le moment) +# Par échantillon, ce script récupère tous les index associés, le nombre de reads trouvés, dont le nombre de barcodes lus parfaitement et le nombre de barcode lus avec un mismatch. +# Ce sctipt récupère aussi les index très souvent retrouvés mais non associé à un echantillon +# Le pourcentage du nombre de fragments par échantillon sur le nombre total est calculé + +## -------------------- +# PACKAGES +## -------------------- +library('xml2') +library('stringr') +library('optparse') + +## -------------------- +# FUNCTIONS +## -------------------- +concat_df = function(df1, df2, col.names) { + colnames(df2)<-col.names + df_tmp<-rbind(df1, df2) + return(df_tmp) +} + +## -------------------- +# PARAMETERS +## -------------------- +option_list = list( + # All arguments are compulsory + make_option(c("-x", "--xml"), type = "character", default = NULL, metavar = "character", + help = "Path to the DemultiplexingStats.xml file."), + make_option(c("-i", "--indexNumber"), type = "character", default = NULL, metavar = "character", + help = "Path to the .indexNumber file."), + make_option(c("-d", "--demuxSum"), type = "character", default = NULL, metavar = "character", + help = "Path to the demuxSummary.txt file.") +) + +opt_parser = OptionParser(usage="Make demultiplexStats easier to read.", option_list = option_list) +opt = parse_args(opt_parser) + +if(is.null(opt$xml) | is.null(opt$indexNumber) | is.null(opt$demuxSum)) { + stop("At least one argument is missing.\n", call. = FALSE) +} + +## -------------------- +# LOG +## -------------------- +cat("\nLancement du script demuxStatsFromXML.R avec les options suivantes :\n") +cat(paste0("\tFichier XML :\t\t", opt$xml, "\n")) +cat(paste0("\tFichier IndexNumber :\t", opt$indexNumber, "\n")) +cat(paste0("\tDemux Summary :\t\t" , opt$demuxSum, "\n")) +launchDir<-getwd() +cat(paste0("\nLe fichier de sortie sera écrit dans le répertoire :\t",launchDir , "\n\n")) + +## -------------------- +# MAIN +## -------------------- +xml<-read_xml(opt$xml) + +df<-data.frame() +vec.names<-c("Project", "Sample", "Barcode", "bcCount", "bcPerfect", "bcOneMismatch") + +projects<-xml_find_all(xml, "//Project") + +cat("Lecture du XML\n") +for (pr in 1:length(projects)){ + project<-xml_attr(projects[pr], "name") + Samples<-xml_children(projects[pr]) + for (sample in 1:length(Samples)){ + sample_name<-xml_attr(Samples[sample], "name") + xml_bc<-xml_children(Samples[sample]) + barcode_names<-xml_attr(xml_bc, "name") + for (bc in 1:length(barcode_names)) { + if (barcode_names[bc] != "all"){ + lane_path<-xml_path(xml_children(xml_bc[bc])) + BarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/BarcodeCount"))) + PerfectBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/PerfectBarcodeCount"))) + if (length(PerfectBarcodeCount) == 0) { PerfectBarcodeCount<-0 } + OneMismatchBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/OneMismatchBarcodeCount"))) + + if (length(OneMismatchBarcodeCount) == 0) { OneMismatchBarcodeCount<- "-"} + + df_to_add<-data.frame(project, sample_name, barcode_names[bc], BarcodeCount, PerfectBarcodeCount, OneMismatchBarcodeCount) + df<-concat_df(df, df_to_add, vec.names) + + } + } + } +} + +cat("Résumé des informations extraites (nombre d'échantillons par projet) :") +table(df$Project) + +# Concaténation des index multilples +# Ecrire script pour générer ce fichier à partir de la SS +cat("\nLecture du fichier contenant le nombre d'index pour chaque échantillon.\n") +indexNumber<-read.table(opt$indexNumber, header=TRUE, sep="\t") + +df2<-data.frame() +df.defaultLine<-df[which(df$Project == "default"),] +df2<-concat_df(df2, df.defaultLine, vec.names) + +cat("Rassemblement des statistiques par échantillons.\n") +for (line in 1:dim(indexNumber)[1]){ + mySample<-indexNumber[line, "Sample"] + mySampleNumber<-indexNumber[line, "NumberOfIndex"] + + # Single Index Case + if (mySampleNumber == 1) { + df.singleLine<-df[which(df$Sample == mySample),] + df2<-concat_df(df2, df.singleLine, vec.names) + } + # Dual et 4 Index Cases + else if (mySampleNumber > 1) { + #sub.df<-df[which(str_detect(df$Sample, mySample)), ] + sub.df<-df[which(df$Sample == mySample), ] + #print(sub.df) + # Parcours du sous-data.frame + for (l in 1:dim(sub.df)[1]) { + sub.df.project<-sub.df[l, "Project"] + sub.df.barcode<-sub.df[l, "Barcode"] + sub.df.bcCount<-as.numeric(sub.df[l, "bcCount"]) + sub.df.bcPerfect<-as.numeric(sub.df[l, "bcPerfect"]) + sub.df.oneMismatch<-as.numeric(sub.df[l, "bcOneMismatch"]) # bcOneMismatch + + #print(paste(mySample, ":: Traitement du barcode :", sub.df.barcode)) + + if (l == 1 ) { + sub.df.project.toAdd<-sub.df.project + sub.df.barcode.toAdd<-sub.df.barcode + sub.df.bcCount.toAdd<-sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch + } else { + sub.df.barcode.toAdd<-paste0(sub.df.barcode.toAdd, "+", sub.df.barcode) + sub.df.bcCount.toAdd<-sub.df.bcCount.toAdd+sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect.toAdd+sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch.toAdd+sub.df.oneMismatch + } + } + + # Add to data.frame + df_to_add<-data.frame(sub.df.project,mySample, sub.df.barcode.toAdd, sub.df.bcCount.toAdd, sub.df.bcPerfect.toAdd, sub.df.oneMismatch.toAdd) + df2<-concat_df(df2, df_to_add, vec.names) + } +} + +cat("Résumé des informations extraites (nombre d'échantillons par projet) :") +table(df2$Project) + +## Recherche des index indeterminés +cat("\nRecherche des index indéterminés.\n") +bcCount.min<-min(as.numeric(df2[-which(df$Project == "default"), "bcCount"])) +bcCount.threshold<-0.8*bcCount.min + +# Rechercher tous les index trouvés au moins bcCount.threshold fois +cat("Tentative de récupérer des échantillons parmi les index retrouvés les plus fréquemment.\n") +cat("\tLecture du DemuxSummary.\n") +linesToSkip<-as.numeric(system(paste("grep -n Most", opt$demuxSum, "| cut -d':' -f1"), intern = TRUE)) +tabDemuxSum<-read.table(opt$demuxSum, skip=linesToSkip, col.names=c("Index", "Count")) + +tabUndetermined<-tabDemuxSum[which(tabDemuxSum$Count >= bcCount.threshold),] + +cat("\tRésumé des inforamtions extraites :\n") +cat(paste0("\tNombre d'index indéterminés retrouvés :\t", dim(tabUndetermined)[1], "\n")) +head(tabUndetermined) + + +# Construction du dataFrame pour intégration à df2 +df2.Projects<-unique(df2$Project) +myProject<-df2.Projects[which(df2.Projects != "default")] + +### Pour chaque ligne de tabUndertermined, on ajoute une ligne à df2 : +if (dim(tabUndetermined)[1] != 0) { + df.tabUndetermined<-data.frame() + for (i in 1:dim(tabUndetermined)[1]) { + df.tabUndetermined.tmp<-data.frame(myProject, "Undetermined", tabUndetermined[i, "Index"], tabUndetermined[i, "Count"], "-", "-") + df.tabUndetermined<-concat_df(df.tabUndetermined, df.tabUndetermined.tmp, vec.names) + } + + df2<-concat_df(df2, df.tabUndetermined, vec.names) + cat("\tLes index indéterminés ont été ajouté au data.table.\n") +} else { + cat("\tAuncun index indéterminés trouvés.\n") +} + +## Soustraction des undertermined aux allOthers +# recuperer les Count de tabUndetermined et soustraire la somme à df2[which(df2$Project == "default"), "bcCount"] +cat("\nQuelques calculs sur les données avant de les exporter.\n") +cat("\tActualisation du nombre d'index 'AllOthers'.\n") +undertermined.count<-sum(as.numeric(tabUndetermined[,"Count"])) +df2[which(df2$Project == "default"), "bcCount"]<-as.numeric(df2[which(df2$Project == "default"), "bcCount"])-undertermined.count + +# Calcul pourcentages de chaque barcode +cat("\tCalcul du pourcentage sur le nombre de fragments total.\n") +totalOfFragments<-sum(as.numeric(df2$bcCount)) + +percentOfFragment<-as.data.frame(round((as.numeric(df2[,"bcCount"])/totalOfFragments)*100, 2)) +rownames(percentOfFragment)<-rownames(df2) +colnames(percentOfFragment)<-"percentageOfFragment" + +df2<-cbind(df2, percentOfFragment) + +# Export du data.frame +cat("\nSauvegarde du data.frame.\n") +myProject<-"DEBUG" +# mettre des 0 à la place des NA dans df2 +write.table(df2, row.names = FALSE, quote = F, sep = "\t", file = paste0("DemultiplexStats_", myProject, ".csv")) +# Ecrire un fichier par valeur de myProject ! Cas ou il y a plusieurs projets sur la même lane. +cat(paste0("\tLe fichier suivant à été créé :\t", launchDir, "/DemultiplexStats_", myProject, ".csv\n")) +cat("\nFin normale du script, on sort.\n") diff --git a/bin/extractInfo.pl b/bin/extractInfo.pl new file mode 100755 index 0000000000000000000000000000000000000000..bedf21becfc4950da49f7cd3605bed7ab84b249b --- /dev/null +++ b/bin/extractInfo.pl @@ -0,0 +1,396 @@ +#!/usr/bin/perl -w + +=head1 NAME + + extractInfo.pl + +=head1 DESCRIPTION + + Récupère les informations de la SampleSheet et du RunInfo.xml pour écrire le masque récupéré par extractReads.pl + +=head1 SYNOPSIS + + extractInfo.pl -h | -s SampleSheet.csv -r RunInfo.xml + +=head1 OPTIONS + + -s : fichier SampleSheet.csv - input + -r : fichier RunInfo.xml - input + +=head1 VERSION + +=head1 DEPENDENCIES + +=head1 AUTHOR + + Plateforme genomique Toulouse (get-plage.ngs@genotoul.fr) + +=cut +############################################################################################################################# +# +# LIBRAIRIES +# +############################################################################################################################# +use strict; +use Getopt::Long; +use File::Copy "cp"; +use File::Basename; +use SOAP::Lite; +use List::MoreUtils qw(indexes); +use Log::Log4perl (); +use Log::Log4perl qw(:easy);#FATAL ERROR WARN INFO DEBUG TRACE +use Pod::Usage; +use Switch; +use utf8; +#local $/ = "\r\n"; + +############################################################################################################################# +# +# EXEMPLE DE RUNINFO.XML +# +############################################################################################################################# + +#MiSeq +# <Reads> +# <Read NumCycles="151" Number="1" IsIndexedRead="N" /> +# <Read NumCycles="6" Number="2" IsIndexedRead="Y" /> +# <Read NumCycles="151" Number="3" IsIndexedRead="N" /> +# </Reads> + +#HiSeq3000 Run Simple + Dual index +# <Reads> +# <Read Number="1" NumCycles="151" IsIndexedRead="N" /> +# <Read Number="2" NumCycles="8" IsIndexedRead="Y" /> +# <Read Number="3" NumCycles="8" IsIndexedRead="Y" /> +# <Read Number="4" NumCycles="151" IsIndexedRead="N" /> +# </Reads> + + + +############################################################################################################################# +# +# MAIN +# +############################################################################################################################# +MAIN: +{ + # Initialisation du log + Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p> extractInfo.pl:L%L %M] %m%n' } ); + my $logger = Log::Log4perl -> get_logger(); + $logger -> info("Entrée dans le programme"); + + # Parametre du programme + my $help = 0 ; + my $RunInfo; + my $SampleSheet; + + # Recuperation des options + GetOptions ( 'help|h' => \$help, + 'r=s' => \$RunInfo, #string + 's:s' => \$SampleSheet); #string + if($help){ + pod2usage( + -verbose => 99 + ); + } + + ################## + # Programme + ################## + + my $SSformat; + my $checkIEM; + my $check10x; + my $config_file = "Run.conf"; # fichier d'output qui va etre pris comme input pour GenerateCasavaDir.pl pour les analyses standard. + #my $config10X_file = "Run_10X.conf"; # fichier d'output qui va etre pris comme input pour GenerateCasavaDir.pl pour les analyses 10X. + + if (-s $SampleSheet) { + $SSformat = check_my_SSFormat($SampleSheet); + } + $check10x = ($SSformat eq '10X') ? 1 : 0; + $checkIEM = ($SSformat eq 'IEM') ? 1 : 0; + + if( $checkIEM && $check10x){ + $logger -> logdie("[Error] Le programme ne fonctionne pas quand on lui donne Illumina ET 10x."); + } + if( !$checkIEM && !$check10x){ + $logger -> logdie("[Error] Le programme ne fonctionne pas sans samplesheet."); + } + $logger -> info("\tcheckIEM : ".$checkIEM." | check10x : ".$check10x); + + # # # # # # # # # # # # # # # # # # # # # # # + # Parsing du fichier RunInfo.xml + # # # # # # # # # # # # # # # # # # # # # # # + $logger -> info("Analyse du fichier RunInfo.xml"); + + # Récupération de la taille des reads et d'index par le nombre de cycles + my $runInfo_lengthR1 = 0; + my $runInfo_lengthR2 = ""; + my $runInfo_lengthI1 = 0; + my $runInfo_lengthI2 = ""; + + # Informations recuperees par capture de regex + my $versionRunInfo; + my $number = ""; + my $numCycle = ""; + my $isIndexed = ""; + + # Configuration du run + my $runInfo_config = "single"; #dual|single|noindex + + open(F,"$RunInfo") or $logger -> logdie("[Erreur] Impossible d'ouvrir le fichier RunInfo.xml"); + while(my $ligne =<F>){ + chomp($ligne); + # Recuperation de la version de RunInfo + #<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2"> -> MiSeq + #<RunInfo Version="5"> -> Nova + if( $ligne =~ /\<RunInfo / ){ + ($versionRunInfo) = $ligne =~ m/<RunInfo.* Version="(\d)">/; + $logger -> info("\tVersion du RunInfo : ".$versionRunInfo); + next; + } + + next if( $ligne !~ /\s*<Read /); # Analyse uniquement sur les lignes de read + if( $versionRunInfo eq "2"){ + ($numCycle, $number, $isIndexed) = $ligne =~ m/<Read NumCycles="(\d+)" Number="(\d)" IsIndexedRead="(Y|N)" \/\>/; + } elsif( $versionRunInfo eq "5"){ + ($number, $numCycle, $isIndexed) = $ligne =~ m/<Read Number="(\d)" NumCycles="(\d+)" IsIndexedRead="(Y|N)"\/\>/; + } else { + $logger -> logdie("[Erreur] Le numero de version de RunInfo.xml ne correspond à rien de connu" ); + } + $logger -> info("\t\tRésultat des captures : NumCycle ".$numCycle." | number ".$number." | IsIndexed ".$isIndexed); + + # Interpretation pour connaitre les longueurs des cycles + if ($isIndexed eq "N" && $number eq 1){ # Read 1 + $runInfo_lengthR1 = $numCycle; + } + if ($isIndexed eq "N" && $number ne 1){ #Read 2 + $runInfo_lengthR2 = $numCycle; + } + if ($isIndexed eq "Y" && $runInfo_lengthI1 eq 0){ #Index 1 + $runInfo_lengthI1 = $numCycle; + } + elsif ($isIndexed eq "Y" && $runInfo_lengthI1 ne 0){ #Index 2 + $runInfo_lengthI2 = $numCycle; + $runInfo_config = "dual"; + } + } + close F; + + $logger -> logdie("Impossible de capter les infos de numCycle, number, isIndexed" ) if (($numCycle eq "") || ($number eq "") || ($isIndexed eq "")); + $runInfo_config = "noindex" if($runInfo_lengthI1 eq 0); + $logger -> info("\tConfig : ".$runInfo_config. + " | R1 = ". $runInfo_lengthR1 ." | R2 = ". $runInfo_lengthR2. + " | I1 = ". $runInfo_lengthI1 ." | I2 = ". $runInfo_lengthI2); + + # # # # # # # # # # # # # # # # # # # # # # # + # Traitement de la samplesheet + # # # # # # # # # # # # # # # # # # # # # # # + + # Parametrage # # # # # # # # # # # # # # # # # # + + my $lane_10x = ""; + my $cmdOptions_10x = ""; + + my $mask; + my $index1; my $index2; # Variables temporaires stockant l'info des colonnes index et index2 pour une lane donnée + my $lane; # Variable temporaire stockant le numéro de la lane étudiée + my %info_lane; #Tableau regroupant l'information de configuration des index par lane + + # Construction du dico %line_interpreter qui rassemble les différents formats de SS IEM possibles + my %line_interpreter; # MNL = mono lane | MTL = multi lane | SI = Single index | DI = Dual index + $line_interpreter{"MonoLane-SingleIndex"} = "Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description"; + $line_interpreter{"MonoLane-DualIndex"} = "Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description"; + $line_interpreter{"MultiLane-SingleIndex"} = "Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description"; + $line_interpreter{"MultiLane-DualIndex"} = "Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description"; + my $samplesheet_config = ""; # La config de la SS + my %indexHeaderSS_dict = (); # Dico qui associe clé-colonne : valeur-index + + # Construction du dico %length_index qui associe la longueur d'un index 10X à son préfixe + my %length_index; + $length_index{"SI-GA"}{"idx1"}=8; $length_index{"SI-GA"}{"idx2"}=0; + $length_index{"SI-NA"}{"idx1"}=8; $length_index{"SI-NA"}{"idx2"}=0; + $length_index{"SI-TT"}{"idx1"}=10; $length_index{"SI-TT"}{"idx2"}=10; + + # Parcours de la Samplesheet # # # # # # # # # # # # # # # # # # + $logger -> info("Analyse du fichier ".$SampleSheet); + my $headerline_present = 0; + open(S,"$SampleSheet") or $logger -> logdie("[Erreur] Impossible d'ouvrir la SampleSheet $SampleSheet"); + LINE: while(my $ligne = <S>){ + chomp($ligne); + next LINE if not ($ligne =~ /.*,.*,.*/); # Sauter les lignes du début qui ont 0 ou 1 virgule + if($ligne =~ /.*Sample_ID,.*/){ + $headerline_present = 1; + # Détermination du mode de la Samplesheet + # (Tout est dans ce bloc pour être exécuté une seule fois dans la boucle) + foreach my $SS_config (keys %line_interpreter){ + $samplesheet_config = $SS_config if ($line_interpreter{$SS_config} eq $ligne); + } + $logger -> logdie("[Erreur] Aucune config ne correspond à la SS :(") if( $samplesheet_config eq "" ); + $logger -> info("\tSS en config $samplesheet_config"); + + # Construction d'un tableau permettant de construire le dico qui associe le numéro de la colonne au nom de la colonne + my @headerSS_tab = split(/,/, $line_interpreter{$samplesheet_config}); + foreach my $column_name (@headerSS_tab){ + $indexHeaderSS_dict{$column_name} = indexes { $_ eq $column_name } @headerSS_tab; + } + next LINE; + } + $logger -> logdie("[Erreur] La samplesheet $SampleSheet ne contient pas de header") if( !$headerline_present); + next LINE if($info_lane{$lane}); # On considère que tous les échantillons d'une même lane sont indexés pareils + + my @list = split(/,/,$ligne); + $index1 = $list[$indexHeaderSS_dict{'index'}]; # enregistre la séquence de l'index1 ou SI-GA... + $index2 = ($samplesheet_config =~ /DualIndex/) ? $list[$indexHeaderSS_dict{'index2'}] : "" ; + $lane = ($samplesheet_config =~ /MultiLane/) ? $list[$indexHeaderSS_dict{'Lane'}] : '1' ; + + # Contrairement à illumina qui ont la séquence notée, les index 10X ont le nom de l'index (sauf les customs!!) + if($check10x){ + $logger -> info("Gestion du 10X"); + $lane_10x .= $lane.","; + my $prefixe_index = substr($index1, 0, 5); + if($list[$indexHeaderSS_dict{'I7_Index_ID'}] !~ "Custom_"){ + $index1 = ("X"x$length_index{$prefixe_index}{idx1}); # dico contenant les longueurs des index 10x pour filouter + $index2 = ("X"x$length_index{$prefixe_index}{idx2}) if($samplesheet_config =~ /DualIndex/); + } + } + # Bilan pour la lane étudiée + $logger -> info("\tSur la lane ".$lane." -> Index1 : ".$index1. " | Index2 : ".$index2); + + # Remplissage du dico info_lane : infolane{#1}=8,8 par exemple + $info_lane{$lane} = length($index1); + $info_lane{$lane} .= ",".length($index2) if($runInfo_config eq "dual") ; + $logger -> info("\tLane ".$lane. " : ".$info_lane{$lane}); + } + close S; + + # Ecriture des options 10X + if($check10x){ + chop $lane_10x; + $cmdOptions_10x = "--lanes=".$lane_10x; + $cmdOptions_10x .= " --filter-single-index " if(($runInfo_config eq "dual") and ($samplesheet_config =~ /SingleIndex/)); + $cmdOptions_10x .= " --filter-dual-index " if(($runInfo_config eq "dual") and ($samplesheet_config =~ /DualIndex/)); + } + + # Rechercher si bool_change_config ? + #my $bool_change_config; + #Ecriture du masque # # # # # # # # # # # # # # # # + $logger -> info("Ecriture du masque"); + my $masque_read1 = "Y".($runInfo_lengthR1-1)."n"; + my $masque_read2 = ($runInfo_lengthR2 eq "") ? " ": ",Y".($runInfo_lengthR2-1)."n"; + $logger -> info("masqueR1 : ".$masque_read1." | masqueR2 :".$masque_read2); + +# if( $samplesheet_config =~ /MonoLane/){ +# $logger -> info("\tEn mono-lane"); +# $mask = " --use-bases-mask ".$masque_read1; +# $logger -> info("masque : ".$mask); +# $mask .= ",I$runInfo_lengthI1" if($runInfo_config eq "single"); +# $logger -> info("masque : ".$mask); +# $mask .= ",I$runInfo_lengthI1,I$runInfo_lengthI2" if($runInfo_config eq "dual"); +# $logger -> info("masque : ".$mask); +# $mask .= "$masque_read2"; +# $logger -> info("masque : ".$mask); +# $logger -> info("masqueR1 : ".$masque_read1." | masqueR2 :".$masque_read2); +# +# }else{ # Multilane +# $logger -> info("\tEn multi-lane"); +# my $nb_n_idx1; # Nombre de n à la fin de l'index 1 +# my $nb_n_idx2; # Nombre de n à la fin de l'index 2 +# my @idx = keys(%info_lane); +# +# foreach my $k (keys(%info_lane)) { +# $mask .= " --use-bases-mask ".$k.":".$masque_read1; +# +# if($runInfo_config eq "single"){ +# $mask .= ",n*" if($info_lane{$k} eq "0"); #si la lane est NoIndex, n'a pas d'index 1 +# $mask .= ",I".$info_lane{$k}.("n" x ($runInfo_lengthI1-$info_lane{$k})) if($info_lane{$k} ne "0"); #si la lane a 1 index +# +# }elsif($runInfo_config eq "dual"){ +# my @list = split(/,/,$info_lane{$k}); +# $nb_n_idx1 = $runInfo_lengthI1-$list[0]; +# #si la lane est NoIndex ; n'a pas d'index 1 et 2 +# if($list[0] eq "0"){ +# $mask .= ",n*,n*"; +# #si la lane est single index ; l'index 2 est vide +# }elsif($list[1] eq "0"){ +# $mask .= ",I".$list[0].("n"x$nb_n_idx1).",n*"; +# #si la lane a 2 index +# }else{ +# $nb_n_idx2 = $runInfo_lengthI2-$list[1]; +# $mask .= ",I".$list[0].("n"x$nb_n_idx1).",I".$list[1].("n"x$nb_n_idx2); +# } +# } +# $mask .= "$masque_read2"; +# } +# } + my $nb_n_idx1; # Nombre de n à la fin de l'index 1 + my $nb_n_idx2; # Nombre de n à la fin de l'index 2 + my @idx = keys(%info_lane); + + foreach my $k (keys(%info_lane)) { + $mask .= " --use-bases-mask ".$k.":".$masque_read1; + + if($runInfo_config eq "single"){ + $mask .= ",n*" if($info_lane{$k} eq "0"); #si la lane est NoIndex, n'a pas d'index 1 + $mask .= ",I".$info_lane{$k}.("n" x ($runInfo_lengthI1-$info_lane{$k})) if($info_lane{$k} ne "0"); #si la lane a 1 index + + }elsif($runInfo_config eq "dual"){ + my @list = split(/,/,$info_lane{$k}); + $nb_n_idx1 = $runInfo_lengthI1-$list[0]; + #si la lane est NoIndex ; n'a pas d'index 1 et 2 + if($list[0] eq "0"){ + $mask .= ",n*,n*"; + #si la lane est single index ; l'index 2 est vide + }elsif($list[1] eq "0"){ + $mask .= ",I".$list[0].("n"x$nb_n_idx1).",n*"; + #si la lane a 2 index + }else{ + $nb_n_idx2 = $runInfo_lengthI2-$list[1]; + $mask .= ",I".$list[0].("n"x$nb_n_idx1).",I".$list[1].("n"x$nb_n_idx2); + } + } + $mask .= "$masque_read2"; + } + $logger -> info("\t\tConfig de la Samplesheet : ".$samplesheet_config. " | Masque : " . $mask); + + #Ecriture du fichier Run.conf pour la samplesheet IEM # # # # # + open(O, ">$config_file") or $logger -> logdie("Error in opening config file $config_file"); + print O "SAMPLESHEET=$SampleSheet\n"; + print O "RUNCONFIG=$runInfo_config\n"; + print O "MASQUE=$mask\n"; + print O "OPTIONS=$cmdOptions_10x\n" if($check10x); + print O "DEMUX=$SSformat\n"; + close O; +} + +=head2 function check_my_SSFormat + + Title : check_my_SSFormat + Usage : $boolean = check_my_SSFormat( $samplesheet, mode); + Prerequisite : None + Function : Send an email and check if the sending went well + Returns : Boolean + Args : $mContent, $mSubject, $mCC, $mRecipients : strings + Globals : none + +=cut + +sub check_my_SSFormat { + my ($samplesheet_to_test) = @_; + my $logger = Log::Log4perl -> get_logger('check_my_SSFormat'); + + my $chemistrySS = `grep Chemistry $samplesheet_to_test`; $? and $logger -> logdie("Récupération de 'Chemistry' en echec" ); + my ($chemistry) = $chemistrySS =~ m/^Chemistry,(\w+)$/; + + if ($chemistry eq '10X'){ + $logger -> info("$samplesheet_to_test au format 10X"); + return '10X'; + }elsif($chemistry eq 'Default' or $chemistry eq 'amplicon' ){ + $logger -> info("$samplesheet_to_test au format 'IEM'"); + return 'IEM'; + }else{ + $logger -> logdie("[Erreur] On aurait du rentrer dans le cas IEM ou 10X" ); + } +} diff --git a/bin/extractInfoForDemuxStats.pl b/bin/extractInfoForDemuxStats.pl new file mode 100755 index 0000000000000000000000000000000000000000..71218fc3d7e35bd8c5f9729df4c4f8c25ada85f5 --- /dev/null +++ b/bin/extractInfoForDemuxStats.pl @@ -0,0 +1,124 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractInfoForDemuxStats.pl + +=head1 DESCRIPTION + + Extract from the samplesheet of lane : (1) sample names and (2) how many index are associated. Ecriture dans un fichier .indexNumber + +=head1 SYNOPSIS + + extractInfoForDemuxStats.pl --sampleSheet + +=head1 OPTIONS + + -sampleSheet|s : the samplesheet file + +=head1 EXEMPLES + + perl extractInfoForDemuxStats.pl --sampleSheet 20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; + +################################################################### +# +# INITIALISATION +# +#################################################################### +my $sampleSheet=""; + +GetOptions ('sampleSheet=s' => \$sampleSheet, +); + +if ($sampleSheet eq "") { + print STDERR ("Please, give a file !"); + print STDERR ("USAGE : extractInfoForDemuxStats.pl --sampleSheet <File>\n"); + exit 0; +} + +#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description +#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description + +# recuperer le nombre de fois où "*Index_ID" est écrit et leur position +# récupere la position du sample_ID +#Pour chaque ligne recupérer le ou les index_ID +#Si index_ID =~ XX-XX-XX alors #index = 4 +#Sinon #index = 1 +#Faire la somme des #index par ligne +#Ecrire le nom de l'échantillon et le nombre d'index associé +#Ne pas oublier l'entete du fichier de sortie + + +### Lecture de la samplesheet : +open (my $handle, '<', $sampleSheet) or exit 1; +chomp(my @lines = <$handle>); +close $handle; + +my $projectName=""; +my $sample_ID_position; +my @index_ID_position=(); +my %sample_info=(); + + +foreach my $line (@lines) { + my @cur_line = split(',', $line); + + # Recherche du nom du projet + if ($line =~ /^Infos/) { + $projectName = $cur_line[1]; + } + + # Recherche des positions des Sample_ID et des Index_ID + elsif ($line =~ /^Lane/) { + while ( my ( $indice, $valeur ) = each @cur_line ) { + if ($valeur eq "Sample_ID") { $sample_ID_position=$indice;} + if ($valeur =~ /Index_ID$/) { push(@index_ID_position, $indice);} + } + } + + # Association Sample_ID avec sont nombre d'index + elsif ($line =~ m/^(\d),/) { + my $sample_ID = $cur_line[$sample_ID_position]; + my $index_number=0; + my @cur_index_ID = (); + foreach my $pos (@index_ID_position) { + if ($cur_line[$pos] =~ /\w{2}-\w{2}-\w{2}/) { $index_number = 4; } else { $index_number += 1; } + } + $sample_info{$sample_ID} = $index_number; + } +} + +# ecriture du fichier de sortie : +my $content =""; +$content.="Sample\tNumberOfIndex\n"; +foreach my $k (keys(%sample_info)) { + $content.="$k\t$sample_info{$k}\n"; +} + +my $file2write = "$projectName.indexNumber"; + +open(my $fh, '>', $file2write) or exit 1; +print $fh $content; +close $fh; + + + + diff --git a/bin/extractInfoForReadSets.pl b/bin/extractInfoForReadSets.pl new file mode 100755 index 0000000000000000000000000000000000000000..b9a9dc1d917d0726ff3cb3f16cfa336915890864 --- /dev/null +++ b/bin/extractInfoForReadSets.pl @@ -0,0 +1,105 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractInfoForReaSets.pl + +=head1 DESCRIPTION + + Extract (from samplesheet and RunNGL-Bi.created) and emit relevant informations for readSets creation + +=head1 SYNOPSIS + + extractInfoForReaSet.pl --sampleSheet --runNGLBi + +=head1 OPTIONS + + -sampleSheet|s : the samplesheet file + -runNGLBi|s : the RunNGL-Bi.created file + +=head1 EXEMPLES + + perl extractInfoForReaSet.pl --sampleSheet 20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv --runNGLBi RunNGL-Bi.created + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; + +################################################################### +# +# INITIALISATION +# +################################################################### +my $sampleSheet=""; +my $runNGLBiFile=""; + +GetOptions ('samplesheet=s' => \$sampleSheet, + 'runNGLBi=s'=> \$runNGLBiFile, +); + +if ($sampleSheet eq "" || $runNGLBiFile eq "") { + print STDERR ("At least one argument is missing !"); + print STDERR ("USAGE : extractInfoForReaSet.pl --sampleSheet <File> --runNGLBi <File>\n"); + exit 0; +} + +my $laneNumber; +my $experimentName; +my $runName; +my $content; +my $file2write="readSetCreation.info"; + +################################################################### +# +# MAIN +# +################################################################### +## Extract informations from files +### SamplSheet +#### ExperimentName +my $experimentName_ligne = `grep "Experiment Name" $sampleSheet | head -1`; +($experimentName) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; + +#### LaneNumber + +if ($sampleSheet =~ "_MISEQ_") { + $laneNumber = "1"; +} else { + open (my $handle, '<', $sampleSheet) or exit 1; + chomp(my @lines = <$handle>); + close $handle; + + foreach my $line (@lines) { + if ($line =~ m/^(\d),/) { + ($laneNumber) = $line =~ m/^(\d),/; + last; + } + } +} +### RunNGL-Bi.created +$runName = `cat $runNGLBiFile`; +chomp($runName); + +## Write exit file +$content.="ExperimentName;$experimentName\n"; +$content.="NGLBiRunName;$runName\n"; +$content.="LaneNumber;$laneNumber\n"; + +open(my $fh, '>', $file2write) or exit 1; +print $fh $content; +close $fh; + diff --git a/bin/extractReads.pl b/bin/extractReads.pl new file mode 100755 index 0000000000000000000000000000000000000000..a3f5b2b8019c20fad82b95a16b2757be76268e49 --- /dev/null +++ b/bin/extractReads.pl @@ -0,0 +1,506 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractReads.pl + +=head1 DESCRIPTION + + Initailisation du pipeline wf-Illumina-nf + Decoupage de la samplesheet + Creation du run dans NGL-Bi + Parametrage et lancement des analyses qualite via wf-Illumina-nf/main.nf + +=head1 SYNOPSIS + + extractReads.pl -h | |-sequencer|s type_sequencer] 2>> /work/sbsuser/Logs/cronMACHINE.txt + +=head1 OPTIONS + + -sequencer|s : Type de sequenceur (MiSeq ou NovaSeq) -> Obligatoire + -test|t : Activer le mode test -> Facultatif + -mailTest|m : Preciser l'adresse mail a laquelle envoyer les messages de log -> obligatoire si test + -samplesheetDemux|i : i comme IEM pour préciser la samplesheet é prendre en compte -> Facultatif + -jFlow|j : pour préciser la feuille jflow é prendre en compte -> Facultatif + +=head1 EXEMPLES + + perl extractReads.pl -s MiSeq + perl extractReads.pl -s MiSeq -t -m hermione.granger@poudlard.uk + + +=head1 DEPENDENCIES + + - Web service permettant la recuperation des adresses mails a partir de l'id + +=head1 AUTHOR + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; +use Log::Log4perl (); +use Log::Log4perl qw(:easy);#FATAL ERROR WARN INFO DEBUG TRACE +#use File::Util; +use File::chdir; +use File::Copy "cp"; +use File::Copy "move"; +use Cwd 'abs_path'; + + +################################################################### +# +# MAIN +# +################################################################### +MAIN: +{ + ############################################################### + # INITIALISATION + ############################################################### + + # Initialisation du log + Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p> extractReads.pl:L%L %M] %m%n' } ); + my $logger = Log::Log4perl -> get_logger(); + + # Récupération des options + my $help = 0 ; + my $sequencer = ""; + my $demuxType_int; + my $demuxType; + my $file_samplesheet = ""; + my $file_jflow = ""; + my $arg_timestamp = ""; # on supprime + my $arg_jobid = ""; # on supprime + my $mailTEST = ""; + my $checkTest = ""; + + GetOptions ('help|h' => \$help, + 'sequencer|s=s' => \$sequencer, + 'samplesheetDemux|i:s'=> \$file_samplesheet, # i forIEM... + 'jFlow|j:s'=> \$file_jflow, + 'timestamp:i'=>\$arg_timestamp, + 'demuxJobid:s'=>\$arg_jobid, + 'mailTesteur|m:s' => \$mailTEST, + 'isTest|t' => \$checkTest, + ); + + if($help){ + pod2usage(-verbose => 1 ); + } + + print STDERR "\n"; + print STDERR "# # # # # # # # # #\n"; + print STDERR "# # extractReads.pl is happening # #\n"; + print STDERR "# # # # # # # # # #\n"; + print STDERR "\n"; + + $logger -> info("Vérification des arguments"); + + # Verification du séquenceur + $sequencer ne ""? $logger -> info("\tSequenceur = " . $sequencer) : $logger -> logdie("\tPas de séquenceur précisé..."); + unless ($sequencer eq "MiSeq" or $sequencer eq "NovaSeq"){ + $logger -> logdie("Erreur dans le nom du sequenceur : ".$sequencer." n'existe pas"); + } + + # vérification de la SS + $file_samplesheet ne "" ? $logger -> info("\tSamplesheet fournie = " . $file_samplesheet ." !") : $logger -> info("\tPas de samplesheet fournie!"); + + # Gestion du test et/ou des mails + $mailTEST ne ""? $logger -> info("\tmailTEST = " . $mailTEST) : $logger -> info("\tPas de mailTEST!"); + $checkTest ne ""? $logger -> info("\tcheckTEST = " . $checkTest) : $logger -> info("\tPas en mode test!"); + $checkTest = $checkTest ne ""? 1 : 0; + # Si on est en test, on veut une adresse mail! + $logger -> logdie("MODE TEST ACTIVE, MERCI DE DONNER UN MAIL AVEC L'OPTION -m MONMAIL\@MONSERVEUR") if( ($checkTest) && ($mailTEST eq "") ); + my $raw_data=""; + my $path_to_scripts=""; + if ($checkTest) { + $raw_data = $sequencer eq "MiSeq"? "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq" : "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq"; + $path_to_scripts=abs_path($0); + } else { + $raw_data="/$sequencer"; + $path_to_scripts=abs_path($0); + } + $logger -> info("\tLes données brutes sont ici : $raw_data"); + + # Configuration API NGL-Bi + my $ngl_api_base_prod = "/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/"; + my $ngl_api_base_test = "/save/devcrgs/src/NGL_REST_Client/ngl-bi_client/IG/SystemeInteractionNGL-Bi/"; + my $ngl_api_base = $checkTest? $ngl_api_base_test : $ngl_api_base_prod; + my $ngl_bi_scripts="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; + $ENV{'APIPERL'}=$ngl_api_base; + $ENV{'CONFFILE'}=$ngl_api_base."conf/prod_illumina_qc.conf"; + loadConfFile(); + unshift @INC, $ngl_api_base."Common_tools/src/perl/lib/"; + unshift @INC, $ngl_api_base."DB_tools/src/perl/lib/"; + require illumina; + require json; + $logger -> info("Variables d'environnement pour NGL-Bi chargées depuis : ".$ngl_api_base); + # Initialisation des variables + my $runExistsInNGL = 0; + my $NGLBiRunCreatedFile = 'RunNGL-Bi.created'; + my $NGLBiReadsetCreatedFil = 'ReadsetsNGL-Bi.created'; + my $NGLBiRunName = ""; + my $NGLSQExperimentCode; + + # Paramétrage général + my $prefixLogFolder = "PipelineLogs_Lane"; + + + ############################################################### + # RECHERCHE SAMPLESHEET + ############################################################### + ## Recherche SS + ### parcours des sous répertoires de /$sequencer + my $regexpPSS = '^[0-9]{8}_.*_BULKDEMUX_.*csv$'; + #my @run_directories = $f -> list_dir('/'.$sequencer => {dirs_only = 1, no_fsdots = 1}=; # ls + my @run_directories = `ls $raw_data`; $? and $logger -> logdie("[Erreur] Impossible de récupéer la liste des dossiers de $raw_data}"); + foreach my $dir (@run_directories){ + chomp($dir); + #my @RunInfo = (); + my @RunInfo = split("_", $dir); # [$#dir] + # Extraction des infos contenues dans le nom du répertoire + my $runDate = $RunInfo[0]; + my ($annee, $mois, $jour) = ($runDate =~ m/([0-9]{2})([0-9]{2})([0-9]{2})/); + my $sequencerID = $RunInfo[1]; + my $barcodeFlowcell; # Sert é l'unicité des noms des .fastq.gz + if ($RunInfo[3] =~ m/000000000-/){ + my @FCBarcode = split('-', $RunInfo[3]); + $barcodeFlowcell = $FCBarcode[$#FCBarcode]; + } else { + $barcodeFlowcell = $RunInfo[3]; + } + + # Recherche de la SS + $logger -> info("Recherche de SampleSheet dans $raw_data/$dir"); + chdir "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); + #$CWD = "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); + my $preSampleSheet = "PreSampleSheet.csv"; + my $lastPSS = `ls -t | egrep $regexpPSS | head -1`; $? and $logger -> logdie("[Erreur] Recup de la derniere BulkSS"); + chomp($lastPSS); + if( $lastPSS ne ""){ + $logger -> info("Check de PSS ".$lastPSS); + my $checkPSS = check_my_samplesheet($lastPSS, $preSampleSheet); + + ############################################################### + # CREATION RUN NGL-Bi + ############################################################### + $NGLSQExperimentCode = getNGLSeqExperimentCode($preSampleSheet); + $runExistsInNGL = 1 if($NGLSQExperimentCode ne " -"); + if ($runExistsInNGL){ + if (! -e $NGLBiRunCreatedFile){ + # INTEGRATION DU RUN A NGL-BI # # # # # # # # # # # + $logger -> info("Pas de fichier $NGLBiRunCreatedFile dans $raw_data/$dir -> Le run NGL-Bi semble ne pas exister "); + my $commandNGLBiRun = "perl $ngl_bi_scripts/createNGL-BiRun.pl --sequencer $sequencer --NGLSqExperimentCode $NGLSQExperimentCode"; + $logger -> info("\tCreation du run avec : ".$commandNGLBiRun); + my $result_commandNGLBiRun = `$commandNGLBiRun 2>&1`; + $? and $logger -> logdie("[Erreur]Lancement de createNGL-BiRun.pl\n".$result_commandNGLBiRun); + $logger -> info("\n".$result_commandNGLBiRun); + }else{ + $logger -> info("Le run existe déjà dans NGL-Bi"); + } + }else{ + $logger -> info("\tRun en autonomie : n'existe pas dans NGL-SQ"); + `touch $NGLBiRunCreatedFile`; $? and $logger -> logdie("[Erreur] Impossible de créer le fichier"); + } + } else { + $logger -> logdie("Aucune SampleSheet trouvée dans $raw_data/$dir"); + } + + # Recherche du fichier de fin de run + my $file2checkForEndOfRun = $sequencerID eq "M07093" ? "RTAComplete.txt" : "CopyComplete.txt"; + if (! -e $file2checkForEndOfRun){ + $logger -> info("Pas de fichier de fin de run -> sortie du script!"); + exit; + } else { + # Détection du nombre de lane + $logger -> info("Détection du nombre de headers") ; + my $nbHeader = `grep "Header" $preSampleSheet | wc -l` ; $? and $logger -> logdie("Comptage de [Header] en echec"); + chomp($nbHeader); + $logger -> info("\t$preSampleSheet -> Nb de [header] = ".$nbHeader ); + + # Création des répertoires de logs par lane + $logger -> info("Détection des répertoires de log"); + foreach my $count (1..$nbHeader){ + my $logFolder = $prefixLogFolder.$count; + if (! -d "$raw_data/$dir/$logFolder"){ # Si le rep n'existe pas, alors on le crée + $logger -> info("\tCréation du répertoire".$logFolder." + chmod 770" ); + mkdir "$raw_data/$dir/$logFolder" or $logger -> logdie("Impossible de créer le répertoire ".$logFolder ); + chmod 0770, "$raw_data/$dir/$logFolder" or $logger -> logdie($!); + } else { + $logger -> info("\tLe répertoire ".$logFolder." existe déjé"); + } + } + + ############################################################### + # DECOUPAGE SAMPLESHEET + ############################################################### + $logger -> info("Découpe de ".$preSampleSheet) ; + my $laneExtraite = ''; + my $counterIEMFiles = 0; #counter to store the number of IEM files found in the bulk file + my $IEMFileContent = ''; + my $IEMFilePrefixe = $lastPSS; + $IEMFilePrefixe =~ s/BULKDEMUX/IEM/g; # Replace Bulk by IEM + $IEMFilePrefixe =~ s/.csv//g; # Supprime le .csv de la fin pour faciliter l'ajout du compteur de lanes + $IEMFilePrefixe .= '_Lane'; + + open my $handle, '<', $preSampleSheet; + chomp(my @lines = <$handle>); + close $handle; + + foreach my $line (@lines) { + if ($line eq '[Header]'){ + if($counterIEMFiles > 0){ # a 1st line was already found and $IEMFileContent contains a single IEM file content + # ecriture du fichier + my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; + print2file($IEMFileContent, $subSampleSheet); + } + $IEMFileContent = ''; + $counterIEMFiles++; + } + $IEMFileContent .= $line."\n"; + ($laneExtraite) = $line =~ m/^(\d),/; + $laneExtraite = '1' if ($sequencer eq 'MiSeq' ); + } + # ecriture du dernier fichier + my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; + print2file($IEMFileContent, $subSampleSheet); + + # Désactivation de la SampleSheet + $logger -> info("Désactivation de la SampleSheet."); + move($lastPSS, $lastPSS.".old") or $logger -> logdie("Le renommage de ".$lastPSS." en .old est en erreur ".$!); + + ############################################################### + # INTEROP DANS NEXTCLOUD + ############################################################### + if (!$checkTest){ + # Récupération de l'année pour le répertoire de destination + my $year = "20".$annee; + + # Ecriture de la commande de synchronisation + my $aws_source = "$raw_data/$dir/"; + my $aws_target = "s3://partage/externes/Illumina-SAV/$sequencer/$year/$dir"; #X:\partage\externes\Illumina-SAV\NovaSeq [$#dir] + my $aws_prefixcmd = "aws s3 --endpoint-url https://s3r-tls.stockage.inra.fr"; + + # Ecriture du script de lancement de synchronisation + my $aws_script_file = "scriptAWS_$sequencerID.sbatch"; + my $aws_script = "#!/bin/sh \n"; + $aws_script .= "#SBATCH -p wflowq\n#SBATCH -t 20\n#SBATCH --mem-per-cpu=200M\n"; + $aws_script .= "#SBATCH -J $aws_script_file\n#SBATCH -e %x.e%j\n#SBATCH -o %x.o%j\n\n"; + $aws_script .= "module load system/Python-3.6.7_shared\n"; + $aws_script .= "$aws_prefixcmd sync $aws_source $aws_target "; + $aws_script .= "--exclude \"*\" --include \"[Rr]un[A-Za-z]*.xml\" --include \"InterOp/[A-Za-z]*.bin\" "; + $aws_script .= "--exclude \"InterOp/C[0-9]*.1*\"\n"; + print2file($aws_script, "$aws_source/$aws_script_file"); + + + # Lancement du script + my $sleepLastingForAWS = 300; + my $aws_launchcmd = "sbatch $aws_script_file"; + my $aws_joboutput = `$aws_launchcmd`; $? and $logger -> logdie("Commande $aws_launchcmd impossible : ".$!); + my ($aws_jobID) = $aws_joboutput =~ m/Submitted batch job (\d+)/; + chomp($aws_jobID); + $logger -> info("\tDossier " . $aws_source." -> JobID : ".$aws_jobID."\nCommande exécutée : " . $aws_launchcmd ); + + # Attente de la fin du job + my $boolOver = is_my_jobID_over($aws_jobID); + while (!$boolOver){ + $boolOver = is_my_jobID_over($aws_jobID); + if (!$boolOver){ + $logger -> info("\tEn attente de la fin de $aws_jobID, é dans ".($sleepLastingForAWS/60)." minutes!"); + sleep($sleepLastingForAWS); # toutes les 5 minutes (*60 = 300) + } + } + + # Vérification qu'on est bon, sinon envoi d'un mail pour prévenir + if (-e $aws_script_file.".e".$aws_jobID){ + $logger -> info("\tLe fichier d'erreur pour AWS existe bien!"); + if (! -z $aws_script_file.".e".$aws_jobID){ + my $testObjectPrefixe = $checkTest? "[TEST]" : ""; + $logger -> error("\tLe fichier d'erreur pour AWS n'est pas vide, il a dé se passer quelque chose de louche, é investiguer!" ); + my $mailRecipients = $checkTest? $mailTEST :'get-plage.bioinfo@genotoul.fr'; + my $mailContent = "Une erreur est survenue lors de la copie des fichiers SAV vers CEPH avec la commande contenue dans\n${aws_source}${aws_script_file}.\n\n"; + $mailContent .= "Le fichier d'erreur contient \n".`cat $aws_script_file.e$aws_jobID`; + send_and_check_my_email($mailContent, "${$testObjectPrefixe}Erreur sauvegarde SAV sur CEPH", $mailRecipients, $mailRecipients); + }else{ + $logger -> info("\tLe fichier d'erreur pour AWS est vide, j'aime quand un plan se déroule sans accroc!"); + } + } + } else { $logger -> info("Nous sommes en mode test : pas besoin de sauvegarder InterOp"); } + + ############################################################### + # CREATION READSETS NGL-Bi + ############################################################### +=head1 A_SUPPRIMER + if ($runExistsInNGL){ + # parcours des dossier PipelineLogs_Lane* + + # recherche du $NGLBiReadsetCreatedFile + ## Si trouvé : on ne fait rien, les readsets existent deja + + + + + if (! -e $NGLBiReadsetCreatedFil){ + # CREATION DES READSETS DANS NGL-BI # # # # # # # # # # # + $logger -> info("Pas de fichier $NGLBiReadsetCreatedFil dans $raw_data/$dir -> Les readsets ne semblent ne pas exister dans NGL-Bi"); + } + } +=cut + + ############################################################### + # LANCEMENT DE NEXTFLOW + ############################################################### + # création du dossier dans /work, se déplacer dedans et lancer nextflow + + } # Fichier de fin de run trouvé + } # fin parcours des répertoires +} + +################################################################### +# +# FONCTIONS +# +################################################################### + +sub print2file { + my ($content, $file2write) = @_; + my $logger = Log::Log4perl -> get_logger('print2file'); + $logger -> info("\tEcriture du fichier $file2write"); + open(my $fh, '>', $file2write) or exit 1; + print $fh $content; + close $fh; +} + +sub check_my_samplesheet{ + my ($file2check, $file2write) = @_; + my $logger = Log::Log4perl -> get_logger('check_my_samplesheet'); + + my $isfile2checkwindows; + my $isfile2checklinux; + + $logger -> info("Etude de $file2check"); + if (-s $file2check){ # $file2check exists and has a non zero size + $logger -> info("Vérification des fins de ligne"); + $isfile2checkwindows = is_my_file_Windows($file2check); + $logger -> info("Sortie de is_my_file_Windows : " . $isfile2checkwindows); + if ($isfile2checkwindows){ + $logger -> warn($file2check." a des fins de ligne Windows : on le convertit!"); + convert_file_2_linux($file2check); + my $isfile2checkwindows2 = is_my_file_Windows($file2check); + if ($isfile2checkwindows2){ + $logger -> logdie("La conversion dos2linux n'a pas fonctionné!"); + } else { + $logger -> info("La conversion dos2linux a fonctionné!"); + } + }else { + $logger -> info("Donc fins de ligne de " . $file2check . " : Linux"); + } + + $logger -> info("Etude de $file2write"); + if(-s $file2write){# $file2write a une taille différente de 0 byte + if( $file2write eq $file2check ){#Fichier correct + $logger -> info($file2write." est déjé l'équivalent de ".$file2check.", on garde!"); + }else{#Renommer le nouveau fichier CSV $file2write et l'ancien OLD_$file2write + chomp($file2check); + $logger -> info("Copie de ".$file2write." en OLD_$file2write"); + cp($file2write,"OLD_$file2write") or $logger -> logdie("Impossible de copier le fichier ".$file2write); + $logger -> info("Copie de ".$file2check." en ".$file2write); + cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); + } + }else{#Si $file2write est vide, on en fait une copie avec le nom correct + chomp($file2check); + $logger -> info("Copie de ".$file2check." en ".$file2write); + cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); + } + return 1; + }else{ + $logger -> info("Il n'y a pas de SampleSheet ".$file2check); + return 0; + } +} + +# Récupere le code d'expérience NGL-SQ dans une samplesheet +sub getNGLSeqExperimentCode{ + my ($samplesheet) = @_; + my $logger = Log::Log4perl -> get_logger('getNGLSeqExperimentCode'); + my $NGLSQExperimentCode = ""; + my $experimentName_ligne = `grep "Experiment Name" $samplesheet | head -1` ; $? and $logger -> logdie("Récupération de 'Experiment Name' dans '".$samplesheet."' en echec" ); + ($NGLSQExperimentCode) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; + $logger -> info("NGLSQExperimentCode : ".$NGLSQExperimentCode); + $logger -> info("L'expérience ne sera pas rentrée dans NGL-Bi car pas de correspondance dans NGL-SQ") if($NGLSQExperimentCode eq '-'); + $logger -> logdie("Echec de la récup du code d'expérience") if($NGLSQExperimentCode eq ""); + return $NGLSQExperimentCode; +} + +# Charge les variables d'environnement du fichier de configuration NGL +sub loadConfFile{ + my $logger = Log::Log4perl -> get_logger('loadConfFile'); + unless ($ENV{CONFFILE}) { + $logger -> logdie("$0: Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); + }; + my $dbconf_file = $ENV{CONFFILE}; + unless (-f $dbconf_file) { + $logger -> logdie("$0: Database configuration file not exist: $dbconf_file. It's necessary for continue"); + }; + open my $handle, '<', $dbconf_file; + chomp( my @lines = <$handle> ); + close $handle; + foreach my $line (@lines) { + $line =~ s/#.*//o; + unless ($line) { next; } + if ($line =~ /(.*)=(.*)/o) { + my $key = $1; + my $value = $2; + $key =~ s/^\s*//o; + $key =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/\s*$//o; + $ENV{$key} = $value; + }else { + $logger -> logdie("$0: Can't load variable to database configuration file $dbconf_file in line: '$_'"); + } + } +} + +=head2 function is_my_file_Windows + + Title : is_my_file_Windows + Usage : $boolean = is_my_file_Windows($file); + Prerequisite : None + Function : Retourne 0 si les fins de ligne du fichier sont linux, 1 si Windows + Returns : Nombre + Args : $file, string + Globals : none + +=cut + +sub is_my_file_Windows { + my ($file) = @_ ; + my $logger = Log::Log4perl -> get_logger('is_my_file_Windows'); + $logger -> info("Fichier en entrée : " . $file); + my $fileOutput; + my $ismyfileWindows = 0; + + $fileOutput = `file $file`; $? and $logger -> logdie("[Erreur]Lancement de file"); + chomp($fileOutput); + $logger -> info("Message de sortie : " . $fileOutput); + if ($fileOutput =~ /with CRLF.* line terminators/){ + $logger -> info("Le fichier est Windows"); + $ismyfileWindows = 1; + } + return $ismyfileWindows; +} + diff --git a/conf/base.config b/conf/base.config index 64b1c66f29bd14fd21d5981e55eb8c7374675aea..78238b18a04674914aa1596902df1d58cbd98590 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,57 +1,157 @@ -/* - * ------------------------------------------------- - * nf-core/template Nextflow base config file - * ------------------------------------------------- - * A 'blank slate' config file, appropriate for general - * use on most high performace compute environments. - * Assumes that all software is installed and available - * on the PATH. Runs in `local` mode - all jobs will be - * run on the logged in environment. - */ +// ======================================== +// PARAMS +//========================================= +System.out.println "Chargement des paramètres de base" +// Fixed params +params { + // EMPTY INITIALISATION OF INPUT PARAMS + referenceGenome = '' + inputdir = "" + outdir = "./" // base output directory for all analysis +} -process { +import java.text.SimpleDateFormat +SimpleDateFormat uniqueness_format = new SimpleDateFormat("yyyMMddHHmmss") - // TODO nf-core: Check the defaults for all processes - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 7.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } - - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 1 - maxErrors = '-1' - - // Process-specific resource requirements - // NOTE - Only one of the labels below are used in the fastqc process in the main script. - // If possible, it would be nice to keep the same label naming convention when - // adding in your processes. - // TODO nf-core: Customise requirements for specific processes. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors - withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 14.GB * task.attempt, 'memory' ) } - time = { check_max( 6.h * task.attempt, 'time' ) } - } - withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 42.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } - } - withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 84.GB * task.attempt, 'memory' ) } - time = { check_max( 10.h * task.attempt, 'time' ) } - } - withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } - } - withName:get_software_versions { - cache = false - } -} +System.out.println "Lecture du fichier de configuration du run : $launchDir/../params.config" +includeConfig "$launchDir/../params.config" +// Dynamic params params { - // Defaults only, expecting to be overwritten - max_memory = 12.GB - max_cpus = 8 - max_time = 4.h + nf_uniqueness = uniqueness_format.format(new Date()) + outdir= params.inputdir + "/nextflow/" + nf_uniqueness + + System.out.println "" + System.out.println "runName : "+runName + System.out.println "data : "+dataNature + System.out.println "sequencer : "+sequencer + System.out.println "machineID : "+machineID + System.out.println "run_date : "+run_date + System.out.println "fcID : "+fcID + System.out.println "lane : "+lane + System.out.println "demuxUniqueness : "+demuxUniqueness + System.out.println "outdir : "+outdir + System.out.println "" +} + +// ======================================== +// PROCESS +//========================================= +process { + executor = 'slurm' + queue = 'wflowq' + time='1h' + cpus = 1 + memory = 2.GB + + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 2 + maxErrors = '-1' + + // ----- WithName + withName: BWA_ALIGNMENT { + module = ['bioinfo/bwa-0.7.17'] + } + + withName: DUPLICATED_READS { + publishDir path: "${params.outdir}/Duplicats" , mode: 'copy', pattern: "*.log" + module = ['bioinfo/fastp-0.23.2'] + time = { 5.h * task.attempt } + memory = { 3.GB * task.attempt } + cpus = { 3 * task.attempt } + } + + withName: FASTQC { + publishDir = [ + path: "${params.outdir}/ReadsStats", + mode: 'symlink', + pattern: '*.zip', + saveAs: { filename -> "${name}_fastqc.zip" } + ] + publishDir = [ + path: "${params.outdir}/ReadsStats", + mode: 'copy', + pattern: '*.html', + saveAs: { filename -> "${name}.html" } + ] + + errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 3 + module = ['bioinfo/FastQC_v0.11.7'] + time = { 1.h * task.attempt } + + } + + // ----- WithLabel + withLabel: littleJob { + executor = 'local' + } + + withLabel: samtools { + module = ['bioinfo/samtools-1.14'] + } + + withLabel: cigar { + module = ['system/Python-3.7.4:bioinfo/samtools-1.14'] + } + + withLabel: qualimap { + module = ['system/R-3.4.3:bioinfo/qualimap-31-08-20'] + beforeScript='unset DISPLAY' + } } + +// ======================================== +// SHARED MODULES +//========================================= +params.shared_modules = '/home/sbsuser/work/Nextflow/shared_modules/ExportSources_Jules' + +process { + withName: GZIP { + ext.args = '-f' + publishDir = [ + path: { "${params.outdir}/archives" }, + mode: 'symlink', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.gz" + ] + } + + withName: GUNZIP { + ext.args = [ + '-f' + ].join(' ') + + time = { 2.h * task.attempt } + } + + withName: SEQTK_SAMPLE { + ext.args = '-s100' + ext.args2 = 100000 + + module = 'bioinfo/seqtk-1.3' + + publishDir = [ + path: { "${params.outdir}/subset" }, + mode: 'symlink', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.fast{a,q}" + ] + } + + withName: MULTIQC { + ext.args = [ + "--config ${baseDir}/assets/multiqc_config.yaml", + params.project ? "--title '${params.project}'" : '' + ].join(' ') + + module = '/tools/share/Modules/bioinfo/MultiQC-v1.11' + + publishDir = [ + path: { "${params.outdir}/MultiQC" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.html" + ] + } +} \ No newline at end of file diff --git a/conf/path.config b/conf/path.config deleted file mode 100644 index 4e9c550e87584053edd60052fde3de875370c0ad..0000000000000000000000000000000000000000 --- a/conf/path.config +++ /dev/null @@ -1,7 +0,0 @@ -//not tested. -withName:fastqc { - process.beforeScript = "export PATH=/path/to/fastqc:$PATH" -} -withName:multiqc { - process.beforeScript = "export PATH=/path/to/multiqc:$PATH" -} \ No newline at end of file diff --git a/conf/prod.config b/conf/prod.config new file mode 100644 index 0000000000000000000000000000000000000000..b36b1a7eb73be4305bff6ca4b6567b32651dbd0f --- /dev/null +++ b/conf/prod.config @@ -0,0 +1,35 @@ +System.out.println "Chargement des paramètres de la config PROD" +// ======================================== +// PROCESSES +//========================================= +process { + withLabel: ngl_bi { + executor = 'local' + beforeScript = "export NGL_BI_CLIENT='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current'" + //errorStrategy = { 'ignore' } + } + + withLabel: samtools { + cpus = { 6 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 3.h * task.attempt } + } + + withLabel: qualimap { + cpus = { 8 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 3.h * task.attempt } + } + + + withName: BWA_ALIGNMENT { + cpus = { 6 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 3.d * task.attempt } + } +} + +// ======================================== +// CONFIG FILES +//========================================= +includeConfig "$baseDir/conf/report.config" \ No newline at end of file diff --git a/conf/report.config b/conf/report.config new file mode 100644 index 0000000000000000000000000000000000000000..385b8ecdb3929d3811f16e47f95e11ef49ad3c39 --- /dev/null +++ b/conf/report.config @@ -0,0 +1,33 @@ +// ======================================== +// REPORTS +//========================================= +timeline { + enabled = true + file = "${params.outdir}/pipeline_info/execution_timeline.html" +} + +trace { + enabled = true + file = "${params.outdir}/pipeline_info/execution_trace.txt" + fields = 'task_id,native_id,name,status,exit,realtime,%cpu,%mem,duration,script,rss' // verifier ajout des champs +} + +report { + enabled = true + file = "${params.outdir}/pipeline_info/execution_report.html" +} + +dag { + enabled = true + file = "${params.outdir}/pipeline_info/pipeline_dag.svg" +} + +manifest { + name = 'get-nextflow-ngl-bi/wf-nanopore-nf' + author = 'Jules Sabban' + homePage = 'https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf' + description = 'Workflow for Nanopore data quality control' + mainScript = 'main.nf' + nextflowVersion = '>=0.32.0' + version = '1.0.0' +} \ No newline at end of file diff --git a/conf/test.config b/conf/test.config index ce7674cecad64d7d9aaa04d6ba1fe48cceb59954..fa614b4a0b71ccf28727b2b929b611a28be185f0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,22 +1,34 @@ -/* - * ------------------------------------------------- - * Nextflow config file for running tests - * ------------------------------------------------- - * Defines bundled input files and everything required - * to run a fast and simple test. Use as follows: - * nextflow run nf-core/template -profile test - */ - -params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on Travis - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h - - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - inputdir = './data' +// ======================================== +// PROCESSES +//========================================= +process { + withLabel: ngl_bi { + executor = 'local' + beforeScript = "export NGL_BI_CLIENT='/work/sbsuser/test/jules/VisualStudioSources/ngl-bi_client'" // test + //errorStrategy = { 'ignore' } + } + + withLabel: samtools { + cpus = { 1 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 10.m * task.attempt } + } + + withLabel: qualimap { + cpus = { 1 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 10.m * task.attempt } + } + + withName: BWA_ALIGNMENT { + cpus = { 3 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 1.h * task.attempt } + } } + + +// ======================================== +// CONFIG FILES +//========================================= +includeConfig "$baseDir/conf/report.config" \ No newline at end of file diff --git a/data/MT_rep1_1_Ch6.fastq.gz b/data/MT_rep1_1_Ch6.fastq.gz deleted file mode 100644 index e2975f131f94a08f60b1a4d94a51d5a2cf425edd..0000000000000000000000000000000000000000 Binary files a/data/MT_rep1_1_Ch6.fastq.gz and /dev/null differ diff --git a/data/MT_rep1_2_Ch6.fastq.gz b/data/MT_rep1_2_Ch6.fastq.gz deleted file mode 100644 index bb7bbdac117a0965f4a41b71f8baa2bbac2efa26..0000000000000000000000000000000000000000 Binary files a/data/MT_rep1_2_Ch6.fastq.gz and /dev/null differ diff --git a/data/samples.csv b/data/samples.csv deleted file mode 100644 index bc50be23eabfb1e5cded3f309e7960cd9064008e..0000000000000000000000000000000000000000 --- a/data/samples.csv +++ /dev/null @@ -1 +0,0 @@ -1,MT,./data/MT_rep1_1_Ch6.fastq.gz,./data/MT_rep1_2_Ch6.fastq.gz \ No newline at end of file diff --git a/main.nf b/main.nf index befd72c81255f0c5d3b4fac07509f755208662e4..9de84762554479b5622030acb442bff4525de209 100644 --- a/main.nf +++ b/main.nf @@ -1,5 +1,6 @@ #!/usr/bin/env nextflow +nextflow.enable.dsl = 2 /* Copyright INRAE 2021 @@ -16,364 +17,27 @@ The fact that you are presently reading this means that you have had knowledge of the license and that you accept its terms. This script is based on : - the nf-core guidelines . See https://nf-co.re/ for more information - - the institut cury template https://github.com/bioinfo-pf-curie/geniac-template/ + - the Curie institute template https://github.com/bioinfo-pf-curie/geniac-template/ */ - /* ======================================================================================== - GeT/template + NAMED WORKFLOW FOR PIPELINE ======================================================================================== - GeT/template Analysis Pipeline. - #### Homepage / Documentation - https://github.com/get-nf/template ----------------------------------------------------------------------------------------- */ +include { ILLUMINA_QC } from "$baseDir/workflow/illumina_qc.nf" -def helpMessage() { - log.info""" - - Usage: - - The typical command for running the pipeline is as follows: - - nextflow run get-nf/template --inputdir '/path/to/data' --samplesheet 'samples.csv' -profile docker - - Mandatory arguments: - --inputdir Path to input directory - -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, path, genotoul, test and more. - - Options: - --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) - --contaminant Name of iGenomes // To be discussed ???? - --outdir The output directory where the results will be saved - --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail Same as --email, except only send mail if the workflow is not successful - --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - - - ======================================================= - Available profiles - -profile test Run the test dataset - -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path - -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path - -profile docker Use the Docker images for each process - -profile singularity Use the singularity images for each process - -profile genologin Run the workflow on the cluster, instead of locally - - """.stripIndent() -} - -// Show help message -if (params.help) { - helpMessage() - exit 0 -} - - -// NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY - -/* - * Create a channel for input read files - */ -// If you want to use the channel below in a process, define the following: -// input: -// file dir from inputDirCh -// - - -ch_inputdir = params.inputdir ? Channel.fromPath(params.inputdir, checkIfExists: true) : Channel.empty() - -// Create a channel for input read files -if(params.samplesheet){ - if(params.single_end){ - Channel - .from(file("${params.samplesheet}")) - .splitCsv(header: false) - .map{ row -> [ row[0], [file(row[2])]] } - .into { ch_read_files_for_fastqc; ch_read_files_for_qc1; ch_read_files_for_assembly} - }else{ - Channel - .from(file("${params.samplesheet}")) - .splitCsv(header: false) - .map{ row -> [ row[0], [file(row[2]), file(row[3])]] } - .into { ch_read_files_for_fastqc; ch_read_files_for_qc1; ch_read_files_for_assembly} - } - params.reads=false -} else { - exit 1, "Expect a samplesheet and an input dir !" -} -/* - * SET UP CONFIGURATION VARIABLES - */ -// Has the run name been specified by the user? -// this has the bonus effect of catching both -name and --name -custom_runName = params.name -if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { - custom_runName = workflow.runName -} -// Stage config files -ch_multiqc_config = file(params.multiqc_config, checkIfExists: true) -ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) - - -def summary = [:] -if (workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Run Name'] = custom_runName ?: workflow.runName -// TODO nf-core: Report custom parameters here -summary['Input dir'] = params.inputdir -summary['Sample sheet'] = params.samplesheet -summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" -summary['Output dir'] = params.outdir -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Script dir'] = workflow.projectDir -summary['User'] = workflow.userName -if (workflow.profile == 'awsbatch') { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue -} -summary['Config Profile'] = workflow.profile -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail -} -log.info "-\033[2m--------------------------------------------------\033[0m-" -log.info "-\033[2m----------------"+ workflow.manifest.name +" --\033[0m-" -log.info "-\033[2m--------------------------------------------------\033[0m-" -log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") -log.info "-\033[2m--------------------------------------------------\033[0m-" - -/* - * Parse software version numbers - */ -process get_software_versions { - publishDir "${params.outdir}/pipeline_info", mode: 'copy', - saveAs: { filename -> - if (filename.indexOf(".csv") > 0) filename - else null - } - - output: - file 'software_versions_mqc.yaml' into software_versions_yaml - file "software_versions.csv" - - script: - // TODO nf-core: Get all tools to print their version number here - """ - echo $workflow.manifest.version > v_pipeline.txt - echo $workflow.nextflow.version > v_nextflow.txt - fastqc --version > v_fastqc.txt - multiqc --version > v_multiqc.txt - scrape_software_versions.py &> software_versions_mqc.yaml - """ -} -/* - * STEP 1 - FastQC - */ -process fastqc { - tag "$name" - label 'process_medium' - publishDir "${params.outdir}/fastqc", mode: 'copy', - saveAs: { filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" } - - input: - set val(name), file(reads) from ch_read_files_for_fastqc - - output: - file "*_fastqc.{zip,html}" into ch_fastqc_results_for_multiqc - - script: - """ - fastqc --quiet --threads $task.cpus $reads - """ -} - -/* - * STEP 2 - Fake QC - */ -process qc1 { - input: - set replicate_id, file(reads) from ch_read_files_for_qc1 - - output: - file("${replicate_id}.qc1") into ch_fastqc_raw_for_assembly - - script: - """ - echo "mkdir ${replicate_id} ; fastqc --nogroup --quiet -o ${replicate_id} --threads ${task.cpus} ${reads[0]} ${reads[1]}" > ${replicate_id}.qc1 - """ -} - -/* - * STEP 3 - Fake assembly - */ -process assembly { - input: - file (qc) from ch_fastqc_raw_for_assembly - set replicate_id, file(reads) from ch_read_files_for_assembly - - output: - file("${replicate_id}.assembly") into ch_assembly_for_multiqc - - script: - """ - echo "ASSEMBLY ${replicate_id} ; " > ${replicate_id}.assembly - """ -} - -process workflow_summary { - - output: - file 'workflow_summary_mqc.yaml' into ch_workflow_summary_yaml - - exec: - def yaml_file = task.workDir.resolve('workflow_summary_mqc.yaml') - yaml_file.text = """ - id: 'summary' - description: " - this information is collected when the pipeline is started." - section_name: 'Workflow Summary' - section_href: "${workflow.manifest.homePage}" - plot_type: 'html' - data: | - <dl class=\"dl-horizontal\"> - ${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} - </dl> - """.stripIndent() -} - -/* - * STEP - MultiQC - */ -process multiqc { - - publishDir "${params.outdir}/MultiQC", mode: 'copy' - - when: - !params.skip_multiQC - - input: - file (multiqc_config) from ch_multiqc_config - file ('fastqc/*') from ch_fastqc_results_for_multiqc.collect().ifEmpty([]) - // TODO get-nf: Add in log files from your new processes for MultiQC to find! - file ('software_versions/*') from software_versions_yaml.collect() - file ('workflowSummary/*') from ch_workflow_summary_yaml.collect() - - output: - file "*report.html" into ch_multiqc_report - file "*_data" - file "multiqc_plots" - - script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - """ - multiqc -f $rtitle $rfilename --config $multiqc_config . - """ +workflow QC_ANALYSIS { + ILLUMINA_QC() } /* - * STEP - Output Description HTML - */ -process output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: 'copy' - - input: - file output_docs from ch_output_docs - - output: - file "results_description.html" +======================================================================================== + RUN ALL WORKFLOWS +======================================================================================== +*/ - script: - """ - pandoc $output_docs -t html -o results_description.html - """ +workflow { + QC_ANALYSIS() } - -/* - * Completion e-mail notification - */ -workflow.onComplete { - - // Set up the e-mail variables - def name_wf = workflow.manifest.name - def subject = "[$name_wf] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[$name_wf] FAILED: $workflow.runName" - } - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = custom_runName ?: workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary - println(workflow) - - email_fields['summary']['Date Started'] = 11 // workflow.start - email_fields['summary']['Date Completed'] = 11 // workflow.complete - email_fields['summary']['Pipeline script file path'] = 'aaa' //workflow.scriptFile - email_fields['summary']['Pipeline script hash ID'] = 'aaa' //workflow.scriptId - if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - if (workflow.container) email_fields['summary']['Docker image'] = workflow.container - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - // Check if we are only sending emails on failure - email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$baseDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Send the HTML e-mail - if (email_address) { - // Catch failures and try with plaintext - [ 'mail', '-s', subject, email_address ].execute() << email_txt - log.info "[$name_wf] Sent summary e-mail to $email_address (mail)" - log.info "$email_txt" - } - - // Write summary e-mail HTML to a file - def output_d = new File( "${params.outdir}/pipeline_info/" ) - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_tf = new File( output_d, "pipeline_report.txt" ) - output_tf.withWriter { w -> w << email_txt } - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_red = params.monochrome_logs ? '' : "\033[0;31m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - - if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}" - log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}" - log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}" - } - if (workflow.success) { - log.info "-${c_purple}[${name_wf}]${c_green} Pipeline completed successfully${c_reset}" - } else { - log.info "-${c_purple}[${name_wf}]${c_red} Pipeline completed with errors${c_reset}" - } - -} \ No newline at end of file diff --git a/modules/local/module_NGL-Bi.nf b/modules/local/module_NGL-Bi.nf new file mode 100644 index 0000000000000000000000000000000000000000..96f29d5f40edc0861fe33e3b93ed25aeb724cb11 --- /dev/null +++ b/modules/local/module_NGL-Bi.nf @@ -0,0 +1,54 @@ +params.outdir='' + + +process prepareReadSetCreation { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' + + input: + path sampleSheet + path runNGLBiCreated + + output: + file 'readSetCreation.info' + + script: + """ + extractInfoForReadSets.pl --sampleSheet $sampleSheet --runNGLBi $runNGLBiCreated + """ +} + +process readsetNGLBiCreation { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy', pattern: '*.created' + + executor = 'local' + beforeScript = "export ENV_NGL='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/'" + errorStrategy = { 'ignore' } + + input : + path infoFile + + output : + path 'ReadsetsNGL-Bi.created', emit: readSetFile + path 'ReadsetsNGL-BiCreation.log', emit: readSetLog + + script : + """ + createNGLBiReadSets.pl --infoFile $infoFile --env_ngl_bi \$ENV_NGL 2> ReadsetsNGL-BiCreation.log 1> ReadsetsNGL-Bi.created + + """ +} + +process checkErrorFromNGLBi { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' + + input: + path logFile + + output: + path 'ReadsetsNGL-BiCreation.log' + + script: + """ + checkErrorNGLScripts.pl --file $logFile + """ +} \ No newline at end of file diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf new file mode 100644 index 0000000000000000000000000000000000000000..ca2a7bf1d19974ccd98d95489168ca3933dfa4dd --- /dev/null +++ b/modules/local/module_core.nf @@ -0,0 +1,277 @@ +/* + * Module pour les analyses de base du pipeline +*/ + +process extractInfoForDemuxStats { + publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' + + input: + path SampleSheet + + output: + path "*.indexNumber" + + script: + """ + extractInfoForDemuxStats.pl --sampleSheet $SampleSheet + + """ +} + +process demultiplexStats { + publishDir path: "${params.outdir}/Demux" , mode: 'copy' + + //module 'system/R-4.0.4_gcc-9.3.0' // Ne fonctionne pas ! + + input: + path DemuxStatXML + path IndexNumberFile + path DemuxSummary + + output: + path 'demultiplexStats.log', emit: log + path "DemultiplexStats_*", emit: demultiplexStatsCSV + + script: + """ + module load system/R-4.0.4_gcc-9.3.0 + demuxStatsFromXML.R --xml $DemuxStatXML --indexNumber $IndexNumberFile --demuxSum $DemuxSummary > demultiplexStats.log + """ +} + +process FASTQC { + + + tag " $name" + + input: + tuple val(name), path(read) + + output: + tuple val(name), path("*_fastqc.{zip,html}") , emit: report + // path log files + + script: + """ + fastqc -t $task.cpus --nogroup --noextract --outdir ./ ${read} + """ +} + + +process illuminaFilter { + publishDir path: "${params.outdir}/IlluminaFilter" , mode: 'copy', pattern: '*.gz'/*, saveAs: { filename -> "${name}.fastq.gz" }*/ + + module 'bioinfo/fastq_illumina_filter-0.1' + executor 'slurm' + queue 'wflowq' + cpus { 1 * task.attempt } + time { 1.h * task.attempt } + memory '1.GB' + + tag " $name" + + input: + tuple val(name), path(read) + + output: + tuple val("$name"), path("*.fastq.gz"), emit: reads + path("*.output"), emit: log + + script: + """ + zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.output | gzip -c -f > ${name}_filtered.fastq.gz + """ + +} + +process BWA_ALIGNMENT { + publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' + + tag " $sample" + + input: + tuple val(sample), path(reads) + each genomeRef + + output: + //tuple val(sample), path("*.log"), emit: log + tuple val("${sample}_${genomeName}"), path("${sample}_${genomeName}.sam"), emit: sam + + script: + genomeName=file(genomeRef).simpleName + """ + bwa mem ${genomeRef} ${reads} 1> ${sample}_${genomeName}.sam 2> ${sample}.log + """ +} + +process FASTQSCREEN { + publishDir path: "${params.outdir}/ContaminationSearch/FastQ-Screen", mode: 'copy' + + module 'bioinfo/FastQ-Screen-0.15.2' + time { 1.h * task.attempt } + + tag " $sample" + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("*.txt"), emit: report + + script: + """ + fastq_screen $reads --conf $launchDir/../fastq_screen.conf + """ +} + +process DUPLICATED_READS { + + tag "$sample" + + input: + tuple val(sample), path(fastq) + + output: + tuple val(sample), path("*.json"), emit: json + tuple val(sample), path("*.log") + + shell: + R1_name=file(fastq[0]).simpleName + R2_name=file(fastq[1]).simpleName + ''' + fastp \ + -i !{fastq[0]} \ + -o !{R1_name}_dedupl.fastq \ + -I !{fastq[1]} \ + -O !{R2_name}_dedupl.fastq \ + --disable_adapter_trimming \ + --disable_quality_filtering \ + --disable_length_filtering \ + --json !{R1_name}_fastp.json \ + 2> !{R1_name}.log + ''' +} + + +/* -------------------------------------------------------------------- + * OLD PROCESS + * -------------------------------------------------------------------- +*/ +process decoupageSS { + // Not used anymore + publishDir path: "${params.outdir}/SampleSheets" , mode: 'copy' + + input: + path multiSS + + output: + path '*' + + shell: + """ + extractReads.pl $multiSS NovaSeq + + """ +} + +process maskMaker { + publishDir path: "${params.outdir}/Demux" , mode: 'copy' + + input: + path SampleSheet + path RunInfoXML + + output: + path 'Run.conf' + + script: + """ + extractInfo.pl -s $SampleSheet -r $RunInfoXML + + """ +} + +process bcl2fastq { + publishDir path: "${params.outdir}/Demux/Reads" , mode: 'copy' + + echo=true + + input: + path SampleSheet + path Runconf + val mismatchNumber + path rawdata_location + + //output: + //path "*" + + shell: + """ + mask=\$(grep 'MASQUE' !{Runconf} | cut -d'=' -f2) + echo "bcl2fastq -p 10 -r 4 -w 4 \${mask} --barcode-mismatches !{mismatchNumber} --output-dir ./ -R !{rawdata_location} --sample-sheet !{SampleSheet} -l DEBUG" + + """ +} + +process search_conta_bwa { + // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB + publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' + module 'bioinfo/bwa-0.7.17' + time { 20.m * task.attempt } + memory { 5.GB * task.attempt } + + input: + tuple val(name), path(read) + each genomeRef + + output: + tuple val("${name}_${genomeName}"), path("${name}_${genomeName}.sam"), emit: sam + + script: + genomeName=file(genomeRef).simpleName + """ + bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err + """ +} + +process search_conta_samtools { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + + module 'bioinfo/samtools-1.9' + time { 10.m * task.attempt } + + tag " $sample" + + input: + tuple val(name), path("*") + + output: + //tuple val("$name"), path("*") + path("*.txt") + + script: + """ + samtools view -SF 260 ${name}.sam 2>> ${name}.err | cut -f1 - 2>> ${name}.err | sort - > ${name}.txt 2>> ${name}.err + """ +} + +process search_conta_summary { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + + time { 10.m * task.attempt } + memory '1.GB' + + tag " $sample" + + input: + //tuple val(name), path("*") + path("*") + + output: + path("*.yaml") + + script: + """ + contaCounter.pl ./ + """ +} \ No newline at end of file diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf new file mode 100644 index 0000000000000000000000000000000000000000..ea95679f3b5e8d715c75afe50cee3dd193315ae8 --- /dev/null +++ b/modules/local/module_dna.nf @@ -0,0 +1,176 @@ +/* + * Module pour l'alignement des reads ADN sur génome de référence et des statistiques associées +*/ + +process BWA_ALIGNMENT { BWA_ALIGNMENT + publishDir path: "${params.outdir}/alignment/bwa" , mode: 'copy' + + tag " $sample" + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.sam"), emit: sam + + script: + """ + bwa mem ${params.referenceGenome} ${reads} 1> ${sample}.sam 2> ${sample}.log + """ +} + +process SAMTOOLS_VIEW { + publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' + + tag "$sample" + + label 'samtools' + + input: + tuple val(sample), path(sam) + + output: + tuple val(sample), path("*.bam"), emit: bam + + script: + """ + samtools view -bS ${sam} > ${sample}.bam + """ +} + +process SAMTOOLS_SORT { + publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' + + tag "$sample" + + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.bam"), emit: bam + //path("*.bam"), emit: bam + + script: // Pourquoi unmerged ??? https://forgemia.inra.fr/genotoul-bioinfo/ng6/-/blob/master/workflows/components/bwa.py#L97 + """ + samtools sort ${bam} -o ${sample}_unmerged.bam 2>> ${sample}.log + """ +} + +process SAMTOOLS_FLAGSTATS { + publishDir path: "${params.outdir}/alignmentStats/samtools" , mode: 'copy' + + tag "$sample" + + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.txt"), emit: txt + + script: + """ + samtools flagstat ${bam} > ${sample}_flagstat.txt 2>> ${sample}.log + """ +} + +process QUALIMAP { + publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy' + + tag "$sample" + + label 'qualimap' + + errorStrategy = { 'ignore' } + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*/*"), emit: all // ${sample}_stats/* + tuple val(sample), path("${sample}"), emit: report + + script: + """ + qualimap bamqc -bam ${bam} -outdir ${sample} 1> ${sample}.log + """ +} + + + +/* +process alignmentQualityStats { + publishDir path: "${params.outdir}/alignmentStats/cigar" , mode: 'copy' + + label 'cigar' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.csv"), emit: csv + tuple val(sample), path("*.png"), emit: graph + + script: + cigarOptions = params.splitReads ? "--readsplit" : "" + + if (params.pairedEnd) { + """ + python + samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${sample}_R2.csv -o ${sample}_R1.png ${sample}_R2.png ${cigarOptions} 2> ${sample}.log + """ + } else { + """ + samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${cigarOptions} 2> ${sample}.log + """ + } +} + +process alignmentSummary { + publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' + + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.stat"), emit: stat + + script: + """ + samtools view -F0x0100 -bh ${bam} | samtools flagstat - > ${sample}.stat + """ +} + +process readAlignementSummary { // addTreatment + publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' + + input: + tuple val(sample), path(statFile) + + output: + tuple val(sample), path("*.log"), emit: log + + script: + """ + alignementStatTreatment.pl --file ${statFile} 1> ${sample}.log + """ + + +} + + //alignmentQualityStats(samtoolsSort.out.bam) + //alignmentSummary(samtoolsSort.out.bam) + //readAlignementSummary(alignmentSummary.out.stat) + + +*/ \ No newline at end of file diff --git a/modules/local/module_test.nf b/modules/local/module_test.nf new file mode 100644 index 0000000000000000000000000000000000000000..a15894d9108882a6f370d94a6d8676149cf2cebd --- /dev/null +++ b/modules/local/module_test.nf @@ -0,0 +1,17 @@ +process bar { + publishDir path: "/home/sbsuser/work/Nextflow/wf-illumina-nf/results" , mode: 'copy' + + input: + path x + path y + + output: + path 'bar.txt', emit: fichier_de_sortie + // path 'foo.txt', emit: other_file + + script: + """ + (cat $x; head $y ) > bar.txt + """ +} + diff --git a/nextflow.config b/nextflow.config index 87e3584b4666dcd82ef1a565310727c0fdd45fae..26777bdf73c9dfca319004b47560781a7d3c6c7e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,63 +1,48 @@ -/* - * ------------------------------------------------- - * nf-core/template Nextflow config file - * ------------------------------------------------- - * Default config options for all environments. - */ - -// Global default params, used in configs -params { - - // Workflow flags - // TODO nf-core: Specify your pipeline's command line flags - inputdir = "./data" - samplesheet = "${params.inputdir}/samples.csv" - single_end = false - outdir = './results' - skip_multiQC = false - - // Boilerplate options - name = false - multiqc_config = "$baseDir/assets/multiqc_config.yaml" - tracedir = "${params.outdir}/pipeline_info" - email = false - email_on_fail = false - monochrome_logs = false - help = false - config_profile_description = false - config_profile_contact = false - config_profile_url = false - - // if use -profile path specify path where all binaries are stored - globalPath = "" -} - -params { - // Defaults only, expecting to be overwritten - max_memory = 20.GB - max_cpus = 4 - max_time = 40.h -} +// ======================================== +// PARAMS +// ========================================= +// Global params +params { + // PARAMETRE POUR OUTILS + // TODO + + // OTHERS + email="jules.sabban@inrae.fr" + email_on_fail="jules.sabban@inrae.fr" + email_bioinfo="get-plage.bioinfo@genotoul.fr" + email_labo="get-plage.labo@genotoul.fr" + + monochrome_logs = true + help = false + + config_profile_description = false // ?? + config_profile_contact = false // ?? + config_profile_url = false // ?? +} + +// ======================================== +// PROFILES +//========================================= +// Load base.config by default for all pipelines +includeConfig "$baseDir/conf/base.config" +System.out.println "Les configurations de bases sont chargées" // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev process.container = "$baseDir/template-nf.sif" -// Load base.config by default for all pipelines -includeConfig 'conf/base.config' - profiles { - conda { process.conda = "$baseDir/environment.yml" } - debug { process.beforeScript = 'echo $HOSTNAME' } - docker { docker.enabled = true } - singularity { singularity.enabled = true } - test { includeConfig 'conf/test.config' } - path { process.beforeScript = "export PATH=${params.globalPath}:$PATH" } - multipath { includeConfig 'conf/path.config' } - genotoul { includeConfig 'conf/genotoul.config' } + conda { process.conda = "$baseDir/environment.yml" } + debug { process.beforeScript = 'echo $HOSTNAME' } + docker { docker.enabled = true } + singularity { singularity.enabled = true } + test { includeConfig "$baseDir/conf/test.config" } + prod { includeConfig "$baseDir/conf/prod.config" } } +System.out.println "Tous les profiles ont été analysés" + // Avoid this error: // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351, once this is established and works well, nextflow might implement this behavior as new default. @@ -65,67 +50,4 @@ docker.runOptions = '-u \$(id -u):\$(id -g)' // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] - -timeline { - enabled = true - file = "${params.tracedir}/execution_timeline.html" -} - -trace { - enabled = true - file = "${params.tracedir}/execution_trace.txt" - fields = 'task_id,name,status,exit,realtime,%cpu,rss' -} - -report { - enabled = true - file = "${params.tracedir}/execution_report.html" -} - -dag { - enabled = true - file = "${params.tracedir}/pipeline_dag.svg" -} - -manifest { - name = 'get-nextflow-ngl-bi/template-nf' - author = 'Céline Noirot' - homePage = 'https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf' - description = 'get workflow template' - mainScript = 'main.nf' - nextflowVersion = '>=0.32.0' - version = '1.0dev' -} - -// Function to ensure that resource requirements don't go beyond -// a maximum limit -def check_max(obj, type) { - if (type == 'memory') { - try { - if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) - return params.max_memory as nextflow.util.MemoryUnit - else - return obj - } catch (all) { - println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'time') { - try { - if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) - return params.max_time as nextflow.util.Duration - else - return obj - } catch (all) { - println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'cpus') { - try { - return Math.min( obj, params.max_cpus as int ) - } catch (all) { - println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" - return obj - } - } -} +System.out.println "Sortie du nextflow.config" \ No newline at end of file diff --git a/modules/.gitkeep b/sub-workflows/local/10X_qc.nf similarity index 100% rename from modules/.gitkeep rename to sub-workflows/local/10X_qc.nf diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf new file mode 100644 index 0000000000000000000000000000000000000000..9ac154556b8f31c92e624cdf4933a4c3410c59f1 --- /dev/null +++ b/sub-workflows/local/core_pipeline.nf @@ -0,0 +1,103 @@ + +// ------------------------------------------------- +// CORE PIPELINE +// ------------------------------------------------- +/* + * Creation readsets NGL-Bi -> plus tard + * Statistiques de démultiplexage + * QC des reads + * Recherche contaminations + * Recherche duplicats +*/ + +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- +include { + extractInfoForDemuxStats; + demultiplexStats; + FASTQC; + illuminaFilter; + FASTQSCREEN; + DUPLICATED_READS; +} from "$baseDir/modules/local/module_core.nf" + +include { + prepareReadSetCreation; + readsetNGLBiCreation as readsetCreation; +} from "$baseDir/modules/local/module_NGL-Bi.nf" + +include { GUNZIP } from "${params.shared_modules}/gzip.nf" +include { SEQTK_SAMPLE } from "${params.shared_modules}/seqtk.nf" +//------------------------------------------------- + +inNGL=true +forceNewReadset=false +isResume=workflow.resume + +//------------------------------------------------- + +workflow NGLBi_readsets { + /* + * Creation readsets NGL-Bi -> oui !! + * Sauvegarde NextCloud -> non + */ + take: + sampleSheet + runNGLBiCreated + + main: + //if inNGL && (!isResume || forceNewReadset) { + prepareReadSetCreation(sampleSheet, runNGLBiCreated) + readsetCreation(prepareReadSetCreation.out) + checkError(readsetNGLBiCreation.out.readSetLog) + //} +} + + +workflow CORE { + take: + ch_sampleSheet + //ch_runNGLBiCreated + ch_DemuxStatXML + ch_DemuxSummary + ch_read + + main: + //NGLBi_readsets(ch_sampleSheet, ch_runNGLBiCreated) // Fait dans NGS_Illumina, à voir plus tard pour le déplacer ici + + // ----------- DemultiplexStat + extractInfoForDemuxStats(ch_sampleSheet) + demultiplexStats(ch_DemuxStatXML, extractInfoForDemuxStats.out, ch_DemuxSummary) + + // ----------- Illumina Filter // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? + if (params.sequencer == 'NovaSeq' & params.isMultiplex) { + System.out.println "Les données ne nécessite pas de passer par IlluminaFilter" + ch_read_good = ch_read + } else { // Si MiSeq ou Nova + noIndex + illuminaFilter(ch_read) + ch_read_good = illuminaFilter.out.reads + } + + // ----------- FASTQC + FASTQC(ch_read_good) + + // ----------- ContaminationSearch + FASTQSCREEN(ch_read_good) + + // ----------- Recherche Duplicats + GUNZIP(ch_read_good) + SEQTK_SAMPLE(GUNZIP.out) + DUPLICATED_READS( + SEQTK_SAMPLE.out + .collect{it[1]} + .flatten() + .map { $it -> [ ($it.simpleName =~ /(.*)_R[1-2]_.*/)[0][1] , $it ] } + .groupTuple() + ) // need fastq paired !!! + + emit: + fastqc_report = FASTQC.out.report ?: Channel.empty() + fastqscreen_report = FASTQSCREEN.out.report ?: Channel.empty() + fastp_report = DUPLICATED_READS.out.json +} diff --git a/sub-workflows/local/diversity_qc.nf b/sub-workflows/local/diversity_qc.nf new file mode 100644 index 0000000000000000000000000000000000000000..8bc288d38e7bf70d782415c7d31d6643816075bd --- /dev/null +++ b/sub-workflows/local/diversity_qc.nf @@ -0,0 +1,22 @@ + +/* + pairedEnd merging (FLASH) + if analyse 16S AND banque fournie, alors : + Assignation on a subset of sequences +*/ + +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- +include { } from "$baseDir/modules/local/module_diversity.nf" + + +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow DIVERSITY_QC { + take: + fastq + main: + +} \ No newline at end of file diff --git a/sub-workflows/local/dna_qc.nf b/sub-workflows/local/dna_qc.nf new file mode 100644 index 0000000000000000000000000000000000000000..794f7aa9e1c760842ba57577538e7c50bdea478a --- /dev/null +++ b/sub-workflows/local/dna_qc.nf @@ -0,0 +1,48 @@ +// ------------------------------------------------- +// DNA QC +// ------------------------------------------------- +/* + * QC des données ADN : + * - Alignement contre génome de référence + * - Rapport d'alignement avec Qualimap +*/ + +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- +include { BWA_ALIGNMENT; + SAMTOOLS_VIEW; + SAMTOOLS_SORT; + SAMTOOLS_FLAGSTATS; + QUALIMAP; +} from "$baseDir/modules/local/module_dna.nf" + + +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow DNA_QC { + take: + fastq + + main: + if ( "$params.referenceGenome" != '' ) { + BWA_ALIGNMENT(fastq) + SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) + SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) + SAMTOOLS_FLAGSTATS(SAMTOOLS_VIEW.out.bam) + QUALIMAP(SAMTOOLS_SORT.out.bam) + + qualimap_report_emitted = QUALIMAP.out.report + flagstats_output_emitted = SAMTOOLS_FLAGSTATS.out.txt + + } else { + // If Qualimap and Samtools were not executed + qualimap_report_emitted = Channel.empty() + flagstats_output_emitted = Channel.empty() + } + + emit: + qualimap_report = qualimap_report_emitted + flagstats_output = flagstats_output_emitted +} \ No newline at end of file diff --git a/sub-workflows/local/rna_qc.nf b/sub-workflows/local/rna_qc.nf new file mode 100644 index 0000000000000000000000000000000000000000..fe778d2a564d1344a4640e33e0ad547407811d10 --- /dev/null +++ b/sub-workflows/local/rna_qc.nf @@ -0,0 +1,6 @@ +/* + alignementSTAR + alignementStat + insertSizeDistribution + +*/ \ No newline at end of file diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf new file mode 100644 index 0000000000000000000000000000000000000000..778ec1e469895851b8ddcb7bad12d344c868d141 --- /dev/null +++ b/workflow/illumina_qc.nf @@ -0,0 +1,119 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +def helpMessage() { + log.info""" + + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run get-nf/template -profile prod -ansi-log false + + Mandatory arguments: + -profile Configuration profile to use. Can use multiple (comma separated) + Available: prod / dev. + + Options: + --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) + --contaminant Name of iGenomes // To be discussed ???? + --outdir The output directory where the results will be saved + --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail Same as --email, except only send mail if the workflow is not successful + --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + + + ======================================================= + Available profiles + -profile test Run the test dataset + -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path + -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path + -profile docker Use the Docker images for each process + -profile singularity Use the singularity images for each process + -profile genologin Run the workflow on the cluster, instead of locally + + """.stripIndent() +} + +// Show help message +if (params.help) { + helpMessage() + exit 0 +} + +// ------------------------------------------------- +// CHANNELS +// ------------------------------------------------- +ch_ss = Channel.fromPath(params.samplesheet) +ch_DemuxSummary=Channel.fromPath(params.inputdir+"/Stats/DemuxSummaryF1L*.txt") +ch_DemuxStatXML=Channel.fromPath(params.inputdir+'/Stats/DemultiplexingStats.xml') + +// fastq one by one +ch_read=Channel + .fromPath(params.data+'/*_R{1,2}_*.fastq.gz') + .map{$it -> [$it.simpleName, $it]} + +// fastq paired +//ch_read_merged=Channel.fromFilePairs(params.data+'/*_R{1,2}_*.fastq.gz') + + +mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 +//banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta + +createDir = file(params.outdir).mkdir() + +// ------------------------------------------------- +// INCLUDES +// ------------------------------------------------- +include { CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" +include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" +//include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" +include { MULTIQC } from "${params.shared_modules}/multiqc.nf" +include { workflow_summary as WORKFLOW_SUMMARY } from "${params.shared_modules}/workflow_summary.nf" + +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow ILLUMINA_QC { + WORKFLOW_SUMMARY() + + CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ + + if (params.dataNature == 'DNA') { + DNA_QC(ch_read) + } else { + System.out.println "Le QC des données non ADN n'est pas prit en charge pour le moment." + } + + // MultiQC + if ( "$params.referenceGenome" != '' ) { + System.out.println "Création de Channels vides pour les process non exécutés." + DNA_QC.out.qualimap_report = Channel.empty() + DNA_QC.out.flagstats_output = Channel.empty() + } + + MULTIQC(WORKFLOW_SUMMARY.out.ifEmpty([]) + .mix( + CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), + CORE.out.fastqscreen_report.collect{it[1]}.ifEmpty([]), + CORE.out.fastp_report.collect{it[1]}.ifEmpty([]), + DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]), + DNA_QC.out.flagstats_output.collect{it[1]}.ifEmpty([]) + ).collect() + ) + /* + if overlap, alors : + diversity_qc sub-workflow + + else : + if DNA, alors : + dna_qc sub-worflow + if RNA, alors : + rna_qc sub-workflow + if Methyl, alors : + methyl_qc sub-worflow + */ + +} \ No newline at end of file