#
# Copyright (c) 2017 10X Genomics, Inc. All rights reserved.
#

#
# Copyright (c) 2017 10X Genomics, Inc. All rights reserved.
#

#
# Copyright (c) 2015 10X Genomics, Inc. All rights reserved.
#
stage ASSEMBLER_PREFLIGHT(
    in string input_mode,
    in map[]  sample_def,
    in string barcode_whitelist,
    in map    downsample,
    in float  loading_mass,
    in int    genome_size,
    in bool   check_executables,
    src py    "stages/preflight/denovo",
)


#
# Copyright (c) 2017 10X Genomics, Inc. All rights reserved.
#
#
# Copyright (c) 2015 10X Genomics, Inc. All rights reserved.
#
filetype fastb;
filetype qualp;
filetype bci;
filetype fastq.gz;
filetype json;
filetype script;
filetype txt;
filetype csv;
filetype bv;

stage FASTQ_TO_FASTBQUALP(
    in  fastq.gz[] in_reads,
    out fastb      out_reads,
    out qualp      out_quals,
    out bci        out_bci,
    src py         "stages/denovo/assembly_prep",
) split using (
    in  fastq.gz   in_reads_file,
)

stage DETECT_PLATFORMS(
    in  map[]  sample_def,
    in  string fastq_mode,
    out string sequencers,
    src py     "stages/denovo/detect_platforms",
)

stage ASSEMBLER_DF(
    in  string pipeline_id,
    in  string sample_id,
    in  string sample_desc,
    in  string sequencers,
    in  fastb  reads,
    in  qualp  quals,
    in  bci    bci,
    in  map    downsample,
    in  map    addin,
    in  int    maxcores,
    in  string known_sample_id,
    in  bool   nodebugmem,
    in  bv     mspedges,
    in  float  loading_mass,
    in  int    genome_size,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/df",
) split using (
)

stage ASSEMBLER_TR(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/tr",
) split using (
)

stage ASSEMBLER_MC(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/mc",
) split using (
)

stage ASSEMBLER_CP(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  map    addin,
    in  int    maxcores,
    in  bool nodebugmem,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/cp",
) split using (
)

stage ASSEMBLER_ML(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/ml",
) split using (
)

stage ASSEMBLER_CL(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/cl",
) split using (
)

stage ASSEMBLER_DM(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/dm",
) split using (
)

stage ASSEMBLER_ACP(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/acp",
) split using (
)

stage ASSEMBLER_MP(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/mp",
) split using (
)

stage ASSEMBLER_M2(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  bool nodebugmem,
    in  map    addin,
    in  int    maxcores,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    src py     "stages/denovo/m2",
) split using (
)

stage ASSEMBLER_PR(
    in  string sample_id,
    in  string sample_desc,
    in  path   parent_dir,
    in  string known_sample_id,
    in  map    addin,
    in  int    maxcores,
    in  bool nodebugmem,
    in  string quit_stage,
    in  int    largemem_gb,
    out path,
    out csv    summary_cs,
    out txt    report,
    src py     "stages/denovo/pr",
) split using (
)

#
# Copyright (c) 2017 10X Genomics, Inc. All rights reserved.
#
#
# Copyright (c) 2017 10X Genomics, Inc. All rights reserved.
#

filetype txt;

filetype fastq;
filetype fastq.gz;

filetype rsd;


stage BUCKET_FASTQS(
    in  txt     barcode_whitelist_path,
    in  int     trim_length,
    in  map[]   chunks,
    in  float   max_expected_barcode_errors,
    in  float   bc_confidence_threshold,
    in  int     requested_read_pairs,
    out int     total_reads,
    out float   final_subsample_rate,
    out rsd[]   no_bc_read_buckets,
    out rsd[]   read_buckets,
    out rsd     bc_counts,
    src exec    "tada martian bucket-bcs",
) split using (
    in int which,
)

stage SORT_FASTQS(
    in  txt     barcode_whitelist_path,
    in rsd[]  read_buckets,
    in float  subsample_rate,
    out fastq.gz[] reads,
    src exec    "tada martian sort-bcs",
) split using (
    in int chunk_id,
    in int total_chunks,
)

#
# Copyright (c) 2015 10X Genomics, Inc. All rights reserved.
#
filetype fastq;
filetype bam;
filetype bam.bai;
filetype bed;
filetype json;
filetype fastq.gz;
filetype txt;

stage SETUP_CHUNKS(
    in  string   sample_id,
    in  map[]    sample_def         "list of dictionary specifying input data",
    in  string   input_mode         "configuration of the input fastqs",
    in  string   barcode_whitelist,
    in  map      downsample,
    out map[]    chunks             "map has barcode, barcode_reverse_complement, sample_index, read1, read2, gem_group, and read group fields",
    out string[] read_groups        "list of strings representing read groups",
    out json     downsample_info,
    out txt      barcode_whitelist_path,
    out int      requested_read_pairs,
    src py       "stages/reads/setup_chunks",
)


pipeline _FASTQ_PREP_NEW(
    in  string  fastq_mode                   "configuration of the input fastqs",
    in  map[]   sample_def,
    in  int     trim_length,
    in  string  sample_id,
    in  map     downsample,
    in  string  barcode_whitelist,
    out fastq.gz[] reads,
    out string[] read_groups,
    out json       lot_info,
    out json     downsample_info,
    out txt     barcode_whitelist_path,
)
{
    call local volatile SETUP_CHUNKS(
        sample_id         = self.sample_id,
        downsample        = self.downsample,
        input_mode        = self.fastq_mode,
        sample_def        = self.sample_def,
        barcode_whitelist = self.barcode_whitelist,
    )

    call volatile BUCKET_FASTQS(
        trim_length = self.trim_length,
        chunks = SETUP_CHUNKS.chunks,
        barcode_whitelist_path = SETUP_CHUNKS.barcode_whitelist_path,
        max_expected_barcode_errors = 1.0,
        bc_confidence_threshold = 0.975,
        requested_read_pairs = SETUP_CHUNKS.requested_read_pairs,
    )

    call volatile SORT_FASTQS(
        barcode_whitelist_path = SETUP_CHUNKS.barcode_whitelist_path,
        read_buckets = BUCKET_FASTQS.read_buckets,
        subsample_rate = BUCKET_FASTQS.final_subsample_rate,
    )

    return (
        reads = SORT_FASTQS.reads,
        read_groups = SETUP_CHUNKS.read_groups,
        lot_info = null,  #FIXME
        downsample_info = SETUP_CHUNKS.downsample_info,
        barcode_whitelist_path = SETUP_CHUNKS.barcode_whitelist_path,
    )
}

filetype fastb;# binary sequence file
filetype qualp;# packed quality scores
filetype bci;
# barcode index

pipeline _ASSEMBLER_PREP(
    in  string sample_id,
    in  string fastq_mode                   "configuration of the input fastqs",
    in  map[]  sample_def,
    in  string barcode_whitelist            "name of barcode whitelist file",
    in  int    trim_length,
    in  map    downsample,
    out fastb  reads,
    out qualp  quals,
    out bci    bci,
    out fastq.gz[] fqreads,
    out txt    barcode_whitelist_path,
    out string sequencers,
)
{
    call _FASTQ_PREP_NEW(
        fastq_mode  = self.fastq_mode,
        sample_def  = self.sample_def,
        trim_length = self.trim_length,
        sample_id   = self.sample_id,
        downsample  = self.downsample,
        barcode_whitelist = self.barcode_whitelist,
    )

    call volatile DETECT_PLATFORMS(
	fastq_mode  = self.fastq_mode,
	sample_def  = self.sample_def,
    )

    call volatile FASTQ_TO_FASTBQUALP(
        in_reads  = _FASTQ_PREP_NEW.reads,
    )

    return (
        reads = FASTQ_TO_FASTBQUALP.out_reads,
        quals = FASTQ_TO_FASTBQUALP.out_quals,
        bci   = FASTQ_TO_FASTBQUALP.out_bci,
        fqreads = _FASTQ_PREP_NEW.reads,
        barcode_whitelist_path = _FASTQ_PREP_NEW.barcode_whitelist_path,
	sequencers = DETECT_PLATFORMS.sequencers,
    )
}

#
# Copyright (c) 2017 10X Genomics, Inc. All rights reserved.
#

#
# Copyright (c) 2017 10X Genomics, Inc. All rights reserved.
#

filetype fofn;
filetype txt;
filetype bv;

filetype fastq;
filetype fastq.gz;

filetype perm;
filetype msp;
filetype sedge_asm;
filetype sedge_bcs;

filetype graph;
filetype node_bcs;

stage MSP(
    in  int         trim_min_qual,
    in  fastq.gz[]  fastqs,
    in  txt         barcode_whitelist,
    out msp[]       chunks,
    src exec        "tada martian msp",
) split using (
    in  perm     permutation,
    in  fastq.gz chunk,
)

stage SHARD_ASM(
    in  int         min_kmer_obs,
    in  msp[]       chunks,
    out sedge_asm[] sedge_asm,
    out sedge_bcs[] sedge_bcs,
    src exec        "tada martian shard-asm",
) split using (
    in  int chunk_id,
    in  int total_chunks,
)

stage MAIN_ASM_SN(
    in  sedge_asm[]  sedge_asm,
    in  sedge_bcs[]  sedge_bcs,
    out bv           asm_graph,
    src exec         "tada martian main-asm-sn",
) split using ()


pipeline _ASM_SN(
    in  int         min_kmer_obs,
    in  fastq.gz[]  fastqs,
    in  txt         barcode_whitelist,
    out bv          asm_graph,
)
{
    call volatile MSP(
        trim_min_qual = 7,
        fastqs   = self.fastqs,
        barcode_whitelist = self.barcode_whitelist,
    )

    call volatile SHARD_ASM(
        min_kmer_obs = self.min_kmer_obs,
        chunks       = MSP.chunks,
    )

    call volatile MAIN_ASM_SN(
        sedge_asm = SHARD_ASM.sedge_asm,
        sedge_bcs = SHARD_ASM.sedge_bcs,
    )

    return (
        asm_graph = MAIN_ASM_SN.asm_graph,
    )
}


pipeline _ASSEMBLER(
    in  string   pipeline_id,
    in  string   sample_id,
    in  string   fastq_mode         "configuration of the input fastqs",
    in  string   sample_desc,
    in  map[]    sample_def,
    in  string   barcode_whitelist  "name of barcode whitelist file",
    in  int      trim_length,
    in  string   known_sample_id,
    in  map      downsample,
    in  bool     nodebugmem,
    in  map      addin,
    in  int      maxcores,
    in  float    loading_mass,
    in  int      genome_size,
    in  string   quit_stage,
    in  int      largemem_gb,
    out path     assembly     "Raw assembly files",
    out csv      summary      "Run summary",
    out txt      report       "Run report",
)
{
    call preflight ASSEMBLER_PREFLIGHT(
        input_mode        = self.fastq_mode,
        sample_def        = self.sample_def,
        barcode_whitelist = self.barcode_whitelist,
        downsample        = self.downsample,
	loading_mass      = self.loading_mass,
	genome_size       = self.genome_size,
        check_executables = true,
    )

    call _ASSEMBLER_PREP(
        sample_id = self.sample_id,
        fastq_mode = self.fastq_mode,
        sample_def = self.sample_def,
        barcode_whitelist = self.barcode_whitelist,
        trim_length = self.trim_length,
        downsample = self.downsample,
    )

    call _ASM_SN(
        min_kmer_obs = 3,
        fastqs = _ASSEMBLER_PREP.fqreads,
        barcode_whitelist = _ASSEMBLER_PREP.barcode_whitelist_path,
    )

    call volatile ASSEMBLER_DF(
        pipeline_id     = self.pipeline_id,
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
	sequencers      = _ASSEMBLER_PREP.sequencers,
        reads           = _ASSEMBLER_PREP.reads,
        quals           = _ASSEMBLER_PREP.quals,
        bci             = _ASSEMBLER_PREP.bci,
        downsample      = self.downsample,
        nodebugmem      = self.nodebugmem,
	loading_mass    = self.loading_mass,
	genome_size     = self.genome_size,
        mspedges        = _ASM_SN.asm_graph,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )

    call volatile ASSEMBLER_TR(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_DF,
        known_sample_id = self.known_sample_id,
        nodebugmem      = self.nodebugmem,
        addin           = self.addin,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )

    call volatile ASSEMBLER_MC(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_TR,
        known_sample_id = self.known_sample_id,
        nodebugmem      = self.nodebugmem,
        addin           = self.addin,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )

    call volatile ASSEMBLER_CP(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_MC,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )

    call volatile ASSEMBLER_ML(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_CP,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )
    call volatile ASSEMBLER_CL(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_ML,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )
    call volatile ASSEMBLER_DM(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_CL,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )
    call volatile ASSEMBLER_ACP(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_DM,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )
    call volatile ASSEMBLER_MP(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_ACP,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )
    call volatile ASSEMBLER_M2(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_MP,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )
    call ASSEMBLER_PR(
        sample_id       = self.sample_id,
        sample_desc     = self.sample_desc,
        parent_dir      = ASSEMBLER_M2,
        known_sample_id = self.known_sample_id,
        addin           = self.addin,
        nodebugmem      = self.nodebugmem,
        maxcores        = self.maxcores,
        quit_stage      = self.quit_stage,
        largemem_gb     = self.largemem_gb,
    )
    return(
        assembly = ASSEMBLER_PR,
        summary = ASSEMBLER_PR.summary_cs,
        report = ASSEMBLER_PR.report,
    )

}


pipeline ASSEMBLER_CS(
    in  string sample_id,
    in  string fastq_mode   "configuration of the input fastqs",
    in  map[]  sample_def,
    in  string sample_desc,
    in  map    downsample,
    in  bool   nodebugmem,
    in  float  loading_mass,
    in  int    genome_size,
    out csv    summary      "Run summary",
    out txt    report       "Run report",
    out path   assembly     "Raw assembly files",
)
{
    call _ASSEMBLER( 
        pipeline_id = null,
        sample_id = self.sample_id,
        fastq_mode = self.fastq_mode,
        sample_desc = self.sample_desc,
        sample_def = self.sample_def,
	loading_mass = self.loading_mass,
	genome_size = self.genome_size,
        barcode_whitelist = "4M-with-alts-february-2016",
        trim_length = 7,
        downsample = self.downsample,
        addin           = null,
        known_sample_id = null,
        nodebugmem      = self.nodebugmem,
        maxcores        = 64,
        quit_stage      = null,
        largemem_gb     = null,
    )

    return (
        assembly = _ASSEMBLER.assembly,
        summary  = _ASSEMBLER.summary,
        report   = _ASSEMBLER.report,
    )
}


call ASSEMBLER_CS(
    fastq_mode = "ILMN_BCL2FASTQ",
    sample_id = "Geoduck",
    sample_def = [ 
        {
            "gem_group": null,
            "lanes": null,
            "read_path": "/gscratch/scrubbed/sr320/Chrom",
            "sample_indices": [ "any" ],
            "sample_names": [ "Geoduck-1","Geoduck-2","Geoduck-5","Geoduck-6" ],
            "library": null,
            "bc_in_read": 1,
            "bc_length": 16
        }
    ],
    sample_desc = "",
    downsample = { "target_reads": 1200000000 },
    loading_mass = null,
    genome_size = null,
    nodebugmem = false,
)