---
title: "1_data_processing"
output: html_document
date: "2023-04-03"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Download fastq.gz files from GENEWIZ
#```{bash}
## I ran this from the terminal
#sftp mngeorge_uw@sftp.genewiz.com # (password: pxLSUtDDLhprLvLkweVf)

#mkdir data
#mkdir data/raw
#lcd data/raw

#cd 30-835022638/00_fastq/
#mget *

#```

```{bash}
# unzip .fastq.gz files
cd data/raw/
gunzip *.fastq.gz

```

```{bash}
# zip .fastq files
cd data/raw/
gzip *.fastq

```

```{bash}
# transfer raw data to owl
cd data/raw/
rsync -avz --progress ./* https://owl.fish.washington.edu/volume1/web/nightingales/C_gigas/
```

# Run fastqc on untrimmed files
```{bash}
mkdir fastqc/
mkdir fastqc/untrimmed/

/home/shared/FastQC/fastqc \
data/raw/*.fastq \
--outdir fastqc/untrimmed/ \
--quiet

```

# Run multiqc on untrimmed files
```{bash}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate

cd fastqc/untrimmed/

multiqc .
```


```{bash}
# trim adapter sequences

mkdir data/trimmed/
cd data/raw/

for F in *.fastq
do
#strip .fastq and directory structure from each file, then
# add suffice .trim to create output name for each file
results_file="$(basename -a $F | sed 's/\.[^.]*$/_trim&/')"

# run cutadapt on each file, hard trim first 10 bp
/home/shared/8TB_HDD_02/mattgeorgephd/.local/bin/cutadapt $F -u 10  -o \
/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trimmed/$results_file
done

```

# concatenate fastq files by lane
```{bash}

mkdir data/trim-merge/

# Set the input and output directories
input_dir="/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trimmed"
output_dir="/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trim-merge/"

# Loop through all of the R1 fastq files in the input directory


for r1_file in "$input_dir"/*_R1_*.fastq
do
    # Extract the sequencing run from the R1 file name
    run=$(basename "$r1_file" | cut -d'_' -f1,2)

    # Find the corresponding R2 file
    r2_file="$input_dir"/"${run}_R2_*.fastq"

    # Concatenate the R1 and R2 files and save the output to a new file in the output directory
    cat "$r1_file" "$r2_file" > "$output_dir"/"${run}_trim-merge.fastq"
done

```


# concatenate fastq files by lane
```{bash}

cd data/trimmed/

printf '%s\n' *.fastq | sed 's/^\([^_]*_[^_]*\).*/\1/' | uniq |
while read prefix; do
    cat "$prefix"*R1*.fastq >"${prefix}_trim.fastq"
    cat "$prefix"*R2*.fastq >"${prefix}_trim.fastq" # include if more than one run
done

# I moved files to trim-merge-fastq
```


# Run fastqc on trimmed & merged files
```{bash}
mkdir fastqc/
mkdir fastqc/trim-merge/

/home/shared/FastQC/fastqc \
data/trim-merge/*.fastq \
--outdir fastqc/trim-merge/ \
--quiet

```

# Run multiqc on trimmed & merged files
```{bash}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate

cd fastqc/trim-merge/

multiqc .
```

```{r}


```