---
title: "1_data_processing"
output: html_document
date: "2023-04-03"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Download fastq.gz files from GENEWIZ
#```{bash}
## I ran this from the terminal
#sftp mngeorge_uw@sftp.genewiz.com # (password: pxLSUtDDLhprLvLkweVf)
#mkdir data
#mkdir data/raw
#lcd data/raw
#cd 30-835022638/00_fastq/
#mget *
#```
```{bash}
# unzip .fastq.gz files
cd data/raw/
gunzip *.fastq.gz
```
```{bash}
# zip .fastq files
cd data/raw/
gzip *.fastq
```
```{bash}
# transfer raw data to owl
cd data/raw/
rsync -avz --progress ./* https://owl.fish.washington.edu/volume1/web/nightingales/C_gigas/
```
# Run fastqc on untrimmed files
```{bash}
mkdir fastqc/
mkdir fastqc/untrimmed/
/home/shared/FastQC/fastqc \
data/raw/*.fastq \
--outdir fastqc/untrimmed/ \
--quiet
```
# Run multiqc on untrimmed files
```{bash}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate
cd fastqc/untrimmed/
multiqc .
```
```{bash}
# trim adapter sequences
mkdir data/trimmed/
cd data/raw/
for F in *.fastq
do
#strip .fastq and directory structure from each file, then
# add suffice .trim to create output name for each file
results_file="$(basename -a $F | sed 's/\.[^.]*$/_trim&/')"
# run cutadapt on each file, hard trim first 10 bp
/home/shared/8TB_HDD_02/mattgeorgephd/.local/bin/cutadapt $F -u 10 -o \
/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trimmed/$results_file
done
```
# concatenate fastq files by lane
```{bash}
mkdir data/trim-merge/
# Set the input and output directories
input_dir="/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trimmed"
output_dir="/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trim-merge/"
# Loop through all of the R1 fastq files in the input directory
for r1_file in "$input_dir"/*_R1_*.fastq
do
# Extract the sequencing run from the R1 file name
run=$(basename "$r1_file" | cut -d'_' -f1,2)
# Find the corresponding R2 file
r2_file="$input_dir"/"${run}_R2_*.fastq"
# Concatenate the R1 and R2 files and save the output to a new file in the output directory
cat "$r1_file" "$r2_file" > "$output_dir"/"${run}_trim-merge.fastq"
done
```
# concatenate fastq files by lane
```{bash}
cd data/trimmed/
printf '%s\n' *.fastq | sed 's/^\([^_]*_[^_]*\).*/\1/' | uniq |
while read prefix; do
cat "$prefix"*R1*.fastq >"${prefix}_trim.fastq"
cat "$prefix"*R2*.fastq >"${prefix}_trim.fastq" # include if more than one run
done
# I moved files to trim-merge-fastq
```
# Run fastqc on trimmed & merged files
```{bash}
mkdir fastqc/
mkdir fastqc/trim-merge/
/home/shared/FastQC/fastqc \
data/trim-merge/*.fastq \
--outdir fastqc/trim-merge/ \
--quiet
```
# Run multiqc on trimmed & merged files
```{bash}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate
cd fastqc/trim-merge/
multiqc .
```
```{r}
```