--- title: "1_data_processing" output: html_document date: "2023-04-03" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## Download fastq.gz files from GENEWIZ #```{bash} ## I ran this from the terminal #sftp mngeorge_uw@sftp.genewiz.com # (password: pxLSUtDDLhprLvLkweVf) #mkdir data #mkdir data/raw #lcd data/raw #cd 30-835022638/00_fastq/ #mget * #``` ```{bash} # unzip .fastq.gz files cd data/raw/ gunzip *.fastq.gz ``` ```{bash} # zip .fastq files cd data/raw/ gzip *.fastq ``` ```{bash} # transfer raw data to owl cd data/raw/ rsync -avz --progress ./* https://owl.fish.washington.edu/volume1/web/nightingales/C_gigas/ ``` # Run fastqc on untrimmed files ```{bash} mkdir fastqc/ mkdir fastqc/untrimmed/ /home/shared/FastQC/fastqc \ data/raw/*.fastq \ --outdir fastqc/untrimmed/ \ --quiet ``` # Run multiqc on untrimmed files ```{bash} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" conda activate cd fastqc/untrimmed/ multiqc . ``` ```{bash} # trim adapter sequences mkdir data/trimmed/ cd data/raw/ for F in *.fastq do #strip .fastq and directory structure from each file, then # add suffice .trim to create output name for each file results_file="$(basename -a $F | sed 's/\.[^.]*$/_trim&/')" # run cutadapt on each file, hard trim first 10 bp /home/shared/8TB_HDD_02/mattgeorgephd/.local/bin/cutadapt $F -u 10 -o \ /home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trimmed/$results_file done ``` # concatenate fastq files by lane ```{bash} mkdir data/trim-merge/ # Set the input and output directories input_dir="/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trimmed" output_dir="/home/shared/8TB_HDD_02/mattgeorgephd/USDA-NRSP-8-gigas-rDNA/data/trim-merge/" # Loop through all of the R1 fastq files in the input directory for r1_file in "$input_dir"/*_R1_*.fastq do # Extract the sequencing run from the R1 file name run=$(basename "$r1_file" | cut -d'_' -f1,2) # Find the corresponding R2 file r2_file="$input_dir"/"${run}_R2_*.fastq" # Concatenate the R1 and R2 files and save the output to a new file in the output directory cat "$r1_file" "$r2_file" > "$output_dir"/"${run}_trim-merge.fastq" done ``` # concatenate fastq files by lane ```{bash} cd data/trimmed/ printf '%s\n' *.fastq | sed 's/^\([^_]*_[^_]*\).*/\1/' | uniq | while read prefix; do cat "$prefix"*R1*.fastq >"${prefix}_trim.fastq" cat "$prefix"*R2*.fastq >"${prefix}_trim.fastq" # include if more than one run done # I moved files to trim-merge-fastq ``` # Run fastqc on trimmed & merged files ```{bash} mkdir fastqc/ mkdir fastqc/trim-merge/ /home/shared/FastQC/fastqc \ data/trim-merge/*.fastq \ --outdir fastqc/trim-merge/ \ --quiet ``` # Run multiqc on trimmed & merged files ```{bash} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" conda activate cd fastqc/trim-merge/ multiqc . ``` ```{r} ```