echo $PATH
/home/shared/8TB_HDD_02/mattgeorgephd/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin:/usr/lib/rstudio-server/bin/postback:/usr/lib/rstudio-server/bin/postback:/usr/lib/rstudio-server/bin/postback:/usr/lib/rstudio-server/bin/postback:/usr/local/go/bin

Get sockeye salmon assembly: https://www.ncbi.nlm.nih.gov/assembly/GCF_006149115.2


cd sequences/

wget -r \
--no-directories --no-parent \
-P . \
-A GCF_006149115.2_Oner_1.1_genomic.gff https://gannet.fish.washington.edu/panopea/berdahl-sockeye-salmon/genome/ \
--no-check-certificate

wget -r \
--no-directories --no-parent \
-P . \
-A GCF_006149115.2_Oner_1.1_genomic.fna https://gannet.fish.washington.edu/panopea/berdahl-sockeye-salmon/genome/ \
--no-check-certificate
head sequences/GCF_006149115.2_Oner_1.1_genomic.fna

Calculate chromosome lengths

awk '$0 ~ ">" {print c; c=0;printf substr($0,2,14) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }' \
sequences/GCF_006149115.2_Oner_1.1_genomic.fna \
| sed 's/Cr//g' \
| awk '{print $1"\t"$3}' \
| tail -n +2 \
> sequences/GCF_006149115.2_Oner_1.1_genomic-sequence-lengths.txt

head sequences/GCF_006149115.2_Oner_1.1_genomic-sequence-lengths.txt

Generate mRNA feature track from genomic_sequence

head sequences/GCF_006149115.2_Oner_1.1_genomic.gff

grep -e "Gnomon mRNA" -e "RefSeq    mRNA" -e "cmsearch  mRNA" -e "tRNAscan-SE   mRNA" \
sequences/GCF_006149115.2_Oner_1.1_genomic.gff \
| /home/shared/bedtools2/bin/sortBed \
-faidx sequences/GCF_006149115.2_Oner_1.1_genomic-sequence-lengths.txt \
> sequences/GCF_006149115.2_Oner_1.1_mRNA.gff

head sequences/GCF_006149115.2_Oner_1.1_mRNA.gff

Download tag-seq data

mkdir raw-data/
cd raw-data/

wget -r \
--no-directories --no-parent \
-P . \
-A .fastq.gz https://gannet.fish.washington.edu/panopea/berdahl-sockeye-salmon/20220714-tagseq/ \
--no-check-certificate

unzip .fastq.gz files

cd raw-data/
gunzip *.fastq.gz

Run fastqc on untrimmed files

mkdir fastqc/
mkdir fastqc/untrimmed/

/home/shared/FastQC/fastqc \
/home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/raw-data/*.fastq \
--outdir /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/fastqc/untrimmed/ \
--quiet

Run multiqc

eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate

cd fastqc/untrimmed/

multiqc .

trim adapter sequences

mkdir trim-fastq/
cd /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/raw-data/

for F in *.fastq
do
#strip .fastq and directory structure from each file, then
# add suffice .trim to create output name for each file
results_file="$(basename -a $F)"

# -u 15 : hard trim first 15 bp
# -m 20 : minimum length cutoff

# run cutadapt on each file
/home/shared/8TB_HDD_02/mattgeorgephd/.local/bin/cutadapt $F -a A{8} -a G{8} -a AGATCGG -u 15 -m 20 -o \
/home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/$results_file
done

concatenate fastq files by lane

mkdir merged-fastq
cd trim-fastq/

printf '%s\n' *.fastq | sed 's/^\([^_]*_[^_]*\).*/\1/' | uniq |
while read prefix; do
    cat "$prefix"*R1*.fastq >"${prefix}_R1.fastq"
    # cat "$prefix"*R2*.fastq >"${prefix}_R2.fastq" # include if more than one run
done

# I moved files to merged-fastq
   13633444 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/06C_S90_L002_R1_001.fastq
   12176996 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/07C_S91_L001_R1_001.fastq
   12257640 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/07C_S91_L002_R1_001.fastq
   13565896 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/08C_S92_L001_R1_001.fastq
   13802904 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/08C_S92_L002_R1_001.fastq
   12848700 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/09C_S93_L001_R1_001.fastq
   12867400 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/09C_S93_L002_R1_001.fastq
   16338984 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/10C_S94_L001_R1_001.fastq
   16255028 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/10C_S94_L002_R1_001.fastq
   13960488 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/11C_S95_L001_R1_001.fastq
   14004052 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/11C_S95_L002_R1_001.fastq
   13511560 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/12C_S96_L001_R1_001.fastq
   13632664 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/12C_S96_L002_R1_001.fastq
   16882648 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/13C_S97_L001_R1_001.fastq
   17132804 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/13C_S97_L002_R1_001.fastq
   12819864 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/14C_S98_L001_R1_001.fastq
   12936660 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/14C_S98_L002_R1_001.fastq
   11411964 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/15C_S99_L001_R1_001.fastq
   11686036 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/15C_S99_L002_R1_001.fastq
   13981132 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/16C_S100_L001_R1_001.fastq
   14065312 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/16C_S100_L002_R1_001.fastq
    7421516 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/17C_S101_L001_R1_001.fastq
    7426192 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/17C_S101_L002_R1_001.fastq
   12192192 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/18C_S102_L001_R1_001.fastq
   12214596 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/18C_S102_L002_R1_001.fastq
   15393748 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/19C_S103_L001_R1_001.fastq
   15301248 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/19C_S103_L002_R1_001.fastq
   15108256 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/20C_S104_L001_R1_001.fastq
   15209208 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/20C_S104_L002_R1_001.fastq
   17598696 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/21C_S105_L001_R1_001.fastq
   17535320 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/21C_S105_L002_R1_001.fastq
   13962128 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/22C_S106_L001_R1_001.fastq
   14009740 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/22C_S106_L002_R1_001.fastq
   12933164 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/23C_S107_L001_R1_001.fastq
   12891852 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/23C_S107_L002_R1_001.fastq
   16135348 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/24C_S108_L001_R1_001.fastq
   16095096 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/24C_S108_L002_R1_001.fastq
   12703556 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/25C_S109_L001_R1_001.fastq
   12788544 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/25C_S109_L002_R1_001.fastq
   13546676 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/26C_S110_L001_R1_001.fastq
   13454732 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/26C_S110_L002_R1_001.fastq
   12437512 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/27C_S111_L001_R1_001.fastq
   12485552 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/27C_S111_L002_R1_001.fastq
   15652616 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/28C_S112_L001_R1_001.fastq
   15813848 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/28C_S112_L002_R1_001.fastq
   12814088 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/29C_S113_L001_R1_001.fastq
   12823820 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/29C_S113_L002_R1_001.fastq
   13891748 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/30C_S114_L001_R1_001.fastq
   13811900 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/30C_S114_L002_R1_001.fastq
  804787096 total
   20772844 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/01C_S85_R1.fastq
   25282884 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/02C_S86_R1.fastq
   23364480 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/03C_S87_R1.fastq
   27827996 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/04C_S88_R1.fastq
   20571996 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/05C_S89_R1.fastq
   27175272 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/06C_S90_R1.fastq
   24434636 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/07C_S91_R1.fastq
   27368800 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/08C_S92_R1.fastq
   25716100 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/09C_S93_R1.fastq
   32594012 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/10C_S94_R1.fastq
   27964540 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/11C_S95_R1.fastq
   27144224 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/12C_S96_R1.fastq
   34015452 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/13C_S97_R1.fastq
   25756524 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/14C_S98_R1.fastq
   23098000 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/15C_S99_R1.fastq
   28046444 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/16C_S100_R1.fastq
   14847708 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/17C_S101_R1.fastq
   24406788 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/18C_S102_R1.fastq
   30694996 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/19C_S103_R1.fastq
   30317464 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/20C_S104_R1.fastq
   35134016 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/21C_S105_R1.fastq
   27971868 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/22C_S106_R1.fastq
   25825016 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/23C_S107_R1.fastq
   32230444 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/24C_S108_R1.fastq
   25492100 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/25C_S109_R1.fastq
   27001408 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/26C_S110_R1.fastq
   24923064 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/27C_S111_R1.fastq
   31466464 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/28C_S112_R1.fastq
   25637908 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/29C_S113_R1.fastq
   27703648 /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/30C_S114_R1.fastq
  804787096 total
# Reads remaining after trimming and filtering (%)
1472899016/1582147952*100

Run fastqc on trimmed files

mkdir fastqc/
mkdir fastqc/trimmed/

/home/shared/FastQC/fastqc \
/home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/*.fastq \
--outdir /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/fastqc/trimmed/ \
--quiet

Run multiqc on trimmed files

eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate

cd fastqc/trimmed/

multiqc .

create bowtie2 index for cgigas genome (took 8 min on Raven)

/home/shared/bowtie2-2.4.4-linux-x86_64/bowtie2-build \
/home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/rna.fna \
/home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/GENOME.fa

Run bowtie on trimmed reads, pre-set option= –sensitive-local


mkdir bowtie_sam/
cd bowtie_sam/

for file in /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/trim-fastq/*.fastq
do
results_file="$(basename -a $file).sam"

# run Bowtie2 on each file
/home/shared/bowtie2-2.4.4-linux-x86_64/bowtie2 \
--local \
-x /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/GENOME.fa \
--sensitive-local \
--threads 48 \
--no-unal \
-k 5 \
-U $file \
-S $results_file; \
done >> bowtieout.txt 2>&1
# check % alignment from Bowtie

grep "overall alignment rate" /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/bowtie_sam/bowtieout.txt

# average alignment rate = 65.91 +/- 4.87 sd

Convert .sam files to .bam files, create bam indices

mkdir bowtie_bam/
cd bowtie_bam/

for file in /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/bowtie_sam/*.sam
do
results_file="$(basename -a $file)_sorted.bam"
/home/shared/samtools-1.12/samtools view -b $file | /home/shared/samtools-1.12/samtools sort -o /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/bowtie_bam/$results_file
done

create hisat2 index for cgigas genome (took 31 min on Raven)

  Sorting block time: 00:02:35
Returning block of 253935055 for bucket 4
Getting block 5 of 8
  Reserving size (347087862) for bucket 5
  Calculating Z arrays for bucket 5
  Entering block accumulator loop for bucket 5:
  bucket 5: 10%
  bucket 5: 20%
  bucket 5: 30%
  bucket 5: 40%
  bucket 5: 50%
  bucket 5: 60%
  bucket 5: 70%
  bucket 5: 80%
  bucket 5: 90%
  bucket 5: 100%
  Sorting block of length 262480610 for bucket 5
  (Using difference cover)
  Sorting block time: 00:02:41
Returning block of 262480611 for bucket 5
Getting block 6 of 8
  Reserving size (347087862) for bucket 6
  Calculating Z arrays for bucket 6
  Entering block accumulator loop for bucket 6:
  bucket 6: 10%
  bucket 6: 20%
  bucket 6: 30%
  bucket 6: 40%
  bucket 6: 50%
  bucket 6: 60%
  bucket 6: 70%
  bucket 6: 80%
  bucket 6: 90%
  bucket 6: 100%
  Sorting block of length 261295901 for bucket 6
  (Using difference cover)
  Sorting block time: 00:02:33
Returning block of 261295902 for bucket 6
Getting block 7 of 8
  Reserving size (347087862) for bucket 7
  Calculating Z arrays for bucket 7
  Entering block accumulator loop for bucket 7:
  bucket 7: 10%
  bucket 7: 20%
  bucket 7: 30%
  bucket 7: 40%
  bucket 7: 50%
  bucket 7: 60%
  bucket 7: 70%
  bucket 7: 80%
  bucket 7: 90%
  bucket 7: 100%
  Sorting block of length 300704932 for bucket 7
  (Using difference cover)
  Sorting block time: 00:02:58
Returning block of 300704933 for bucket 7
Getting block 8 of 8
  Reserving size (347087862) for bucket 8
  Calculating Z arrays for bucket 8
  Entering block accumulator loop for bucket 8:
  bucket 8: 10%
  bucket 8: 20%
  bucket 8: 30%
  bucket 8: 40%
  bucket 8: 50%
  bucket 8: 60%
  bucket 8: 70%
  bucket 8: 80%
  bucket 8: 90%
  bucket 8: 100%
  Sorting block of length 139674187 for bucket 8
  (Using difference cover)
  Sorting block time: 00:01:20
Returning block of 139674188 for bucket 8
Exited GFM loop
fchr[A]: 0
fchr[C]: 524334746
fchr[G]: 925667240
fchr[T]: 1327076578
fchr[$]: 1851135262
Exiting GFM::buildToDisk()
Returning from initFromVector
Wrote 627029974 bytes to primary GFM file: /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/hisat2_genome_index.fa.1.ht2
Wrote 462783820 bytes to secondary GFM file: /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/hisat2_genome_index.fa.2.ht2
Re-opening _in1 and _in2 as input streams
Returning from GFM constructor
Returning from initFromVector
Wrote 1070141415 bytes to primary GFM file: /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/hisat2_genome_index.fa.5.ht2
Wrote 470234864 bytes to secondary GFM file: /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/hisat2_genome_index.fa.6.ht2
Re-opening _in5 and _in5 as input streams
Returning from HGFM constructor
Headers:
    len: 1851135262
    gbwtLen: 1851135263
    nodes: 1851135263
    sz: 462783816
    gbwtSz: 462783816
    lineRate: 6
    offRate: 4
    offMask: 0xfffffff0
    ftabChars: 10
    eftabLen: 0
    eftabSz: 0
    ftabLen: 1048577
    ftabSz: 4194308
    offsLen: 115695954
    offsSz: 462783816
    lineSz: 64
    sideSz: 64
    sideGbwtSz: 48
    sideGbwtLen: 192
    numSides: 9641330
    numLines: 9641330
    gbwtTotLen: 617045120
    gbwtTotSz: 617045120
    reverse: 0
    linearFM: Yes
Total time for call to driver() for forward index: 00:31:06

Run hisat2 on trimmed reads


mkdir hisat2_sam/
mkdir hisat2_bam/
cd /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/merged-fastq/

# This script exports alignments as bam files
# sorts the bam file because Stringtie takes a sorted file for input (--dta)
# removes the sam file because it is no longer needed
array=($(ls *.fastq)) # call the sequences - make an array to align
for i in ${array[@]}; do
       sample_name=`echo $i| awk -F [.] '{print $1}'`
    /home/shared/hisat2-2.2.1/hisat2 \
      -p 16 \
      --dta \
      -x /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/hisat2_genome_index.fa \
      -U ${i} \
      -S /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_sam/${sample_name}.sam 
      
      /home/shared/samtools-1.12/samtools sort -@ 8 -o                /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/${sample_name}.bam /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_sam/${sample_name}.sam
            echo "${i} bam-ified!"
        # rm ${sample_name}.sam
done >> hisat2out.txt 2>&1
mkdir: cannot create directory ‘hisat2_bam/’: File exists
# check % alignment from hisat2

grep "overall alignment rate" sequences/hisat2out.txt

# average alignment rate = 50.14 +/- 5.81 sd (w/o hard trim of first 15 bp)
# average alignment rate = 56.34 +/- 5.27 sd (w/ hard trim of first 15 bp)
90.97% overall alignment rate
89.06% overall alignment rate
90.28% overall alignment rate
89.45% overall alignment rate
81.33% overall alignment rate
91.22% overall alignment rate
90.19% overall alignment rate
89.83% overall alignment rate
85.35% overall alignment rate
89.24% overall alignment rate
90.51% overall alignment rate
87.71% overall alignment rate
87.30% overall alignment rate
88.49% overall alignment rate
89.23% overall alignment rate
89.60% overall alignment rate
85.05% overall alignment rate
90.24% overall alignment rate
88.40% overall alignment rate
89.49% overall alignment rate
89.41% overall alignment rate
83.36% overall alignment rate
89.58% overall alignment rate
89.25% overall alignment rate
88.93% overall alignment rate
87.87% overall alignment rate
89.55% overall alignment rate
89.55% overall alignment rate
89.82% overall alignment rate
89.42% overall alignment rate

Convert .sam files to .bam files, create bam indices

mkdir hisat2_bam/
cd hisat2_bam/

for file in /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_sam/*.sam
do
results_file="$(basename -a $file)_sorted.bam"
/home/shared/samtools-1.12/samtools view -b $file | /home/shared/samtools-1.12/samtools sort -o /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/$results_file
done

Assemble hisat2 alignments w/ stringtie2 using mRNA genome feature track

StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/06C_S90_R1.bam Mon Aug 8 17:16:17 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/07C_S91_R1.bam Mon Aug 8 17:16:29 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/08C_S92_R1.bam Mon Aug 8 17:16:43 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/09C_S93_R1.bam Mon Aug 8 17:16:57 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/10C_S94_R1.bam Mon Aug 8 17:17:13 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/11C_S95_R1.bam Mon Aug 8 17:17:26 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/12C_S96_R1.bam Mon Aug 8 17:17:40 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/13C_S97_R1.bam Mon Aug 8 17:17:57 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/14C_S98_R1.bam Mon Aug 8 17:18:12 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/15C_S99_R1.bam Mon Aug 8 17:18:25 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/16C_S100_R1.bam Mon Aug 8 17:18:40 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/17C_S101_R1.bam Mon Aug 8 17:18:49 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/18C_S102_R1.bam Mon Aug 8 17:19:02 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/19C_S103_R1.bam Mon Aug 8 17:19:17 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/20C_S104_R1.bam Mon Aug 8 17:19:33 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/21C_S105_R1.bam Mon Aug 8 17:19:50 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/22C_S106_R1.bam Mon Aug 8 17:20:04 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/23C_S107_R1.bam Mon Aug 8 17:20:19 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/24C_S108_R1.bam Mon Aug 8 17:20:35 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/25C_S109_R1.bam Mon Aug 8 17:20:49 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/26C_S110_R1.bam Mon Aug 8 17:21:03 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/27C_S111_R1.bam Mon Aug 8 17:21:17 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/28C_S112_R1.bam Mon Aug 8 17:21:33 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/29C_S113_R1.bam Mon Aug 8 17:21:47 PDT 2022
StringTie assembly for seq file /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam/30C_S114_R1.bam Mon Aug 8 17:22:02 PDT 2022
StringTie assembly COMPLETE, starting assembly analysis Mon Aug 8 17:22:02 PDT 2022

cd /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/hisat2_bam

# make gtf list file (needed for stringtie merge function)
for filename in *.gtf; do 
  echo $PWD/$filename; 
  done > gtf_list.txt

# make listGTF file (needed for count matrix), two columns w/ sample ID
for filename in *.gtf; do 
  echo $filename $PWD/$filename; 
  done > listGTF.txt

# merge GTFs into a single file
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \
  --merge \
  -p 48 \
    -G /home/shared/8TB_HDD_02/mattgeorgephd/berdahl-sockeye-salmon/sequences/GCF_006149115.2_Oner_1.1_mRNA.gff \
    -o onerka_merged.gtf gtf_list.txt #Merge GTFs to form $

echo "Stringtie merge complete" $(date)

# Compute accuracy of gff
# gffcompare -r ../../../refs/Panopea-generosa-v1.0.a4.mRNA_SJG.gff3 -G -o merged Pgenerosa_merged.gtf #Compute the accuracy and pre$
# echo "GFFcompare complete, Starting gene count matrix assembly..." $(date)

# Compile gene count matrix from GTFs
/home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \
  -g onerka_gene_count_matrix.csv \
  -i listGTF.txt #Compile the gene count matrix

echo "Gene count matrix compiled." $(date)
Stringtie merge complete Mon Aug 8 17:25:55 PDT 2022
Gene count matrix compiled. Mon Aug 8 17:26:09 PDT 2022
