Project: O.Lurida Wildtype vs. Hatchery

Olympia oyster - The goal of this project is to see if there are genetic differences between olympia oysters from a hatchery group and a wild type group. -This could be useful for understanding if hatchery bred oyster can be used to restore wild type populations -Data was originally collected by PSRF ##Workflow -I received the data as BAM files and uploaded into Rstudio wild type samples

wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept=CSMB17W*.bam \
https://gannet.fish.washington.edu/acropora/OlyRAD_6plates/CSMB17W.v8/

-and for hatchery samples

wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept=CSMB18H*.bam \
https://gannet.fish.washington.edu/acropora/OlyRAD_6plates-v2/ 

##Merged two groups to make comparison -Call variants for hatchery

for i in {01..05}; do
  /home/shared/bcftools-1.14/bcftools mpileup -Ou -f \
  /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Olurida_v081.fa \
  /home/shared/8TB_HDD_02/thielkla/Karina-chinook/CSBM18H/CSMB18H.${i}.bam \
  | /home/shared/bcftools-1.14/bcftools call -mv -Ov \
  -o ~/Karina-chinook/output/H${i}.vcf
done
 

-Call variants for wild type

for i in {01..05}; do
  /home/shared/bcftools-1.14/bcftools mpileup -Ou -f \
  /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Olurida_v081.fa \
  /home/shared/8TB_HDD_02/thielkla/Karina-chinook/CSMB17W/CSMB17W.${i}.bam \
  | /home/shared/bcftools-1.14/bcftools call -mv -Ov \
  -o ~/Karina-chinook/output/W${i}.vcf
done

-Merged for comparison

for f in ~/Karina-chinook/output/*.vcf; do
  /home/shared/htslib-1.14/bgzip "$f"         # compress to .vcf.gz
  /home/shared/htslib-1.14/tabix -p vcf "$f.gz"  # index for random access
done
ls ~/Karina-chinook/output/*.vcf.gz > ~/Karina-chinook/output/vcflist.txt
/home/shared/bcftools-1.14/bcftools merge \
-l ~/Karina-chinook/output/vcflist.txt -Oz -o ~/Karina-chinook/output/merged.vcf.gz

/home/shared/bcftools-1.14/bcftools index ~/Karina-chinook/output/merged.vcf.gz
/home/shared/bcftools-1.14/bcftools merge \
-l ~/Karina-chinook/output/vcflist.txt -Oz -o ~/Karina-chinook/output/merged.vcf.gz

/home/shared/bcftools-1.14/bcftools index ~/Karina-chinook/output/merged.vcf.gz

##Compare FST -Create text file that list samples

/home/shared/vcftools-0.1.16/bin/vcftools --gzvcf ~/Karina-chinook/output/merged.vcf.gz \
  --weir-fst-pop hatchery.txt \
  --weir-fst-pop wild.txt \
  --out ~/Karina-chinook/output/hatchery_vs_wild

##Compare FST -To measure the degree of genetic differentiation between populations

head -20 ~/Karina-chinook/output/hatchery_vs_wild.weir.fst 

##Table -Create a data frame called hatchery_vs_wild.weir.fst -Make table from fst data

# Install if not already installed
#install.packages("DT")

# Load the package
library(DT)


# Read your CSV file
hatchery_vs_wild.weir.fst <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/Top_1__FST_Loci.csv")


# Display interactive table
datatable(hatchery_vs_wild.weir.fst, options = list(pageLength = 10, scrollX = TRUE), rownames = FALSE)

##Make Bedgraph from top SNPS -Top genetic variations between groups

# Load necessary library
library(readr)

# Read CSV (assuming your file is called "input.csv")
df <- read_csv("https://gannet.fish.washington.edu/seashell/snaps/Top_1__FST_Loci.csv", col_types = cols())

# Drop the first column
df <- df[, -1]

# Rename columns for clarity
colnames(df) <- c("chrom", "pos", "score")

# Convert to bedGraph format: chrom, start (0-based), end (1-based), score
df$start <- df$pos - 1
df$end <- df$start + 1

# Reorder columns
bedgraph <- df[, c("chrom", "start", "end", "score")]

# Write to tab-separated file with no header
write.table(bedgraph, file = "/home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/TopSNP.bedgraph", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)

##CLOSEST SNPs and genes - Identify genes that are physically closest to a set of SNPs using a gene annotation file and a list of top SNPs

curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081-20190709.gene.gff

-Sorts the GFF file by chromosome (column 1) and by start position (column 4).

sort -k1,1 -k4,4n Olurida_v081-20190709.gene.gff > Olurida_v081-20190709.gene.sorted.gff
sort -k1,1 -k2,2n /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/TopSNP.bedgraph > /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/TopSNP.sorted.bedgraph

##Find closest gene to each SNP -Each line in the output corresponds to a SNP, followed by the closest gene’s information

/home/shared/bedtools2/bin/bedtools closest \
-a /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/TopSNP.sorted.bedgraph \
-b Olurida_v081-20190709.gene.sorted.gff \
> /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/09-snp-gene-closet.out

head /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/09-snp-gene-closet.out
## bash: /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/09-snp-gene-closet.out: Permission denied
## Contig100908 451 452 1   .   .   .   -1  -1  .   .   .   .
## Contig103716 1665    1666    0.666667    .   .   .   -1  -1  .   .   .   .
## Contig104631 524 525 0.714286    .   .   .   -1  -1  .   .   .   .
## Contig105717 3961    3962    0.666667    .   .   .   -1  -1  .   .   .   .
## Contig105736 902 903 1   Contig105736    maker   gene    704 1576    .   +   .   ID=OLUR_00030760;Name=OLUR_00030760;Alias=maker-Contig105736-snap-gene-0.1;Note=Similar to PI4KA: Phosphatidylinositol 4-kinase alpha (Bos taurus OX%3D9913);
## Contig106498 8274    8275    1   Contig106498    maker   gene    5400    7804    .   +   .   ID=OLUR_00012823;Name=OLUR_00012823;Alias=maker-Contig106498-snap-gene-0.3;Note=Protein of unknown function;Dbxref=MobiDBLite:mobidb-lite;
## Contig1083   37671   37672   0.666667    Contig1083  maker   gene    5035    37419   .   +   .   ID=OLUR_00003165;Name=OLUR_00003165;Alias=snap_masked-Contig1083-processed-gene-0.0;Note=Similar to USP7: Ubiquitin carboxyl-terminal hydrolase 7 (Gallus gallus OX%3D9031);Dbxref=Gene3D:G3DSA:2.60.210.10,Gene3D:G3DSA:3.90.70.10,InterPro:IPR001394,InterPro:IPR008974,InterPro:IPR018200,InterPro:IPR028889,InterPro:IPR038765,Pfam:PF00443,ProSitePatterns:PS00972,ProSiteProfiles:PS50235,SUPERFAMILY:SSF49599,SUPERFAMILY:SSF54001;Ontology_term=GO:0005515,GO:0006511,GO:0016579,GO:0036459;
## Contig1083   37725   37726   0.666667    Contig1083  maker   gene    5035    37419   .   +   .   ID=OLUR_00003165;Name=OLUR_00003165;Alias=snap_masked-Contig1083-processed-gene-0.0;Note=Similar to USP7: Ubiquitin carboxyl-terminal hydrolase 7 (Gallus gallus OX%3D9031);Dbxref=Gene3D:G3DSA:2.60.210.10,Gene3D:G3DSA:3.90.70.10,InterPro:IPR001394,InterPro:IPR008974,InterPro:IPR018200,InterPro:IPR028889,InterPro:IPR038765,Pfam:PF00443,ProSitePatterns:PS00972,ProSiteProfiles:PS50235,SUPERFAMILY:SSF49599,SUPERFAMILY:SSF54001;Ontology_term=GO:0005515,GO:0006511,GO:0016579,GO:0036459;
## Contig113550 4322    4323    1   .   .   .   -1  -1  .   .   .   .
## Contig117009 829 830 1   .   .   .   -1  -1  .   .   .   .

##Table of closet genes and function

grep -oP 'Note=Similar to \K[^(;)]+' /home/shared/8TB_HDD_02/thielkla/Karina-chinook/Karina-chinook-v2/project/09-snp-gene-closet.out | sort | uniq
## AATF: Protein AATF 
## acj6: Inhibitory POU protein 
## ADGRL3: Adhesion G protein-coupled receptor L3 
## bli-3: Dual oxidase 1 
## Cask: Peripheral plasma membrane protein CASK 
## CEP192: Centrosomal protein of 192 kDa 
## CFAP65: Cilia- and flagella-associated protein 65 
## CRIP1: Cysteine-rich protein 1 
## CYP10: Cytochrome P450 10 
## DDX19A: ATP-dependent RNA helicase DDX19A 
## Decr1: 2%2C4-dienoyl-CoA reductase%2C mitochondrial 
## drpr: Protein draper 
## FAM49B: Protein FAM49B 
## Fbn2: Fibrillin-2 
## G-protein coupled receptor GRL101 
## Grcc10: Protein C10 
## Helz2: Helicase with zinc finger domain 2 
## Hexb: Beta-hexosaminidase subunit beta 
## ifi30: Gamma-interferon-inducible lysosomal thiol reductase 
## LIPG: Endothelial lipase 
## Liver carboxylesterase 4 
## mnb: Serine/threonine-protein kinase minibrain 
## MPPED1: Metallophosphoesterase domain-containing protein 1 
## mth2: G-protein coupled receptor Mth2 
## MTHFS: 5-formyltetrahydrofolate cyclo-ligase 
## Nkd1: Protein naked cuticle homolog 1 
## NLGN4Y: Neuroligin-4%2C Y-linked 
## pgap2: Post-GPI attachment to proteins factor 2 
## Pgghg: Protein-glucosylgalactosylhydroxylysine glucosidase 
## PI4KA: Phosphatidylinositol 4-kinase alpha 
## PNKP: Bifunctional polynucleotide phosphatase/kinase 
## R3HDM2: R3H domain-containing protein 2 
## RIC1: RAB6A-GEF complex partner protein 1 
## Rmnd5a: E3 ubiquitin-protein ligase RMND5A 
## rost: Protein rolling stone 
## Slc16a14: Monocarboxylate transporter 14 
## slc39a14: Zinc transporter ZIP14 
## SULT3A1: Amine sulfotransferase 
## T25G3.4: Probable glycerol-3-phosphate dehydrogenase%2C mitochondrial 
## Trpc4: Short transient receptor potential channel 4 
## ttn-1: Titin homolog 
## UBR4: E3 ubiquitin-protein ligase UBR4 
## USP7: Ubiquitin carboxyl-terminal hydrolase 7 
## Zdhhc22: Palmitoyltransferase ZDHHC22 
## ZMIZ1: Zinc finger MIZ domain-containing protein 1 
## ZNF91: Zinc finger protein 91