create a workflow for conducting genotype-environment association (GEA) analyses using RAD-seq data
in the future, I will use this workflow for identifying candidate loci associated with components of thermal variation
## Aligning
# a for loop that aligns each .fq file according to name of sample and outputs a .sam file
for i in ${ind_ID};
do
bwa mem -t 20 -T 30 $REF $INPUT/${i}.fq > $OUTPUT1/${i}.sam 2>> $REPORT
done
conda deactivate #deactivate conda env
## Sorting
conda activate samtools_env
# a for loop that sorts each .sam file (using samtools) and creates a .bam file
for i in ${ind_ID};
do
samtools view -b -q 30 $OUTPUT1/${i}.sam > $OUTPUT1/${i}.bam
samtools sort $OUTPUT1/${i}.bam > $OUTPUT2/${i}.bam
samtools index $OUTPUT2/${i}.bam
sample | collection site | region |
---|---|---|
B001 | Belfair | HoodCanal |
B002 | Belfair | HoodCanal |
… | … | … |
W049 | Willapa Bay | Coast |
W050 | Willapa Bay | Coast |
# function for simulating temperatures
simulate_temps <- function() {
base_temps <- c(
6.5, 7.1, 8.3, 10.2, 12.8, 15.5, # Monthly averages for nearshore areas
17.5, 18.3, 16.1, 12.5, 9.1, 7.2 # Based on Salish Sea observations
)
tibble(date = seq.Date(as.Date("2025-01-01"), by = "day", length.out = 365)) %>%
mutate(
month = month(date),
temp = base_temps[month] + rnorm(365, 0, 1.0) # Simplified noise
)
}
# calculate seasonal temp metrics
seasonal_data <- sites %>%
mutate(
temp_data = map(1:n(), ~simulate_temps()),
metrics = map(temp_data, ~{
.x %>%
mutate(season = case_when(
month(date) %in% 3:5 ~ "Spring",
month(date) %in% 6:8 ~ "Summer",
month(date) %in% 9:11 ~ "Fall",
TRUE ~ "Winter"
)) %>%
group_by(season) %>%
summarise(
avg_temp = mean(temp),
max_temp = max(temp),
min_temp = min(temp),
temp_range = max_temp - min_temp,
warm_anomaly_days = sum(temp > quantile(temp, 0.9)),
cold_anomaly_days = sum(temp < quantile(temp, 0.1)),
cumulative_gdd = sum(pmax(temp - 5, 0))
)
})
) %>%
unnest(metrics) %>%
select(-temp_data)
Filtering
Calculate population allele frequencies
Process and summarize environmental data at the population level (awaiting access, but I have simulated data if need be)
Build GEA analyses! Likely 2 separate analyses for cross-validation
Build workflow for identifing candidate loci