In [40]:
SUFFIX = 'HCSS_Afilt32m70_01'

Correlate genetic and epigenetic distances

For all sites

In [24]:
pwd
/scratch/t.cri.ksilliman/CommonG_cp2/2019_Mapping/ANGSD_run/HCSS
In [3]:
head ../../paper-oly-mbdbs-gen/analyses/methylation-filtered/dist.manhat.csv
"SeqNum.row","SeqNum.col","dist.manh","SampNum.row","SampNum.col"
1,1,0,"hc1_2","hc1_2"
1,15,273985.441785306,"ss3_15","hc1_2"
1,3,290189.490642819,"hc2_15","hc1_2"
1,17,285476.927245925,"ss3_20","hc1_2"
1,4,281520.711361473,"hc2_17","hc1_2"
1,2,298303.029739484,"hc1_4","hc1_2"
1,7,298537.38668563,"hc3_7","hc1_2"
1,16,284208.486947856,"ss3_16","hc1_2"
1,14,281305.331808996,"ss3_14","hc1_2"
In [4]:
#read in epigenetic Manhattan distance, 10 coverage 
library(spaa)
ep10 <-read.csv("../../paper-oly-mbdbs-gen/analyses/methylation-filtered/dist.manhat.csv",header = T)
ep10 <- ep10[,c("SampNum.row","SampNum.col","dist.manh")]
ep10 <- as.matrix(list2dist(ep10))
mbdorder = c("hc1_2","hc1_4","hc2_15","hc2_17","hc3_1","hc3_5","hc3_7","hc3_10","hc3_11",
        "ss2_9","ss2_14","ss2_18","ss3_3","ss3_14","ss3_15","ss3_16","ss3_20","ss5_18")
ep10 <- ep10[mbdorder, mbdorder]
ep10
Dep10 <- as.dist(ep10)
hc1_2hc1_4hc2_15hc2_17hc3_1hc3_5hc3_7hc3_10hc3_11ss2_9ss2_14ss2_18ss3_3ss3_14ss3_15ss3_16ss3_20ss5_18
hc1_2 0.0298303.0290189.5281520.7323694.5288274.1298537.4276634.8311367.0297793.0280529.9334528.1284554.6281305.3273985.4284208.5285476.9297048.1
hc1_4298303.0 0.0288401.4272689.0314054.6308425.8290930.3292157.4313295.9306377.3291403.7334752.1291934.5287946.6285750.2292011.5290794.1299482.5
hc2_15290189.5288401.4 0.0267996.3312934.8306314.1276217.9283512.3309629.3296792.4273417.4339043.0280972.0277302.7270960.4281246.4283754.6297864.8
hc2_17281520.7272689.0267996.3 0.0314787.3299622.6275369.7267637.3306306.9287060.1263776.1339119.1274036.3268751.4256317.1272144.4273361.0290238.7
hc3_1323694.5314054.6312934.8314787.3 0.0334641.8299112.8325937.0324463.8321179.5320997.4335280.4315988.8313767.2321554.2317127.8317475.6315139.7
hc3_5288274.1308425.8306314.1299622.6334641.8 0.0315610.7288781.8325942.9316722.4306762.5346143.9305665.1303050.4296746.5308046.6304834.4313945.2
hc3_7298537.4290930.3276217.9275369.7299112.8315610.7 0.0292111.6315363.0297745.1282259.6341069.9284236.7279628.7277165.4285466.0287941.8299663.9
hc3_10276634.8292157.4283512.3267637.3325937.0288781.8292111.6 0.0317230.3297372.5267563.6345360.9279921.4274370.7257816.0277769.7274299.4294286.7
hc3_11311367.0313295.9309629.3306306.9324463.8325942.9315363.0317230.3 0.0306749.7303226.0330648.3295962.1294657.6302097.9297720.5300676.0304098.3
ss2_9297793.0306377.3296792.4287060.1321179.5316722.4297745.1297372.5306749.7 0.0253556.8313685.4255231.0253726.0268911.5256477.3273957.8278320.1
ss2_14280529.9291403.7273417.4263776.1320997.4306762.5282259.6267563.6303226.0253556.8 0.0326677.5241072.4240035.5237976.3237509.6258380.9271803.4
ss2_18334528.1334752.1339043.0339119.1335280.4346143.9341069.9345360.9330648.3313685.4326677.5 0.0300672.3306406.7328384.0317824.0324077.2300108.0
ss3_3284554.6291934.5280972.0274036.3315988.8305665.1284236.7279921.4295962.1255231.0241072.4300672.3 0.0238988.6251167.6238332.2262145.0253721.5
ss3_14281305.3287946.6277302.7268751.4313767.2303050.4279628.7274370.7294657.6253726.0240035.5306406.7238988.6 0.0244038.7235816.2258085.7257354.3
ss3_15273985.4285750.2270960.4256317.1321554.2296746.5277165.4257816.0302097.9268911.5237976.3328384.0251167.6244038.7 0.0246521.3253941.4276798.6
ss3_16284208.5292011.5281246.4272144.4317127.8308046.6285466.0277769.7297720.5256477.3237509.6317824.0238332.2235816.2246521.3 0.0258804.6267488.5
ss3_20285476.9290794.1283754.6273361.0317475.6304834.4287941.8274299.4300676.0273957.8258380.9324077.2262145.0258085.7253941.4258804.6 0.0280486.1
ss5_18297048.1299482.5297864.8290238.7315139.7313945.2299663.9294286.7304098.3278320.1271803.4300108.0253721.5257354.3276798.6267488.5280486.1 0.0
In [5]:
%expand
# read in genetic distances
gen <- read.table("Results/{SUFFIX}_mbd.dist", row.names = 1, header=T)
gen
Dgen <- as.dist(gen)
hc1_2hc1_4hc2_15hc2_17hc3_1hc3_5hc3_7hc3_10hc3_11ss2_9ss2_14ss2_18ss3_3ss3_14ss3_15ss3_16ss3_20ss5_18
hc1_20.00000000.21554360.20549880.22457610.21404750.12510350.21378330.19417310.21965810.21917480.22013630.22516790.22935400.23030180.22739600.22675360.22417230.2393679
hc1_40.21554360.00000000.18442400.16932710.18113310.20711290.16084090.20900090.21575630.21863050.21883300.21463090.22919050.22474120.22573570.21908710.21572040.2274161
hc2_150.20549880.18442400.00000000.20419500.18546900.20216040.18239130.15926890.21681350.20411320.20854760.19100940.20428630.20451060.20613600.20404590.20133460.2098602
hc2_170.22457610.16932710.20419500.00000000.18840480.21636340.17244020.21792600.22650390.22682100.22682090.22830670.23797350.23220900.23852350.22693640.22943610.2372695
hc3_10.21404750.18113310.18546900.18840480.00000000.20843800.11413500.20708800.20929090.21959930.21608660.21554640.22548280.21967780.22585900.22057490.22009200.2156870
hc3_50.12510350.20711290.20216040.21636340.20843800.00000000.21041850.18699890.21430600.22022220.21482870.22144250.22165370.21945870.22235240.21927650.22212980.2263948
hc3_70.21378330.16084090.18239130.17244020.11413500.21041850.00000000.20987600.21541450.22119450.22335520.22366310.22654070.22483710.22692510.22243530.22605810.2286600
hc3_100.19417310.20900090.15926890.21792600.20708800.18699890.20987600.00000000.21581900.20725410.21238150.20480360.20573820.20787990.19167200.21394690.20810970.2118349
hc3_110.21965810.21575630.21681350.22650390.20929090.21430600.21541450.21581900.00000000.22047290.22067850.22013600.23620940.22525140.23072650.22826040.22009070.2270442
ss2_90.21917480.21863050.20411320.22682100.21959930.22022220.22119450.20725410.22047290.00000000.14820220.16431400.15656100.16426930.20910800.16640550.20667930.1738464
ss2_140.22013630.21883300.20854760.22682090.21608660.21482870.22335520.21238150.22067850.14820220.00000000.17410980.14840940.17312420.20531170.15552100.20286540.1736163
ss2_180.22516790.21463090.19100940.22830670.21554640.22144250.22366310.20480360.22013600.16431400.17410980.00000000.13114760.16138940.21216930.18563830.19882390.1271048
ss3_30.22935400.22919050.20428630.23797350.22548280.22165370.22654070.20573820.23620940.15656100.14840940.13114760.00000000.16921170.20938900.14625540.20560960.1244475
ss3_140.23030180.22474120.20451060.23220900.21967780.21945870.22483710.20787990.22525140.16426930.17312420.16138940.16921170.00000000.21015180.16451750.20695740.1486115
ss3_150.22739600.22573570.20613600.23852350.22585900.22235240.22692510.19167200.23072650.20910800.20531170.21216930.20938900.21015180.00000000.21276400.20918650.2141706
ss3_160.22675360.21908710.20404590.22693640.22057490.21927650.22243530.21394690.22826040.16640550.15552100.18563830.14625540.16451750.21276400.00000000.20471890.1671745
ss3_200.22417230.21572040.20133460.22943610.22009200.22212980.22605810.20810970.22009070.20667930.20286540.19882390.20560960.20695740.20918650.20471890.00000000.2039022
ss5_180.23936790.22741610.20986020.23726950.21568700.22639480.22866000.21183490.22704420.17384640.17361630.12710480.12444750.14861150.21417060.16717450.20390220.0000000
In [6]:
#library(MASS)
#dens <- kde2d(Dgeo,DgenN, n=300)
#myPal <- colorRampPalette(c("white","blue","gold", "orange", "red"))
plot(Dgen, Dep10, pch=20,cex=1,xlab = "Genetic distance", ylab = "MBD Manhattan distance")
summary(lm(Dep10~Dgen),)
R2 = round(summary(lm(Dep10~Dgen))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(Dep10~Dgen))
text(0.13,320000,label=paste("R^2=",R2))
text(0.13,310000,label="r=0.31")
Call:
lm(formula = Dep10 ~ Dgen)

Residuals:
   Min     1Q Median     3Q    Max 
-53094 -17259   -937  18870  54443 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   229505      15591  14.720  < 2e-16 ***
Dgen          299862      75420   3.976 0.000108 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24320 on 151 degrees of freedom
Multiple R-squared:  0.09477,	Adjusted R-squared:  0.08877 
F-statistic: 15.81 on 1 and 151 DF,  p-value: 0.0001084
In [7]:
cor.test( ~ Dep10 + Dgen,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ Dep10 + Dgen, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  Dep10 and Dgen
t = 3.9759, df = 151, p-value = 0.0001084
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1568240 0.4447926
sample estimates:
      cor 
0.3078416 
	Spearman's rank correlation rho

data:  Dep10 and Dgen
S = 444614, p-value = 0.001505
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.2551332 

For DMLs

In [8]:
#read in epigenetic Manhattan distance, 10 coverage 
library(spaa)
ep10 <-read.csv("../../paper-oly-mbdbs-gen/analyses/methylation-filtered/dist.manhat.DMLs.csv",header = T)
ep10 <- ep10[,c("SampNum.row","SampNum.col","dist.manh")]
ep10 <- as.matrix(list2dist(ep10))
mbdorder = c("hc1_2","hc1_4","hc2_15","hc2_17","hc3_1","hc3_5","hc3_7","hc3_10","hc3_11",
        "ss2_9","ss2_14","ss2_18","ss3_3","ss3_14","ss3_15","ss3_16","ss3_20","ss5_18")
ep10 <- ep10[mbdorder, mbdorder]
ep10
Dep10 <- as.dist(ep10)
hc1_2hc1_4hc2_15hc2_17hc3_1hc3_5hc3_7hc3_10hc3_11ss2_9ss2_14ss2_18ss3_3ss3_14ss3_15ss3_16ss3_20ss5_18
hc1_2 0.00010545.53210351.204 9385.57310610.214 6363.30010611.443 8551.481 9068.33714788.30014887.00213635.25614606.95214484.49411693.99714072.04411702.18 13190.286
hc1_410545.532 0.000 9064.440 7984.184 8727.395 8905.577 8699.20710736.49910026.70117346.81016626.60915169.86716480.53516347.99014253.68015345.56813291.64 15195.551
hc2_1510351.204 9064.440 0.000 8032.958 9008.169 9760.334 7750.86710319.29211194.32016995.51916120.01015450.88216418.20616417.51213740.32715822.14313357.00 15649.929
hc2_17 9385.573 7984.184 8032.958 0.000 9827.633 8017.600 8785.571 9408.60310454.90216400.96216108.63615381.09115743.18015586.17313138.39714668.39012158.79 14422.239
hc3_110610.214 8727.395 9008.169 9827.633 0.000 8808.350 6567.10810436.64210554.47516222.80916014.16413934.95115248.14714915.30612793.94714521.67412503.21 13721.460
hc3_5 6363.300 8905.577 9760.334 8017.600 8808.350 0.000 9189.478 7834.229 9119.64415202.56415975.18313292.57215959.68315121.11911756.75615334.17412440.56 14534.806
hc3_710611.443 8699.207 7750.867 8785.571 6567.108 9189.478 0.00010443.44310804.16017348.60416705.44214868.12316044.56216375.90313350.96516117.65413837.97 15293.315
hc3_10 8551.48110736.49910319.292 9408.60310436.642 7834.22910443.443 0.00010148.67915486.50114988.73013497.21015209.25714489.56512354.98514441.49811705.99 14281.504
hc3_11 9068.33710026.70111194.32010454.90210554.475 9119.64410804.16010148.679 0.00015102.61414709.27913012.51714188.19514113.22611779.37313574.42312187.15 13623.710
ss2_914788.30017346.81016995.51916400.96216222.80915202.56417348.60415486.50115102.614 0.000 9317.558 8747.628 8413.878 9119.60010149.293 8682.16011387.52 8899.327
ss2_1414887.00216626.60916120.01016108.63616014.16415975.18316705.44214988.73014709.279 9317.558 0.00010653.366 9278.006 9505.80210936.765 8530.52411322.26 10281.724
ss2_1813635.25615169.86715450.88215381.09113934.95113292.57214868.12313497.21013012.517 8747.62810653.366 0.000 7351.714 8692.70610026.94410471.81411078.75 7483.073
ss3_314606.95216480.53516418.20615743.18015248.14715959.68316044.56215209.25714188.195 8413.878 9278.006 7351.714 0.000 8503.102 9899.829 8283.19111215.38 6997.712
ss3_1414484.49416347.99016417.51215586.17314915.30615121.11916375.90314489.56514113.226 9119.600 9505.802 8692.706 8503.102 0.00010568.719 9230.87611030.21 8055.097
ss3_1511693.99714253.68013740.32713138.39712793.94711756.75613350.96512354.98511779.37310149.29310936.76510026.944 9899.82910568.719 0.00010193.02410433.14 9796.022
ss3_1614072.04415345.56815822.14314668.39014521.67415334.17416117.65414441.49813574.423 8682.160 8530.52410471.814 8283.191 9230.87610193.024 0.00010436.72 8725.894
ss3_2011702.18313291.64513357.00212158.79012503.21112440.56213837.96911705.98712187.14811387.52011322.26211078.74911215.38511030.20810433.13910436.720 0.00 10959.480
ss5_1813190.28615195.55115649.92914422.23913721.46014534.80615293.31514281.50413623.710 8899.32710281.724 7483.073 6997.712 8055.097 9796.022 8725.89410959.48 0.000
In [9]:
#library(MASS)
#dens <- kde2d(Dgeo,DgenN, n=300)
#myPal <- colorRampPalette(c("white","blue","gold", "orange", "red"))
plot(Dgen, Dep10, pch=20,cex=1,xlab = "Genetic distance", ylab = "MBD Manhattan distance, DMLs")
summary(lm(Dep10~Dgen),)
R2 = round(summary(lm(Dep10~Dgen))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(Dep10~Dgen))
text(0.13,16000,label=paste("R^2=",R2))
text(0.13,15000,label="r=0.68")
Call:
lm(formula = Dep10 ~ Dgen)

Residuals:
   Min     1Q Median     3Q    Max 
 -5032  -1730    -81   1508   4874 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -3343       1374  -2.433   0.0161 *  
Dgen           75762       6646  11.400   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2143 on 151 degrees of freedom
Multiple R-squared:  0.4625,	Adjusted R-squared:  0.459 
F-statistic:   130 on 1 and 151 DF,  p-value: < 2.2e-16
In [11]:
cor.test( ~ Dep10 + Dgen,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ Dep10 + Dgen, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  Dep10 and Dgen
t = 11.529, df = 151, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5895371 0.7603386
sample estimates:
      cor 
0.6842097 
	Spearman's rank correlation rho

data:  Dep10 and Dgen
S = 209532, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6489687 

Correlate Pst and Fst

For overlapping genes

All genes with methylation data

In [1]:
head ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab
wc -l ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab
"Quant_Varia"	"contig_gene"	"start_gene_2kb"	"end_gene_2kb"	"Pst_Values"
"Contig0_10497_95068"	"Contig0"	"10497"	"95068"	0.019332886629221
"Contig100_48322_58076"	"Contig100"	"48322"	"58076"	0.067008101478262
"Contig100188_1_4123"	"Contig100188"	"1"	"4123"	0.734962005060689
"Contig100199_5411_27437"	"Contig100199"	"5411"	"27437"	0.0682501114385254
"Contig100396_1_4908"	"Contig100396"	"1"	"4908"	0.185205868587485
"Contig100499_508_4044"	"Contig100499"	"508"	"4044"	0.150423262664518
"Contig1006_1_13779"	"Contig1006"	"1"	"13779"	0.0880097439836617
"Contig100691_1_8538"	"Contig100691"	"1"	"8538"	0.0188795032121047
"Contig10074_2260_6621"	"Contig10074"	"2260"	"6621"	0.251312849756899
3755 ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab
In [2]:
# read in Pst data
pst <- read.csv("../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab", sep="\t",header=T,stringsAsFactors = F)
colnames(pst) <- c("id","contig","start","end","pst")
head(pst)
idcontigstartendpst
Contig0_10497_95068 Contig0 10497 95068 0.01933289
Contig100_48322_58076 Contig100 48322 58076 0.06700810
Contig100188_1_4123 Contig100188 1 4123 0.73496201
Contig100199_5411_27437Contig100199 5411 27437 0.06825011
Contig100396_1_4908 Contig100396 1 4908 0.18520587
Contig100499_508_4044 Contig100499 508 4044 0.15042326
In [3]:
wc -l SFS/HCSS_sfsm70_PerGeneFst.csv
1387 SFS/HCSS_sfsm70_PerGeneFst.csv
In [7]:
list.of.packages <- c("reshape2","dplyr", "tidyr", "readr", "stringr", "plotly","tidyverse","lfmm") #add new libraries here 
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})
Loading required package: reshape2
Error: package or namespace load failed for 'reshape2' in dyn.load(file, DLLpath = DLLpath, ...):
 unable to load shared object '/home/t.cri.ksilliman/miniconda3/lib/R/library/stringi/libs/stringi.so':
  libicui18n.so.58: cannot open shared object file: No such file or directory
Loading required package: dplyr

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading required package: tidyr
Loading required package: readr
Loading required package: stringr
Error: package or namespace load failed for 'stringr' in dyn.load(file, DLLpath = DLLpath, ...):
 unable to load shared object '/home/t.cri.ksilliman/miniconda3/lib/R/library/stringi/libs/stringi.so':
  libicui18n.so.58: cannot open shared object file: No such file or directory
Loading required package: plotly
Loading required package: ggplot2

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

Loading required package: tidyverse
Error: package or namespace load failed for 'tidyverse' in dyn.load(file, DLLpath = DLLpath, ...):
 unable to load shared object '/home/t.cri.ksilliman/miniconda3/lib/R/library/stringi/libs/stringi.so':
  libicui18n.so.58: cannot open shared object file: No such file or directory
Loading required package: lfmm
Error: package or namespace load failed for 'lfmm' in dyn.load(file, DLLpath = DLLpath, ...):
 unable to load shared object '/home/t.cri.ksilliman/miniconda3/lib/R/library/Matrix/libs/Matrix.so':
  libRlapack.so: cannot open shared object file: No such file or directory
  1. FALSE
  2. TRUE
  3. TRUE
  4. TRUE
  5. FALSE
  6. TRUE
  7. FALSE
  8. FALSE
In [15]:
genes <- 
  read_delim(file = "SFS/HCSS_sfsm70_PerGeneFst.csv", delim = "\t", col_names = TRUE)
head(genes)
Parsed with column specification:
cols(
  id = col_character(),
  contig = col_character(),
  start = col_double(),
  end = col_double(),
  notes_gene = col_character(),
  fst01 = col_double()
)
idcontigstartendnotes_genefst01
Contig103346_4131_18776 Contig103346 4131 18776 ID=OLUR_00011851;Name=OLUR_00011851;Alias=maker-Contig103346-snap-gene-0.4;Note=Similar to HSPA12B: Heat shock 70 kDa protein 12B (Homo sapiens OX%3D9606);Dbxref=CDD:cd10229,Gene3D:G3DSA:3.30.420.40,Gene3D:G3DSA:3.90.640.10,InterPro:IPR013126,PRINTS:PR00301,Pfam:PF00012,SUPERFAMILY:SSF53067; 0.069637883
Contig60108_1_13731 Contig60108 1 13731 ID=OLUR_00014779;Name=OLUR_00014779;Alias=maker-Contig60108-snap-gene-0.2;Note=Similar to MLH3: DNA mismatch repair protein Mlh3 (Homo sapiens OX%3D9606);Dbxref=Gene3D:G3DSA:3.30.230.10,Gene3D:G3DSA:3.30.565.10,InterPro:IPR013507,InterPro:IPR014721,InterPro:IPR020568,InterPro:IPR036890,MobiDBLite:mobidb-lite,Pfam:PF01119,SMART:SM01340,SUPERFAMILY:SSF54211,SUPERFAMILY:SSF55874;Ontology_term=GO:0005524,GO:0006298,GO:0030983;0.121951220
Contig27784_1_12761 Contig27784 1 12761 ID=OLUR_00013246;Name=OLUR_00013246;Alias=maker-Contig27784-snap-gene-0.2;Note=Similar to Fbxl6: F-box/LRR-repeat protein 6 (Mus musculus OX%3D10090);Dbxref=Gene3D:G3DSA:3.80.10.10,InterPro:IPR032675,SUPERFAMILY:SSF52047; 0.041095890
Contig38648_3515_20578 Contig38648 3515 20578 ID=OLUR_00010999;Name=OLUR_00010999;Alias=maker-Contig38648-snap-gene-0.2;Note=Similar to Rnf8: E3 ubiquitin-protein ligase RNF8 (Mus musculus OX%3D10090);Dbxref=CDD:cd16535,Coils:Coil,Gene3D:G3DSA:3.30.40.10,InterPro:IPR001841,InterPro:IPR013083,InterPro:IPR017907,MobiDBLite:mobidb-lite,Pfam:PF13923,ProSitePatterns:PS00518,ProSiteProfiles:PS50089,SMART:SM00184,SUPERFAMILY:SSF57850; 0.047858942
Contig282_38634_44741 Contig282 38634 44741 ID=OLUR_00002451;Name=OLUR_00002451;Alias=snap_masked-Contig282-processed-gene-0.4;Note=Similar to Cry2: Cryptochrome-2 (Rattus norvegicus OX%3D10116);Dbxref=Gene3D:G3DSA:3.40.50.620,InterPro:IPR006050,InterPro:IPR014729,InterPro:IPR036155,Pfam:PF00875,ProSiteProfiles:PS51645,SUPERFAMILY:SSF52425; 0.001798561
Contig282_26273_42182 Contig282 26273 42182 ID=OLUR_00002450;Name=OLUR_00002450;Alias=maker-Contig282-snap-gene-0.10;Note=Similar to API5: Apoptosis inhibitor 5 (Pongo abelii OX%3D9601);Dbxref=Coils:Coil,InterPro:IPR008383,InterPro:IPR016024,MobiDBLite:mobidb-lite,Pfam:PF05918,SUPERFAMILY:SSF48371; 0.001644737
In [16]:
length(intersect(pst$id,genes$id))
237
In [18]:
both = merge(pst,genes, by ="id")
head(both)
idcontig.xstart.xend.xpstcontig.ystart.yend.ynotes_genefst01
Contig0_10497_95068 Contig0 10497 95068 0.01933289 Contig0 10497 95068 ID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515; 0.000000000
Contig10074_2260_6621 Contig10074 2260 6621 0.25131285 Contig10074 2260 6621 ID=OLUR_00025622;Name=OLUR_00025622;Alias=maker-Contig10074-snap-gene-0.2;Note=Similar to V-RMIL: Serine/threonine-protein kinase-transforming protein Rmil (Avian rous-associated virus type 1 OX%3D11950);Dbxref=Gene3D:G3DSA:1.10.510.10; 0.192019950
Contig107541_1926_7324 Contig107541 1926 7324 0.66627497 Contig107541 1926 7324 ID=OLUR_00024691;Name=OLUR_00024691;Alias=maker-Contig107541-snap-gene-0.2;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:3.30.710.10,InterPro:IPR000210,InterPro:IPR011333,MobiDBLite:mobidb-lite,Pfam:PF00651,ProSiteProfiles:PS50097,SUPERFAMILY:SSF54695;Ontology_term=GO:0005515; 0.045112782
Contig109827_1765_6591 Contig109827 1765 6591 0.66755413 Contig109827 1765 6591 ID=OLUR_00025663;Name=OLUR_00025663;Alias=maker-Contig109827-snap-gene-0.2;Note=Similar to Unc80: Protein unc-80 homolog (Mus musculus OX%3D10090); 0.150627615
Contig112833_1_6669 Contig112833 1 6669 0.68067953 Contig112833 1 6669 ID=OLUR_00025551;Name=OLUR_00025551;Alias=maker-Contig112833-snap-gene-0.2;Note=Similar to Gls2: Glutaminase liver isoform%2C mitochondrial (Mus musculus OX%3D10090);Dbxref=CDD:cd00204,Gene3D:G3DSA:1.25.40.20,Gene3D:G3DSA:3.40.710.10,InterPro:IPR002110,InterPro:IPR012338,InterPro:IPR015868,InterPro:IPR020683,InterPro:IPR036770,Pfam:PF04960,Pfam:PF12796,ProSiteProfiles:PS50088,ProSiteProfiles:PS50297,SMART:SM00248,SUPERFAMILY:SSF48403,SUPERFAMILY:SSF56601;Ontology_term=GO:0004359,GO:0005515,GO:0006541;0.005154639
Contig129_9110_46084 Contig129 9110 46084 0.06717008 Contig129 9110 46084 ID=OLUR_00001506;Name=OLUR_00001506;Alias=maker-Contig129-snap-gene-0.10;Note=Similar to Ints3: Integrator complex subunit 3 (Mus musculus OX%3D10090);Dbxref=InterPro:IPR019333,MobiDBLite:mobidb-lite,Pfam:PF10189; 0.100917431

3755 genes with Pst value, 1387 genes with Fst value, 237 overlapping genes

In [23]:
#library(MASS)
#dens <- kde2d(Dgeo,DgenN, n=300)
#myPal <- colorRampPalette(c("white","blue","gold", "orange", "red"))
plot(both$fst01, both$pst, pch=20,cex=1,xlab = "Fst", ylab = "Pst")
title("Pst vs Fst, 2kbslop all genes w/meth data, 237 genes")
summary(lm(both$pst~both$fst01),)
R2 = round(summary(lm(both$pst~both$fst01))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(both$pst~both$fst01))
text(0.4,0.8,label=paste("R^2=",R2))
text(0.4,0.75,label="r= 0.018")
Call:
lm(formula = both$pst ~ both$fst01)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.27839 -0.19593 -0.06751  0.14782  0.66175 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.26639    0.01780  14.965   <2e-16 ***
both$fst01   0.04315    0.14978   0.288    0.774    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2282 on 235 degrees of freedom
Multiple R-squared:  0.0003531,	Adjusted R-squared:  -0.003901 
F-statistic: 0.08301 on 1 and 235 DF,  p-value: 0.7735
In [21]:
cor.test( ~ both$pst + both$fst01,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ both$pst + both$fst01, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  both$pst and both$fst01
t = 0.28812, df = 235, p-value = 0.7735
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1088997  0.1458724
sample estimates:
       cor 
0.01879138 
Warning message in cor.test.default(x = c(0.019332886629221, 0.251312849756899, :
"Cannot compute exact p-value with ties"
	Spearman's rank correlation rho

data:  both$pst and both$fst01
S = 2264910, p-value = 0.7494
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.02085675 

Get gene annotations for these overlaps

Hih Pst, low Fst

In [33]:
both[which(both$fst01 <0.01 & both$pst >0.5),c(1,5,10,9)]
idpstfst01notes_gene
5Contig112833_1_6669 0.6806795 0.005154639 ID=OLUR_00025551;Name=OLUR_00025551;Alias=maker-Contig112833-snap-gene-0.2;Note=Similar to Gls2: Glutaminase liver isoform%2C mitochondrial (Mus musculus OX%3D10090);Dbxref=CDD:cd00204,Gene3D:G3DSA:1.25.40.20,Gene3D:G3DSA:3.40.710.10,InterPro:IPR002110,InterPro:IPR012338,InterPro:IPR015868,InterPro:IPR020683,InterPro:IPR036770,Pfam:PF04960,Pfam:PF12796,ProSiteProfiles:PS50088,ProSiteProfiles:PS50297,SMART:SM00248,SUPERFAMILY:SSF48403,SUPERFAMILY:SSF56601;Ontology_term=GO:0004359,GO:0005515,GO:0006541;
35Contig18485_27692_52779 0.7325620 0.000000000 ID=OLUR_00001082;Name=OLUR_00001082;Alias=maker-Contig18485-snap-gene-0.8;Note=Similar to RREB1: Ras-responsive element-binding protein 1 (Gallus gallus OX%3D9031);Dbxref=Gene3D:G3DSA:3.30.160.60,InterPro:IPR013087,InterPro:IPR036236,MobiDBLite:mobidb-lite,Pfam:PF00096,Pfam:PF13894,Pfam:PF13909,Pfam:PF13912,ProSitePatterns:PS00028,ProSiteProfiles:PS50157,SMART:SM00355,SUPERFAMILY:SSF57667;Ontology_term=GO:0003676;
52Contig206988_1_7220 0.5694211 0.000000000 ID=OLUR_00024466;Name=OLUR_00024466;Alias=snap_masked-Contig206988-processed-gene-0.0;Note=Similar to Wdr54: WD repeat-containing protein 54 (Mus musculus OX%3D10090);Dbxref=InterPro:IPR036322,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;
66Contig2254_1_11569 0.5354929 -0.004008016 ID=OLUR_00019191;Name=OLUR_00019191;Alias=maker-Contig2254-snap-gene-0.1;Note=Similar to VVA0006: Riboflavin biosynthesis protein VVA0006 (Vibrio vulnificus (strain YJ016) OX%3D196600);Dbxref=CDD:cd15457,Coils:Coil,Gene3D:G3DSA:1.10.357.40,InterPro:IPR012816,InterPro:IPR037238,MobiDBLite:mobidb-lite,Pfam:PF08719,SUPERFAMILY:SSF143990,TIGRFAM:TIGR02464;
86Contig24745_17400_35588 0.6093125 0.000000000 ID=OLUR_00002446;Name=OLUR_00002446;Alias=maker-Contig24745-snap-gene-0.7;Note=Protein of unknown function;Dbxref=Coils:Coil;
95Contig26170_5945_21684 0.6814634 -0.004629630 ID=OLUR_00010285;Name=OLUR_00010285;Alias=maker-Contig26170-snap-gene-0.3;Note=Similar to Stk31: Serine/threonine-protein kinase 31 (Mus musculus OX%3D10090);Dbxref=CDD:cd00180,Coils:Coil,Gene3D:G3DSA:1.10.510.10,InterPro:IPR011009,MobiDBLite:mobidb-lite,SUPERFAMILY:SSF56112;
130Contig32566_4290_23883 0.8429597 0.000000000 ID=OLUR_00002794;Name=OLUR_00002794;Alias=snap_masked-Contig32566-processed-gene-0.0;Note=Protein of unknown function;Dbxref=MobiDBLite:mobidb-lite;
131Contig32566_9055_42575 0.7889923 -0.004415011 ID=OLUR_00002795;Name=OLUR_00002795;Alias=maker-Contig32566-snap-gene-0.7;Note=Similar to HECTD1: E3 ubiquitin-protein ligase HECTD1 (Homo sapiens OX%3D9606);Dbxref=CDD:cd00204,Coils:Coil,Gene3D:G3DSA:1.25.40.20,Gene3D:G3DSA:2.30.30.920,Gene3D:G3DSA:2.60.120.260,InterPro:IPR002110,InterPro:IPR008979,InterPro:IPR010606,InterPro:IPR012919,InterPro:IPR020683,InterPro:IPR036770,InterPro:IPR037252,MobiDBLite:mobidb-lite,Pfam:PF06701,Pfam:PF07738,Pfam:PF13857,ProSiteProfiles:PS50088,ProSiteProfiles:PS50297,ProSiteProfiles:PS51416,SMART:SM00248,SUPERFAMILY:SSF159034,SUPERFAMILY:SSF48403,SUPERFAMILY:SSF49785;Ontology_term=GO:0004842,GO:0005515,GO:0016567,GO:0046872;
134Contig32910_2664_19377 0.7010187 0.003144654 ID=OLUR_00011811;Name=OLUR_00011811;Alias=maker-Contig32910-snap-gene-0.4;Note=Similar to Spata6: Spermatogenesis-associated protein 6 (Mus musculus OX%3D10090);Dbxref=Coils:Coil,InterPro:IPR032732,Pfam:PF14909;
155Contig37588_11392_33472 0.7551651 -0.003696858 ID=OLUR_00003309;Name=OLUR_00003309;Alias=maker-Contig37588-snap-gene-0.5;Note=Protein of unknown function;Dbxref=Coils:Coil,MobiDBLite:mobidb-lite;
160Contig38_57109_82031 0.7901919 -0.006535948 ID=OLUR_00000267;Name=OLUR_00000267;Alias=maker-Contig38-snap-gene-0.14;Note=Similar to KIF16B: Kinesin-like protein KIF16B (Homo sapiens OX%3D9606);Dbxref=CDD:cd00060,Coils:Coil,Gene3D:G3DSA:2.60.200.20,Gene3D:G3DSA:3.30.1520.10,Gene3D:G3DSA:3.40.850.10,InterPro:IPR000253,InterPro:IPR001683,InterPro:IPR001752,InterPro:IPR008984,InterPro:IPR019821,InterPro:IPR027417,InterPro:IPR036871,InterPro:IPR036961,MobiDBLite:mobidb-lite,PRINTS:PR00380,Pfam:PF00225,Pfam:PF00498,Pfam:PF00787,ProSitePatterns:PS00411,ProSiteProfiles:PS50067,ProSiteProfiles:PS50195,SMART:SM00129,SMART:SM00312,SUPERFAMILY:SSF49879,SUPERFAMILY:SSF52540,SUPERFAMILY:SSF64268;Ontology_term=GO:0003777,GO:0005515,GO:0005524,GO:0007018,GO:0008017,GO:0035091;
169Contig408_19615_27287 0.5061239 0.000000000 ID=OLUR_00006735;Name=OLUR_00006735;Alias=maker-Contig408-snap-gene-0.4;Note=Protein of unknown function;
175Contig42610_7960_46080 0.5826824 0.000000000 ID=OLUR_00000735;Name=OLUR_00000735;Alias=maker-Contig42610-snap-gene-0.9;Note=Similar to Togaram1: TOG array regulator of axonemal microtubules protein 1 (Mus musculus OX%3D10090);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,InterPro:IPR011989,InterPro:IPR016024,InterPro:IPR034085,MobiDBLite:mobidb-lite,SMART:SM01349,SUPERFAMILY:SSF48371;
210Contig58479_956_12381 0.5113152 0.000000000 ID=OLUR_00016278;Name=OLUR_00016278;Alias=maker-Contig58479-snap-gene-0.3;Note=Protein of unknown function;Dbxref=MobiDBLite:mobidb-lite;

Low Pst, high Fst

In [28]:
both[which(both$fst01 >0.3 & both$pst <0.1),c(1,5,10,9)]
idpstfst01notes_gene
49Contig20413_1_17467 0.020724598 0.4720812 ID=OLUR_00010706;Name=OLUR_00010706;Alias=maker-Contig20413-snap-gene-0.3;Note=Similar to sll1388: Universal stress protein Sll1388 (Synechocystis sp. (strain PCC 6803 / Kazusa) OX%3D1111708);Dbxref=CDD:cd00293,Coils:Coil,Gene3D:G3DSA:3.40.50.620,InterPro:IPR006015,InterPro:IPR006016,InterPro:IPR014729,PRINTS:PR01438,Pfam:PF00582,SUPERFAMILY:SSF52402;
143Contig34298_13092_51114 0.004027313 0.3714286 ID=OLUR_00001738;Name=OLUR_00001738;Alias=maker-Contig34298-snap-gene-0.4;Note=Similar to BIRC6: Baculoviral IAP repeat-containing protein 6 (Homo sapiens OX%3D9606);Dbxref=CDD:cd00022,Gene3D:G3DSA:1.10.1170.10,InterPro:IPR001370,InterPro:IPR019775,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00653,ProSitePatterns:PS00678,ProSiteProfiles:PS50143,SMART:SM00238,SUPERFAMILY:SSF50978,SUPERFAMILY:SSF57924;Ontology_term=GO:0005515;
197Contig50657_1_11990 0.033930990 0.4868217 ID=OLUR_00017738;Name=OLUR_00017738;Alias=maker-Contig50657-snap-gene-0.3;Note=Similar to KDM3B: Lysine-specific demethylase 3B (Homo sapiens OX%3D9606);Dbxref=Gene3D:G3DSA:2.60.120.650,InterPro:IPR003347,MobiDBLite:mobidb-lite,Pfam:PF02373,ProSiteProfiles:PS51184,SMART:SM00558,SUPERFAMILY:SSF51197;

High Fst, high Pst

In [29]:
both[which(both$fst01 >0.3 & both$pst >0.3),c(1,5,10,9)]
idpstfst01notes_gene
19Contig16916_3259_40767 0.5218304 0.3822314 ID=OLUR_00001636;Name=OLUR_00001636;Alias=maker-Contig16916-snap-gene-0.8;Note=Similar to Med13: Mediator of RNA polymerase II transcription subunit 13 (Mus musculus OX%3D10090);Dbxref=InterPro:IPR009401,InterPro:IPR021643,MobiDBLite:mobidb-lite,Pfam:PF06333,Pfam:PF11597;Ontology_term=GO:0003712,GO:0006357,GO:0016592;
85Contig24693_2452_15381 0.3410517 0.5007496 ID=OLUR_00007682;Name=OLUR_00007682;Alias=maker-Contig24693-snap-gene-0.4;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:1.10.533.10,InterPro:IPR000488,InterPro:IPR011029,MobiDBLite:mobidb-lite,ProSiteProfiles:PS50017,SUPERFAMILY:SSF47986;Ontology_term=GO:0005515,GO:0007165;
108Contig28902_528_13774 0.3112797 0.4283427 ID=OLUR_00000830;Name=OLUR_00000830;Alias=maker-Contig28902-snap-gene-0.12;Note=Similar to SPON1: Spondin-1 (Homo sapiens OX%3D9606);Dbxref=Gene3D:G3DSA:2.20.100.10,Gene3D:G3DSA:2.60.40.2130,InterPro:IPR000884,InterPro:IPR009465,InterPro:IPR036383,InterPro:IPR038678,MobiDBLite:mobidb-lite,Pfam:PF00090,Pfam:PF06468,ProSiteProfiles:PS50092,ProSiteProfiles:PS51020,SMART:SM00209,SUPERFAMILY:SSF82895;
227Contig77382_1559_12918 0.4155037 0.6633523 ID=OLUR_00016056;Name=OLUR_00016056;Alias=snap_masked-Contig77382-processed-gene-0.0;Note=Similar to Socs5: Suppressor of cytokine signaling 5 (Mus musculus OX%3D10090);Dbxref=Gene3D:G3DSA:3.30.505.10,InterPro:IPR000980,InterPro:IPR036860,MobiDBLite:mobidb-lite,Pfam:PF00017,SUPERFAMILY:SSF55550;

Get 1st and last 10% of genes as bed file

In [37]:
sfsSuf = "SFS/HCSS_sfsm70"
In [2]:
module load gcc/6.2.0
module load xz/5.2.2 
module load bzip2/1.0.6
module load bedtools/2.29.0
In [4]:
# extracting gene regions out of genome annotations file (gff3)
cat ../../paper-oly-mbdbs-gen/genome-features/Olurida_v081-20190709.gene.gff | cut -f 1,4,5,9 > gene_regions.tab
In [22]:
%expand
# reading gene regions
genes=read.table("gene_regions.tab",sep="\t")
names(genes)=c("contig","start","end","notes")
In [23]:
genes$pct15 <- as.integer((genes$end - genes$start)*0.15)
genes$start15 <- genes$start +genes$pct15
genes$end15 <- genes$end - genes$pct15
head(genes)
contigstartendnotespct15start15end15
Contig61093 7493 7946 ID=OLUR_00020575;Name=OLUR_00020575;Alias=maker-Contig61093-snap-gene-0.2;Note=Protein of unknown function; 67 7560 7879
Contig1111 24968 28696 ID=OLUR_00006628;Name=OLUR_00006628;Alias=maker-Contig1111-snap-gene-0.1;Note=Similar to Spag6: Sperm-associated antigen 6 (Mus musculus OX%3D10090);Dbxref=Gene3D:G3DSA:1.25.10.10,InterPro:IPR000225,InterPro:IPR000357,InterPro:IPR011989,InterPro:IPR016024,Pfam:PF02985,SMART:SM00185,SUPERFAMILY:SSF48371;Ontology_term=GO:0005515; 559 25527 28137
Contig214118 201 926 ID=OLUR_00032161;Name=OLUR_00032161;Alias=maker-Contig214118-snap-gene-0.0;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:3.10.450.10; 108 309 818
Contig58217 9736 11541 ID=OLUR_00019127;Name=OLUR_00019127;Alias=snap_masked-Contig58217-processed-gene-0.2;Note=Protein of unknown function;Dbxref=MobiDBLite:mobidb-lite; 270 10006 11271
Contig2046 2295 18394 ID=OLUR_00011450;Name=OLUR_00011450;Alias=maker-Contig2046-snap-gene-0.5;Note=Protein of unknown function; 2414 4709 15980
Contig9540 4303 10179 ID=OLUR_00018391;Name=OLUR_00018391;Alias=maker-Contig9540-snap-gene-0.1;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:2.170.300.10; 881 5184 9298
In [24]:
pct15genes <- genes[,c("contig","start","start15","end15", "end")]
head(pct15genes)
contigstartstart15end15end
Contig61093 7493 7560 7879 7946
Contig1111 24968 25527 28137 28696
Contig214118 201 309 818 926
Contig58217 9736 10006 11271 11541
Contig2046 2295 4709 15980 18394
Contig9540 4303 5184 9298 10179
In [25]:
write.table(pct15genes,"pct15genes.tab",sep="\t",quote = F,row.names = F)
In [26]:
%expand
fst01=read.table("SFS/HCSS_sfsm80_fst2pop.fst")
names(fst01)=c("contig","pos","a","b")

fst01$contig=as.character(fst01$contig)
In [27]:
# removing zero-only (invariant) bases
fst01[,3:4]=round(fst01[,3:4],3)
ch01=apply(fst01[,3:4],1,sum)
chh=(ch01>0)
table(chh)

fst01=fst01[chh,]
head(fst01)
chh
 FALSE   TRUE 
328905   5252 
contigposab
64Contig0 74432 0.000 0.009
82Contig0109103 0.033 0.457
98Contig0109119 0.003 0.052
102Contig0109123 0.002 0.067
114Contig1 40362 0.000 0.009
169Contig1 42880 0.003 0.078
In [34]:
# computing weighted Fst per gene segment (1)
i=1;gfst01=c();ns01=0;snp=c()
pb=txtProgressBar(0,nrow(pct15genes))
for (i in 1:nrow(pct15genes)) {
	setTxtProgressBar(pb,i)
	sub=subset(fst01,contig==pct15genes$contig[i] & ((pos>=pct15genes$start[i] & pos<=pct15genes$start15[i]) | (pos>=pct15genes$end15[i] & pos<=pct15genes$end[i])))
    if (is.null(sub[1,1]) | sum(sub$b)==0) { 
		gfst01=append(gfst01,NA)
	} else {
        gfst01=append(gfst01,sum(sub$a)/sum(sub$b))
		ns01=ns01+nrow(sub)
        snp=append(snp,paste(sub$contig,sub$pos,sep="_"))
	}
}
================================================================================

185 SNPs found in the 1st or last 15% of gene

In [35]:
length(snp)
185
In [33]:
# density plots of per-gene Fst
plot(density(na.omit(gfst01)))
In [38]:
%expand
# adding results to genes table, saving
pct15genes$fst01=gfst01
id <- paste(pct15genes$contig,pct15genes$start,pct15genes$end,sep="_")
pct15genes <- cbind(id,pct15genes)
head(pct15genes[!is.na(pct15genes$fst01),])
save(pct15genes,file="{sfsSuf}_PerGeneFst.RData")
idcontigstartstart15end15endfst01
12Contig103346_6131_16776Contig103346 6131 7727 15180 16776 0.114973262
35Contig27784_397_10761 Contig27784 397 1951 9207 10761 0.041095890
121Contig282_28273_40182 Contig282 28273 30059 38396 40182 0.001795332
227Contig28993_906_8396 Contig28993 906 2029 7273 8396 0.081395349
272Contig92_19276_42960 Contig92 19276 22828 39408 42960 0.125000000
370Contig42157_2034_10762 Contig42157 2034 3343 9453 10762 0.028571429
In [40]:
%expand
# saving table of per-gene Fst for GO_MWU analysis
f01=pct15genes[!is.na(pct15genes$fst01),]
nrow(f01)
write.table(f01,file="{sfsSuf}_15pctGeneFst.csv",quote=F,row.names=F,sep="\t")
124
In [ ]:

5 methylated sites per gene (old)

In [22]:
grep "Contig20413" Results/fstpst_2kbslop.genes
Contig20413	maker	gene	486	15467	.	-	.	ID=OLUR_00010706;Name=OLUR_00010706;Alias=maker-Contig20413-snap-gene-0.3;Note=Similar to sll1388: Universal stress protein Sll1388 (Synechocystis sp. (strain PCC 6803 / Kazusa) OX%3D1111708);Dbxref=CDD:cd00293,Coils:Coil,Gene3D:G3DSA:3.40.50.620,InterPro:IPR006015,InterPro:IPR006016,InterPro:IPR014729,PRINTS:PR01438,Pfam:PF00582,SUPERFAMILY:SSF52402;	Contig20413	485	15467	0.472081218274112_0.0207245979022085
In [35]:
head ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab
wc -l ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab
"Quant_Varia"	"contig_gene"	"start_gene_2kb"	"end_gene_2kb"	"Pst_Values"
"Contig0_10497_95068"	"Contig0"	"10497"	"95068"	0.019332886629221
"Contig100188_1_4123"	"Contig100188"	"1"	"4123"	0.734962005060689
"Contig100396_1_4908"	"Contig100396"	"1"	"4908"	0.185205868587485
"Contig100994_1_8602"	"Contig100994"	"1"	"8602"	0.701143568206082
"Contig1013_14071_19514"	"Contig1013"	"14071"	"19514"	0.0172219569940594
"Contig1013_16204_21663"	"Contig1013"	"16204"	"21663"	0.0395933279259626
"Contig101333_3371_8675"	"Contig101333"	"3371"	"8675"	0.712270890827424
"Contig10144_1_8748"	"Contig10144"	"1"	"8748"	0.229975030080655
"Contig101661_1_5354"	"Contig101661"	"1"	"5354"	0.345600927439141
1394 ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab
In [36]:
# read in Pst data
pst <- read.csv("../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop.tab", sep="\t",header=T,stringsAsFactors = F)
colnames(pst) <- c("id","contig","start","end","pst")
head(pst)
idcontigstartendpst
Contig0_10497_95068 Contig0 10497 95068 0.01933289
Contig100188_1_4123 Contig100188 1 4123 0.73496201
Contig100396_1_4908 Contig100396 1 4908 0.18520587
Contig100994_1_8602 Contig100994 1 8602 0.70114357
Contig1013_14071_19514Contig1013 14071 19514 0.01722196
Contig1013_16204_21663Contig1013 16204 21663 0.03959333
In [37]:
wc -l SFS/HCSS_sfsm70_PerGeneFst.csv
1387 SFS/HCSS_sfsm70_PerGeneFst.csv
In [38]:
# read in Fst data
fst <- read.csv("SFS/HCSS_sfsm70_PerGeneFst.csv",stringsAsFactors = F)
head(fst)
idcontigstartendfst01
Contig103346_4131_18776Contig103346 4131 18776 0.069637883
Contig60108_1_13731 Contig60108 1 13731 0.121951220
Contig27784_1_12761 Contig27784 1 12761 0.041095890
Contig38648_3515_22246 Contig38648 3515 22246 0.047858942
Contig282_38634_45331 Contig282 38634 45331 0.001798561
Contig282_26273_42182 Contig282 26273 42182 0.001644737
In [39]:
length(intersect(pst$id,fst$id))
64

1394 genes with Pst values, 1387 genes with Fst values (including those with Fst=0), only 64 overlapping genes

In [40]:
both = merge(pst,fst, by ="id")
both <- both[,c("id","pst","fst01")]
head(both)
idpstfst01
Contig0_10497_95068 0.01933289 0.00000000
Contig1433_1_13800 0.20530881 0.02343096
Contig15905_1_15190 0.05149559 0.00000000
Contig15905_1_35236 0.05149559 0.00000000
Contig16063_563_350130.12577787 0.06563707
Contig16203_1_18094 0.11331736 0.04923077
In [47]:
#library(MASS)
#dens <- kde2d(Dgeo,DgenN, n=300)
#myPal <- colorRampPalette(c("white","blue","gold", "orange", "red"))
plot(both$fst, both$pst, pch=20,cex=1,xlab = "Fst", ylab = "Pst")
title("Pst vs Fst, 2kbslop_5loci, 64 genes")
summary(lm(both$pst~both$fst),)
R2 = round(summary(lm(both$pst~both$fst))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(both$pst~both$fst))
text(0.4,0.8,label=paste("R^2=",R2))
text(0.4,0.75,label="r= 0.08")
Call:
lm(formula = both$pst ~ both$fst)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.29403 -0.17988 -0.07144  0.12928  0.60292 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.24004    0.03382   7.098 1.45e-09 ***
both$fst     0.18060    0.28676   0.630    0.531    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2252 on 62 degrees of freedom
Multiple R-squared:  0.006357,	Adjusted R-squared:  -0.009669 
F-statistic: 0.3967 on 1 and 62 DF,  p-value: 0.5311
In [42]:
cor.test( ~ both$pst + both$fst,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ both$pst + both$fst, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  both$pst and both$fst
t = 0.62981, df = 62, p-value = 0.5311
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1693982  0.3192828
sample estimates:
       cor 
0.07973106 
Warning message in cor.test.default(x = c(0.019332886629221, 0.205308806060111, :
"Cannot compute exact p-value with ties"
	Spearman's rank correlation rho

data:  both$pst and both$fst
S = 42830, p-value = 0.8787
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
0.01946097 

3 methylated sites per gene

In [25]:
head ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop_3loci.tab
wc -l ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop_3loci.tab
"Quant_Varia"	"contig_gene"	"start_gene_2kb"	"end_gene_2kb"	"Pst_Values"
"Contig0_10497_95068"	"Contig0"	"10497"	"95068"	0.019332886629221
"Contig100188_1_4123"	"Contig100188"	"1"	"4123"	0.734962005060689
"Contig100396_1_4908"	"Contig100396"	"1"	"4908"	0.185205868587485
"Contig100743_7752_12849"	"Contig100743"	"7752"	"12849"	0.513177756897547
"Contig100994_1_8602"	"Contig100994"	"1"	"8602"	0.701143568206082
"Contig10117_48_10916"	"Contig10117"	"48"	"10916"	0.0512787574778887
"Contig1013_14071_19514"	"Contig1013"	"14071"	"19514"	0.0172219569940594
"Contig1013_16204_21663"	"Contig1013"	"16204"	"21663"	0.0395933279259626
"Contig101333_3371_8675"	"Contig101333"	"3371"	"8675"	0.712270890827424
2178 ../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop_3loci.tab
In [48]:
# read in Pst data
pst <- read.csv("../../paper-oly-mbdbs-gen/analyses/DMGs/Pst_gene_2kbslop_3loci.tab", sep="\t",header=T,stringsAsFactors = F)
colnames(pst) <- c("id","contig","start","end","pst")
head(pst)
idcontigstartendpst
Contig0_10497_95068 Contig0 10497 95068 0.01933289
Contig100188_1_4123 Contig100188 1 4123 0.73496201
Contig100396_1_4908 Contig100396 1 4908 0.18520587
Contig100743_7752_12849Contig100743 7752 12849 0.51317776
Contig100994_1_8602 Contig100994 1 8602 0.70114357
Contig10117_48_10916 Contig10117 48 10916 0.05127876
In [49]:
wc -l SFS/HCSS_sfsm70_PerGeneFst.csv
1387 SFS/HCSS_sfsm70_PerGeneFst.csv
In [50]:
# read in Fst data
fst <- read.csv("SFS/HCSS_sfsm70_PerGeneFst.csv",stringsAsFactors = F)
head(fst)
idcontigstartendfst01
Contig103346_4131_18776Contig103346 4131 18776 0.069637883
Contig60108_1_13731 Contig60108 1 13731 0.121951220
Contig27784_1_12761 Contig27784 1 12761 0.041095890
Contig38648_3515_22246 Contig38648 3515 22246 0.047858942
Contig282_38634_45331 Contig282 38634 45331 0.001798561
Contig282_26273_42182 Contig282 26273 42182 0.001644737
In [51]:
length(intersect(pst$id,fst$id))
96

2178 genes with Pst values, 1387 genes with Fst values (including those with Fst=0), only 96 overlapping genes

In [52]:
both = merge(pst,fst, by ="id")
both <- both[,c("id","pst","fst01")]
head(both)
idpstfst01
Contig0_10497_95068 0.01933289 0.00000000
Contig1433_1_13800 0.20530881 0.02343096
Contig149134_128_4495 0.19887867 0.00000000
Contig15595_31612_644020.15324152 0.01369863
Contig15905_1_15190 0.05149559 0.00000000
Contig15905_1_35236 0.05149559 0.00000000
In [53]:
#library(MASS)
#dens <- kde2d(Dgeo,DgenN, n=300)
#myPal <- colorRampPalette(c("white","blue","gold", "orange", "red"))
plot(both$fst, both$pst, pch=20,cex=1,xlab = "Fst", ylab = "Pst")
title("Pst vs Fst, 2kbslop_3loci, 96 genes")
summary(lm(both$pst~both$fst),)
R2 = round(summary(lm(both$pst~both$fst))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(both$pst~both$fst))
text(0.4,0.8,label=paste("R^2=",R2))
text(0.4,0.75,label="r= 0.076")
Call:
lm(formula = both$pst ~ both$fst)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.27744 -0.17026 -0.05201  0.11454  0.60272 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.24024    0.02442   9.838 4.05e-16 ***
both$fst     0.14612    0.19709   0.741     0.46    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.205 on 94 degrees of freedom
Multiple R-squared:  0.005813,	Adjusted R-squared:  -0.004763 
F-statistic: 0.5496 on 1 and 94 DF,  p-value: 0.4603
In [32]:
cor.test( ~ both$pst + both$fst,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ both$pst + both$fst, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  both$pst and both$fst
t = 0.74136, df = 94, p-value = 0.4603
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1261713  0.2725629
sample estimates:
       cor 
0.07624322 
Warning message in cor.test.default(x = c(0.019332886629221, 0.205308806060111, :
"Cannot compute exact p-value with ties"
	Spearman's rank correlation rho

data:  both$pst and both$fst
S = 154628, p-value = 0.6371
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.04875152 

Pst vs Fst for random sites in genome

1000 bp bins

In [41]:
sfsSuf = "SFS/HCSS_sfsm70"
In [42]:
module load gcc/6.2.0
module load xz/5.2.2 
module load bzip2/1.0.6
module load bedtools/2.29.0
In [2]:
bedtools makewindows -h
Tool: bedtools makewindows
Version: v2.29.0
Summary: Makes adjacent or sliding windows across a genome or BED file.

Usage: bedtools makewindows [OPTIONS] [-g <genome> OR -b <bed>]
 [ -w <window_size> OR -n <number of windows> ]

Input Options: 
	-g <genome>
		Genome file size (see notes below).
		Windows will be created for each chromosome in the file.

	-b <bed>
		BED file (with chrom,start,end fields).
		Windows will be created for each interval in the file.

Windows Output Options: 
	-w <window_size>
		Divide each input interval (either a chromosome or a BED interval)
		to fixed-sized windows (i.e. same number of nucleotide in each window).
		Can be combined with -s <step_size>

	-s <step_size>
		Step size: i.e., how many base pairs to step before
		creating a new window. Used to create "sliding" windows.
		- Defaults to window size (non-sliding windows).

	-n <number_of_windows>
		Divide each input interval (either a chromosome or a BED interval)
		to fixed number of windows (i.e. same number of windows, with
		varying window sizes).

	-reverse
		 Reverse numbering of windows in the output, i.e. report 
		 windows in decreasing order

ID Naming Options: 
	-i src|winnum|srcwinnum
		The default output is 3 columns: chrom, start, end .
		With this option, a name column will be added.
		 "-i src" - use the source interval's name.
		 "-i winnum" - use the window number as the ID (e.g. 1,2,3,4...).
		 "-i srcwinnum" - use the source interval's name with the window number.
		See below for usage examples.

Notes: 
	(1) The genome file should tab delimited and structured as follows:
	 <chromName><TAB><chromSize>

	For example, Human (hg19):
	chr1	249250621
	chr2	243199373
	...
	chr18_gl000207_random	4262

Tips: 
	One can use the UCSC Genome Browser's MySQL database to extract
	chromosome sizes. For example, H. sapiens:

	mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
	"select chrom, size from hg19.chromInfo" > hg19.genome

Examples: 
 # Divide the human genome into windows of 1MB:
 $ bedtools makewindows -g hg19.txt -w 1000000
 chr1 0 1000000
 chr1 1000000 2000000
 chr1 2000000 3000000
 chr1 3000000 4000000
 chr1 4000000 5000000
 ...

 # Divide the human genome into sliding (=overlapping) windows of 1MB, with 500KB overlap:
 $ bedtools makewindows -g hg19.txt -w 1000000 -s 500000
 chr1 0 1000000
 chr1 500000 1500000
 chr1 1000000 2000000
 chr1 1500000 2500000
 chr1 2000000 3000000
 ...

 # Divide each chromosome in human genome to 1000 windows of equal size:
 $ bedtools makewindows -g hg19.txt -n 1000
 chr1 0 249251
 chr1 249251 498502
 chr1 498502 747753
 chr1 747753 997004
 chr1 997004 1246255
 ...

 # Divide each interval in the given BED file into 10 equal-sized windows:
 $ cat input.bed
 chr5 60000 70000
 chr5 73000 90000
 chr5 100000 101000
 $ bedtools makewindows -b input.bed -n 10
 chr5 60000 61000
 chr5 61000 62000
 chr5 62000 63000
 chr5 63000 64000
 chr5 64000 65000
 ...

 # Add a name column, based on the window number: 
 $ cat input.bed
 chr5  60000  70000 AAA
 chr5  73000  90000 BBB
 chr5 100000 101000 CCC
 $ bedtools makewindows -b input.bed -n 3 -i winnum
 chr5        60000   63334   1
 chr5        63334   66668   2
 chr5        66668   70000   3
 chr5        73000   78667   1
 chr5        78667   84334   2
 chr5        84334   90000   3
 chr5        100000  100334  1
 chr5        100334  100668  2
 chr5        100668  101000  3
 ...

 # Reverse window numbers: 
 $ cat input.bed
 chr5  60000  70000 AAA
 chr5  73000  90000 BBB
 chr5 100000 101000 CCC
 $ bedtools makewindows -b input.bed -n 3 -i winnum -reverse
 chr5        60000   63334   3
 chr5        63334   66668   2
 chr5        66668   70000   1
 chr5        73000   78667   3
 chr5        78667   84334   2
 chr5        84334   90000   1
 chr5        100000  100334  3
 chr5        100334  100668  2
 chr5        100668  101000  1
 ...

 # Add a name column, based on the source ID + window number: 
 $ cat input.bed
 chr5  60000  70000 AAA
 chr5  73000  90000 BBB
 chr5 100000 101000 CCC
 $ bedtools makewindows -b input.bed -n 3 -i srcwinnum
 chr5        60000   63334   AAA_1
 chr5        63334   66668   AAA_2
 chr5        66668   70000   AAA_3
 chr5        73000   78667   BBB_1
 chr5        78667   84334   BBB_2
 chr5        84334   90000   BBB_3
 chr5        100000  100334  CCC_1
 chr5        100334  100668  CCC_2
 chr5        100668  101000  CCC_3
 ...



In [3]:
# use previously created genome fai file as the bedfile input
awk 'FS=OFS="\t"{print $1, 0, $2}' ../../Olurida_v081.fa.fai \
| bedtools makewindows -b - -w 1000 > Olurida_v081_1kbIntervals.bed
In [6]:
head Olurida_v081_1kbIntervals.bed
Contig0	0	1000
Contig0	1000	2000
Contig0	2000	3000
Contig0	3000	4000
Contig0	4000	5000
Contig0	5000	6000
Contig0	6000	7000
Contig0	7000	8000
Contig0	8000	9000
Contig0	9000	10000
In [73]:
# reading regions
regions=read.table("Olurida_v081_1kbIntervals.bed")
names(regions)=c("contig","start","end")
In [74]:
# read in fst data
fst01=read.table("SFS/HCSS_sfsm70_fst2pop.fst")
names(fst01)=c("contig","pos","a","b")

fst01$contig=as.character(fst01$contig)
In [75]:
# removing zero-only (invariant) bases
fst01[,3:4]=round(fst01[,3:4],3)
ch01=apply(fst01[,3:4],1,sum)
chh=(ch01>0)
table(chh)

fst01=fst01[chh,]
head(fst01)
chh
 FALSE   TRUE 
363405   5882 
contigposab
64Contig0 74432 0.000 0.009
82Contig0109103 0.034 0.457
98Contig0109119 0.003 0.052
102Contig0109123 0.002 0.067
114Contig1 40362 0.000 0.009
169Contig1 42880 0.003 0.078
In [ ]:
fst
In [11]:
# computing weighted Fst per region
i=1;gfst01=c();ns01=0
pb=txtProgressBar(0,nrow(regions))
for (i in 1:nrow(regions)) {
	setTxtProgressBar(pb,i)
	sub=subset(fst01,contig==regions$contig[i] & pos>=regions$start[i] & pos<=regions$end[i])
	if (is.null(sub[1,1]) | sum(sub$b)==0) { 
		gfst01=append(gfst01,NA)
	} else {
		gfst01=append(gfst01,sum(sub$a)/sum(sub$b))
		ns01=ns01+nrow(sub)
	}
}
================================================================================
In [12]:
# density plots of per-region Fst
plot(density(na.omit(gfst01)))
In [18]:
%expand
#adding results to regions table, saving
regions$fst01=gfst01
regionsNA <- regions[!is.na(regions$fst01),]
head(regionsNA)
write.csv(regionsNA,file="{sfsSuf}_Per1kbFst.csv",quote=F,row.names=F)
#save bed file of intervals only with fst data
contigstartendfst01
75Contig0 74000 75000 0.00000000
110Contig0 109000 110000 0.06770833
158Contig1 40000 41000 0.00000000
160Contig1 42000 43000 0.03846154
216Contig2 10000 11000 0.02127660
342Contig2 136000 137000 0.00000000
In [45]:
nrow(regionsNA)
3883
In [22]:
%expand
filtbed <- regionsNA[,c("contig","start","end")]
write.table(filtbed, "{sfsSuf}_1kb.bed",quote=F,row.names=F,col.names = F)
In [23]:
%expand
head {sfsSuf}_1kb.bed
Contig0 74000 75000
Contig0 109000 110000
Contig1 40000 41000
Contig1 42000 43000
Contig2 10000 11000
Contig2 136000 137000
Contig3 14000 15000
Contig4 51000 52000
Contig8 75000 76000
Contig9 17000 18000

Compare with overlappin Pst bins

In [52]:
%expand
regionsNA <- read.csv("{sfsSuf}_Per1kbFst.csv")
id <- paste(regionsNA$contig,regionsNA$start,regionsNA$end, sep = "_")
regionsNA <- cbind(id,regionsNA)
head(regionsNA)
idcontigstartendfst01
Contig0_74000_75000 Contig0 74000 75000 0.00000000
Contig0_109000_110000Contig0 109000 110000 0.06770833
Contig1_40000_41000 Contig1 40000 41000 0.00000000
Contig1_42000_43000 Contig1 42000 43000 0.03846154
Contig2_10000_11000 Contig2 10000 11000 0.02127660
Contig2_136000_137000Contig2 136000 137000 0.00000000
In [50]:
# read in Pst data
pst1kb <- read.csv("../../paper-oly-mbdbs-gen/analyses/methylation-filtered/Pst_bins_1kb.tab", sep="\t",header=T,stringsAsFactors = F)
colnames(pst1kb) <- c("id","contig","start","end","pst","pst.lowCI","pst.highCI")
head(pst1kb)
nrow(pst1kb)
idcontigstartendpstpst.lowCIpst.highCI
Contig10074_5000_6000 Contig10074 5000 6000 0.25131285 0.0008842925 0.8672152
Contig100771_1000_1852Contig100771 1000 1852 0.34311448 0.0015001933 0.8588288
Contig126977_2000_3000Contig126977 2000 3000 0.62651934 0.0210034318 0.8771075
Contig128060_1000_1680Contig128060 1000 1680 0.01810252 0.0005849085 0.8363022
Contig129509_0_1000 Contig129509 0 1000 0.24973351 0.0015760069 0.8427798
Contig132115_1000_1816Contig132115 1000 1816 0.21746108 0.0009032690 0.8652226
71

71 overlapping bins

In [57]:
both = merge(pst1kb,regionsNA, by ="id")
#both <- both[,c("id","pst","fst01")]
head(both)
idcontig.xstart.xend.xpstpst.lowCIpst.highCIcontig.ystart.yend.yfst01
Contig10074_5000_6000 Contig10074 5000 6000 0.25131285 0.0008842925 0.8672152 Contig10074 5000 6000 0.19201995
Contig100771_1000_1852Contig100771 1000 1852 0.34311448 0.0015001933 0.8588288 Contig100771 1000 1852 0.14919355
Contig126977_2000_3000Contig126977 2000 3000 0.62651934 0.0210034318 0.8771075 Contig126977 2000 3000 0.01584507
Contig128060_1000_1680Contig128060 1000 1680 0.01810252 0.0005849085 0.8363022 Contig128060 1000 1680 0.04655870
Contig129509_0_1000 Contig129509 0 1000 0.24973351 0.0015760069 0.8427798 Contig129509 0 1000 0.04545455
Contig132115_1000_1816Contig132115 1000 1816 0.21746108 0.0009032690 0.8652226 Contig132115 1000 1816 0.01709402
In [59]:
#library(MASS)
#dens <- kde2d(Dgeo,DgenN, n=300)
#myPal <- colorRampPalette(c("white","blue","gold", "orange", "red"))
plot(both$fst01, both$pst, pch=20,cex=1,xlab = "Fst", ylab = "Pst")
title("Pst vs Fst, 1kb bins, 71 bins")
summary(lm(both$pst~both$fst01),)
R2 = round(summary(lm(both$pst~both$fst01))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(both$pst~both$fst01))
text(0.4,0.8,label=paste("R^2=",R2))
text(0.4,0.75,label="r= -0.035")
Call:
lm(formula = both$pst ~ both$fst01)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.28231 -0.19942 -0.02921  0.14795  0.52908 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.28394    0.03259   8.712 9.98e-13 ***
both$fst01  -0.10975    0.37560  -0.292    0.771    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2266 on 69 degrees of freedom
Multiple R-squared:  0.001236,	Adjusted R-squared:  -0.01324 
F-statistic: 0.08537 on 1 and 69 DF,  p-value: 0.771
In [58]:
cor.test( ~ both$pst + both$fst01,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ both$pst + both$fst01, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  both$pst and both$fst01
t = -0.29219, df = 69, p-value = 0.771
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2662733  0.1997889
sample estimates:
        cor 
-0.03515334 
Warning message in cor.test.default(x = c(0.251312849756899, 0.343114484406881, :
"Cannot compute exact p-value with ties"
	Spearman's rank correlation rho

data:  both$pst and both$fst01
S = 58217, p-value = 0.8435
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
0.02385219 

Get gene annotations for these overlaps

In [8]:
module load gcc/6.2.0
module load bedtools/2.29.0
In [17]:
chr <- as.character(both$contig.x)
start <- both$startc +2000
end <- both$end.x-2000
fst <- paste(both$fst01, both$pst, sep="_")
bed <- cbind(chr,start,end,fst)
In [18]:
write.table(bed,"Results/fstpst_2kbslop.tab",row.names = F, col.names = F,quote = F, sep = "\t")
In [19]:
intersectBed \
  -wb \
  -a "../../paper-oly-mbdbs-gen/genome-features/Olurida_v081-20190709.gene.gff" \
  -b "Results/fstpst_2kbslop.tab" \
  > "Results/fstpst_2kbslop.genes"
In [31]:
both[which(both$fst01 >0.4),]
idcontig.xstart.xend.xpstcontig.ystart.yend.yfst01startc
35Contig20413_1_17467 Contig20413 1 17467 0.02072460 Contig20413 1 17467 0.4720812 -1515
61Contig24693_2452_15381Contig24693 2452 15381 0.34105167 Contig24693 2452 15381 0.5007496 2452
75Contig28902_528_13774 Contig28902 528 13774 0.31127971 Contig28902 528 13774 0.4283427 528
133Contig50657_1_11990 Contig50657 1 11990 0.03393099 Contig50657 1 11990 0.4868217 -1888
149Contig77382_1559_12918Contig77382 1559 12918 0.41550369 Contig77382 1559 12918 0.6633523 1559

Low Pst, high Fst

In [22]:
grep "Contig20413" Results/fstpst_2kbslop.genes
Contig20413	maker	gene	486	15467	.	-	.	ID=OLUR_00010706;Name=OLUR_00010706;Alias=maker-Contig20413-snap-gene-0.3;Note=Similar to sll1388: Universal stress protein Sll1388 (Synechocystis sp. (strain PCC 6803 / Kazusa) OX%3D1111708);Dbxref=CDD:cd00293,Coils:Coil,Gene3D:G3DSA:3.40.50.620,InterPro:IPR006015,InterPro:IPR006016,InterPro:IPR014729,PRINTS:PR01438,Pfam:PF00582,SUPERFAMILY:SSF52402;	Contig20413	485	15467	0.472081218274112_0.0207245979022085
In [26]:
grep "Contig50657" Results/fstpst_2kbslop.genes
Contig50657	maker	gene	113	9990	.	+	.	ID=OLUR_00017738;Name=OLUR_00017738;Alias=maker-Contig50657-snap-gene-0.3;Note=Similar to KDM3B: Lysine-specific demethylase 3B (Homo sapiens OX%3D9606);Dbxref=Gene3D:G3DSA:2.60.120.650,InterPro:IPR003347,MobiDBLite:mobidb-lite,Pfam:PF02373,ProSiteProfiles:PS51184,SMART:SM00558,SUPERFAMILY:SSF51197;	Contig50657	112	9990	0.486821705426357_0.0339309902311326
In [97]:
wc -l SFS/HCSS_sfsm70_slidingwindow
2773 SFS/HCSS_sfsm70_slidingwindow

10,000 bp bins

In [24]:
# use previously created genome fai file as the bedfile input
awk 'FS=OFS="\t"{print $1, 0, $2}' ../../Olurida_v081.fa.fai \
| bedtools makewindows -b - -w 10000 > Olurida_v081_10kbIntervals.bed
In [25]:
head Olurida_v081_10kbIntervals.bed
Contig0	0	10000
Contig0	10000	20000
Contig0	20000	30000
Contig0	30000	40000
Contig0	40000	50000
Contig0	50000	60000
Contig0	60000	70000
Contig0	70000	80000
Contig0	80000	90000
Contig0	90000	100000
In [35]:
# reading regions
regions=read.table("Olurida_v081_10kbIntervals.bed")
names(regions)=c("contig","start","end")
In [36]:
# read in fst data
fst01=read.table("SFS/HCSS_sfsm70_fst2pop.fst")
names(fst01)=c("contig","pos","a","b")

fst01$contig=as.character(fst01$contig)
In [37]:
# removing zero-only (invariant) bases
fst01[,3:4]=round(fst01[,3:4],3)
ch01=apply(fst01[,3:4],1,sum)
chh=(ch01>0)
table(chh)

fst01=fst01[chh,]
head(fst01)
chh
 FALSE   TRUE 
363405   5882 
contigposab
64Contig0 74432 0.000 0.009
82Contig0109103 0.034 0.457
98Contig0109119 0.003 0.052
102Contig0109123 0.002 0.067
114Contig1 40362 0.000 0.009
169Contig1 42880 0.003 0.078
In [38]:
# computing weighted Fst per region
i=1;gfst01=c();ns01=0
pb=txtProgressBar(0,nrow(regions))
for (i in 1:nrow(regions)) {
	setTxtProgressBar(pb,i)
	sub=subset(fst01,contig==regions$contig[i] & pos>=regions$start[i] & pos<=regions$end[i])
	if (is.null(sub[1,1]) | sum(sub$b)==0) { 
		gfst01=append(gfst01,NA)
	} else {
		gfst01=append(gfst01,sum(sub$a)/sum(sub$b))
		ns01=ns01+nrow(sub)
	}
}
================================================================================
In [39]:
# density plots of per-region Fst
plot(density(na.omit(gfst01)))
In [40]:
%expand
#adding results to regions table, saving
regions$fst01=gfst01
regionsNA <- regions[!is.na(regions$fst01),]
head(regionsNA)
write.csv(regionsNA,file="{sfsSuf}_Per10kbFst.csv",quote=F,row.names=F)
#save bed file of intervals only with fst data
contigstartendfst01
8Contig0 70000 80000 0.00000000
11Contig0 100000 110000 0.06770833
17Contig1 40000 50000 0.03448276
23Contig2 10000 20000 0.02127660
35Contig2 130000 139250 0.00000000
37Contig3 10000 20000 0.25287356
In [41]:
nrow(regionsNA)
3782
In [42]:
%expand
filtbed <- regionsNA[,c("contig","start","end")]
write.table(filtbed, "{sfsSuf}_10kb.bed",quote=F,row.names=F,col.names = F)

Compare with overlappin Pst bins

In [60]:
%expand
regionsNA <- read.csv("{sfsSuf}_Per10kbFst.csv")
id <- paste(regionsNA$contig,regionsNA$start,regionsNA$end, sep = "_")
regionsNA <- cbind(id,regionsNA)
head(regionsNA)
idcontigstartendfst01
Contig0_70000_80000 Contig0 70000 80000 0.00000000
Contig0_100000_110000Contig0 100000 110000 0.06770833
Contig1_40000_50000 Contig1 40000 50000 0.03448276
Contig2_10000_20000 Contig2 10000 20000 0.02127660
Contig2_130000_139250Contig2 130000 139250 0.00000000
Contig3_10000_20000 Contig3 10000 20000 0.25287356
In [61]:
# read in Pst data
pst10kb <- read.csv("../../paper-oly-mbdbs-gen/analyses/methylation-filtered/Pst_bins_10kb.tab", sep="\t",header=T,stringsAsFactors = F)
colnames(pst10kb) <- c("id","contig","start","end","pst","pst.lowCI","pst.highCI")
head(pst10kb)
nrow(pst10kb)
idcontigstartendpstpst.lowCIpst.highCI
Contig0_70000_80000 Contig0 70000 80000 0.6136306 0.011347886 0.8948949
Contig10074_0_6621 Contig10074 0 6621 0.2513128 0.001205032 0.8501561
Contig100771_0_1852 Contig100771 0 1852 0.3431145 0.001595068 0.8617699
Contig10259_0_9151 Contig10259 0 9151 0.2439721 0.000979882 0.8090206
Contig103223_10000_14834Contig103223 10000 14834 0.3743660 0.001878200 0.8601134
Contig104217_0_5491 Contig104217 0 5491 0.4712576 0.011812632 0.8385497
271

271 overlapping bins

In [62]:
both = merge(pst10kb,regionsNA, by ="id")
#both <- both[,c("id","pst","fst01")]
head(both)
idcontig.xstart.xend.xpstpst.lowCIpst.highCIcontig.ystart.yend.yfst01
Contig0_70000_80000 Contig0 70000 80000 0.6136306 0.011347886 0.8948949 Contig0 70000 80000 0.0000000
Contig10074_0_6621 Contig10074 0 6621 0.2513128 0.001205032 0.8501561 Contig10074 0 6621 0.1920200
Contig100771_0_1852 Contig100771 0 1852 0.3431145 0.001595068 0.8617699 Contig100771 0 1852 0.1491935
Contig10259_0_9151 Contig10259 0 9151 0.2439721 0.000979882 0.8090206 Contig10259 0 9151 0.1026694
Contig103223_10000_14834Contig103223 10000 14834 0.3743660 0.001878200 0.8601134 Contig103223 10000 14834 0.0000000
Contig104217_0_5491 Contig104217 0 5491 0.4712576 0.011812632 0.8385497 Contig104217 0 5491 0.0000000
In [66]:
#library(MASS)
#dens <- kde2d(Dgeo,DgenN, n=300)
#myPal <- colorRampPalette(c("white","blue","gold", "orange", "red"))
plot(both$fst01, both$pst, pch=20,cex=1,xlab = "Fst", ylab = "Pst")
title("Pst vs Fst, 10kb bins, 271 bins")
summary(lm(both$pst~both$fst01),)
R2 = round(summary(lm(both$pst~both$fst01))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(both$pst~both$fst01))
text(0.4,0.8,label=paste("R^2=",R2))
text(0.4,0.75,label="r= -0.004")
Call:
lm(formula = both$pst ~ both$fst01)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.27271 -0.19651 -0.02384  0.14066  0.72989 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.272758   0.016211  16.826   <2e-16 ***
both$fst01  -0.009332   0.143242  -0.065    0.948    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.22 on 269 degrees of freedom
Multiple R-squared:  1.578e-05,	Adjusted R-squared:  -0.003702 
F-statistic: 0.004245 on 1 and 269 DF,  p-value: 0.9481
In [64]:
cor.test( ~ both$pst + both$fst01,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ both$pst + both$fst01, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  both$pst and both$fst01
t = -0.065151, df = 269, p-value = 0.9481
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1230691  0.1152373
sample estimates:
         cor 
-0.003972307 
Warning message in cor.test.default(x = c(0.613630567871362, 0.251312849756899, :
"Cannot compute exact p-value with ties"
	Spearman's rank correlation rho

data:  both$pst and both$fst01
S = 3376696, p-value = 0.7682
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.0179847 

Correlating PC scores for genetic and epigenetic

All methylation data

In [38]:
# Read in PC scores from epigenetics
ep <- read.table("../../paper-oly-mbdbs-gen/analyses/methylation-filtered/PC-scores-filtered-methylation.tab",
                 header=T, sep="\t",row.names = "sample")

Add sample names to PC score matrix

In [11]:
key = read.csv("../../paper-oly-mbdbs-gen/data/sample-key.csv",colClasses = c("character","character"))
In [40]:
samples = rownames(ep)
mapdf <- data.frame(old=key$X...MBD.FILENAME,new=key$SAMPLE)
rownames(ep) <- mapdf$new[match(samples,mapdf$old)]
In [41]:
ep
PC1PC2PC3PC4PC5PC6PC7PC8PC9PC10PC11PC12PC13PC14PC15PC16PC17PC18
hc1_2 -6.692850 4.371347 17.3591858 -9.77823057 0.03392218 -29.86492405 35.5278752 30.6193584 -4.9343922 1.6809432 -0.33319001 3.635139 -7.4010422 -1.7594517 3.1929463 0.78578319 -4.4636713 -2.304060e-14
hc1_4 -1.070833 15.671612 -0.3928227 -1.70710015 41.23235880 35.77341278 2.2218207 22.6387392 9.6957465 -0.1030809 -0.86444586 7.793098 2.1127411 -1.9011238 1.0629236 2.77676877 0.2719266 -4.127601e-14
hc2_15-12.299954 10.306976 -8.1557636 -2.70502528 18.69976530 -31.90777310 -18.5266121 -11.0064755 32.4791342 -12.9709720 -12.42688061 6.839118 -5.7880514 -0.1922841 2.8387897 0.27435155 2.0688240 -2.724470e-14
hc2_17-20.447126 9.462505 -2.1141525 4.20049318 11.22313158 -0.77622031 0.7696665 -5.0742263 -2.9998595 -1.6355382 15.84724957 -44.577822 -11.9680384 6.4587427 -9.2348469 -8.24820025 -1.5929793 1.984524e-14
hc3_1 39.034523 48.607826 -14.8380393 24.54435834 -20.88827041 3.21539341 13.3872950 -6.7302612 5.4916718 -3.3313800 0.04068923 1.333991 0.3918315 1.5024459 -0.5308704 2.08230580 -0.6792729 -1.282481e-14
hc3_5 -4.158491 15.894780 56.6155610 -2.33521782 -19.01246614 7.46016914 -25.9280178 3.7768436 0.2410256 -3.1401753 1.99055907 2.008911 -3.5426766 -0.3045020 0.8488112 -0.11514156 1.4576012 1.117509e-14
hc3_7 -3.864274 15.873938 -20.7614296 2.15581022 7.59235360 -16.98380550 -28.4280712 11.2734120 -33.2551537 19.2619936 4.18859370 7.355386 7.3000877 -5.3908788 -0.3526813 -0.96549984 1.6868523 -3.011653e-14
hc3_10-26.775638 5.285006 16.3128676 3.29961637 11.01994934 -0.50438638 19.8221442 -30.5733695 -9.8219832 4.1390918 -10.16929815 3.537457 30.0290917 -6.4407773 -8.5153142 0.40346723 4.6974825 -2.846074e-14
hc3_11 27.358771 4.769939 -8.5306352 -59.71357725 -3.74778573 4.63932148 1.8642351 -13.3416913 -4.0485879 -6.2794531 8.25416810 -1.321221 1.6583165 -1.7369624 3.7856504 -0.97944952 0.8468419 -1.080733e-14
ss2_9 6.465812 -14.429699 -9.6526527 -1.47541437 -13.23682631 4.77846200 -6.5708353 18.3475636 -5.9277610 -17.3833320 -36.74039835 -16.780138 16.3704203 11.1439572 -1.2996767 -0.69204759 -0.3559637 4.914472e-15
ss2_14-18.922299 -13.051948 -11.7284274 9.37222473 -14.48802057 6.02618124 -0.3432883 3.7470212 3.5191786 -14.7538726 12.47806816 -5.926382 5.2032870 -34.8767482 21.5234972 8.01383124 0.9745638 -3.814744e-14
ss2_18 57.203399 -31.515497 16.0043428 19.96257323 23.19173716 -9.36772786 -1.5263979 -8.2383146 -9.8901491 -9.6078438 7.31227535 -1.022245 -1.2873794 -2.3582088 1.3820464 -0.01808849 -0.3010683 -3.171205e-14
ss3_3 -1.586950 -13.016356 -4.4951886 2.37429362 -8.74625932 4.22362495 -2.2857498 -0.2209097 12.4528948 11.1203493 5.21846830 8.206049 7.9088060 -1.0529871 -4.8831301 -25.60237746 -34.1337760 -1.407739e-14
ss3_14 -6.869877 -13.758143 -5.7509940 -0.03503271 -7.97145122 0.09865517 -2.9408401 0.7549669 6.9849426 4.4075935 13.18907492 3.068868 3.8584631 12.8126462 -17.4943374 38.04138920 -11.8529642 -3.098823e-14
ss3_15-23.724292 -6.297303 -6.2630517 7.77628176 -2.50136889 6.03173553 4.4649768 -8.9514123 -11.4648707 -11.7542464 9.96107454 14.292394 -5.7570909 31.8160870 26.2366619 -4.61349919 2.7657943 -1.370778e-14
ss3_16 -6.222188 -15.256037 -9.9880900 2.74993534 -12.89661904 3.47663653 2.1522917 8.3846073 5.2531997 -8.6952416 12.08045896 14.386765 -3.5246216 -1.1906847 -28.0187811 -15.53426597 26.2991427 1.160877e-14
ss3_20 -8.409625 -7.667093 -4.3401787 0.84692428 -2.47756877 11.84000340 4.2955124 -15.2525170 -13.0036782 7.6647033 -25.18073705 6.730737 -37.6991146 -11.2709100 -5.8189887 4.19794007 -5.2303088 9.542605e-16
ss5_18 10.981891 -15.251853 0.7194687 0.46708707 -7.02658154 1.84124156 2.0439948 -0.1533349 19.2286419 41.3804613 -4.84572988 -9.560104 2.1349702 4.7416398 15.2773000 0.19273281 17.5409751 -2.729078e-14
In [45]:
%expand
gen <- read.table("Results/{SUFFIX}pca_covMat.tsv",)
mbdorder = c("hc1_2","hc1_4","hc2_15","hc2_17","hc3_1","hc3_10","hc3_11","hc3_5","hc3_7",
        "ss2_14","ss2_18","ss2_9","ss3_14","ss3_15","ss3_16","ss3_20","ss3_3","ss5_18")
rownames(gen) <- mbdorder
gen
#make sure ep and gen in same order
ep <- ep[mbdorder,]
PC1PC2PC3PC4PC5PC6PC7PC8PC9PC10PC11PC12PC13PC14PC15PC16PC17PC18Pop
hc1_2-0.169157009 0.32979333 -0.0925416296 0.50520867 -0.294133348 0.26493543 -0.06140564 -0.006872229 0.04954539 -0.03079583 0.16689447 -0.019328195 -0.35883554 0.29791521 -0.122308974 -0.111055481 -0.375556625 0.144280831 HC
hc1_4-0.271757005 -0.34897316 -0.2228580558 0.07179135 0.140243789 0.05948359 0.23457074 -0.431635637 0.58806689 -0.31398495 -0.05176609 0.072893855 -0.08775922 -0.04412887 0.022912601 -0.146603418 0.022401047 -0.020879124 HC
hc2_15-0.071626776 -0.08966038 -0.0397213904-0.09019719 0.008315516 -0.08914973 -0.13072719 -0.577920092 -0.10440079 0.71562241 -0.01947412 0.022977518 -0.18519793 0.22058976 0.071916895 -0.041643594 0.064886645 0.004739740 HC
hc2_17-0.298667880 -0.46065814 0.1709509477 0.50626010 0.477551173 -0.14189869 -0.12229645 0.248062189 -0.22116443 0.09107086 0.10440924 -0.118132992 -0.02734497 0.03632917 0.021668054 -0.059933906 -0.003101467 0.013541656 HC
hc3_1-0.256597468 -0.26360918 -0.1601116901-0.25708967 -0.429531489 -0.01684808 -0.01589535 0.228308475 -0.20952811 -0.03248925 0.06665936 -0.095151431 -0.09836574 0.02562810 0.010099832 -0.650256453 0.209656174 -0.009993740 HC
hc3_10-0.065071796 0.29433533 -0.1021351748 0.05611128 0.049239062 -0.72913326 0.52168553 -0.007792869 -0.08157511 -0.05036670 0.21913731 0.012004485 -0.11929153 0.11770265 0.002913704 -0.032029642 0.047016400 -0.024292181 HC
hc3_11-0.125488142 0.01373902 0.8988541987-0.11639476 -0.143160319 -0.01167964 0.05355827 -0.120221710 0.11441735 -0.16274347 0.03271447 -0.005991747 -0.19713438 0.15824428 0.058861658 -0.028844206 0.117987901 -0.042665776 HC
hc3_5-0.143120769 0.25833924 -0.0773359071 0.39861545 -0.166206946 0.13312088 -0.01994485 0.001502115 0.08777540 0.06530467 -0.01659665 -0.105096992 0.07948560 -0.09322778 0.197338624 0.138111410 0.761771924 -0.159821780 HC
hc3_7-0.318831783 -0.31672402 -0.1685526135-0.20871865 -0.302968493 -0.06214955 -0.05833547 0.105436774 -0.07224308 -0.10760250 0.15323289 -0.041115290 -0.18920247 0.18938030 -0.009664227 0.707551090 0.006904790 -0.038549729 HC
ss2_14 0.243532694 -0.05325667 -0.0112504003 0.02660362 0.010415915 0.05821443 -0.07413337 -0.389989744 -0.30450396 -0.25018469 0.43696327 -0.402573092 -0.25713079 -0.44736762 -0.003418302 0.017336869 0.024482783 0.021291751 SS
ss2_18 0.286774500 -0.18692917 0.0607321050 0.07293295 -0.096864786 -0.06099770 -0.01245880 0.204805726 0.33236915 0.26832962 0.32740659 0.276267253 -0.19667989 -0.18134206 -0.581828420 0.009179303 0.205992957 0.002802899 SS
ss2_9 0.217164350 -0.13668071 -0.0359391218 0.12504185 -0.034491284 0.03055297 -0.09330715 -0.158611623 -0.28458837 -0.26532185 0.23254672 0.712236517 0.07543774 0.19810597 0.262751244 -0.030782937 0.152386118 0.174226060 SS
ss3_14 0.284005355 -0.13343295 -0.0453763001 0.14950017 -0.064156555 -0.06947412 0.06337588 0.096367552 -0.11549845 -0.04674180 -0.50853613 0.140984955 -0.58760720 -0.12835534 0.161687478 0.015612647 -0.044728107 -0.413190543 SS
ss3_15 0.008320234 0.25839268 -0.1403021067-0.25629503 0.376373342 -0.17472840 -0.59718904 0.111137946 0.22635953 -0.19824638 0.09764560 -0.041632615 -0.32747893 0.23209975 0.054120731 -0.092662237 0.176367549 -0.032818887 SS
ss3_16 0.252509868 -0.11857098 -0.0431228268 0.08965063 0.019221920 -0.02240088 0.04005040 -0.180833165 -0.20228145 -0.25125342 -0.34995704 -0.255014779 0.06612093 0.46156279 -0.516495088 0.009494130 0.266183507 0.174456979 SS
ss3_20 0.037809434 0.04616455 -0.0589178291-0.24481162 0.392205366 0.53899828 0.49733542 0.176921496 -0.11761826 0.10437955 0.19390864 -0.015883140 -0.25973585 0.22674915 0.052554708 0.009970900 0.158389076 -0.025277071 SS
ss3_3 0.372932794 -0.18422417 -0.0197498927 0.09594917 -0.116801908 -0.01000246 -0.01186712 0.018885450 0.18829491 0.01801898 0.31317672 -0.239693297 0.26625812 0.40494020 0.193512101 -0.069335517 -0.127424254 -0.564333793 SS
ss5_18 0.348617184 -0.18623575 -0.0002393696 0.05522893 -0.120708591 -0.10013944 0.07737120 0.187529089 0.27635145 0.11537599 -0.04890382 -0.260466163 -0.13365163 0.06453839 0.438908477 0.035969106 0.047603910 0.629948894 SS
In [57]:
plot(ep$PC2,gen$PC1, pch=20,cex=1,xlab = "PC2 methylation", ylab = "PC1 SNPs")
title("Filtered methylated sites (PC2) vs filtered SNPs (PC1)")
summary(lm(gen$PC1~ep$PC2),)
R2 = round(summary(lm(gen$PC1~ep$PC2))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(gen$PC1~ep$PC2))
text(30,0.3,label=paste("R^2=",R2))
text(ep$PC2,gen$PC1, labels=rownames(ep), cex=0.9, font=2)
Call:
lm(formula = gen$PC1 ~ ep$PC2)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.212094 -0.088553 -0.005015  0.076451  0.264273 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.018408   0.032040   0.575    0.574    
ep$PC2      -0.011094   0.001805  -6.147  1.4e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1359 on 16 degrees of freedom
Multiple R-squared:  0.7025,	Adjusted R-squared:  0.6839 
F-statistic: 37.79 on 1 and 16 DF,  p-value: 1.404e-05
In [58]:
cor.test( ~ ep$PC2 + gen$PC1,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ ep$PC2 + gen$PC1, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  ep$PC2 and gen$PC1
t = -6.1473, df = 16, p-value = 1.404e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.9379921 -0.6100194
sample estimates:
       cor 
-0.8381768 
	Spearman's rank correlation rho

data:  ep$PC2 and gen$PC1
S = 1802, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.8596491 

DMLs

In [63]:
# Read in PC scores from epigenetics
ep <- read.table("../../paper-oly-mbdbs-gen/analyses/DMLs/PC-scores-DMLs.tab",
                 header=T, sep="\t",row.names = "sample")

Add sample names to PC score matrix

In [64]:
key = read.csv("../../paper-oly-mbdbs-gen/data/sample-key.csv",colClasses = c("character","character"))
In [65]:
samples = rownames(ep)
mapdf <- data.frame(old=key$X...MBD.FILENAME,new=key$SAMPLE)
rownames(ep) <- mapdf$new[match(samples,mapdf$old)]
In [66]:
ep
PC1PC2PC3PC4PC5PC6PC7PC8PC9PC10PC11PC12PC13PC14PC15PC16PC17PC18
hc1_2 2.58520529 2.52082478 -1.2085058 0.42322452 0.54325748 0.1039631 -0.12694825 0.76376345 -0.96942144 0.26802083 0.28865161 -1.51780782 0.445506029 -0.12625313 0.1374234782-0.405429804 -0.004794839 3.330669e-16
hc1_4 3.37080780 -2.03165236 0.8702686 -1.29204441 1.91824968 -0.4996976 0.47911154 0.31219593 0.35561796 0.80971097 0.02913808 0.74960408 0.446178287 0.07594788 -0.0002381671-0.673094503 -0.135790637 -2.498002e-16
hc2_15 3.86829394 -1.33681409 0.8650964 0.78709933 -1.26146005 0.1721864 0.56342577 -0.88795865 -0.37425513 -0.80642763 0.80413379 0.04505579 0.379095401 -0.90594140 0.3619052557 0.223859298 -0.273138785 1.026956e-15
hc2_17 3.38228858 -0.79641793 -1.4876656 -1.25664040 1.33311484 0.6280170 0.11835032 0.50985459 0.59488917 -1.38658333 -0.59549147 -0.19059968 0.066777983 -0.30390921 -0.9218117044 0.217071952 0.093162128 7.632783e-16
hc3_1 1.81393178 -0.67259628 2.9870456 -0.40833186 -0.96025334 -0.8174120 -1.02180183 0.91534553 -0.92598761 0.98324685 -0.64844127 -0.10305661 -0.135350626 -0.55412374 -0.2333078766 0.277394323 0.218305193 -4.163336e-17
hc3_5 3.15464787 1.72341264 -1.1378371 -0.05172669 1.01491175 -0.8237016 0.87441578 0.89248770 -0.20670025 0.26927195 0.92423115 0.80427163 0.124159134 0.40792642 0.3863396250 0.562323232 0.195555875 -4.059253e-16
hc3_7 3.03723117 -2.21797016 0.5758374 0.82823062 -2.29779396 -0.1390859 0.01737066 0.38127916 0.42014818 -0.85970261 0.16518251 -0.31089880 -0.278629464 0.98636768 0.2374751890-0.291878869 0.210038774 -4.128642e-16
hc3_10 3.12657476 2.05163630 -1.4785213 2.64148283 -0.56936840 0.1551002 -0.50077137 -0.37443424 0.85983088 0.36632514 -0.67663699 0.71482050 -0.818042865 -0.47403996 0.0124800260-0.267051183 0.003900828 6.938894e-17
hc3_11 1.90514384 -0.10053224 -0.2313423 -0.02410457 1.00028093 -1.0818482 -1.08862894 -2.84220098 -0.68173469 0.30161141 -0.22254763 -0.25552827 -0.009977519 0.67586623 -0.2296772481 0.210401103 -0.042430308 5.412337e-16
ss2_9-3.29604933 -0.36696617 -1.1240478 -1.64678552 -0.03140524 -1.1444626 -2.12193257 0.06073731 1.59177805 -0.14269409 0.03296847 -0.21700577 0.298625486 -0.34382530 0.7520590765 0.067221294 0.038251681 -1.665335e-16
ss2_14-4.62761978 0.32016538 0.1023013 1.33200011 -0.43756652 0.8884132 -1.70058349 0.26035659 -0.41445441 -0.16073162 1.10376867 0.65106642 0.598388311 0.08450745 -0.7660601303-0.167222788 0.034365840 -1.290634e-15
ss2_18-3.15964800 -2.94944995 -2.4536016 1.41571219 -0.45780374 -0.4007593 0.56658560 1.03593882 -0.16075042 0.95973890 -0.35935370 -0.32331375 -0.066483322 0.22004619 -0.1606234117 0.315810332 -0.260083443 -9.992007e-16
ss3_3-4.51647335 -1.63423052 -1.3518178 -0.43540364 0.43983796 1.2242943 1.19036209 -1.07786890 -1.10902628 0.03895321 -0.09456160 0.13664437 -0.205271840 -0.48921408 0.3702596694-0.207845136 0.352042417 -1.415534e-15
ss3_14-3.49585065 1.44235134 1.8117922 1.39008328 -0.13611863 -0.6455329 1.87460770 -0.59488368 1.39795831 0.09007089 -0.53640239 -0.27075052 0.968970471 0.05005773 -0.1640301250 0.072471797 0.089704849 -1.000935e-15
ss3_15-0.50826754 -0.06622783 1.9832653 0.06545791 1.59826625 3.0244050 -0.16521858 0.01091822 0.98652309 0.48815728 0.51502137 -0.51785758 -0.760809159 0.19425483 0.1575905249 0.221552299 -0.070317115 -7.077672e-16
ss3_16-2.89036430 1.04776489 1.2228544 0.50719492 1.16406626 0.1998446 -0.39106282 0.80257770 -1.21474302 -1.23994657 -1.09970478 0.45690810 -0.004092271 0.36003195 0.5480006844 0.007993032 -0.191693813 8.673617e-17
ss3_20 0.03871472 2.01719310 -0.6907287 -3.12339634 -2.82958681 1.5596246 0.49255668 -0.24716886 -0.12242973 0.44366221 -0.36019013 0.33441036 0.146179426 0.29177550 -0.1081014256 0.009544983 -0.148261334 -4.996004e-16
ss5_18-3.78856679 1.04950912 0.7456069 -1.15205229 -0.03062846 -2.4033483 0.94016172 0.07906030 -0.02724268 -0.42268378 0.73023433 -0.18596246 -1.195223461 -0.14947506 -0.3796834404-0.173121361 -0.108817312 -4.163336e-17
In [67]:
%expand
#make sure ep and gen in same order
ep <- ep[mbdorder,]
In [69]:
plot(ep$PC1,gen$PC1, pch=20,cex=1,xlab = "PC1 methylation", ylab = "PC1 SNPs")
title("DMLs (PC1) vs filtered SNPs (PC1)")
summary(lm(gen$PC1~ep$PC1),)
R2 = round(summary(lm(gen$PC1~ep$PC1))$r.squared, 4)
#image(dens, col=transp(myPal(300),.7), add=TRUE)
abline(lm(gen$PC1~ep$PC1))
text(3,0.3,label=paste("R^2=",R2))
text(ep$PC1,gen$PC1, labels=rownames(ep), cex=0.9, font=2)
Call:
lm(formula = gen$PC1 ~ ep$PC1)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.148275 -0.052397  0.007205  0.045458  0.180223 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.018408   0.020769   0.886    0.389    
ep$PC1      -0.069865   0.006601 -10.583 1.24e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.08812 on 16 degrees of freedom
Multiple R-squared:  0.875,	Adjusted R-squared:  0.8672 
F-statistic:   112 on 1 and 16 DF,  p-value: 1.241e-08
In [70]:
cor.test( ~ ep$PC1 + gen$PC1,
         method = "pearson",
         conf.level = 0.95)
cor.test( ~ ep$PC1 + gen$PC1, 
         method = "spearman",
         continuity = FALSE,
         conf.level = 0.95)
	Pearson's product-moment correlation

data:  ep$PC1 and gen$PC1
t = -10.583, df = 16, p-value = 1.241e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.9760352 -0.8318183
sample estimates:
       cor 
-0.9354179 
	Spearman's rank correlation rho

data:  ep$PC1 and gen$PC1
S = 1768, p-value = 2.01e-05
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.8245614 
In [ ]: