31-transcript-viz

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Here I want to explore the best way to visualize the transcript presence data…. eventually this will be integrated with DNA methylation data. Will start out looking out males only.

cd ../data

curl -O https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/analyses/transcript_counts_per_gene_per_sample_controls_males.csv

curl -O https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/analyses/transcript_counts_per_gene_per_sample_exposed_males.csv

##   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
##                                  Dload  Upload   Total   Spent    Left  Speed
## 
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1376k  100 1376k    0     0  24.4M      0 --:--:-- --:--:-- --:--:-- 24.4M
##   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
##                                  Dload  Upload   Total   Spent    Left  Speed
## 
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1774k  100 1774k    0     0  26.2M      0 --:--:-- --:--:-- --:--:-- 26.2M

m_con <- read.csv("../data/transcript_counts_per_gene_per_sample_controls_males.csv")

head(m_con)

##      gene_name transcript_counts.S13M transcript_counts.S64M
## 1         ATP6                      1                      1
## 2         COX1                      1                      1
## 3         COX2                      1                      1
## 4         COX3                      1                      1
## 5         CYTB                      1                      1
## 6 LOC111099029                      1                      1
##   transcript_counts.S6M transcript_counts.S7M
## 1                     1                     1
## 2                     1                     1
## 3                     1                     1
## 4                     1                     1
## 5                     1                     1
## 6                     1                     1
##   control_males_sum_transcript_counts control_males_median_transcript_counts
## 1                                   4                                      1
## 2                                   4                                      1
## 3                                   4                                      1
## 4                                   4                                      1
## 5                                   4                                      1
## 6                                   4                                      1
##   control_males_mean_transcript_counts control_males_max_transcript_counts
## 1                                    1                                   1
## 2                                    1                                   1
## 3                                    1                                   1
## 4                                    1                                   1
## 5                                    1                                   1
## 6                                    1                                   1
##   control_males_min_transcript_counts control_males_std_dev_transcript_counts
## 1                                   1                                       0
## 2                                   1                                       0
## 3                                   1                                       0
## 4                                   1                                       0
## 5                                   1                                       0
## 6                                   1                                       0

m_exp <- read.csv("../data/transcript_counts_per_gene_per_sample_exposed_males.csv")

head(m_exp)

##      gene_name transcript_counts.S12M transcript_counts.S23M
## 1         ATP6                      1                      1
## 2         COX1                      1                      1
## 3         COX2                      1                      1
## 4         COX3                      1                      1
## 5         CYTB                      1                      1
## 6 LOC111099029                      1                      1
##   transcript_counts.S31M transcript_counts.S48M transcript_counts.S59M
## 1                      1                      1                      1
## 2                      1                      1                      1
## 3                      1                      1                      1
## 4                      1                      1                      1
## 5                      1                      1                      1
## 6                      1                      1                      1
##   transcript_counts.S9M exposed_males_sum_transcript_counts
## 1                     1                                   6
## 2                     1                                   6
## 3                     1                                   6
## 4                     1                                   6
## 5                     1                                   6
## 6                     1                                   6
##   exposed_males_median_transcript_counts exposed_males_mean_transcript_counts
## 1                                      1                                    1
## 2                                      1                                    1
## 3                                      1                                    1
## 4                                      1                                    1
## 5                                      1                                    1
## 6                                      1                                    1
##   exposed_males_max_transcript_counts exposed_males_min_transcript_counts
## 1                                   1                                   1
## 2                                   1                                   1
## 3                                   1                                   1
## 4                                   1                                   1
## 5                                   1                                   1
## 6                                   1                                   1
##   exposed_males_std_dev_transcript_counts
## 1                                       0
## 2                                       0
## 3                                       0
## 4                                       0
## 5                                       0
## 6                                       0

males <- inner_join(m_con, m_exp, by = "gene_name")

ggplot(m_con, aes(x = control_males_max_transcript_counts)) +
  geom_histogram(bins = 80) +
  xlim(0,50) +
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 53 rows containing missing values (geom_bar).

ggplot(m_exp, aes(x = exposed_males_max_transcript_counts)) +
  geom_histogram(bins = 80) +
  xlim(0,50) +
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 52 rows containing missing values (geom_bar).

#max <- males %>% 
  #pivot_longer(c(`control_males_max_transcript_counts`, `exposed_males_max_transcript_counts`), names_to = "max", values_to = "counts")

ggplot() +
  geom_histogram(data = m_con, aes(x = control_males_max_transcript_counts), fill = "blue", alpha = 0.2) +
  geom_histogram(data = m_exp, aes(x = exposed_males_max_transcript_counts), fill = "green", alpha = 0.2) +
  scale_y_log10()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## Warning: Removed 3 rows containing missing values (geom_bar).
## Removed 3 rows containing missing values (geom_bar).

m2 <- males %>% mutate(diffmean = abs(control_males_mean_transcript_counts - exposed_males_mean_transcript_counts)) %>% 
  mutate(diffmed = abs(control_males_median_transcript_counts - exposed_males_median_transcript_counts)) %>%
  mutate(covar_exp = exposed_males_std_dev_transcript_counts / exposed_males_mean_transcript_counts) %>%
  mutate(covar_con = control_males_std_dev_transcript_counts / control_males_mean_transcript_counts)

library("RColorBrewer")

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(1, 4))

ggplot(m2, aes(x = control_males_mean_transcript_counts, y = exposed_males_mean_transcript_counts, color = diffmean)) +
  geom_point(alpha = 0.6, size = 3) +
  geom_smooth(method = "lm") +
  geom_abline(size=1.0,colour="red") +
  sc

## `geom_smooth()` using formula 'y ~ x'

 # ylim(0,20) +
  #xlim(0,20)

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, 3))

ggplot(m2, aes(x = control_males_median_transcript_counts, y = exposed_males_median_transcript_counts, color = diffmean)) +
  #geom_point(alpha = 0.1, size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(size=1.0,colour="red") +
  geom_jitter(alpha = 0.3, size =0.5) +
  sc

## `geom_smooth()` using formula 'y ~ x'

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, 4))

ggplot(m2, aes(x = covar_con, y = covar_exp, color = diffmean)) +
  geom_point(alpha = 0.2, size = 2) +
  geom_smooth(method = "lm") +
  geom_abline(size=1.0,colour="red") +
  sc

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 725 rows containing non-finite values (stat_smooth).

## Warning: Removed 725 rows containing missing values (geom_point).