# do once
# install.packages("R.utils")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(R.utils)
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 4.3.2
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.26.0 (2024-01-24 05:12:50 UTC) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
##
## The following object is masked from 'package:R.methodsS3':
##
## throw
##
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
##
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
##
## R.utils v2.12.3 (2023-11-18 01:00:02 UTC) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following object is masked from 'package:utils':
##
## timestamp
##
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, isOpen, nullfile, parse, warnings
library(dplyr)
The DEG list, stats, and counts, come from the deseq2 output with an padj > 0.05. Genes were pre filtered before deseq2 those that met the parameter of showing at least 8 individuals (over half) from each treatment (control n = 15, treatment n = 15) had 23 (median read count, after exluding all 0 values) reads. This was to limit outlier individuals in the data and get a padj that was more reflective of the actual differences is expression across treatments.
While counts aren’t needed for this, I chose to add it to the DEG matrix I’m creating so I could have a data frame that had all relevant data for the DEG. This is also to help minimize the number of dataframes/files to keep track of, as all of the DEG info is store in one. If I want a more select dataframe later on, I can create it. This reasoning is perhaps best compared to when folks create a “laundry list CV” that includes everything, then they create smaller CVs for each individual job. Could skip this step if you wanted to and just proceed with stats, or even just a list of the DEG names.
# for all counts: ../output/0807-logcounts_allDEG_8ind23r_ToC.csv
# for DEG stats: ../output/0807-DEGstats_ToC_8ind23r.tab
#read in count results
counts <- read.csv("../output/0807-logcounts_allDEG_8ind23r_ToC.csv", row.names = 1)
head(counts)
## M.C.193 M.C.216 M.C.218 M.C.226 M.C.306 M.C.329 M.C.334
## LOC132758254 9.018234 8.991358 7.577010 7.789053 7.618000 7.903490 7.785037
## LOC132745822 7.546969 8.093778 8.444167 8.048840 9.286832 9.820501 7.614251
## LOC132746399 5.276290 6.314388 7.936010 7.727373 6.929499 8.016098 6.245973
## LOC132746792 5.789978 6.525297 5.486501 5.769676 6.385635 7.139049 6.165775
## LOC132720785 5.407278 3.874463 3.474487 4.185148 3.179462 4.353156 4.736047
## LOC132725056 7.031642 7.395884 6.558978 7.200879 6.139855 5.858405 5.528054
## M.C.337 M.C.339 M.C.358 M.C.360 M.C.363 M.C.373 M.C.482
## LOC132758254 8.362197 8.102469 7.730976 7.431279 8.489877 6.870841 7.056003
## LOC132745822 8.489700 6.665863 7.782664 7.177897 8.498897 6.770201 8.030656
## LOC132746399 6.409642 5.553355 6.873344 7.109438 5.692989 7.077499 5.222981
## LOC132746792 6.409642 5.398225 7.458745 5.432316 6.991711 6.799679 5.763178
## LOC132720785 5.329371 3.402581 3.944095 4.941024 3.968640 4.973514 4.657204
## LOC132725056 6.074927 6.692819 6.392831 6.916854 5.924841 6.473688 6.463304
## M.C.488 M.T.18 M.T.20 M.T.22 M.T.235 M.T.245 M.T.29
## LOC132758254 7.866917 7.002917 6.278039 6.744291 7.172415 6.699427 6.653879
## LOC132745822 8.887287 6.953388 5.858643 6.806725 7.443312 6.929311 6.682167
## LOC132746399 6.724682 4.798929 4.682751 5.706205 5.549657 5.462240 6.317607
## LOC132746792 6.372353 4.325212 4.101964 5.247309 5.513920 5.108990 5.487575
## LOC132720785 5.076312 5.817759 6.399378 6.957439 4.356273 4.640188 5.281139
## LOC132725056 5.981913 5.119464 4.101964 5.095524 6.653936 5.296400 3.455454
## M.T.31 M.T.32 M.T.399 M.T.43 M.T.44 M.T.500 M.T.7
## LOC132758254 7.674504 6.960515 6.236251 6.410440 6.419648 6.607779 7.169018
## LOC132745822 6.836827 6.504231 6.675271 7.866451 6.984149 7.760471 6.843521
## LOC132746399 5.470048 5.218768 5.602092 6.129826 5.613907 6.159865 5.885884
## LOC132746792 4.901269 5.370087 5.188391 5.218074 5.850376 6.092670 4.609669
## LOC132720785 6.237069 4.898547 6.252709 7.016638 7.447696 5.158157 4.870806
## LOC132725056 4.789809 5.049696 4.324042 4.892587 5.200124 5.506812 4.561152
## M.T.83 M.T.8
## LOC132758254 6.848184 7.134442
## LOC132745822 7.864893 5.499278
## LOC132746399 4.885269 4.420540
## LOC132746792 5.927677 4.633276
## LOC132720785 6.186756 4.565791
## LOC132725056 6.689869 4.982870
# add in stats
stats <- read.csv("../output/0807-DEGstats_ToC_8ind23r.tab", sep="")
head(stats)
## baseMean log2FoldChange lfcSE stat pvalue
## LOC132742068 68.04920 -1.4845094 0.3347656 -4.434474 9.229718e-06
## LOC132738510 114.22423 -0.9442138 0.2535605 -3.723821 1.962304e-04
## LOC132727777 27.05036 1.8022735 0.4081725 4.415470 1.007906e-05
## LOC132728236 169.99708 0.5275304 0.1333260 3.956695 7.599388e-05
## LOC132740341 163.84395 -1.0096084 0.2543228 -3.969790 7.193589e-05
## LOC132741452 3020.53904 -0.6475836 0.1418720 -4.564563 5.005354e-06
## padj
## LOC132742068 0.011302182
## LOC132738510 0.048396396
## LOC132727777 0.011302182
## LOC132728236 0.031379420
## LOC132740341 0.031379420
## LOC132741452 0.007098218
The next step to creating our ’DEG masterfile is prep to join them (could be skipped by using the join_by( a == b) in the actual joining step, but I already wrote this chunk so..
# make column with LOC names
LOCnames <- rownames(counts)
counts$LOC <- LOCnames
head(counts)
## M.C.193 M.C.216 M.C.218 M.C.226 M.C.306 M.C.329 M.C.334
## LOC132758254 9.018234 8.991358 7.577010 7.789053 7.618000 7.903490 7.785037
## LOC132745822 7.546969 8.093778 8.444167 8.048840 9.286832 9.820501 7.614251
## LOC132746399 5.276290 6.314388 7.936010 7.727373 6.929499 8.016098 6.245973
## LOC132746792 5.789978 6.525297 5.486501 5.769676 6.385635 7.139049 6.165775
## LOC132720785 5.407278 3.874463 3.474487 4.185148 3.179462 4.353156 4.736047
## LOC132725056 7.031642 7.395884 6.558978 7.200879 6.139855 5.858405 5.528054
## M.C.337 M.C.339 M.C.358 M.C.360 M.C.363 M.C.373 M.C.482
## LOC132758254 8.362197 8.102469 7.730976 7.431279 8.489877 6.870841 7.056003
## LOC132745822 8.489700 6.665863 7.782664 7.177897 8.498897 6.770201 8.030656
## LOC132746399 6.409642 5.553355 6.873344 7.109438 5.692989 7.077499 5.222981
## LOC132746792 6.409642 5.398225 7.458745 5.432316 6.991711 6.799679 5.763178
## LOC132720785 5.329371 3.402581 3.944095 4.941024 3.968640 4.973514 4.657204
## LOC132725056 6.074927 6.692819 6.392831 6.916854 5.924841 6.473688 6.463304
## M.C.488 M.T.18 M.T.20 M.T.22 M.T.235 M.T.245 M.T.29
## LOC132758254 7.866917 7.002917 6.278039 6.744291 7.172415 6.699427 6.653879
## LOC132745822 8.887287 6.953388 5.858643 6.806725 7.443312 6.929311 6.682167
## LOC132746399 6.724682 4.798929 4.682751 5.706205 5.549657 5.462240 6.317607
## LOC132746792 6.372353 4.325212 4.101964 5.247309 5.513920 5.108990 5.487575
## LOC132720785 5.076312 5.817759 6.399378 6.957439 4.356273 4.640188 5.281139
## LOC132725056 5.981913 5.119464 4.101964 5.095524 6.653936 5.296400 3.455454
## M.T.31 M.T.32 M.T.399 M.T.43 M.T.44 M.T.500 M.T.7
## LOC132758254 7.674504 6.960515 6.236251 6.410440 6.419648 6.607779 7.169018
## LOC132745822 6.836827 6.504231 6.675271 7.866451 6.984149 7.760471 6.843521
## LOC132746399 5.470048 5.218768 5.602092 6.129826 5.613907 6.159865 5.885884
## LOC132746792 4.901269 5.370087 5.188391 5.218074 5.850376 6.092670 4.609669
## LOC132720785 6.237069 4.898547 6.252709 7.016638 7.447696 5.158157 4.870806
## LOC132725056 4.789809 5.049696 4.324042 4.892587 5.200124 5.506812 4.561152
## M.T.83 M.T.8 LOC
## LOC132758254 6.848184 7.134442 LOC132758254
## LOC132745822 7.864893 5.499278 LOC132745822
## LOC132746399 4.885269 4.420540 LOC132746399
## LOC132746792 5.927677 4.633276 LOC132746792
## LOC132720785 6.186756 4.565791 LOC132720785
## LOC132725056 6.689869 4.982870 LOC132725056
LOCstat <- rownames(stats)
stats$LOC <- LOCstat
head(stats)
## baseMean log2FoldChange lfcSE stat pvalue
## LOC132742068 68.04920 -1.4845094 0.3347656 -4.434474 9.229718e-06
## LOC132738510 114.22423 -0.9442138 0.2535605 -3.723821 1.962304e-04
## LOC132727777 27.05036 1.8022735 0.4081725 4.415470 1.007906e-05
## LOC132728236 169.99708 0.5275304 0.1333260 3.956695 7.599388e-05
## LOC132740341 163.84395 -1.0096084 0.2543228 -3.969790 7.193589e-05
## LOC132741452 3020.53904 -0.6475836 0.1418720 -4.564563 5.005354e-06
## padj LOC
## LOC132742068 0.011302182 LOC132742068
## LOC132738510 0.048396396 LOC132738510
## LOC132727777 0.011302182 LOC132727777
## LOC132728236 0.031379420 LOC132728236
## LOC132740341 0.031379420 LOC132740341
## LOC132741452 0.007098218 LOC132741452
Finally join them
# joining stats and counts
DEG <- left_join(x = counts, y = stats, by = "LOC")
head(DEG)
## M.C.193 M.C.216 M.C.218 M.C.226 M.C.306 M.C.329 M.C.334 M.C.337
## 1 9.018234 8.991358 7.577010 7.789053 7.618000 7.903490 7.785037 8.362197
## 2 7.546969 8.093778 8.444167 8.048840 9.286832 9.820501 7.614251 8.489700
## 3 5.276290 6.314388 7.936010 7.727373 6.929499 8.016098 6.245973 6.409642
## 4 5.789978 6.525297 5.486501 5.769676 6.385635 7.139049 6.165775 6.409642
## 5 5.407278 3.874463 3.474487 4.185148 3.179462 4.353156 4.736047 5.329371
## 6 7.031642 7.395884 6.558978 7.200879 6.139855 5.858405 5.528054 6.074927
## M.C.339 M.C.358 M.C.360 M.C.363 M.C.373 M.C.482 M.C.488 M.T.18
## 1 8.102469 7.730976 7.431279 8.489877 6.870841 7.056003 7.866917 7.002917
## 2 6.665863 7.782664 7.177897 8.498897 6.770201 8.030656 8.887287 6.953388
## 3 5.553355 6.873344 7.109438 5.692989 7.077499 5.222981 6.724682 4.798929
## 4 5.398225 7.458745 5.432316 6.991711 6.799679 5.763178 6.372353 4.325212
## 5 3.402581 3.944095 4.941024 3.968640 4.973514 4.657204 5.076312 5.817759
## 6 6.692819 6.392831 6.916854 5.924841 6.473688 6.463304 5.981913 5.119464
## M.T.20 M.T.22 M.T.235 M.T.245 M.T.29 M.T.31 M.T.32 M.T.399
## 1 6.278039 6.744291 7.172415 6.699427 6.653879 7.674504 6.960515 6.236251
## 2 5.858643 6.806725 7.443312 6.929311 6.682167 6.836827 6.504231 6.675271
## 3 4.682751 5.706205 5.549657 5.462240 6.317607 5.470048 5.218768 5.602092
## 4 4.101964 5.247309 5.513920 5.108990 5.487575 4.901269 5.370087 5.188391
## 5 6.399378 6.957439 4.356273 4.640188 5.281139 6.237069 4.898547 6.252709
## 6 4.101964 5.095524 6.653936 5.296400 3.455454 4.789809 5.049696 4.324042
## M.T.43 M.T.44 M.T.500 M.T.7 M.T.83 M.T.8 LOC baseMean
## 1 6.410440 6.419648 6.607779 7.169018 6.848184 7.134442 LOC132758254 187.73392
## 2 7.866451 6.984149 7.760471 6.843521 7.864893 5.499278 LOC132745822 225.66028
## 3 6.129826 5.613907 6.159865 5.885884 4.885269 4.420540 LOC132746399 80.87262
## 4 5.218074 5.850376 6.092670 4.609669 5.927677 4.633276 LOC132746792 60.55140
## 5 7.016638 7.447696 5.158157 4.870806 6.186756 4.565791 LOC132720785 43.73189
## 6 4.892587 5.200124 5.506812 4.561152 6.689869 4.982870 LOC132725056 65.01466
## log2FoldChange lfcSE stat pvalue padj
## 1 -1.188610 0.2003729 -5.931989 2.992873e-09 3.395415e-05
## 2 -1.299604 0.2742359 -4.738998 2.147771e-06 3.929306e-03
## 3 -1.324071 0.2742349 -4.828235 1.377486e-06 3.929306e-03
## 4 -1.141412 0.2388462 -4.778857 1.762949e-06 3.929306e-03
## 5 1.570396 0.3214278 4.885689 1.030675e-06 3.929306e-03
## 6 -1.269575 0.2692982 -4.714384 2.424428e-06 3.929306e-03
I want to get associated gene info for the DEGs. To do this, I read in annotated genome file. Its named blast here because originally it was a blast file, but I didn’t want to go and edit all the following chunks. Since R.Phil has a published annotation, we are using that to get info about our DEGs.
It is important to note that I originally used a blast file but switched because not all of the genes were getting a hit. These genes, however, when searched for on the annotated genome for rphil, did have matches. So I switched to the published annotation file.
The goal is to eventually get some biological process information about or DEGs.
# read in annotationg
blast <- read.delim("../data/ncbi_dataset.tsv")
head(blast)
## Accession Begin End Chromosome Orientation Name
## 1 NC_031332.1 1762 3330 MT plus cytochrome c oxidase subunit II
## 2 NC_031332.1 3397 3461 MT plus tRNA-Pro
## 3 NC_031332.1 3438 4685 MT plus cytochrome b
## 4 NC_031332.1 4686 6093 MT plus 16S ribosomal RNA
## 5 NC_031332.1 6094 7452 MT plus NADH dehydrogenase subunit 4
## 6 NC_031332.1 7453 7514 MT plus tRNA-His
## Symbol Gene.ID Gene.Type Transcripts.accession Transcript.name
## 1 COX2 29141288 protein-coding
## 2 29141300 tRNA
## 3 CYTB 29141301 protein-coding
## 4 29141289 rRNA
## 5 ND4 29141302 protein-coding
## 6 29141290 tRNA
## Protein.accession Protein.name Protein.length Locus.tag
## 1 YP_009305271.1 cytochrome c oxidase subunit II 522 BJM09_gp02
## 2 NA BJM09_gt01
## 3 YP_009305272.1 cytochrome b 415 BJM09_gp13
## 4 NA BJM09_gr02
## 5 YP_009305273.1 NADH dehydrogenase subunit 4 452 BJM09_gp12
## 6 NA BJM09_gt02
Joining annotation file to list of DEGs. Again, this is to keep building our “master list” of DEG info. A smaller dataframe/file could be created with select data if needed – this step is further down in this .rmd , currently commented out until GO terms can be added to the master list.
# join by gene name
DEG_Blast <- left_join(x = DEG, y = blast, join_by("LOC" == "Symbol"))
head(DEG_Blast)
## M.C.193 M.C.216 M.C.218 M.C.226 M.C.306 M.C.329 M.C.334 M.C.337
## 1 9.018234 8.991358 7.577010 7.789053 7.618000 7.903490 7.785037 8.362197
## 2 9.018234 8.991358 7.577010 7.789053 7.618000 7.903490 7.785037 8.362197
## 3 7.546969 8.093778 8.444167 8.048840 9.286832 9.820501 7.614251 8.489700
## 4 7.546969 8.093778 8.444167 8.048840 9.286832 9.820501 7.614251 8.489700
## 5 5.276290 6.314388 7.936010 7.727373 6.929499 8.016098 6.245973 6.409642
## 6 5.789978 6.525297 5.486501 5.769676 6.385635 7.139049 6.165775 6.409642
## M.C.339 M.C.358 M.C.360 M.C.363 M.C.373 M.C.482 M.C.488 M.T.18
## 1 8.102469 7.730976 7.431279 8.489877 6.870841 7.056003 7.866917 7.002917
## 2 8.102469 7.730976 7.431279 8.489877 6.870841 7.056003 7.866917 7.002917
## 3 6.665863 7.782664 7.177897 8.498897 6.770201 8.030656 8.887287 6.953388
## 4 6.665863 7.782664 7.177897 8.498897 6.770201 8.030656 8.887287 6.953388
## 5 5.553355 6.873344 7.109438 5.692989 7.077499 5.222981 6.724682 4.798929
## 6 5.398225 7.458745 5.432316 6.991711 6.799679 5.763178 6.372353 4.325212
## M.T.20 M.T.22 M.T.235 M.T.245 M.T.29 M.T.31 M.T.32 M.T.399
## 1 6.278039 6.744291 7.172415 6.699427 6.653879 7.674504 6.960515 6.236251
## 2 6.278039 6.744291 7.172415 6.699427 6.653879 7.674504 6.960515 6.236251
## 3 5.858643 6.806725 7.443312 6.929311 6.682167 6.836827 6.504231 6.675271
## 4 5.858643 6.806725 7.443312 6.929311 6.682167 6.836827 6.504231 6.675271
## 5 4.682751 5.706205 5.549657 5.462240 6.317607 5.470048 5.218768 5.602092
## 6 4.101964 5.247309 5.513920 5.108990 5.487575 4.901269 5.370087 5.188391
## M.T.43 M.T.44 M.T.500 M.T.7 M.T.83 M.T.8 LOC baseMean
## 1 6.410440 6.419648 6.607779 7.169018 6.848184 7.134442 LOC132758254 187.73392
## 2 6.410440 6.419648 6.607779 7.169018 6.848184 7.134442 LOC132758254 187.73392
## 3 7.866451 6.984149 7.760471 6.843521 7.864893 5.499278 LOC132745822 225.66028
## 4 7.866451 6.984149 7.760471 6.843521 7.864893 5.499278 LOC132745822 225.66028
## 5 6.129826 5.613907 6.159865 5.885884 4.885269 4.420540 LOC132746399 80.87262
## 6 5.218074 5.850376 6.092670 4.609669 5.927677 4.633276 LOC132746792 60.55140
## log2FoldChange lfcSE stat pvalue padj Accession
## 1 -1.188610 0.2003729 -5.931989 2.992873e-09 3.395415e-05 NW_026859905.1
## 2 -1.188610 0.2003729 -5.931989 2.992873e-09 3.395415e-05 NW_026859905.1
## 3 -1.299604 0.2742359 -4.738998 2.147771e-06 3.929306e-03 NW_026856216.1
## 4 -1.299604 0.2742359 -4.738998 2.147771e-06 3.929306e-03 NW_026856216.1
## 5 -1.324071 0.2742349 -4.828235 1.377486e-06 3.929306e-03 NW_026856403.1
## 6 -1.141412 0.2388462 -4.778857 1.762949e-06 3.929306e-03 NW_026856504.1
## Begin End Chromosome Orientation
## 1 204763 227007 plus
## 2 204763 227007 plus
## 3 29641 46037 minus
## 4 29641 46037 minus
## 5 242141 268014 plus
## 6 70333 123083 plus
## Name Gene.ID
## 1 golgin subfamily A member 3-like 132758254
## 2 golgin subfamily A member 3-like 132758254
## 3 titin homolog 132745822
## 4 titin homolog 132745822
## 5 FYVE and coiled-coil domain-containing protein 1-like 132746399
## 6 phospholipase D1-like 132746792
## Gene.Type Transcripts.accession
## 1 protein-coding XM_060749812.1
## 2 protein-coding XM_060749813.1
## 3 protein-coding XM_060734808.1
## 4 protein-coding XM_060734809.1
## 5 protein-coding XM_060735521.1
## 6 protein-coding XM_060736043.1
## Transcript.name Protein.accession
## 1 golgin subfamily A member 3-like, transcript variant X1 XP_060605795.1
## 2 golgin subfamily A member 3-like, transcript variant X2 XP_060605796.1
## 3 titin homolog, transcript variant X1 XP_060590791.1
## 4 titin homolog, transcript variant X2 XP_060590792.1
## 5 FYVE and coiled-coil domain-containing protein 1-like XP_060591504.1
## 6 phospholipase D1-like XP_060592026.1
## Protein.name Protein.length
## 1 golgin subfamily A member 3-like isoform X1 1674
## 2 golgin subfamily A member 3-like isoform X2 1563
## 3 titin homolog isoform X1 1350
## 4 titin homolog isoform X2 1331
## 5 FYVE and coiled-coil domain-containing protein 1-like 2583
## 6 phospholipase D1-like 330
## Locus.tag
## 1
## 2
## 3
## 4
## 5
## 6
# there's multiple entries for each LOC because of different isoforms, but we want just the gene level as our reads weren't thorough enough to give us confident information about isoforms, so we are going to remove duplicates
DEG_Blast <- distinct(DEG_Blast, LOC, .keep_all = TRUE)
head(DEG_Blast)
## M.C.193 M.C.216 M.C.218 M.C.226 M.C.306 M.C.329 M.C.334 M.C.337
## 1 9.018234 8.991358 7.577010 7.789053 7.618000 7.903490 7.785037 8.362197
## 2 7.546969 8.093778 8.444167 8.048840 9.286832 9.820501 7.614251 8.489700
## 3 5.276290 6.314388 7.936010 7.727373 6.929499 8.016098 6.245973 6.409642
## 4 5.789978 6.525297 5.486501 5.769676 6.385635 7.139049 6.165775 6.409642
## 5 5.407278 3.874463 3.474487 4.185148 3.179462 4.353156 4.736047 5.329371
## 6 7.031642 7.395884 6.558978 7.200879 6.139855 5.858405 5.528054 6.074927
## M.C.339 M.C.358 M.C.360 M.C.363 M.C.373 M.C.482 M.C.488 M.T.18
## 1 8.102469 7.730976 7.431279 8.489877 6.870841 7.056003 7.866917 7.002917
## 2 6.665863 7.782664 7.177897 8.498897 6.770201 8.030656 8.887287 6.953388
## 3 5.553355 6.873344 7.109438 5.692989 7.077499 5.222981 6.724682 4.798929
## 4 5.398225 7.458745 5.432316 6.991711 6.799679 5.763178 6.372353 4.325212
## 5 3.402581 3.944095 4.941024 3.968640 4.973514 4.657204 5.076312 5.817759
## 6 6.692819 6.392831 6.916854 5.924841 6.473688 6.463304 5.981913 5.119464
## M.T.20 M.T.22 M.T.235 M.T.245 M.T.29 M.T.31 M.T.32 M.T.399
## 1 6.278039 6.744291 7.172415 6.699427 6.653879 7.674504 6.960515 6.236251
## 2 5.858643 6.806725 7.443312 6.929311 6.682167 6.836827 6.504231 6.675271
## 3 4.682751 5.706205 5.549657 5.462240 6.317607 5.470048 5.218768 5.602092
## 4 4.101964 5.247309 5.513920 5.108990 5.487575 4.901269 5.370087 5.188391
## 5 6.399378 6.957439 4.356273 4.640188 5.281139 6.237069 4.898547 6.252709
## 6 4.101964 5.095524 6.653936 5.296400 3.455454 4.789809 5.049696 4.324042
## M.T.43 M.T.44 M.T.500 M.T.7 M.T.83 M.T.8 LOC baseMean
## 1 6.410440 6.419648 6.607779 7.169018 6.848184 7.134442 LOC132758254 187.73392
## 2 7.866451 6.984149 7.760471 6.843521 7.864893 5.499278 LOC132745822 225.66028
## 3 6.129826 5.613907 6.159865 5.885884 4.885269 4.420540 LOC132746399 80.87262
## 4 5.218074 5.850376 6.092670 4.609669 5.927677 4.633276 LOC132746792 60.55140
## 5 7.016638 7.447696 5.158157 4.870806 6.186756 4.565791 LOC132720785 43.73189
## 6 4.892587 5.200124 5.506812 4.561152 6.689869 4.982870 LOC132725056 65.01466
## log2FoldChange lfcSE stat pvalue padj Accession
## 1 -1.188610 0.2003729 -5.931989 2.992873e-09 3.395415e-05 NW_026859905.1
## 2 -1.299604 0.2742359 -4.738998 2.147771e-06 3.929306e-03 NW_026856216.1
## 3 -1.324071 0.2742349 -4.828235 1.377486e-06 3.929306e-03 NW_026856403.1
## 4 -1.141412 0.2388462 -4.778857 1.762949e-06 3.929306e-03 NW_026856504.1
## 5 1.570396 0.3214278 4.885689 1.030675e-06 3.929306e-03 NW_026862958.1
## 6 -1.269575 0.2692982 -4.714384 2.424428e-06 3.929306e-03 NW_026864281.1
## Begin End Chromosome Orientation
## 1 204763 227007 plus
## 2 29641 46037 minus
## 3 242141 268014 plus
## 4 70333 123083 plus
## 5 82977 120119 plus
## 6 197176 248330 minus
## Name Gene.ID
## 1 golgin subfamily A member 3-like 132758254
## 2 titin homolog 132745822
## 3 FYVE and coiled-coil domain-containing protein 1-like 132746399
## 4 phospholipase D1-like 132746792
## 5 malignant fibrous histiocytoma-amplified sequence 1 homolog 132720785
## 6 serine-rich adhesin for platelets-like 132725056
## Gene.Type Transcripts.accession
## 1 protein-coding XM_060749812.1
## 2 protein-coding XM_060734808.1
## 3 protein-coding XM_060735521.1
## 4 protein-coding XM_060736043.1
## 5 protein-coding XM_060704992.1
## 6 protein-coding XM_060710048.1
## Transcript.name
## 1 golgin subfamily A member 3-like, transcript variant X1
## 2 titin homolog, transcript variant X1
## 3 FYVE and coiled-coil domain-containing protein 1-like
## 4 phospholipase D1-like
## 5 malignant fibrous histiocytoma-amplified sequence 1 homolog, transcript variant X1
## 6 serine-rich adhesin for platelets-like, transcript variant X1
## Protein.accession
## 1 XP_060605795.1
## 2 XP_060590791.1
## 3 XP_060591504.1
## 4 XP_060592026.1
## 5 XP_060560975.1
## 6 XP_060566031.1
## Protein.name
## 1 golgin subfamily A member 3-like isoform X1
## 2 titin homolog isoform X1
## 3 FYVE and coiled-coil domain-containing protein 1-like
## 4 phospholipase D1-like
## 5 malignant fibrous histiocytoma-amplified sequence 1 homolog isoform X1
## 6 serine-rich adhesin for platelets-like isoform X1
## Protein.length Locus.tag
## 1 1674
## 2 1350
## 3 2583
## 4 330
## 5 1231
## 6 2552
# write_delim(DEG_Blast, "../output/0826-DEG_blast_cds_full.tab")
After joining the annotation to the DEGs, I still don’t have something I can take to UniProt to get GO terms with, which is necessary to get to my end goal of having biological proccess information.
I tried to trouble shoot this with the following chunk to see if I could pull the SPID from the full blast output and add it to my annot DEG list. Still 12 SPID short after join. not sure why the discrepency is occuring.
# read in blast full results
blastfull <- read.csv("../output/0821-rphil_blast_cds.tab", sep="")
# select for just the protein id and the saccver / spid info
blastselect <- data.frame(protein = blastfull$protein_id, id = blastfull$saccver)
# join to the deg blast list by matching the protein ids
blastselect_deg <- left_join(DEG_Blast, blastselect, join_by("Protein.accession" == "protein") )
# removing any duplicate LOC entries that were created during the join
blast_id_deg <- distinct(blastselect_deg, LOC, .keep_all = TRUE)
Ignored because they are the steps once SPID information is obtained.
Creating more select dataframe from master list. This one needs to be updated – ignore for now
# make a short file of just the DEG names and other desired data
# DEG_annot <- data.frame( LOC = DEG_Blast$LOC, geneName = DEG_Blast$SPID, baseMean = DEG_Blast$baseMean, log2FoldChange = DEG_Blast$log2FoldChange, pvalue = DEG_Blast$pvalue, padj = DEG_Blast$padj, protein = DEG_Blast$protein, cds = DEG_Blast$cds )
#
# head(DEG_annot)
retrieving uniprot IDs
# retrieve gene names for uniprot id lookup
# this is to get just the numbers from the geneName column for uniprot
# so sp|S8FGV1|LAC... becomes just S8FGV1
# uniprot_id <-DEG_annot$geneName[!is.na(DEG_annot$geneName)]
# head(uniprot_id)
#
# # write to table for uniprot import (or could just copy and paste but i imported a text file to uniprot just for my own santity to make sure everything was included)
# write.table(uniprot_id, "../output/0821-uniprot_id_cds_genelevel_8ind23r_ToC.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
#
# # and now for all the uniprot ids at the gene level that blast had
# write.table(blast$SPID, "../output/0821-uniprot_id_cds_genelevel_8ind23r_ToC.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
GO annotations
# retrieved our unirpot id GO file from the web interface https://www.uniprot.org/
# importing and unzipping file here
#
# gunzip("../data/idmapping_2024_05_14.tsv.gz")
#
# GO_id <- read.csv("../data/idmapping_2024_05_14.tsv", sep = '\t', header = TRUE, row.names=NULL)
# head(GO_id)
# join GO id info to our DEG_annot dataframe from earlier by the uniprot ids
# clam_GO_annotations <- merge(DEG_annot, GO_id, by.x = "uniprot_id", by.y = "Entry")
# head(clam_GO_annotations)
# siic it looks all good so let's write to file
# write.csv(clam_GO_annotations, "../output/clam_GO_annotations_cds_0514.csv")