--- title: "Extracting Transcript Sequences" output: html_document date: "2025-03-26" --- # Extract Transcript Sequences and make a new FASTA file Make a new fasta file that contains ```{r, engine= 'bash'} # Use seqtk to extract sequences that correspond to the gene IDs in transcript_ids.txt /home/shared/seqtk-1.4/seqtk subseq ../data/Botryllus_tozio_genes.fasta ../data/transcript_ids.txt > ../data/noname.fasta cat ../data/noname.fasta ``` Or you can copy and paste the sequences into ncbi blastx. # Manually extracting each sequence We will now pull the transcript sequences of the genes that are unannotated. Below you can use what is spit out in the console copy and paste them into blastx to find associated proteins. You can then update the file noname.csv by hand. ```{r, engine='bash'} awk '/^>g1294[^0-9]/{flag=1; print; next} /^>/{flag=0} flag' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine='bash'} awk '/^>g1294/{flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine='bash'} awk '/^>g6611/{flag=1; print; next} /^>/{flag=0} flag' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine='bash'} awk '/^>g1196[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine='bash'} awk '/^>g12050[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g12111[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g12775[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g1294[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g12982[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g13014[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g14195[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g14703[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g14965[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g15728[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g15986[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g16191[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g16471[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g1703[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g1744[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g2472[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g2473[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g4208[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g5661[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g6611[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g7259[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g7613[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g8170[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g9341[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ``` ```{r, engine= 'bash'} awk '/^>g9597[^0-9]/ {flag=1; print; next} /^>/{flag=0} flag{printf "%s", $0} END {print ""}' ../data/Botryllus_tozio_genes.fasta ```