{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating Genome Feature Tracks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook, I'll use the [NCBI assembly](https://www.ncbi.nlm.nih.gov/assembly/GCF_902806645.1/) and [NCBI Annotation Release 102](https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Crassostrea_gigas/102/) to genome feature tracks for the Roslin *C. gigas* genome." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Set working directory and variables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/yaaminivenkataraman/Documents/ceabigr/code\r\n" ] } ], "source": [ "!pwd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#!mkdir ../genome-features/" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/yaaminivenkataraman/Documents/ceabigr/genome-features\n" ] } ], "source": [ "cd ../genome-features/" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/opt/homebrew/bin/bedtools\r\n" ] } ], "source": [ "!which bedtools" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "bedtoolsDirectory = \"/opt/homebrew/bin\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Download NCBI assembly" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I downloaded the GFF from [this link](https://www.ncbi.nlm.nih.gov/genome/?term=txid6565[orgn]). Can also curl from FTP links [here](https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0/)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##gff-version 3\r\n", "#!gff-spec-version 1.21\r\n", "#!processor NCBI annotwriter\r\n", "#!genome-build C_virginica-3.0\r\n", "#!genome-build-accession NCBI_Assembly:GCF_002022765.2\r\n", "#!annotation-source NCBI Crassostrea virginica Annotation Release 100\r\n", "##sequence-region NC_035780.1 1 65668440\r\n", "##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6565\r\n", "NC_035780.1\tRefSeq\tregion\t1\t65668440\t.\t+\t.\tID=NC_035780.1:1..65668440;Dbxref=taxon:6565;Name=1;chromosome=1;collection-date=22-Mar-2015;country=USA;gbkey=Src;genome=chromosome;isolate=RU13XGHG1-28;isolation-source=Rutgers Haskin Shellfish Research Laboratory inbred lines (NJ);mol_type=genomic DNA;tissue-type=whole sample\r\n", "NC_035780.1\tGnomon\tgene\t13578\t14594\t.\t+\t.\tID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA\r\n", "NC_035780.1\tGnomon\tlnc_RNA\t13578\t14594\t.\t+\t.\tID=rna-XR_002636969.1;Parent=gene-LOC111116054;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;Name=XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 1 sample with support for all annotated introns;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\r\n", "NC_035780.1\tGnomon\texon\t13578\t13603\t.\t+\t.\tID=exon-XR_002636969.1-1;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\r\n", "NC_035780.1\tGnomon\texon\t14237\t14290\t.\t+\t.\tID=exon-XR_002636969.1-2;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\r\n", "NC_035780.1\tGnomon\texon\t14557\t14594\t.\t+\t.\tID=exon-XR_002636969.1-3;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\r\n", "NC_035780.1\tGnomon\tgene\t28961\t33324\t.\t+\t.\tID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t28961\t33324\t.\t+\t.\tID=rna-XM_022471938.1;Parent=gene-LOC111126949;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;Name=XM_022471938.1;gbkey=mRNA;gene=LOC111126949;model_evidence=Supporting evidence includes similarity to: 3 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 21 samples with support for all annotated introns;product=UNC5C-like protein;transcript_id=XM_022471938.1\r\n", "NC_035780.1\tGnomon\texon\t28961\t29073\t.\t+\t.\tID=exon-XM_022471938.1-1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\r\n", "NC_035780.1\tGnomon\texon\t30524\t31557\t.\t+\t.\tID=exon-XM_022471938.1-2;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\r\n", "NC_035780.1\tGnomon\texon\t31736\t31887\t.\t+\t.\tID=exon-XM_022471938.1-3;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\r\n", "NC_035780.1\tGnomon\texon\t31977\t32565\t.\t+\t.\tID=exon-XM_022471938.1-4;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\r\n", "NC_035780.1\tGnomon\texon\t32959\t33324\t.\t+\t.\tID=exon-XM_022471938.1-5;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\r\n", "NC_035780.1\tGnomon\tCDS\t30535\t31557\t.\t+\t0\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\r\n", "NC_035780.1\tGnomon\tCDS\t31736\t31887\t.\t+\t0\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\r\n", "NC_035780.1\tGnomon\tCDS\t31977\t32565\t.\t+\t1\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\r\n", "NC_035780.1\tGnomon\tCDS\t32959\t33204\t.\t+\t0\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\r\n", "NC_035780.1\tGnomon\tgene\t43111\t66897\t.\t-\t.\tID=gene-LOC111110729;Dbxref=GeneID:111110729;Name=LOC111110729;gbkey=Gene;gene=LOC111110729;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t43111\t66897\t.\t-\t.\tID=rna-XM_022447324.1;Parent=gene-LOC111110729;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;Name=XM_022447324.1;gbkey=mRNA;gene=LOC111110729;model_evidence=Supporting evidence includes similarity to: 1 Protein%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\r\n", "NC_035780.1\tGnomon\texon\t66869\t66897\t.\t-\t.\tID=exon-XM_022447324.1-1;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\r\n", "NC_035780.1\tGnomon\texon\t64123\t64334\t.\t-\t.\tID=exon-XM_022447324.1-2;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\r\n", "NC_035780.1\tGnomon\texon\t43111\t44358\t.\t-\t.\tID=exon-XM_022447324.1-3;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\r\n", "NC_035780.1\tGnomon\tCDS\t64123\t64219\t.\t-\t0\tID=cds-XP_022303032.1;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XP_022303032.1;Name=XP_022303032.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X1;protein_id=XP_022303032.1\r\n", "NC_035780.1\tGnomon\tCDS\t43262\t44358\t.\t-\t2\tID=cds-XP_022303032.1;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XP_022303032.1;Name=XP_022303032.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X1;protein_id=XP_022303032.1\r\n", "NC_035780.1\tGnomon\tmRNA\t43111\t46506\t.\t-\t.\tID=rna-XM_022447333.1;Parent=gene-LOC111110729;Dbxref=GeneID:111110729,Genbank:XM_022447333.1;Name=XM_022447333.1;gbkey=mRNA;gene=LOC111110729;model_evidence=Supporting evidence includes similarity to: 1 Protein%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 14 samples with support for all annotated introns;product=FMRFamide receptor-like%2C transcript variant X2;transcript_id=XM_022447333.1\r\n", "NC_035780.1\tGnomon\texon\t45913\t46506\t.\t-\t.\tID=exon-XM_022447333.1-1;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XM_022447333.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X2;transcript_id=XM_022447333.1\r\n", "NC_035780.1\tGnomon\texon\t43111\t44358\t.\t-\t.\tID=exon-XM_022447333.1-2;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XM_022447333.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X2;transcript_id=XM_022447333.1\r\n", "NC_035780.1\tGnomon\tCDS\t45913\t45997\t.\t-\t0\tID=cds-XP_022303041.1;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XP_022303041.1;Name=XP_022303041.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X2;protein_id=XP_022303041.1\r\n", "NC_035780.1\tGnomon\tCDS\t43262\t44358\t.\t-\t2\tID=cds-XP_022303041.1;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XP_022303041.1;Name=XP_022303041.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X2;protein_id=XP_022303041.1\r\n", "NC_035780.1\tGnomon\tgene\t85606\t95254\t.\t-\t.\tID=gene-LOC111112434;Dbxref=GeneID:111112434;Name=LOC111112434;gbkey=Gene;gene=LOC111112434;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t85606\t95254\t.\t-\t.\tID=rna-XM_022449924.1;Parent=gene-LOC111112434;Dbxref=GeneID:111112434,Genbank:XM_022449924.1;Name=XM_022449924.1;gbkey=mRNA;gene=LOC111112434;model_evidence=Supporting evidence includes similarity to: 7 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 13 samples with support for all annotated introns;product=homeobox protein Hox-B7-like;transcript_id=XM_022449924.1\r\n", "NC_035780.1\tGnomon\texon\t94571\t95254\t.\t-\t.\tID=exon-XM_022449924.1-1;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XM_022449924.1;gbkey=mRNA;gene=LOC111112434;product=homeobox protein Hox-B7-like;transcript_id=XM_022449924.1\r\n", "NC_035780.1\tGnomon\texon\t88423\t88589\t.\t-\t.\tID=exon-XM_022449924.1-2;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XM_022449924.1;gbkey=mRNA;gene=LOC111112434;product=homeobox protein Hox-B7-like;transcript_id=XM_022449924.1\r\n", "NC_035780.1\tGnomon\texon\t85606\t85777\t.\t-\t.\tID=exon-XM_022449924.1-3;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XM_022449924.1;gbkey=mRNA;gene=LOC111112434;product=homeobox protein Hox-B7-like;transcript_id=XM_022449924.1\r\n", "NC_035780.1\tGnomon\tCDS\t94571\t95042\t.\t-\t0\tID=cds-XP_022305632.1;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XP_022305632.1;Name=XP_022305632.1;gbkey=CDS;gene=LOC111112434;product=homeobox protein Hox-B7-like;protein_id=XP_022305632.1\r\n", "NC_035780.1\tGnomon\tCDS\t88423\t88589\t.\t-\t2\tID=cds-XP_022305632.1;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XP_022305632.1;Name=XP_022305632.1;gbkey=CDS;gene=LOC111112434;product=homeobox protein Hox-B7-like;protein_id=XP_022305632.1\r\n", "NC_035780.1\tGnomon\tCDS\t85616\t85777\t.\t-\t0\tID=cds-XP_022305632.1;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XP_022305632.1;Name=XP_022305632.1;gbkey=CDS;gene=LOC111112434;product=homeobox protein Hox-B7-like;protein_id=XP_022305632.1\r\n", "NC_035780.1\tGnomon\tgene\t99840\t106460\t.\t+\t.\tID=gene-LOC111120752;Dbxref=GeneID:111120752;Name=LOC111120752;gbkey=Gene;gene=LOC111120752;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t99840\t106460\t.\t+\t.\tID=rna-XM_022461698.1;Parent=gene-LOC111120752;Dbxref=GeneID:111120752,Genbank:XM_022461698.1;Name=XM_022461698.1;gbkey=mRNA;gene=LOC111120752;model_evidence=Supporting evidence includes similarity to: 10 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 27 samples with support for all annotated introns;product=ribulose-phosphate 3-epimerase-like;transcript_id=XM_022461698.1\r\n", "NC_035780.1\tGnomon\texon\t99840\t100122\t.\t+\t.\tID=exon-XM_022461698.1-1;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XM_022461698.1;gbkey=mRNA;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;transcript_id=XM_022461698.1\r\n", "NC_035780.1\tGnomon\texon\t100554\t100661\t.\t+\t.\tID=exon-XM_022461698.1-2;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XM_022461698.1;gbkey=mRNA;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;transcript_id=XM_022461698.1\r\n", "NC_035780.1\tGnomon\texon\t104929\t105063\t.\t+\t.\tID=exon-XM_022461698.1-3;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XM_022461698.1;gbkey=mRNA;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;transcript_id=XM_022461698.1\r\n", "NC_035780.1\tGnomon\texon\t105528\t105614\t.\t+\t.\tID=exon-XM_022461698.1-4;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XM_022461698.1;gbkey=mRNA;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;transcript_id=XM_022461698.1\r\n", "NC_035780.1\tGnomon\texon\t106004\t106460\t.\t+\t.\tID=exon-XM_022461698.1-5;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XM_022461698.1;gbkey=mRNA;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;transcript_id=XM_022461698.1\r\n", "NC_035780.1\tGnomon\tCDS\t99877\t100122\t.\t+\t0\tID=cds-XP_022317406.1;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XP_022317406.1;Name=XP_022317406.1;gbkey=CDS;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;protein_id=XP_022317406.1\r\n", "NC_035780.1\tGnomon\tCDS\t100554\t100661\t.\t+\t0\tID=cds-XP_022317406.1;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XP_022317406.1;Name=XP_022317406.1;gbkey=CDS;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;protein_id=XP_022317406.1\r\n", "NC_035780.1\tGnomon\tCDS\t104929\t105063\t.\t+\t0\tID=cds-XP_022317406.1;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XP_022317406.1;Name=XP_022317406.1;gbkey=CDS;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;protein_id=XP_022317406.1\r\n", "NC_035780.1\tGnomon\tCDS\t105528\t105614\t.\t+\t0\tID=cds-XP_022317406.1;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XP_022317406.1;Name=XP_022317406.1;gbkey=CDS;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;protein_id=XP_022317406.1\r\n", "NC_035780.1\tGnomon\tCDS\t106004\t106120\t.\t+\t0\tID=cds-XP_022317406.1;Parent=rna-XM_022461698.1;Dbxref=GeneID:111120752,Genbank:XP_022317406.1;Name=XP_022317406.1;gbkey=CDS;gene=LOC111120752;product=ribulose-phosphate 3-epimerase-like;protein_id=XP_022317406.1\r\n", "NC_035780.1\tGnomon\tgene\t108305\t110077\t.\t-\t.\tID=gene-LOC111128944;Dbxref=GeneID:111128944;Name=LOC111128944;gbkey=Gene;gene=LOC111128944;gene_biotype=protein_coding;partial=true;start_range=.,108305\r\n", "NC_035780.1\tGnomon\tmRNA\t108305\t110077\t.\t-\t.\tID=rna-XM_022474921.1;Parent=gene-LOC111128944;Dbxref=GeneID:111128944,Genbank:XM_022474921.1;Name=XM_022474921.1;gbkey=mRNA;gene=LOC111128944;model_evidence=Supporting evidence includes similarity to: 2 Proteins%2C and 93%25 coverage of the annotated genomic feature by RNAseq alignments;partial=true;product=mucin-19-like;start_range=.,108305;transcript_id=XM_022474921.1\r\n", "NC_035780.1\tGnomon\texon\t108305\t110077\t.\t-\t.\tID=exon-XM_022474921.1-1;Parent=rna-XM_022474921.1;Dbxref=GeneID:111128944,Genbank:XM_022474921.1;gbkey=mRNA;gene=LOC111128944;partial=true;product=mucin-19-like;start_range=.,108305;transcript_id=XM_022474921.1\r\n", "NC_035780.1\tGnomon\tCDS\t108305\t110077\t.\t-\t0\tID=cds-XP_022330629.1;Parent=rna-XM_022474921.1;Dbxref=GeneID:111128944,Genbank:XP_022330629.1;Name=XP_022330629.1;gbkey=CDS;gene=LOC111128944;partial=true;product=mucin-19-like;protein_id=XP_022330629.1;start_range=.,108305\r\n", "NC_035780.1\tGnomon\tgene\t151859\t157536\t.\t+\t.\tID=gene-LOC111128953;Dbxref=GeneID:111128953;Name=LOC111128953;gbkey=Gene;gene=LOC111128953;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t151859\t157536\t.\t+\t.\tID=rna-XM_022474931.1;Parent=gene-LOC111128953;Dbxref=GeneID:111128953,Genbank:XM_022474931.1;Name=XM_022474931.1;gbkey=mRNA;gene=LOC111128953;model_evidence=Supporting evidence includes similarity to: 1 Protein;product=GATA zinc finger domain-containing protein 14-like;transcript_id=XM_022474931.1\r\n", "NC_035780.1\tGnomon\texon\t151859\t153368\t.\t+\t.\tID=exon-XM_022474931.1-1;Parent=rna-XM_022474931.1;Dbxref=GeneID:111128953,Genbank:XM_022474931.1;gbkey=mRNA;gene=LOC111128953;product=GATA zinc finger domain-containing protein 14-like;transcript_id=XM_022474931.1\r\n", "NC_035780.1\tGnomon\texon\t156764\t157536\t.\t+\t.\tID=exon-XM_022474931.1-2;Parent=rna-XM_022474931.1;Dbxref=GeneID:111128953,Genbank:XM_022474931.1;gbkey=mRNA;gene=LOC111128953;product=GATA zinc finger domain-containing protein 14-like;transcript_id=XM_022474931.1\r\n", "NC_035780.1\tGnomon\tCDS\t151859\t153368\t.\t+\t0\tID=cds-XP_022330639.1;Parent=rna-XM_022474931.1;Dbxref=GeneID:111128953,Genbank:XP_022330639.1;Name=XP_022330639.1;gbkey=CDS;gene=LOC111128953;product=GATA zinc finger domain-containing protein 14-like;protein_id=XP_022330639.1\r\n", "NC_035780.1\tGnomon\tCDS\t156764\t157536\t.\t+\t2\tID=cds-XP_022330639.1;Parent=rna-XM_022474931.1;Dbxref=GeneID:111128953,Genbank:XP_022330639.1;Name=XP_022330639.1;gbkey=CDS;gene=LOC111128953;product=GATA zinc finger domain-containing protein 14-like;protein_id=XP_022330639.1\r\n", "NC_035780.1\tGnomon\tgene\t163809\t183798\t.\t-\t.\tID=gene-LOC111105691;Dbxref=GeneID:111105691;Name=LOC111105691;gbkey=Gene;gene=LOC111105691;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t163809\t183798\t.\t-\t.\tID=rna-XM_022440054.1;Parent=gene-LOC111105691;Dbxref=GeneID:111105691,Genbank:XM_022440054.1;Name=XM_022440054.1;gbkey=mRNA;gene=LOC111105691;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 9 samples with support for all annotated introns;product=uncharacterized LOC111105691;transcript_id=XM_022440054.1\r\n", "NC_035780.1\tGnomon\texon\t183732\t183798\t.\t-\t.\tID=exon-XM_022440054.1-1;Parent=rna-XM_022440054.1;Dbxref=GeneID:111105691,Genbank:XM_022440054.1;gbkey=mRNA;gene=LOC111105691;product=uncharacterized LOC111105691;transcript_id=XM_022440054.1\r\n", "NC_035780.1\tGnomon\texon\t163809\t164341\t.\t-\t.\tID=exon-XM_022440054.1-2;Parent=rna-XM_022440054.1;Dbxref=GeneID:111105691,Genbank:XM_022440054.1;gbkey=mRNA;gene=LOC111105691;product=uncharacterized LOC111105691;transcript_id=XM_022440054.1\r\n", "NC_035780.1\tGnomon\tCDS\t163835\t164266\t.\t-\t0\tID=cds-XP_022295762.1;Parent=rna-XM_022440054.1;Dbxref=GeneID:111105691,Genbank:XP_022295762.1;Name=XP_022295762.1;gbkey=CDS;gene=LOC111105691;product=uncharacterized protein LOC111105691;protein_id=XP_022295762.1\r\n", "NC_035780.1\tGnomon\tgene\t164820\t166793\t.\t+\t.\tID=gene-LOC111105685;Dbxref=GeneID:111105685;Name=LOC111105685;gbkey=Gene;gene=LOC111105685;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t164820\t166793\t.\t+\t.\tID=rna-XM_022440042.1;Parent=gene-LOC111105685;Dbxref=GeneID:111105685,Genbank:XM_022440042.1;Name=XM_022440042.1;gbkey=mRNA;gene=LOC111105685;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 4 samples with support for all annotated introns;product=protein ANTAGONIST OF LIKE HETEROCHROMATIN PROTEIN 1-like;transcript_id=XM_022440042.1\r\n", "NC_035780.1\tGnomon\texon\t164820\t164941\t.\t+\t.\tID=exon-XM_022440042.1-1;Parent=rna-XM_022440042.1;Dbxref=GeneID:111105685,Genbank:XM_022440042.1;gbkey=mRNA;gene=LOC111105685;product=protein ANTAGONIST OF LIKE HETEROCHROMATIN PROTEIN 1-like;transcript_id=XM_022440042.1\r\n", "NC_035780.1\tGnomon\texon\t165620\t166793\t.\t+\t.\tID=exon-XM_022440042.1-2;Parent=rna-XM_022440042.1;Dbxref=GeneID:111105685,Genbank:XM_022440042.1;gbkey=mRNA;gene=LOC111105685;product=protein ANTAGONIST OF LIKE HETEROCHROMATIN PROTEIN 1-like;transcript_id=XM_022440042.1\r\n", "NC_035780.1\tGnomon\tCDS\t165746\t166681\t.\t+\t0\tID=cds-XP_022295750.1;Parent=rna-XM_022440042.1;Dbxref=GeneID:111105685,Genbank:XP_022295750.1;Name=XP_022295750.1;gbkey=CDS;gene=LOC111105685;product=protein ANTAGONIST OF LIKE HETEROCHROMATIN PROTEIN 1-like;protein_id=XP_022295750.1\r\n", "NC_035780.1\tGnomon\tgene\t169468\t170178\t.\t-\t.\tID=gene-LOC111105702;Dbxref=GeneID:111105702;Name=LOC111105702;gbkey=Gene;gene=LOC111105702;gene_biotype=lncRNA\r\n", "NC_035780.1\tGnomon\tlnc_RNA\t169468\t170178\t.\t-\t.\tID=rna-XR_002635081.1;Parent=gene-LOC111105702;Dbxref=GeneID:111105702,Genbank:XR_002635081.1;Name=XR_002635081.1;gbkey=ncRNA;gene=LOC111105702;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 3 samples with support for all annotated introns;product=uncharacterized LOC111105702;transcript_id=XR_002635081.1\r\n", "NC_035780.1\tGnomon\texon\t170129\t170178\t.\t-\t.\tID=exon-XR_002635081.1-1;Parent=rna-XR_002635081.1;Dbxref=GeneID:111105702,Genbank:XR_002635081.1;gbkey=ncRNA;gene=LOC111105702;product=uncharacterized LOC111105702;transcript_id=XR_002635081.1\r\n", "NC_035780.1\tGnomon\texon\t169907\t169960\t.\t-\t.\tID=exon-XR_002635081.1-2;Parent=rna-XR_002635081.1;Dbxref=GeneID:111105702,Genbank:XR_002635081.1;gbkey=ncRNA;gene=LOC111105702;product=uncharacterized LOC111105702;transcript_id=XR_002635081.1\r\n", "NC_035780.1\tGnomon\texon\t169622\t169675\t.\t-\t.\tID=exon-XR_002635081.1-3;Parent=rna-XR_002635081.1;Dbxref=GeneID:111105702,Genbank:XR_002635081.1;gbkey=ncRNA;gene=LOC111105702;product=uncharacterized LOC111105702;transcript_id=XR_002635081.1\r\n", "NC_035780.1\tGnomon\texon\t169468\t169508\t.\t-\t.\tID=exon-XR_002635081.1-4;Parent=rna-XR_002635081.1;Dbxref=GeneID:111105702,Genbank:XR_002635081.1;gbkey=ncRNA;gene=LOC111105702;product=uncharacterized LOC111105702;transcript_id=XR_002635081.1\r\n", "NC_035780.1\tGnomon\tgene\t190449\t193594\t.\t-\t.\tID=gene-LOC111133554;Dbxref=GeneID:111133554;Name=LOC111133554;gbkey=Gene;gene=LOC111133554;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t190449\t193594\t.\t-\t.\tID=rna-XM_022482070.1;Parent=gene-LOC111133554;Dbxref=GeneID:111133554,Genbank:XM_022482070.1;Name=XM_022482070.1;gbkey=mRNA;gene=LOC111133554;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 3 samples with support for all annotated introns;product=putative uncharacterized protein DDB_G0277407;transcript_id=XM_022482070.1\r\n", "NC_035780.1\tGnomon\texon\t193257\t193594\t.\t-\t.\tID=exon-XM_022482070.1-1;Parent=rna-XM_022482070.1;Dbxref=GeneID:111133554,Genbank:XM_022482070.1;gbkey=mRNA;gene=LOC111133554;product=putative uncharacterized protein DDB_G0277407;transcript_id=XM_022482070.1\r\n", "NC_035780.1\tGnomon\texon\t190449\t190999\t.\t-\t.\tID=exon-XM_022482070.1-2;Parent=rna-XM_022482070.1;Dbxref=GeneID:111133554,Genbank:XM_022482070.1;gbkey=mRNA;gene=LOC111133554;product=putative uncharacterized protein DDB_G0277407;transcript_id=XM_022482070.1\r\n", "NC_035780.1\tGnomon\tCDS\t190493\t190924\t.\t-\t0\tID=cds-XP_022337778.1;Parent=rna-XM_022482070.1;Dbxref=GeneID:111133554,Genbank:XP_022337778.1;Name=XP_022337778.1;gbkey=CDS;gene=LOC111133554;product=putative uncharacterized protein DDB_G0277407;protein_id=XP_022337778.1\r\n", "NC_035780.1\tGnomon\tgene\t204243\t207743\t.\t-\t.\tID=gene-LOC111125466;Dbxref=GeneID:111125466;Name=LOC111125466;gbkey=Gene;gene=LOC111125466;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t204243\t207743\t.\t-\t.\tID=rna-XM_022469388.1;Parent=gene-LOC111125466;Dbxref=GeneID:111125466,Genbank:XM_022469388.1;Name=XM_022469388.1;gbkey=mRNA;gene=LOC111125466;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 11 samples with support for all annotated introns;product=homeobox protein 2-like;transcript_id=XM_022469388.1\r\n", "NC_035780.1\tGnomon\texon\t207388\t207743\t.\t-\t.\tID=exon-XM_022469388.1-1;Parent=rna-XM_022469388.1;Dbxref=GeneID:111125466,Genbank:XM_022469388.1;gbkey=mRNA;gene=LOC111125466;product=homeobox protein 2-like;transcript_id=XM_022469388.1\r\n", "NC_035780.1\tGnomon\texon\t204243\t204795\t.\t-\t.\tID=exon-XM_022469388.1-2;Parent=rna-XM_022469388.1;Dbxref=GeneID:111125466,Genbank:XM_022469388.1;gbkey=mRNA;gene=LOC111125466;product=homeobox protein 2-like;transcript_id=XM_022469388.1\r\n", "NC_035780.1\tGnomon\tCDS\t204289\t204720\t.\t-\t0\tID=cds-XP_022325096.1;Parent=rna-XM_022469388.1;Dbxref=GeneID:111125466,Genbank:XP_022325096.1;Name=XP_022325096.1;gbkey=CDS;gene=LOC111125466;product=homeobox protein 2-like;protein_id=XP_022325096.1\r\n", "NC_035780.1\tGnomon\tgene\t214891\t215322\t.\t-\t.\tID=gene-LOC111128964;Dbxref=GeneID:111128964;Name=LOC111128964;gbkey=Gene;gene=LOC111128964;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t214891\t215322\t.\t-\t.\tID=rna-XM_022474945.1;Parent=gene-LOC111128964;Dbxref=GeneID:111128964,Genbank:XM_022474945.1;Name=XM_022474945.1;gbkey=mRNA;gene=LOC111128964;model_evidence=Supporting evidence includes similarity to: 1 Protein;product=putative uncharacterized protein DDB_G0277407;transcript_id=XM_022474945.1\r\n", "NC_035780.1\tGnomon\texon\t214891\t215322\t.\t-\t.\tID=exon-XM_022474945.1-1;Parent=rna-XM_022474945.1;Dbxref=GeneID:111128964,Genbank:XM_022474945.1;gbkey=mRNA;gene=LOC111128964;product=putative uncharacterized protein DDB_G0277407;transcript_id=XM_022474945.1\r\n", "NC_035780.1\tGnomon\tCDS\t214891\t215322\t.\t-\t0\tID=cds-XP_022330653.1;Parent=rna-XM_022474945.1;Dbxref=GeneID:111128964,Genbank:XP_022330653.1;Name=XP_022330653.1;gbkey=CDS;gene=LOC111128964;product=putative uncharacterized protein DDB_G0277407;protein_id=XP_022330653.1\r\n", "NC_035780.1\tGnomon\tgene\t219451\t225076\t.\t-\t.\tID=gene-LOC111113860;Dbxref=GeneID:111113860;Name=LOC111113860;gbkey=Gene;gene=LOC111113860;gene_biotype=protein_coding\r\n", "NC_035780.1\tGnomon\tmRNA\t219451\t225076\t.\t-\t.\tID=rna-XM_022452155.1;Parent=gene-LOC111113860;Dbxref=GeneID:111113860,Genbank:XM_022452155.1;Name=XM_022452155.1;gbkey=mRNA;gene=LOC111113860;model_evidence=Supporting evidence includes similarity to: 10 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 24 samples with support for all annotated introns;product=sulfotransferase family cytosolic 1B member 1-like;transcript_id=XM_022452155.1\r\n", "NC_035780.1\tGnomon\texon\t224748\t225076\t.\t-\t.\tID=exon-XM_022452155.1-1;Parent=rna-XM_022452155.1;Dbxref=GeneID:111113860,Genbank:XM_022452155.1;gbkey=mRNA;gene=LOC111113860;product=sulfotransferase family cytosolic 1B member 1-like;transcript_id=XM_022452155.1\r\n" ] } ], "source": [ "!head -n100 GCF_002022765.2_C_virginica-3.0_genomic.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Prepare for feature track creation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before I pull out feature tracks, I need to know which databases were used for annotation, which features I can expect and how many of them there are, and identify chromosome lengths." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2a. Annotation information" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1 ##sequence-region NC_035784.1 1 98698416\r\n", " 1 ##sequence-region NC_035785.1 1 51258098\r\n", " 1 ##sequence-region NC_035786.1 1 57830854\r\n", " 1 ##sequence-region NC_035787.1 1 75944018\r\n", " 1 ##sequence-region NC_035788.1 1 104168038\r\n", " 1 ##sequence-region NC_035789.1 1 32650045\r\n", " 11 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6565\r\n", "1482188 Gnomon\r\n", "29345 RefSeq\r\n", "1739 tRNAscan-SE\r\n" ] } ], "source": [ "#Database identifiers for extracting features\n", "!cut -f2 GCF_002022765.2_C_virginica-3.0_genomic.gff | sort | uniq -c | tail" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1 #!annotation-source NCBI Crassostrea virginica Annotation Release 100\n", " 1 #!genome-build C_virginica-3.0\n", " 1 #!genome-build-accession NCBI_Assembly:GCF_002022765.2\n", " 1 #!gff-spec-version 1.21\n", " 1 #!processor NCBI annotwriter\n", " 1 ###\n", " 1 ##gff-version 3\n", " 1 ##sequence-region NC_007175.2 1 17244\n", " 1 ##sequence-region NC_035780.1 1 65668440\n", " 1 ##sequence-region NC_035781.1 1 61752955\n", " 1 ##sequence-region NC_035782.1 1 77061148\n", " 1 ##sequence-region NC_035783.1 1 59691872\n", " 1 ##sequence-region NC_035784.1 1 98698416\n", " 1 ##sequence-region NC_035785.1 1 51258098\n", " 1 ##sequence-region NC_035786.1 1 57830854\n", " 1 ##sequence-region NC_035787.1 1 75944018\n", " 1 ##sequence-region NC_035788.1 1 104168038\n", " 1 ##sequence-region NC_035789.1 1 32650045\n", " 11 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6565\n", "645368 CDS\n", "29258 cDNA_match\n", "731916 exon\n", "38838 gene\n", "4750 lnc_RNA\n", "60201 mRNA\n", " 667 pseudogene\n", " 2 rRNA\n", " 11 region\n", " 587 tRNA\n", "1674 transcript\n" ] } ], "source": [ "#Count the number of unique features in the GFF\n", "!cut -f3 GCF_002022765.2_C_virginica-3.0_genomic.gff | sort | uniq -c" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2b. Chromosome lengths" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">NC_035780.1 Crassostrea virginica isolate RU13XGHG1-28 chromosome 1, C_virginica-3.0, whole genome shotgun sequence\r\n", "tgacacatatataaagttgaagTCCATACGTAAGAAACTCTGTGAGATATTAACCGAAAACCTTTTGAATCTTTacgaaa\r\n", "aatatacatgttgcGCCAACTGGCGTAAATCAAAACCGGAAGCAGTAAGCATGTCGTGTTTAGTGTCTATCAAATGGACC\r\n", "GGGGGAGTTCTAGTACATATCCAAAGATAAGGGCAATACATAAAATACTCGCAAAGTTATTGACCGtcaaagttgatgta\r\n", "cttttagaaaaaaataatggaaaatgtggcTTTAGTGGAACGGCatcaatgtaaatttaaaatagcaggGTTTGCGTTTG\r\n", "AATTAAAACATCGTGTTTGGTGTCGTTAGAAAGGTCTATTCCAGTTCTATAACATATCTAAAGGTCAGGTCAATCCgtta\r\n", "atttataaaagagaTATGGGCGATCGCGGCATGAGTACTACATGACACAGAGTTACTTGCTCTTTGCTACTTCAGCGTTT\r\n", "CCGGAAGCGTAGTTTTTTTCGTGCGTTTATTCCTTGCAGATAGCCAAGCAATTCCACAAGAATTGAACTCATTTGGCATT\r\n", "AaacttcttgaaaaaataacaaattctttcttttctcatatCAGGTGGATgtcgagtactttgattctaccgggcataga\r\n", "gtagcaacagtactttcagtaccgtgattttgctaccaatcataggactgaaagtactgtgtatagaccaatcacagtac\r\n" ] } ], "source": [ "#Obtained the full FASTA from this link: https://www.ncbi.nlm.nih.gov/genome/?term=txid6565[orgn]\n", "#Check information\n", "!head GCF_002022765.2_C_virginica-3.0_genomic.fna" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#Extract chr and sequence length information\n", "!awk '$0 ~ \">\" {print c; c=0;printf substr($0,2,14) \"\\t\"; } $0 !~ \">\" {c+=length($0);} END { print c; }' \\\n", "GCF_002022765.2_C_virginica-3.0_genomic.fna \\\n", "| sed 's/Cr//g' \\\n", "| awk '{print $1\"\\t\"$2}' \\\n", "| tail -n +2 \\\n", "> C_virginica-3.0-sequence-lengths.txt" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t65668440\r\n", "NC_035781.1\t61752955\r\n", "NC_035782.1\t77061148\r\n", "NC_035783.1\t59691872\r\n", "NC_035784.1\t98698416\r\n", "NC_035785.1\t51258098\r\n", "NC_035786.1\t57830854\r\n", "NC_035787.1\t75944018\r\n", "NC_035788.1\t104168038\r\n", "NC_035789.1\t32650045\r\n", "NC_007175.2\t17244\r\n" ] } ], "source": [ "#11 chr total including mitochondrial information\n", "!head -n11 C_virginica-3.0-sequence-lengths.txt" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "#New file with chr names only\n", "!cut -f1 C_virginica-3.0-sequence-lengths.txt \\\n", "> C_virginica-3.0-chr.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Generate genome feature tracks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I will extract CDS, exon, gene, lncRNA and mRNA tracks. I can then use those existing tracks to produce intron and intergenic tracks, as well as 1 kb upstream and downstream flanking regions with `bedtools`. I will also use the RepeatMasker output from NCBI for my transposable element track." ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bedtools v2.30.0\r\n" ] } ], "source": [ "!{bedtoolsDirectory}/bedtools --version" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2a. Gene" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#Isolate gene entries from multiple annotation databses. Tab mus be included between database and feature\n", "#Sort output for downstream use\n", "#Include chromosome name information\n", "!grep -e \"Gnomon\tgene\" -e \"RefSeq\tgene\" -e \"tRNAscan-SE\tgene\" \\\n", "GCF_002022765.2_C_virginica-3.0_genomic.gff \\\n", "| {bedtoolsDirectory}/sortBed \\\n", "-faidx C_virginica-3.0-chr.txt \\\n", "> C_virginica-3.0-gene.gff" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\tgene\t13578\t14594\t.\t+\t.\tID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA\n", "NC_035780.1\tGnomon\tgene\t28961\t33324\t.\t+\t.\tID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t43111\t66897\t.\t-\t.\tID=gene-LOC111110729;Dbxref=GeneID:111110729;Name=LOC111110729;gbkey=Gene;gene=LOC111110729;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t85606\t95254\t.\t-\t.\tID=gene-LOC111112434;Dbxref=GeneID:111112434;Name=LOC111112434;gbkey=Gene;gene=LOC111112434;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t99840\t106460\t.\t+\t.\tID=gene-LOC111120752;Dbxref=GeneID:111120752;Name=LOC111120752;gbkey=Gene;gene=LOC111120752;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t108305\t110077\t.\t-\t.\tID=gene-LOC111128944;Dbxref=GeneID:111128944;Name=LOC111128944;gbkey=Gene;gene=LOC111128944;gene_biotype=protein_coding;partial=true;start_range=.,108305\n", "NC_035780.1\tGnomon\tgene\t151859\t157536\t.\t+\t.\tID=gene-LOC111128953;Dbxref=GeneID:111128953;Name=LOC111128953;gbkey=Gene;gene=LOC111128953;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t163809\t183798\t.\t-\t.\tID=gene-LOC111105691;Dbxref=GeneID:111105691;Name=LOC111105691;gbkey=Gene;gene=LOC111105691;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t164820\t166793\t.\t+\t.\tID=gene-LOC111105685;Dbxref=GeneID:111105685;Name=LOC111105685;gbkey=Gene;gene=LOC111105685;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t169468\t170178\t.\t-\t.\tID=gene-LOC111105702;Dbxref=GeneID:111105702;Name=LOC111105702;gbkey=Gene;gene=LOC111105702;gene_biotype=lncRNA\n", " 38838 C_virginica-3.0-gene.gff\n" ] } ], "source": [ "!head C_virginica-3.0-gene.gff\n", "!wc -l C_virginica-3.0-gene.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2b. CDS" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "!grep -e \"Gnomon\tCDS\" -e \"RefSeq\tCDS\" -e \"tRNAscan-SE\tCDS\" \\\n", "GCF_002022765.2_C_virginica-3.0_genomic.gff \\\n", "| {bedtoolsDirectory}/sortBed \\\n", "-faidx C_virginica-3.0-chr.txt \\\n", "> C_virginica-3.0-CDS.gff" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\tCDS\t30535\t31557\t.\t+\t0\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\n", "NC_035780.1\tGnomon\tCDS\t31736\t31887\t.\t+\t0\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\n", "NC_035780.1\tGnomon\tCDS\t31977\t32565\t.\t+\t1\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\n", "NC_035780.1\tGnomon\tCDS\t32959\t33204\t.\t+\t0\tID=cds-XP_022327646.1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XP_022327646.1;Name=XP_022327646.1;gbkey=CDS;gene=LOC111126949;product=UNC5C-like protein;protein_id=XP_022327646.1\n", "NC_035780.1\tGnomon\tCDS\t43262\t44358\t.\t-\t2\tID=cds-XP_022303032.1;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XP_022303032.1;Name=XP_022303032.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X1;protein_id=XP_022303032.1\n", "NC_035780.1\tGnomon\tCDS\t43262\t44358\t.\t-\t2\tID=cds-XP_022303041.1;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XP_022303041.1;Name=XP_022303041.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X2;protein_id=XP_022303041.1\n", "NC_035780.1\tGnomon\tCDS\t45913\t45997\t.\t-\t0\tID=cds-XP_022303041.1;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XP_022303041.1;Name=XP_022303041.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X2;protein_id=XP_022303041.1\n", "NC_035780.1\tGnomon\tCDS\t64123\t64219\t.\t-\t0\tID=cds-XP_022303032.1;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XP_022303032.1;Name=XP_022303032.1;gbkey=CDS;gene=LOC111110729;product=FMRFamide receptor-like isoform X1;protein_id=XP_022303032.1\n", "NC_035780.1\tGnomon\tCDS\t85616\t85777\t.\t-\t0\tID=cds-XP_022305632.1;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XP_022305632.1;Name=XP_022305632.1;gbkey=CDS;gene=LOC111112434;product=homeobox protein Hox-B7-like;protein_id=XP_022305632.1\n", "NC_035780.1\tGnomon\tCDS\t88423\t88589\t.\t-\t2\tID=cds-XP_022305632.1;Parent=rna-XM_022449924.1;Dbxref=GeneID:111112434,Genbank:XP_022305632.1;Name=XP_022305632.1;gbkey=CDS;gene=LOC111112434;product=homeobox protein Hox-B7-like;protein_id=XP_022305632.1\n", " 645368 C_virginica-3.0-CDS.gff\n" ] } ], "source": [ "!head C_virginica-3.0-CDS.gff\n", "!wc -l C_virginica-3.0-CDS.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2c. Exon" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "!grep -e \"Gnomon\texon\" -e \"RefSeq\texon\" -e \"tRNAscan-SE\texon\" \\\n", "GCF_002022765.2_C_virginica-3.0_genomic.gff \\\n", "| {bedtoolsDirectory}/sortBed \\\n", "-faidx C_virginica-3.0-chr.txt \\\n", "> C_virginica-3.0-exon.gff" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\texon\t13578\t13603\t.\t+\t.\tID=exon-XR_002636969.1-1;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\n", "NC_035780.1\tGnomon\texon\t14237\t14290\t.\t+\t.\tID=exon-XR_002636969.1-2;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\n", "NC_035780.1\tGnomon\texon\t14557\t14594\t.\t+\t.\tID=exon-XR_002636969.1-3;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\n", "NC_035780.1\tGnomon\texon\t28961\t29073\t.\t+\t.\tID=exon-XM_022471938.1-1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t30524\t31557\t.\t+\t.\tID=exon-XM_022471938.1-2;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t31736\t31887\t.\t+\t.\tID=exon-XM_022471938.1-3;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t31977\t32565\t.\t+\t.\tID=exon-XM_022471938.1-4;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t32959\t33324\t.\t+\t.\tID=exon-XM_022471938.1-5;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t43111\t44358\t.\t-\t.\tID=exon-XM_022447324.1-3;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\n", "NC_035780.1\tGnomon\texon\t43111\t44358\t.\t-\t.\tID=exon-XM_022447333.1-2;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XM_022447333.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X2;transcript_id=XM_022447333.1\n", " 731916 C_virginica-3.0-exon.gff\n" ] } ], "source": [ "!head C_virginica-3.0-exon.gff\n", "!wc -l C_virginica-3.0-exon.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2d. lncRNA" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "!grep -e \"Gnomon\tlnc_RNA\" -e \"RefSeq\tlnc_RNA\" -e \"tRNAscan-SE\tlnc_RNA\" \\\n", "GCF_002022765.2_C_virginica-3.0_genomic.gff \\\n", "| {bedtoolsDirectory}/sortBed \\\n", "-faidx C_virginica-3.0-chr.txt \\\n", "> C_virginica-3.0-lncRNA.gff" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\tlnc_RNA\t13578\t14594\t.\t+\t.\tID=rna-XR_002636969.1;Parent=gene-LOC111116054;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;Name=XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 1 sample with support for all annotated introns;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t169468\t170178\t.\t-\t.\tID=rna-XR_002635081.1;Parent=gene-LOC111105702;Dbxref=GeneID:111105702,Genbank:XR_002635081.1;Name=XR_002635081.1;gbkey=ncRNA;gene=LOC111105702;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 3 samples with support for all annotated introns;product=uncharacterized LOC111105702;transcript_id=XR_002635081.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t900326\t903430\t.\t+\t.\tID=rna-XR_002636046.1;Parent=gene-LOC111111519;Dbxref=GeneID:111111519,Genbank:XR_002636046.1;Name=XR_002636046.1;gbkey=ncRNA;gene=LOC111111519;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 20 samples with support for all annotated introns;product=uncharacterized LOC111111519;transcript_id=XR_002636046.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t1280831\t1282416\t.\t-\t.\tID=rna-XR_002638148.1;Parent=gene-LOC111124195;Dbxref=GeneID:111124195,Genbank:XR_002638148.1;Name=XR_002638148.1;gbkey=ncRNA;gene=LOC111124195;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 1 sample with support for all annotated introns;product=uncharacterized LOC111124195;transcript_id=XR_002638148.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t1432944\t1458091\t.\t+\t.\tID=rna-XR_002639675.1;Parent=gene-LOC111135942;Dbxref=GeneID:111135942,Genbank:XR_002639675.1;Name=XR_002639675.1;gbkey=ncRNA;gene=LOC111135942;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 4 samples with support for all annotated introns;product=uncharacterized LOC111135942;transcript_id=XR_002639675.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t1503802\t1513830\t.\t-\t.\tID=rna-XR_002636574.1;Parent=gene-LOC111114441;Dbxref=GeneID:111114441,Genbank:XR_002636574.1;Name=XR_002636574.1;gbkey=ncRNA;gene=LOC111114441;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 2 samples with support for all annotated introns;product=uncharacterized LOC111114441;transcript_id=XR_002636574.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t1856841\t1863697\t.\t-\t.\tID=rna-XR_002636863.1;Parent=gene-LOC111115591;Dbxref=GeneID:111115591,Genbank:XR_002636863.1;Name=XR_002636863.1;gbkey=ncRNA;gene=LOC111115591;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 1 sample with support for all annotated introns;product=uncharacterized LOC111115591%2C transcript variant X1;transcript_id=XR_002636863.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t1856841\t1863683\t.\t-\t.\tID=rna-XR_002636864.1;Parent=gene-LOC111115591;Dbxref=GeneID:111115591,Genbank:XR_002636864.1;Name=XR_002636864.1;gbkey=ncRNA;gene=LOC111115591;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments;product=uncharacterized LOC111115591%2C transcript variant X2;transcript_id=XR_002636864.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t2161223\t2166803\t.\t+\t.\tID=rna-XR_002635698.1;Parent=gene-LOC111109763;Dbxref=GeneID:111109763,Genbank:XR_002635698.1;Name=XR_002635698.1;gbkey=ncRNA;gene=LOC111109763;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 23 samples with support for all annotated introns;product=uncharacterized LOC111109763;transcript_id=XR_002635698.1\n", "NC_035780.1\tGnomon\tlnc_RNA\t2928484\t2930094\t.\t-\t.\tID=rna-XR_002637875.1;Parent=gene-LOC111122009;Dbxref=GeneID:111122009,Genbank:XR_002637875.1;Name=XR_002637875.1;gbkey=ncRNA;gene=LOC111122009;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 5 samples with support for all annotated introns;product=uncharacterized LOC111122009;transcript_id=XR_002637875.1\n", " 4750 C_virginica-3.0-lncRNA.gff\n" ] } ], "source": [ "!head C_virginica-3.0-lncRNA.gff\n", "!wc -l C_virginica-3.0-lncRNA.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2e. mRNA" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "!grep -e \"Gnomon\tmRNA\" -e \"RefSeq\tmRNA\" -e \"tRNAscan-SE\tmRNA\" \\\n", "GCF_002022765.2_C_virginica-3.0_genomic.gff \\\n", "| {bedtoolsDirectory}/sortBed \\\n", "-faidx C_virginica-3.0-chr.txt \\\n", "> C_virginica-3.0-mRNA.gff" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\tmRNA\t28961\t33324\t.\t+\t.\tID=rna-XM_022471938.1;Parent=gene-LOC111126949;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;Name=XM_022471938.1;gbkey=mRNA;gene=LOC111126949;model_evidence=Supporting evidence includes similarity to: 3 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 21 samples with support for all annotated introns;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\tmRNA\t43111\t66897\t.\t-\t.\tID=rna-XM_022447324.1;Parent=gene-LOC111110729;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;Name=XM_022447324.1;gbkey=mRNA;gene=LOC111110729;model_evidence=Supporting evidence includes similarity to: 1 Protein%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\n", "NC_035780.1\tGnomon\tmRNA\t43111\t46506\t.\t-\t.\tID=rna-XM_022447333.1;Parent=gene-LOC111110729;Dbxref=GeneID:111110729,Genbank:XM_022447333.1;Name=XM_022447333.1;gbkey=mRNA;gene=LOC111110729;model_evidence=Supporting evidence includes similarity to: 1 Protein%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 14 samples with support for all annotated introns;product=FMRFamide receptor-like%2C transcript variant X2;transcript_id=XM_022447333.1\n", "NC_035780.1\tGnomon\tmRNA\t85606\t95254\t.\t-\t.\tID=rna-XM_022449924.1;Parent=gene-LOC111112434;Dbxref=GeneID:111112434,Genbank:XM_022449924.1;Name=XM_022449924.1;gbkey=mRNA;gene=LOC111112434;model_evidence=Supporting evidence includes similarity to: 7 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 13 samples with support for all annotated introns;product=homeobox protein Hox-B7-like;transcript_id=XM_022449924.1\n", "NC_035780.1\tGnomon\tmRNA\t99840\t106460\t.\t+\t.\tID=rna-XM_022461698.1;Parent=gene-LOC111120752;Dbxref=GeneID:111120752,Genbank:XM_022461698.1;Name=XM_022461698.1;gbkey=mRNA;gene=LOC111120752;model_evidence=Supporting evidence includes similarity to: 10 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 27 samples with support for all annotated introns;product=ribulose-phosphate 3-epimerase-like;transcript_id=XM_022461698.1\n", "NC_035780.1\tGnomon\tmRNA\t108305\t110077\t.\t-\t.\tID=rna-XM_022474921.1;Parent=gene-LOC111128944;Dbxref=GeneID:111128944,Genbank:XM_022474921.1;Name=XM_022474921.1;gbkey=mRNA;gene=LOC111128944;model_evidence=Supporting evidence includes similarity to: 2 Proteins%2C and 93%25 coverage of the annotated genomic feature by RNAseq alignments;partial=true;product=mucin-19-like;start_range=.,108305;transcript_id=XM_022474921.1\n", "NC_035780.1\tGnomon\tmRNA\t151859\t157536\t.\t+\t.\tID=rna-XM_022474931.1;Parent=gene-LOC111128953;Dbxref=GeneID:111128953,Genbank:XM_022474931.1;Name=XM_022474931.1;gbkey=mRNA;gene=LOC111128953;model_evidence=Supporting evidence includes similarity to: 1 Protein;product=GATA zinc finger domain-containing protein 14-like;transcript_id=XM_022474931.1\n", "NC_035780.1\tGnomon\tmRNA\t163809\t183798\t.\t-\t.\tID=rna-XM_022440054.1;Parent=gene-LOC111105691;Dbxref=GeneID:111105691,Genbank:XM_022440054.1;Name=XM_022440054.1;gbkey=mRNA;gene=LOC111105691;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 9 samples with support for all annotated introns;product=uncharacterized LOC111105691;transcript_id=XM_022440054.1\n", "NC_035780.1\tGnomon\tmRNA\t164820\t166793\t.\t+\t.\tID=rna-XM_022440042.1;Parent=gene-LOC111105685;Dbxref=GeneID:111105685,Genbank:XM_022440042.1;Name=XM_022440042.1;gbkey=mRNA;gene=LOC111105685;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 4 samples with support for all annotated introns;product=protein ANTAGONIST OF LIKE HETEROCHROMATIN PROTEIN 1-like;transcript_id=XM_022440042.1\n", "NC_035780.1\tGnomon\tmRNA\t190449\t193594\t.\t-\t.\tID=rna-XM_022482070.1;Parent=gene-LOC111133554;Dbxref=GeneID:111133554,Genbank:XM_022482070.1;Name=XM_022482070.1;gbkey=mRNA;gene=LOC111133554;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 3 samples with support for all annotated introns;product=putative uncharacterized protein DDB_G0277407;transcript_id=XM_022482070.1\n", " 60201 C_virginica-3.0-mRNA.gff\n" ] } ], "source": [ "!head C_virginica-3.0-mRNA.gff\n", "!wc -l C_virginica-3.0-mRNA.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2f. Non-coding sequences" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#Find the complement to the exon track (non-coding sequences)\n", "#Create a BEDfile of IGV\n", "!{bedtoolsDirectory}/complementBed \\\n", "-i C_virginica-3.0-exon.gff \\\n", "-g C_virginica-3.0-sequence-lengths.txt \\\n", "> C_virginica-3.0-nonCDS.bed" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t0\t13577\n", "NC_035780.1\t13603\t14236\n", "NC_035780.1\t14290\t14556\n", "NC_035780.1\t14594\t28960\n", "NC_035780.1\t29073\t30523\n", "NC_035780.1\t31557\t31735\n", "NC_035780.1\t31887\t31976\n", "NC_035780.1\t32565\t32958\n", "NC_035780.1\t33324\t43110\n", "NC_035780.1\t44358\t45912\n", " 337305 C_virginica-3.0-nonCDS.bed\n" ] } ], "source": [ "!head C_virginica-3.0-nonCDS.bed\n", "!wc -l C_virginica-3.0-nonCDS.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2g. Intron" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#Find the intersection between the non-coding sequences and genes (introns)\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-a C_virginica-3.0-nonCDS.bed \\\n", "-b C_virginica-3.0-gene.gff -sorted \\\n", "> C_virginica-3.0-intron.bed" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t13603\t14236\n", "NC_035780.1\t14290\t14556\n", "NC_035780.1\t29073\t30523\n", "NC_035780.1\t31557\t31735\n", "NC_035780.1\t31887\t31976\n", "NC_035780.1\t32565\t32958\n", "NC_035780.1\t44358\t45912\n", "NC_035780.1\t46506\t64122\n", "NC_035780.1\t64334\t66868\n", "NC_035780.1\t85777\t88422\n", " 311341 C_virginica-3.0-intron.bed\n" ] } ], "source": [ "!head C_virginica-3.0-intron.bed\n", "!wc -l C_virginica-3.0-intron.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2h. Untranslated regions of exons" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Obtain UTRs by subtracting CDS from exons\n", "!{bedtoolsDirectory}/subtractBed \\\n", "-a C_virginica-3.0-exon.gff \\\n", "-b C_virginica-3.0-CDS.gff \\\n", "-sorted \\\n", "-g C_virginica-3.0-sequence-lengths.txt \\\n", "> C_virginica-3.0-exonUTR.gff" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\texon\t13578\t13603\t.\t+\t.\tID=exon-XR_002636969.1-1;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\n", "NC_035780.1\tGnomon\texon\t14237\t14290\t.\t+\t.\tID=exon-XR_002636969.1-2;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\n", "NC_035780.1\tGnomon\texon\t14557\t14594\t.\t+\t.\tID=exon-XR_002636969.1-3;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1\n", "NC_035780.1\tGnomon\texon\t28961\t29073\t.\t+\t.\tID=exon-XM_022471938.1-1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t30524\t30534\t.\t+\t.\tID=exon-XM_022471938.1-2;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t33205\t33324\t.\t+\t.\tID=exon-XM_022471938.1-5;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1\n", "NC_035780.1\tGnomon\texon\t43111\t43261\t.\t-\t.\tID=exon-XM_022447324.1-3;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\n", "NC_035780.1\tGnomon\texon\t43111\t43261\t.\t-\t.\tID=exon-XM_022447333.1-2;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XM_022447333.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X2;transcript_id=XM_022447333.1\n", "NC_035780.1\tGnomon\texon\t45998\t46506\t.\t-\t.\tID=exon-XM_022447333.1-1;Parent=rna-XM_022447333.1;Dbxref=GeneID:111110729,Genbank:XM_022447333.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X2;transcript_id=XM_022447333.1\n", "NC_035780.1\tGnomon\texon\t64220\t64334\t.\t-\t.\tID=exon-XM_022447324.1-2;Parent=rna-XM_022447324.1;Dbxref=GeneID:111110729,Genbank:XM_022447324.1;gbkey=mRNA;gene=LOC111110729;product=FMRFamide receptor-like%2C transcript variant X1;transcript_id=XM_022447324.1\n", " 183389 C_virginica-3.0-exonUTR.gff\n" ] } ], "source": [ "!head C_virginica-3.0-exonUTR.gff\n", "!wc -l C_virginica-3.0-exonUTR.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2i. Flanking regions (1 kb)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### All flanks" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "#Create 1 kb flanking regions\n", "#Subtract existing genes from artificial flanks\n", "!{bedtoolsDirectory}/flankBed \\\n", "-i C_virginica-3.0-gene.gff \\\n", "-g C_virginica-3.0-sequence-lengths.txt \\\n", "-b 1000 \\\n", "| {bedtoolsDirectory}/subtractBed \\\n", "-a - \\\n", "-b C_virginica-3.0-gene.gff \\\n", "> C_virginica-3.0-flanks.gff" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\tgene\t12578\t13577\t.\t+\t.\tID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA\n", "NC_035780.1\tGnomon\tgene\t14595\t15594\t.\t+\t.\tID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA\n", "NC_035780.1\tGnomon\tgene\t27961\t28960\t.\t+\t.\tID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t33325\t34324\t.\t+\t.\tID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t42111\t43110\t.\t-\t.\tID=gene-LOC111110729;Dbxref=GeneID:111110729;Name=LOC111110729;gbkey=Gene;gene=LOC111110729;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t66898\t67897\t.\t-\t.\tID=gene-LOC111110729;Dbxref=GeneID:111110729;Name=LOC111110729;gbkey=Gene;gene=LOC111110729;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t84606\t85605\t.\t-\t.\tID=gene-LOC111112434;Dbxref=GeneID:111112434;Name=LOC111112434;gbkey=Gene;gene=LOC111112434;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t95255\t96254\t.\t-\t.\tID=gene-LOC111112434;Dbxref=GeneID:111112434;Name=LOC111112434;gbkey=Gene;gene=LOC111112434;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t98840\t99839\t.\t+\t.\tID=gene-LOC111120752;Dbxref=GeneID:111120752;Name=LOC111120752;gbkey=Gene;gene=LOC111120752;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t106461\t107460\t.\t+\t.\tID=gene-LOC111120752;Dbxref=GeneID:111120752;Name=LOC111120752;gbkey=Gene;gene=LOC111120752;gene_biotype=protein_coding\n", " 70041 C_virginica-3.0-flanks.gff\n" ] } ], "source": [ "!head C_virginica-3.0-flanks.gff\n", "!wc -l C_virginica-3.0-flanks.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Upstream flanks" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "#Create 1 kb upstream flanking regions (-l) based on strand (-s)\n", "#Subtract existing genes from artificial flanks\n", "!{bedtoolsDirectory}/flankBed \\\n", "-i C_virginica-3.0-gene.gff \\\n", "-g C_virginica-3.0-sequence-lengths.txt \\\n", "-l 1000 \\\n", "-r 0 \\\n", "-s \\\n", "| {bedtoolsDirectory}/subtractBed \\\n", "-a - \\\n", "-b C_virginica-3.0-gene.gff \\\n", "> C_virginica-3.0-upstream.gff" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\tgene\t12578\t13577\t.\t+\t.\tID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA\n", "NC_035780.1\tGnomon\tgene\t27961\t28960\t.\t+\t.\tID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t66898\t67897\t.\t-\t.\tID=gene-LOC111110729;Dbxref=GeneID:111110729;Name=LOC111110729;gbkey=Gene;gene=LOC111110729;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t95255\t96254\t.\t-\t.\tID=gene-LOC111112434;Dbxref=GeneID:111112434;Name=LOC111112434;gbkey=Gene;gene=LOC111112434;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t98840\t99839\t.\t+\t.\tID=gene-LOC111120752;Dbxref=GeneID:111120752;Name=LOC111120752;gbkey=Gene;gene=LOC111120752;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t110078\t111077\t.\t-\t.\tID=gene-LOC111128944;Dbxref=GeneID:111128944;Name=LOC111128944;gbkey=Gene;gene=LOC111128944;gene_biotype=protein_coding;partial=true;start_range=.,108305\n", "NC_035780.1\tGnomon\tgene\t150859\t151858\t.\t+\t.\tID=gene-LOC111128953;Dbxref=GeneID:111128953;Name=LOC111128953;gbkey=Gene;gene=LOC111128953;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t183799\t184798\t.\t-\t.\tID=gene-LOC111105691;Dbxref=GeneID:111105691;Name=LOC111105691;gbkey=Gene;gene=LOC111105691;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t193595\t194594\t.\t-\t.\tID=gene-LOC111133554;Dbxref=GeneID:111133554;Name=LOC111133554;gbkey=Gene;gene=LOC111133554;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t207744\t208743\t.\t-\t.\tID=gene-LOC111125466;Dbxref=GeneID:111125466;Name=LOC111125466;gbkey=Gene;gene=LOC111125466;gene_biotype=protein_coding\n", " 34817 C_virginica-3.0-upstream.gff\n" ] } ], "source": [ "!head C_virginica-3.0-upstream.gff\n", "!wc -l C_virginica-3.0-upstream.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Downstream flanks" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "#Create 1 kb upstream flanking regions (-l) based on strand (-s)\n", "#Subtract existing genes from artificial flanks\n", "!{bedtoolsDirectory}/flankBed \\\n", "-i C_virginica-3.0-gene.gff \\\n", "-g C_virginica-3.0-sequence-lengths.txt \\\n", "-l 0 \\\n", "-r 1000 \\\n", "-s \\\n", "| {bedtoolsDirectory}/subtractBed \\\n", "-a - \\\n", "-b C_virginica-3.0-gene.gff \\\n", "> C_virginica-3.0-downstream.gff" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\tGnomon\tgene\t14595\t15594\t.\t+\t.\tID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA\n", "NC_035780.1\tGnomon\tgene\t33325\t34324\t.\t+\t.\tID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t42111\t43110\t.\t-\t.\tID=gene-LOC111110729;Dbxref=GeneID:111110729;Name=LOC111110729;gbkey=Gene;gene=LOC111110729;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t84606\t85605\t.\t-\t.\tID=gene-LOC111112434;Dbxref=GeneID:111112434;Name=LOC111112434;gbkey=Gene;gene=LOC111112434;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t106461\t107460\t.\t+\t.\tID=gene-LOC111120752;Dbxref=GeneID:111120752;Name=LOC111120752;gbkey=Gene;gene=LOC111120752;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t107305\t108304\t.\t-\t.\tID=gene-LOC111128944;Dbxref=GeneID:111128944;Name=LOC111128944;gbkey=Gene;gene=LOC111128944;gene_biotype=protein_coding;partial=true;start_range=.,108305\n", "NC_035780.1\tGnomon\tgene\t157537\t158536\t.\t+\t.\tID=gene-LOC111128953;Dbxref=GeneID:111128953;Name=LOC111128953;gbkey=Gene;gene=LOC111128953;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t162809\t163808\t.\t-\t.\tID=gene-LOC111105691;Dbxref=GeneID:111105691;Name=LOC111105691;gbkey=Gene;gene=LOC111105691;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t189449\t190448\t.\t-\t.\tID=gene-LOC111133554;Dbxref=GeneID:111133554;Name=LOC111133554;gbkey=Gene;gene=LOC111133554;gene_biotype=protein_coding\n", "NC_035780.1\tGnomon\tgene\t203243\t204242\t.\t-\t.\tID=gene-LOC111125466;Dbxref=GeneID:111125466;Name=LOC111125466;gbkey=Gene;gene=LOC111125466;gene_biotype=protein_coding\n", " 35224 C_virginica-3.0-downstream.gff\n" ] } ], "source": [ "!head C_virginica-3.0-downstream.gff\n", "!wc -l C_virginica-3.0-downstream.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2j. Intergenic regions" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "#Find the complement of genes, then subtract flanks to obtain intergenic regions\n", "!{bedtoolsDirectory}/complementBed \\\n", "-i C_virginica-3.0-gene.gff -sorted \\\n", "-g C_virginica-3.0-sequence-lengths.txt \\\n", "| {bedtoolsDirectory}/subtractBed \\\n", "-a - \\\n", "-b C_virginica-3.0-flanks.gff \\\n", "> C_virginica-3.0-intergenic.bed" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t0\t12577\n", "NC_035780.1\t15594\t27960\n", "NC_035780.1\t34324\t42110\n", "NC_035780.1\t67897\t84605\n", "NC_035780.1\t96254\t98839\n", "NC_035780.1\t111077\t150858\n", "NC_035780.1\t158536\t162808\n", "NC_035780.1\t184798\t189448\n", "NC_035780.1\t194594\t203242\n", "NC_035780.1\t208743\t213890\n", " 23949 C_virginica-3.0-intergenic.bed\n" ] } ], "source": [ "!head C_virginica-3.0-intergenic.bed\n", "!wc -l C_virginica-3.0-intergenic.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2k. Transposable elements" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 8847k 100 8847k 0 0 3621k 0 0:00:02 0:00:02 --:--:-- 3623k\n" ] } ], "source": [ "!curl https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0/GCF_002022765.2_C_virginica-3.0_rm.out.gz \\\n", "> C_virginica-3.0-rm.te.gz" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "!gunzip -k C_virginica-3.0-rm.te.gz" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " SW perc perc perc query position in query matching repeat position in repeat\r\n", "score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID\r\n", "\r\n", " 71 0.000 0.000 0.000 NC_035780.1 1473 1535 (65666905) + (TAACCC)n Simple_repeat 1 63 (0) 1\r\n", " 13 14.100 0.000 6.100 NC_035780.1 8261 8295 (65660145) + (CTCCT)n Simple_repeat 1 33 (0) 2\r\n", " 23 18.900 0.000 0.000 NC_035780.1 10552 10600 (65657840) + (TGAA)n Simple_repeat 1 49 (0) 3\r\n", " 37 0.000 0.000 0.000 NC_035780.1 11265 11298 (65657142) + (AAG)n Simple_repeat 1 34 (0) 4\r\n", " 72 0.000 0.000 0.000 NC_035780.1 12211 12271 (65656169) + (AG)n Simple_repeat 1 61 (0) 5\r\n", " 14 21.400 7.400 0.000 NC_035780.1 15431 15484 (65652956) + (TGTATG)n Simple_repeat 1 58 (0) 6\r\n", " 34 0.000 3.000 0.000 NC_035780.1 15520 15552 (65652888) + (GA)n Simple_repeat 1 34 (0) 7\r\n" ] } ], "source": [ "!head C_virginica-3.0-rm.te" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "#Convert RepeatMasker output to a BEDfile\n", "#Skip the first 4 lines \n", "#Print columns 5-7 as a tab-delimited output\n", "!tail -n +4 C_virginica-3.0-rm.te \\\n", "| awk 'BEGIN{OFS= \"\\t\"} {print $5, $6, $7}' \\\n", "> C_virginica-3.0-rm.te.bed" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NC_035780.1\t1473\t1535\n", "NC_035780.1\t8261\t8295\n", "NC_035780.1\t10552\t10600\n", "NC_035780.1\t11265\t11298\n", "NC_035780.1\t12211\t12271\n", "NC_035780.1\t15431\t15484\n", "NC_035780.1\t15520\t15552\n", "NC_035780.1\t15585\t15619\n", "NC_035780.1\t16397\t16434\n", "NC_035780.1\t16631\t16653\n", " 344267 C_virginica-3.0-rm.te.bed\n" ] } ], "source": [ "!head C_virginica-3.0-rm.te.bed\n", "!wc -l C_virginica-3.0-rm.te.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. CG motifs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3a. Count CGs" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 14277725\r\n" ] } ], "source": [ "#Obtain a rough count of CGs in the genome\n", "!fgrep -o -i CG GCF_002022765.2_C_virginica-3.0_genomic.fna \\\n", "| wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generated CG motif track with `fuzznuc` on [Galaxy](usegalaxy.org)\n", "\n", "![Screen Shot 2022-05-11 at 6 58 51 PM](https://user-images.githubusercontent.com/22335838/167960980-73121d80-0aff-45e3-a156-febef79bc2d3.png)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##gff-version 2.0\r\n", "##date 2022-05-11\r\n", "##Type DNA NC_035780.1\r\n", "NC_035780.1\tfuzznuc\tmisc_feature\t29\t30\t2.000\t+\t.\tSequence \"NC_035780.1.1\" ; note \"*pat pattern1\"\r\n", "NC_035780.1\tfuzznuc\tmisc_feature\t55\t56\t2.000\t+\t.\tSequence \"NC_035780.1.2\" ; note \"*pat pattern1\"\r\n", "NC_035780.1\tfuzznuc\tmisc_feature\t76\t77\t2.000\t+\t.\tSequence \"NC_035780.1.3\" ; note \"*pat pattern1\"\r\n", "NC_035780.1\tfuzznuc\tmisc_feature\t94\t95\t2.000\t+\t.\tSequence \"NC_035780.1.4\" ; note \"*pat pattern1\"\r\n", "NC_035780.1\tfuzznuc\tmisc_feature\t104\t105\t2.000\t+\t.\tSequence \"NC_035780.1.5\" ; note \"*pat pattern1\"\r\n", "NC_035780.1\tfuzznuc\tmisc_feature\t117\t118\t2.000\t+\t.\tSequence \"NC_035780.1.6\" ; note \"*pat pattern1\"\r\n", "NC_035780.1\tfuzznuc\tmisc_feature\t135\t136\t2.000\t+\t.\tSequence \"NC_035780.1.7\" ; note \"*pat pattern1\"\r\n" ] } ], "source": [ "#Check Galaxy output\n", "!head C_virginica-3.0-fuzznuc-CGmotif.gff\n", "!wc -l C_virginica-3.0-fuzznuc-CGmotif.gff" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### 3b. Count CG overlaps with all genome feature tracks" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 7778105\r\n" ] } ], "source": [ "#Genes\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-gene.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1728303\r\n" ] } ], "source": [ "#CDS\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-CDS.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2334303\r\n" ] } ], "source": [ "#Exon\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-exon.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 281715\r\n" ] } ], "source": [ "#lncRNA\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-lncRNA.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 7507167\r\n" ] } ], "source": [ "#mRNA\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-mRNA.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 12138514\r\n" ] } ], "source": [ "#nonCDS\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-nonCDS.bed \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5497597\r\n" ] } ], "source": [ "#Introns\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-intron.bed \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 606308\r\n" ] } ], "source": [ "#Exon UTR\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-exonUTR.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 694265\r\n" ] } ], "source": [ "#Upstream flanks\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-upstream.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 616684\r\n" ] } ], "source": [ "#Downstream flanks\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-downstream.gff \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5417334\r\n" ] } ], "source": [ "#Intergenic regions\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-intergenic.bed \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 611471\r\n" ] } ], "source": [ "#Transposable elements\n", "!{bedtoolsDirectory}/intersectBed \\\n", "-u \\\n", "-a C_virginica-3.0-fuzznuc-CGmotif.gff \\\n", "-b C_virginica-3.0-rm.te.bed \\\n", "| wc -l" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 1 }