--- title: "22-Concatenated-genome" output: html_document --- Effort to get version of genome that BS-SNper will like such that we can determine CT snps ########### -------------- CREATE CONCATENATED RED KING CRAB GENOME WITH ~50 SUPER-CONTIGS -------------------############################ # This step is to speed up this job. Otherwise it takes weeks/months/years! # Check to see if there are instances of 1000 N in the genome already - this returns the number of contigs that contain exactly 1000 consecutive N's echo "This is the number of contigs that contain exactly 1000 consecutive N's" grep N Paralithodes.camtschaticus.genome.fasta | awk -F '[^N]+' '{for (i=1; i<=NF; i++) if ($i != "") print length($i)}' | grep 1000$ | wc -l ```{bash} grep N ../data/bgdata/Olurida_v081.fa | awk -F '[^N]+' '{for (i=1; i<=NF; i++) if ($i != "") print length($i)}' | grep 1000$ | wc -l ``` ```{bash} fgrep ">" ../data/bgdata/Olurida_v081.fa | wc -l ``` # Concatenate contigs together, where i=the number of sequences to combine -1 (i.e. here i=17196, so 17197 sequences will be concatenated) echo "Starting genome concatenation" awk -v i=10628 -f /home/lspencer/2022-redking-OA/scripts/merge_contigs.awk.txt \ Paralithodes.camtschaticus.genome.fasta | \ sed 's/> Seq_1/>Seq_1/' > Paralithodes.camtschaticus.genome_concat.fa # remove the weird white space before Seq_1 ```{bash} echo "Starting genome concatenation" awk -v i=10628 -f merge_contigs.awk.txt \ ../data/bgdata/Olurida_v081.fa > ../data/bgdata/Olurida_v081-mergecat.fa ``` ```{bash} cat ../data/bgdata/Olurida_v081-mergecat.fa | sed 's/> Contig0/>Contig0/' \ > ../data/bgdata/Olurida_v081-mergecat01.fa ``` ```{bash} head -1 ../data/bgdata/Olurida_v081-mergecat01.fa ``` ## Edit fasta headers from concatenated IDs to the range (e.g. "Seq_1 Seq_2 Seq_3" to "Seq_1:Seq_3") # create variables n_contigs_old=$(grep ">" Paralithodes.camtschaticus.genome.fasta | wc -l) #number of contigs in original fasta d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig digits=$(echo ${#d}) #number of digits in the last contig n_contigs_new=$(grep ">" Paralithodes.camtschaticus.genome_concat.fa | wc -l) #number of contigs in concatenated genome n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome ```{bash} n_contigs_old=$(grep ">" ../data/bgdata/Olurida_v081.fa | wc -l) #number of contigs in original fasta d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig digits=$(echo ${#d}) #number of digits in the last contig n_contigs_new=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | wc -l) #number of contigs in concatenated genome n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome echo $n_contigs_old echo $d echo $digits echo $n_contigs_new echo $n ``` # loop over each new contig header, paste first and last ID # together and replace concat header with new ID for i in $n do old=$(grep ">" Paralithodes.camtschaticus.genome_concat.fa | sed "${i}q;d") start=$(echo $old | grep -o -E '[0-9]{1,6}' | head -n 1) #If needed, change the 6 to match the value in $digits end=$(echo $old | grep -o -E '[0-9]{1,6}' | tail -n 1) #If needed, change the 6 to match the value in $digits new=$(echo ">Seq_"$start":Seq_"$end) sed -i.bak "s|$old|$new|" Paralithodes.camtschaticus.genome_concat.fa # edit fasta in place. ```{bash} n_contigs_old=$(grep ">" ../data/bgdata/Olurida_v081.fa | wc -l) #number of contigs in original fasta d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig digits=$(echo ${#d}) #number of digits in the last contig n_contigs_new=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | wc -l) #number of contigs in concatenated genome n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome echo $n_contigs_old echo $d echo $digits echo $n_contigs_new echo $n echo "going to next step" for i in $n do old=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | sed "${i}q;d") start=$(echo $old | grep -o -E '[0-9]{1,6}' | head -n 1) #If needed, change the 6 to match the value in $digits echo "What is start?" echo $start end=$(echo $old | grep -o -E '[0-9]{1,6}' | tail -n 1) #If needed, change the 6 to match the value in $digits echo "What is end?" echo $end new=$(echo ">Contig"$start":Contig"$end) echo "What is new?" echo $new sed "s|$old|$new|" ../data/bgdata/Olurida_v081-mergecat01.fa > ../data/bgdata/Olurida_v081-mergecat09.fa done ``` ```{bash} sed '/>/s/.*/>Contig00/' ../data/bgdata/Olurida_v081-mergecat01.fa > ../data/bgdata/Olurida_v081-mergecat02.fa ``` ```{bash} awk '{for(x=1;x<=NF;x++)if($x~/00/){sub(/00/,++i)}}1' ../data/bgdata/Olurida_v081-mergecat02.fa | fgrep ">" ``` ```{bash} awk '{for(x=1;x<=NF;x++)if($x~/00/){sub(/00/,++i)}}1' ../data/bgdata/Olurida_v081-mergecat02.fa \ > ../data/bgdata/Olurida_v081-mergecat99.fa ``` ^^^^^^^^^^^^^ this is it! ```{bash} #awk 'sub("\\>",cnt+1, $0){cnt++}1' ../data/bgdata/Olurida_v081-mergecat01.fa > ../data/bgdata/Olurida_v081-mergecat03.fa ``` ```{bash} n_contigs_old=$(grep ">" ../data/bgdata/Olurida_v081.fa | wc -l) #number of contigs in original fasta d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig digits=$(echo ${#d}) #number of digits in the last contig n_contigs_new=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | wc -l) #number of contigs in concatenated genome n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome echo $n_contigs_old echo $d echo $digits echo $n_contigs_new echo $n echo "going to next step" i=10 old=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | sed "${i}q;d") echo "What is old?" start=$(echo $old | grep -o -E '[0-9]{1,6}' | head -n 1) #If needed, change the 6 to match the value in $digits echo "What is start?" echo $start end=$(echo $old | grep -o -E '[0-9]{1,6}' | tail -n 1) #If needed, change the 6 to match the value in $digits echo "What is end?" echo $end new=$(echo ">Contig"$start":Contig"$end) echo "What is new?" echo $new sed -i.bak "s|$old|$new|" ../bgdata/Olurida_v081-mergecat01.fa # edit fasta in place. ``` sed "s|$old|$new|"