---
title: "22-Concatenated-genome"
output: html_document
---
Effort to get version of genome that BS-SNper will like such that we can determine CT snps
########### -------------- CREATE CONCATENATED RED KING CRAB GENOME WITH ~50 SUPER-CONTIGS -------------------############################
# This step is to speed up this job. Otherwise it takes weeks/months/years!
# Check to see if there are instances of 1000 N in the genome already - this returns the number of contigs that contain exactly 1000 consecutive N's
echo "This is the number of contigs that contain exactly 1000 consecutive N's"
grep N Paralithodes.camtschaticus.genome.fasta | awk -F '[^N]+' '{for (i=1; i<=NF; i++) if ($i != "") print length($i)}' | grep 1000$ | wc -l
```{bash}
grep N ../data/bgdata/Olurida_v081.fa | awk -F '[^N]+' '{for (i=1; i<=NF; i++) if ($i != "") print length($i)}' | grep 1000$ | wc -l
```
```{bash}
fgrep ">" ../data/bgdata/Olurida_v081.fa | wc -l
```
# Concatenate contigs together, where i=the number of sequences to combine -1 (i.e. here i=17196, so 17197 sequences will be concatenated)
echo "Starting genome concatenation"
awk -v i=10628 -f /home/lspencer/2022-redking-OA/scripts/merge_contigs.awk.txt \
Paralithodes.camtschaticus.genome.fasta | \
sed 's/> Seq_1/>Seq_1/' > Paralithodes.camtschaticus.genome_concat.fa # remove the weird white space before Seq_1
```{bash}
echo "Starting genome concatenation"
awk -v i=10628 -f merge_contigs.awk.txt \
../data/bgdata/Olurida_v081.fa > ../data/bgdata/Olurida_v081-mergecat.fa
```
```{bash}
cat ../data/bgdata/Olurida_v081-mergecat.fa | sed 's/> Contig0/>Contig0/' \
> ../data/bgdata/Olurida_v081-mergecat01.fa
```
```{bash}
head -1 ../data/bgdata/Olurida_v081-mergecat01.fa
```
## Edit fasta headers from concatenated IDs to the range (e.g. "Seq_1 Seq_2 Seq_3" to "Seq_1:Seq_3")
# create variables
n_contigs_old=$(grep ">" Paralithodes.camtschaticus.genome.fasta | wc -l) #number of contigs in original fasta
d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig
digits=$(echo ${#d}) #number of digits in the last contig
n_contigs_new=$(grep ">" Paralithodes.camtschaticus.genome_concat.fa | wc -l) #number of contigs in concatenated genome
n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome
```{bash}
n_contigs_old=$(grep ">" ../data/bgdata/Olurida_v081.fa | wc -l) #number of contigs in original fasta
d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig
digits=$(echo ${#d}) #number of digits in the last contig
n_contigs_new=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | wc -l) #number of contigs in concatenated genome
n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome
echo $n_contigs_old
echo $d
echo $digits
echo $n_contigs_new
echo $n
```
# loop over each new contig header, paste first and last ID # together and replace concat header with new ID
for i in $n
do
old=$(grep ">" Paralithodes.camtschaticus.genome_concat.fa | sed "${i}q;d")
start=$(echo $old | grep -o -E '[0-9]{1,6}' | head -n 1) #If needed, change the 6 to match the value in $digits
end=$(echo $old | grep -o -E '[0-9]{1,6}' | tail -n 1) #If needed, change the 6 to match the value in $digits
new=$(echo ">Seq_"$start":Seq_"$end)
sed -i.bak "s|$old|$new|" Paralithodes.camtschaticus.genome_concat.fa # edit fasta in place.
```{bash}
n_contigs_old=$(grep ">" ../data/bgdata/Olurida_v081.fa | wc -l) #number of contigs in original fasta
d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig
digits=$(echo ${#d}) #number of digits in the last contig
n_contigs_new=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | wc -l) #number of contigs in concatenated genome
n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome
echo $n_contigs_old
echo $d
echo $digits
echo $n_contigs_new
echo $n
echo "going to next step"
for i in $n
do
old=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | sed "${i}q;d")
start=$(echo $old | grep -o -E '[0-9]{1,6}' | head -n 1) #If needed, change the 6 to match the value in $digits
echo "What is start?"
echo $start
end=$(echo $old | grep -o -E '[0-9]{1,6}' | tail -n 1) #If needed, change the 6 to match the value in $digits
echo "What is end?"
echo $end
new=$(echo ">Contig"$start":Contig"$end)
echo "What is new?"
echo $new
sed "s|$old|$new|" ../data/bgdata/Olurida_v081-mergecat01.fa > ../data/bgdata/Olurida_v081-mergecat09.fa
done
```
```{bash}
sed '/>/s/.*/>Contig00/' ../data/bgdata/Olurida_v081-mergecat01.fa > ../data/bgdata/Olurida_v081-mergecat02.fa
```
```{bash}
awk '{for(x=1;x<=NF;x++)if($x~/00/){sub(/00/,++i)}}1' ../data/bgdata/Olurida_v081-mergecat02.fa | fgrep ">"
```
```{bash}
awk '{for(x=1;x<=NF;x++)if($x~/00/){sub(/00/,++i)}}1' ../data/bgdata/Olurida_v081-mergecat02.fa \
> ../data/bgdata/Olurida_v081-mergecat99.fa
```
^^^^^^^^^^^^^
this is it!
```{bash}
#awk 'sub("\\>",cnt+1, $0){cnt++}1' ../data/bgdata/Olurida_v081-mergecat01.fa > ../data/bgdata/Olurida_v081-mergecat03.fa
```
```{bash}
n_contigs_old=$(grep ">" ../data/bgdata/Olurida_v081.fa | wc -l) #number of contigs in original fasta
d="${n_contigs_old//[^[:digit:]]/}" #Return ID of last/highest number contig
digits=$(echo ${#d}) #number of digits in the last contig
n_contigs_new=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | wc -l) #number of contigs in concatenated genome
n=$(seq 1 1 $n_contigs_new) #vector of 1:# of contigs in concat. genome
echo $n_contigs_old
echo $d
echo $digits
echo $n_contigs_new
echo $n
echo "going to next step"
i=10
old=$(grep ">" ../data/bgdata/Olurida_v081-mergecat01.fa | sed "${i}q;d")
echo "What is old?"
start=$(echo $old | grep -o -E '[0-9]{1,6}' | head -n 1) #If needed, change the 6 to match the value in $digits
echo "What is start?"
echo $start
end=$(echo $old | grep -o -E '[0-9]{1,6}' | tail -n 1) #If needed, change the 6 to match the value in $digits
echo "What is end?"
echo $end
new=$(echo ">Contig"$start":Contig"$end)
echo "What is new?"
echo $new
sed -i.bak "s|$old|$new|" ../bgdata/Olurida_v081-mergecat01.fa # edit fasta in place.
```
sed "s|$old|$new|"