--- title: "02-lncRNA-position" format: html editor: visual --- #!perl use strict; use warnings; # Input format check die "USAGE: lncRNA_file gtf_file OUT_file" unless @ARGV == 3; open IN1,"$ARGV[0]" || die "$!"; open IN2,"$ARGV[1]" || die "$!"; open OUT,"> $ARGV[2]" || die "$!"; my %gene_data; my %gene_dir; # Read the GTF file and store exon information while(){ chomp; my @data = split /\t/,$_; if ($data[2] ne 'exon'){ next; } # Extract gene_name or use gene_id if gene_name is missing my $gene_name; if ($data[8] =~ /gene_name\s\"(.*?)\"/) { $gene_name = $1; } elsif ($data[8] =~ /gene_id\s\"(.*?)\"/) { $gene_name = $1; } else { next; # Skip if no valid identifier is found } my @arr = [$data[3], $data[4]]; push(@{$gene_data{$gene_name}}, @arr); $gene_dir{$gene_name} = $data[6]; } # Read the lncRNA file and classify lncRNAs my @results; while(){ chomp; my $result; my $smname; my @data = split /\t/,$_; # Extract transcript name (lncRNA) my @rna = split /;/, $data[8]; $rna[1] =~ /\"(.*)\"/; my $linc = $1; # Extract distance to the closest gene $rna[2] =~ /\"(\d+)\"/; my $distance = $1; # Extract closest gene name or ID $_ =~ /closest_gene\s\"(.*?)\"/; $smname = $1; # Classification based on distance and strand orientation if ($distance > 1000){ # c1 - Intergenic $result = "$linc\tIntergenic"; } elsif($distance > 0){ # c1 and c2 if ($gene_dir{$smname} eq $data[6]) { $result = "$linc\tIntergenic"; } else { $result = "$linc\tBidirectional"; } } elsif($distance == 0){ # c3, c4, c5 if ($gene_dir{$smname} ne $data[6]){ # c3 - Antisense $result = "$linc\tAntisense"; } else { # c4 and c5 - Sense (Exonic or Intronic) my $m = 0; foreach my $gename (keys %gene_data){ if ($gename eq $smname){ my @num = @{$gene_data{$gename}}; for (my $i = 0; $i <= $#num; $i++){ if(($num[$i][0] < $data[4]) && ($num[$i][1] > $data[3])){ $m++; } } } } if ($m > 0){ $result = "$linc\tExonic Sense"; } else { $result = "$linc\tIntronic Sense"; } } } push(@results, $result); } # Remove duplicates and write results to output file my %hash; @results = grep { !$hash{$_}++ } @results; foreach my $result (@results){ print OUT "$result\n"; } close IN1; close IN2; close OUT;