脚本功能如说明所示:可以根据参考 gtf 文件 对 stringtie –merge 的结果文件进行过滤筛选新的 lncRNA。
#!/usr/bin/perl -w
use strict;
## this program can fetch that none reference gtf file from stringtie merged result
## and filter that overlaped with know transcript in same strand
## and filter that only have one exon
my $gtf = $ARGV[0];
my $stringtie_merged = $ARGV[1];
die "Usage:perl $0 <hg38.gtf> <stringtie merged.gtf> \n" unless @ARGV == 2;
my %reference;
open REF,$gtf or die "Can't open the reference gtf file $!";
while (<REF>){
chomp;
next if (/^$/);
next if (/^#/);
my @a = split /\t/,$_;
my @b;
if ($a[2] eq "transcript"){
(my $id) = $_ =~ /\s+transcript_id\s+"(\S+)";/;
push @b,$a[3];
push @b,$a[4];
$reference{$a[6]}{$a[0]}{$id} = \@b;
}
}
close REF;
## deal with stringtie mered gtf file
my %transcript;
my %exon;
open IN,$stringtie_merged or die "Can't open the stringtie mweged gtf file $!";
while (<IN>){
chomp;
next if (/^$/);
next if (/^#/);
my @c = split /[\t;]/,$_;
(my $id) = $_ =~ /\s+transcript_id\s+"(\S+)";/;
if ($c[2] eq "transcript" && @c < 12) {
my $strand = $c[6];
my $start = $c[3];
my $end = $c[4];
my $chr = $c[0];
my $z = $reference{$strand}{$chr};
my $flag;
foreach my $k (sort keys %$z) {
if ($z->{$k}->[0] < $start && $z->{$k}->[1] > $start) {
$flag++;
}
if ($z->{$k}->[0] > $start && $z->{$k}->[1] < $end) {
$flag++;
}
if ($z->{$k}->[0] < $start && $z->{$k}->[1] > $end) {
$flag++;
}
if ($z->{$k}->[0] < $end && $z->{$k}->[1] > $end) {
$flag++;
}
}
if (not $flag) {
$transcript{$id} = $_;
# print "$_\n";
}
}
if ($c[2] eq "exon" ) {
push @{$exon{$id}},$_;
}
}
close IN;
foreach my $k (sort keys %transcript) {
if (@{$exon{$k}} > 1){
print join ("\n",$transcript{$k},@{ $exon{$k} } )."\n";
}
}
__END__