perl 分割 fasta 文件

2023年1月21日 362次阅读来源: 正踪大米饭儿
本来想自己写的，结果看到网上有，功能也比较全面，挺好的，我自己就偷懒了。
脚本来源：http://kirill-kryukov.com/study/tools/fasta-splitter/files/fasta-splitter-0.2.6.zip
使用也比较简单，看说明吧！
#!/usr/bin/env perl
#
#  FASTA Splitter  -  a script for partitioning a FASTA file into pieces
#
#  Version 0.2.6 (August 1, 2017)
#
#  Copyright (c) 2012-2017 Kirill Kryukov
#
#  This software is provided 'as-is', without any express or implied
#  warranty. In no event will the authors be held liable for any damages
#  arising from the use of this software.
#
#  Permission is granted to anyone to use this software for any purpose,
#  including commercial applications, and to alter it and redistribute it
#  freely, subject to the following restrictions:
#
#  1. The origin of this software must not be misrepresented; you must not
#     claim that you wrote the original software. If you use this software
#     in a product, an acknowledgment in the product documentation would be
#     appreciated but is not required.
#  2. Altered source versions must be plainly marked as such, and must not be
#     misrepresented as being the original software.
#  3. This notice may not be removed or altered from any source distribution.
#

use strict;
use File::Basename qw(basename);
use File::Path qw(make_path);
use Getopt::Long qw(:config pass_through);

$| = 1;

my ($script_name,$script_version,$script_date,$script_years) = ('fasta-splitter','0.2.6','2017-08-01','2012-2017');
my $start_time = time;

my @files = ();
my ($opt_n_parts,$opt_part_size,$opt_part_num_prefix,$opt_measure,$opt_line_len,$opt_eol,$out_dir,$nopad,$ver,$help);
GetOptions('n-parts=i'         => \$opt_n_parts,
           'part-size=i'       => \$opt_part_size,
           'part-num-prefix=s' => \$opt_part_num_prefix,
           'measure=s'         => \$opt_measure,
           'line-length=i'     => \$opt_line_len,
           'eol=s'             => \$opt_eol,
           'out-dir=s'         => \$out_dir,
           'nopad'             => \$nopad,
           'version'           => \$ver,
           'help'              => \$help);
for (my $i=0; $i<scalar(@ARGV); $i++)
{
    if (substr($ARGV[$i],0,1) eq '-' and $i < scalar(@ARGV)-1)
    {
        if ($ARGV[$i] eq '-n-parts-total'     ) { $opt_n_parts   = int($ARGV[++$i]); $opt_measure = 'all'; }
        if ($ARGV[$i] eq '-n-parts-sequence'  ) { $opt_n_parts   = int($ARGV[++$i]); $opt_measure = 'seq'; }
        if ($ARGV[$i] eq '-part-total-size'   ) { $opt_part_size = int($ARGV[++$i]); $opt_measure = 'all'; }
        if ($ARGV[$i] eq '-part-sequence-size') { $opt_part_size = int($ARGV[++$i]); $opt_measure = 'seq'; }
        if ($ARGV[$i] eq '-line-length') { $opt_line_len = int($ARGV[++$i]); }
        if ($ARGV[$i] eq '-eol'        ) { $opt_eol      = int($ARGV[++$i]); }
    }
    else { push @files, $ARGV[$i]; }
}

my $ver_str = "$script_name, version $script_version, $script_date\nCopyright (c) $script_years Kirill Kryukov\n";
my $help_str = qq{Usage: ${script_name} [options] <file>...
Options:
    --n-parts <N>        - Divide into <N> parts
    --part-size <N>      - Divide into parts of size <N>
    --measure (all|seq|count) - Specify whether all data, sequence length, or
                           number of sequences is used for determining part
                           sizes ('all' by default).
    --line-length        - Set output sequence line length, 0 for single line
                           (default: 60).
    --eol (dos|mac|unix) - Choose end-of-line character ('unix' by default).
    --part-num-prefix T  - Put T before part number in file names (def.: .part-)
    --out-dir            - Specify output directory.
    --nopad              - Don't pad part numbers with 0.
    --version            - Show version.
    --help               - Show help.
};

print (($ver ? $ver_str : ''), ($help ? $help_str : ''));
if (!defined($opt_n_parts) and !defined($opt_part_size) and !defined($opt_measure) and !defined($opt_line_len) and !defined($opt_eol))
{
    if (!$help and !$ver) { print $ver_str, $help_str; } exit;
}
if (!defined($opt_n_parts) and !defined($opt_part_size)) { die "Splitting method is not specified\nUse -h for help\n"; }
if (!@files) { die "File for splitting is not specified\n"; }

if (defined($opt_n_parts) and $opt_n_parts <= 0) { die "Non-positive number of parts\n"; }
if (defined($opt_part_size) and $opt_part_size <= 0) { die "Non-positive part size\n"; }
if (defined($opt_measure) and $opt_measure ne 'all' and $opt_measure ne 'seq' and $opt_measure ne 'count') { die "Unknown value of --measure option\n"; }
if (defined($opt_eol) and $opt_eol ne 'dos' and $opt_eol ne 'mac' and $opt_eol ne 'unix') { die "Unknown value of --eol option\n"; }
if (defined($out_dir))
{
    $out_dir =~ s/[\/\\]+$//;
    if (!-e $out_dir) { make_path($out_dir); }
    if (!-e $out_dir || !-d $out_dir) { die "Can't create output directory \"$out_dir\"\n"; }
    $out_dir .= '/';
}

my $n_parts = defined($opt_n_parts) ? $opt_n_parts : 0;
my $part_size = defined($opt_part_size) ? $opt_part_size : 0;
my $line_len = (defined($opt_line_len) and $opt_line_len >= 0) ? $opt_line_len : 60;
my $eol = defined($opt_eol) ? (($opt_eol eq 'dos') ? "\x0D\x0A" : ($opt_eol eq 'mac') ? "\x0D" : "\x0A") : "\x0A";
my $eol_len = length($eol);
my $measure = defined($opt_measure) ? (($opt_measure eq 'count') ? 0 : ($opt_measure eq 'seq') ? 1 : 2) : 2;
my $part_num_prefix = defined($opt_part_num_prefix) ? $opt_part_num_prefix : '.part-';
my @part_start = ();
my ($base,$ext,$num_len,$total_size);
my ($OUT,$name,$data,$written_total,$written_this_part,$part_end,$part);

foreach my $infile (@files) { split_file($infile); }

my $elapsed_time = time - $start_time;
print "All done, $elapsed_time second", (($elapsed_time==1)?'':'s'), " elapsed\n";

sub split_file
{
    my ($infile) = @_;
    if (!-e $infile or !-f $infile) { print "Can't find file \"$infile\"\n"; return; }
    print $infile;

    ($base,$ext) = (basename($infile),'');
    if ($base =~ /^(.+?)(\.[^\.]+)$/) { ($base,$ext) = ($1,$2); }

    @part_start = ();
    my ($n_seq,$total_seq_len,$n_parts_found) = (0,0,0);

    if ($part_size)
    {
        ($n_seq,$total_seq_len,$total_size,$n_parts_found) = get_file_size_and_part_boundaries($infile);
        if (!$n_parts) { print ": $n_seq sequences, $total_seq_len bp"; }
        print ' => ', ($n_parts ? 'extracting' : 'dividing into'), ' ', $n_parts_found, ' part', ($n_parts_found > 1 ? 's' : ''),
              " of <= $part_size ", ($measure ? (($measure > 1) ? 'bytes' : 'bp') : 'sequences'), "\n";
        open(my $IN,'<',$infile) or die "Error: Can't open file \"$infile\"\n";
        binmode $IN;
        $num_len = length($n_parts_found);
        $OUT = undef;
        my ($out_file,$part,$si,$buffer) = (undef,0,-1,'');
        while (<$IN>)
        {
            $_ =~ s/[\x0D\x0A]+$//;
            if (substr($_,0,1) eq '>')
            {
                if ($OUT)
                {
                    if ($line_len == 0) { if ($si >= 0) { print $OUT $eol; } }
                    elsif ($buffer ne '') { print $OUT $buffer, $eol; $buffer = ''; }
                }
                $si++;
                if ($si >= $part_start[$part+1])
                {
                    if ($OUT) { close $OUT; }
                    $part++;
                    if ($part > $n_parts_found) { last; }
                    $out_file = $out_dir . $base . $part_num_prefix . ($nopad ? $part : sprintf('%0*d',$num_len,$part)) . $ext;
                    open($OUT,'>',$out_file) or die "Can't create output file \"$out_file\"\n";
                    binmode $OUT;
                }
                print $OUT $_, $eol;
                next;
            }
            if ($line_len)
            {
                $buffer .= $_;
                while (length($buffer) >= $line_len) { print $OUT substr($buffer,0,$line_len,''), $eol; }
            }
            else { print $OUT $_; }
        }
        close $IN;
        if ($OUT)
        {
            if (!$line_len) { if ($si >= 0) { print $OUT $eol; } }
            elsif ($buffer ne '') { print $OUT $buffer, $eol; $buffer = ''; }
            close $OUT;
        }
    }
    else
    {
        ($n_seq,$total_seq_len,$total_size) = get_file_size($infile);
        print ": $n_seq sequences, $total_seq_len bp => dividing into $n_parts part", ($n_parts > 1 ? 's' : ''), " ";
        open(my $IN,'<',$infile) or die "Error: Can't open file \"$infile\"\n";
        binmode $IN;
        $num_len = length($n_parts);
        ($OUT,$name,$data,$written_total,$written_this_part,$part_end,$part) = (undef,undef,'',0,0,int($total_size / $n_parts),1);
        while(<$IN>)
        {
            $_ =~ s/[\x0D\x0A]+$//;
            if (substr($_,0,1) eq '>')
            {
                if (defined $name) { dump_seq(); }
                $name = $_; $data = ''; next;
            }
            $data .= $_;
        }
        if (defined $name) { dump_seq(); }
        close $IN;
        if ($OUT) { close $OUT; }
        print " OK\n";
    }
}

sub dump_seq
{
    my $slen = length($data);
    my $seq_size = seq_size(length($name),$slen);
    my $new_written_total = $written_total + $seq_size;
    if ( !$OUT or
         ($written_this_part and ($new_written_total > $part_end) and ($new_written_total - $part_end > $part_end - $written_total)) )
    {
        if ($OUT) { close $OUT; $part++; $part_end = int($total_size / $n_parts * $part) + 1; }

        my $part_file = $out_dir . $base . $part_num_prefix . ($nopad ? $part : sprintf('%0*d',$num_len,$part)) . $ext;

        open($OUT,'>',$part_file) or die "Error: Can't create file \"$part_file\"\n";
        binmode $OUT;
        $written_this_part = 0;
        print ".";
    }
    print $OUT $name, $eol;
    if ($line_len) { for (my $s=0; $s<$slen; $s+=$line_len) { print $OUT substr($data,$s,$line_len), $eol; } }
    else { print $OUT $data, $eol; }
    $written_this_part += $seq_size;
    $written_total += $seq_size;
}

sub get_file_size_and_part_boundaries
{
    my ($file) = @_;
    open(my $IN,'<',$file) or die "Error: Can't open file \"$file\"\n";
    binmode $IN;
    my ($nseq,$total_seq_length,$total_size,$n_parts_found,$this_part_size,$nlen,$slen,$stop) = (0,0,0,1,0,0,0,0);
    $part_start[1] = 0;
    while (<$IN>)
    {
        $_ =~ s/[\x0D\x0A]+$//;
        my $len = length($_);
        if (substr($_,0,1) eq '>')
        {
            if ($nlen)
            {
                my $seq_size = seq_size($nlen,$slen);
                if ($part_size and $this_part_size and ($this_part_size + $seq_size > $part_size))
                {
                    if ($n_parts and $n_parts_found == $n_parts) { $stop = 1; last; }
                    else { $this_part_size = $seq_size; $n_parts_found++; $part_start[$n_parts_found] = $nseq; }
                }
                else { $this_part_size += $seq_size; }
                $nseq++; $total_seq_length += $slen; $total_size += $seq_size;
            }
            ($nlen,$slen) = ($len,0); next;
        }
        if ($nlen) { $slen += $len; }
    }
    if ($nlen and !$stop)
    {
        my $seq_size = seq_size($nlen,$slen);
        if ($part_size and $this_part_size and ($this_part_size + $seq_size > $part_size))
        {
            if ($n_parts and $n_parts_found == $n_parts) { $stop = 1; }
            else { $this_part_size = $seq_size; $n_parts_found++; $part_start[$n_parts_found] = $nseq; }
        }
        if (!$stop) { $nseq++; $total_seq_length += $slen; $total_size += $seq_size; }
    }
    close $IN;
    $part_start[$n_parts_found+1] = $nseq;
    return ($nseq,$total_seq_length,$total_size,$n_parts_found);
}

sub get_file_size
{
    my ($file) = @_;
    open(my $IN,'<',$file) or die "Error: Can't open file \"$file\"\n";
    binmode $IN;
    my ($nseq,$total_seq_length,$total_size,$nlen,$slen) = (0,0,0,0,0);
    while (<$IN>)
    {
        $_ =~ s/[\x0D\x0A]+$//;
        my $len = length($_);
        if (substr($_,0,1) eq '>')
        {
            if ($nlen) { $nseq++; $total_seq_length += $slen; $total_size += seq_size($nlen,$slen); }
            ($nlen,$slen) = ($len,0); next;
        }
        if ($nlen) { $slen += $len; }
    }
    if ($nlen) { $nseq++; $total_seq_length += $slen; $total_size += seq_size($nlen,$slen); }
    close $IN;
    return ($nseq,$total_seq_length,$total_size);
}

sub seq_size
{
    my ($nlen,$slen) = @_;
    return ($measure == 0) ? 1 :
           ($measure == 1) ? $slen :
           $slen + $nlen + $eol_len*(1 + ($line_len ? int(($slen+$line_len-1)/$line_len) : 1));
}
    原文作者：正踪大米饭儿
    原文地址: https://www.jianshu.com/p/9b84731b30d2
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。