Perl - Uncompressing zip files on windows is too slow - windows

I've created a uncompress function, put together from a few code snippets and a few alterations from my side, automatically handling the file type.
My current usecase is to extract a ~550mb zip file from a SMB share on windows with a lot of files in it (qt 5.5 source code)
On Linux, this is a tgz file on a nfs share and it takes 67 seconds for the function to extract it. (other uncompression method than for zip files)
On Windows it takes >15minutes.
I'm thinking about using a system(7z $source) call as alternative.
Do you have any suggestions what's the fastest method to extract a zip file on windows?
Plz be honest, if my uncompress function is crap, i'm no perl expert... :)
Here's my code:
#uncompress full archive file $archFile to $destPath
sub uncompress
{
my $fileToExtract = shift;
my $targetPath = shift;
my $silent = shift;
my $status;
my $buff;
unless (-f $fileToExtract)
{
&error ("$fileToExtract is not a file!");
}
unless (-d $targetPath)
{
&makeDir($targetPath, 1);
}
# just look for .tar since all .tar archives with all compressions can be extracted.
if ($fileToExtract =~ m/.tar/)
{
my $pwd = getcwd();
changeDirectory($targetPath, 1);
my $tar = Archive::Tar->new();
$tar->read($fileToExtract);
$tar->extract();
changeDirectory($pwd, 1);
return;
}
elsif ($fileToExtract =~ m/.zip$/)
{
my $u = new IO::Uncompress::Unzip $fileToExtract or die "Cannot open $fileToExtract: $UnzipError";
for ($status = 1; $status > 0; $status = $u->nextStream())
{
my $header = $u->getHeaderInfo();
my (undef, $path, $name) = splitpath($header->{Name});
my (undef, $path, $name) = splitpath($header->{Name});
my $destdir = "$targetPath$path";
unless (-d $destdir)
{
&makeDir( $destdir, 1);
}
if ($name =~ m!/$!) {
last if $status < 0;
next;
}
my $destfile = "$destdir/$name";
if ($destfile =~ m/\/\/$/) # skip if no filename is given
{
next;
}
$destfile =~ s|\/\/|\/|g; # remove unnecessary doubleslashes
my $fh = openFileHandle ( $destfile , '>', 1 );
binmode($fh);
while (($status = $u->read($buff)) > 0) {
$fh->write($buff);
}
$fh->close();
unless (defined $silent)
{
&syslog ("Uncompress $destfile -> $targetPath");
}
#set timestamps of file to the ones in the zip
my $stored_time = $header->{'Time'};
utime ($stored_time, $stored_time, $destfile);
}
if ($status < 0)
{
die "Error processing $fileToExtract: $!\n"
}
}
else
{
my $ae = Archive::Extract->new( archive => $fileToExtract );
$ae->extract( to => $targetPath ) or &error("Failed to extract $fileToExtract with error $ae->error");
unless (defined $silent)
{
foreach my $file (#{$ae->files})
{
#only print if not a directory
if( $file!~m|/$| )
{
&syslog("Uncompress $fileToExtract -> $targetPath");
}
}
}
}
return;
}

You could simply do it in below manner using Archive::Extract, it provides generic archive extracting mechanism, therefore you don't have to install separate modules for tar and zip.
use Archive::Extract;
my $ae = Archive::Extract->new( archive => $fileToExtract );
my $ok = $ae->extract( to => $targetPath );
If you specifically want to check whether a file is tar or zip then you can use below:
$ae->is_tar
$ae->is_zip
Note that Archive::Extract is a core module therefore you'll not have to install it separetely.

Related

Perl6 : What is the best way for dealing with very big files?

Last week I decided to give a try to Perl6 and started to reimplement one of my program.
I have to say, Perl6 is so the easy for object programming, an aspect very painfull to me in Perl5.
My program have to read and store big files, such as whole genomes (up to 3 Gb and more, See example 1 below) or tabulate data.
The first version of the code was made in the Perl5 way by iterating line by line ("genome.fa".IO.lines). It was very slow and unsable for a correct execution time.
my class fasta {
has Str $.file is required;
has %!seq;
submethod TWEAK() {
my $id;
my $s;
for $!file.IO.lines -> $line {
if $line ~~ /^\>/ {
say $id;
if $id.defined {
%!seq{$id} = sequence.new(id => $id, seq => $s);
}
my $l = $line;
$l ~~ s:g/^\>//;
$id = $l;
$s = "";
}
else {
$s ~= $line;
}
}
%!seq{$id} = sequence.new(id => $id, seq => $s);
}
}
sub MAIN()
{
my $f = fasta.new(file => "genome.fa");
}
So after a little bit of RTFM, I changed for a slurp on the file, a split on the \n which I parsed with a for loop. This way I managed to load the data in 2 min. Much better but not enough. By cheating, I mean by removing a maximum of \n (Example 2), I decreased the execution time to 30 seconds. Quite good, but not totaly satisfied, by this fasta format is not the most used.
my class fasta {
has Str $.file is required;
has %!seq;
submethod TWEAK() {
my $id;
my $s;
say "Slurping ...";
my $f = $!file.IO.slurp;
say "Spliting file ...";
my #lines = $f.split(/\n/);
say "Parsing lines ...";
for #lines -> $line {
if $line !~~ /^\>/ {
$s ~= $line;
}
else {
say $id;
if $id.defined {
%!seq{$id} = seq.new(id => $id, seq => $s);
}
$id = $line;
$id ~~ s:g/^\>//;
$s = "";
}
}
%!seq{$id} = seq.new(id => $id, seq => $s);
}
}
sub MAIN()
{
my $f = fasta.new(file => "genome.fa");
}
So RTFM again and I discovered the magic of Grammar. So new version and an execution time of 45 seconds whatever the fasta format used. Not the fastest way but more elegant and stable.
my grammar fastaGrammar {
token TOP { <fasta>+ }
token fasta {<.ws><header><seq> }
token header { <sup><id>\n }
token sup { '>' }
token id { <[\d\w]>+ }
token seq { [<[ACGTNacgtn]>+\n]+ }
}
my class fastaActions {
method TOP ($/){
my #seqArray;
for $<fasta> -> $f {
#seqArray.push: seq.new(id => $f.<header><id>.made, seq => $f<seq>.made);
}
make #seqArray;
}
method fasta ($/) { make ~$/; }
method id ($/) { make ~$/; }
method seq ($/) { make $/.subst("\n", "", :g); }
}
my class fasta {
has Str $.file is required;
has %seq;
submethod TWEAK() {
say "=> Slurping ...";
my $f = $!file.IO.slurp;
say "=> Grammaring ...";
my #seqArray = fastaGrammar.parse($f, actions => fastaActions).made;
say "=> Storing data ...";
for #seqArray -> $s {
%!seq{$s.id} = $s;
}
}
}
sub MAIN()
{
my $f = fasta.new(file => "genome.fa");
}
I think that I found good solution to handle these kind of big files, but performances are still under those of Perl5.
As a newbie in Perl6, I would be interested to know if there is better ways to deal with big data or if there is some limitation due to the Perl6 implementation ?
As a newbie in Perl6, I would ask two questions :
Is there other Perl6 mechanisms that I'm not aware yet, or not yet
documented, for storing huge data from a file (like my genomes) ?
Did I reach the maximum performances for the current version of
Perl6 ?
Thanks for reading !
Fasta Example 1 :
>2L
CGACAATGCACGACAGAGGAAGCAGAACAGATATTTAGATTGCCTCTCATTTTCTCTCCCATATTATAGGGAGAAATATG
ATCGCGTATGCGAGAGTAGTGCCAACATATTGTGCTCTTTGATTTTTTGGCAACCCAAAATGGTGGCGGATGAACGAGAT
...
>3R
CGACAATGCACGACAGAGGAAGCAGAACAGATATTTAGATTGCCTCTCATTTTCTCTCCCATATTATAGGGAGAAATATG
ATCGCGTATGCGAGAGTAGTGCCAACATATTGTGCTCTTTGATTTTTTGGCAACCCAAAATGGTGGCGGATGAACGAGAT
...
Fasta example 2 :
>2L
GACAATGCACGACAGAGGAAGCAGAACAGATATTTAGATTGCCTCTCAT...
>3R
TAGGGAGAAATATGATCGCGTATGCGAGAGTAGTGCCAACATATTGTGCT...
EDIT
I applied advises of #Christoph and #timotimo and test with code:
my class fasta {
has Str $.file is required;
has %!seq;
submethod TWEAK() {
say "=> Slurping / Parsing / Storing ...";
%!seq = slurp($!file, :enc<latin1>).split('>').skip(1).map: {
.head => seq.new(id => .head, seq => .skip(1).join) given .split("\n").cache;
}
}
}
sub MAIN()
{
my $f = fasta.new(file => "genome.fa");
}
The program finished in 2.7s, which is so great !
I also tried this code on the wheat genome (10 Gb). It finished in 35.2s.
Perl6 is not so slow finally !
Big Thank for the help !
One simple improvement is to use a fixed-width encoding such as latin1 to speed up character decoding, though I'm not sure how much this will help.
As far as Rakudo's regex/grammar engine is concerned, I've found it to be pretty slow, so it might indeed be necessary to take a more low-level approach.
I did not do any benchmarking, but what I'd try first is something like this:
my %seqs = slurp('genome.fa', :enc<latin1>).split('>')[1..*].map: {
.[0] => .[1..*].join given .split("\n");
}
As the Perl6 standard library is implemented in Perl6 itself, it is sometimes possible to improve performance by just avoiding it, writing code in an imperative style such as this:
my %seqs;
my $data = slurp('genome.fa', :enc<latin1>);
my $pos = 0;
loop {
$pos = $data.index('>', $pos) // last;
my $ks = $pos + 1;
my $ke = $data.index("\n", $ks);
my $ss = $ke + 1;
my $se = $data.index('>', $ss) // $data.chars;
my #lines;
$pos = $ss;
while $pos < $se {
my $end = $data.index("\n", $pos);
#lines.push($data.substr($pos..^$end));
$pos = $end + 1
}
%seqs{$data.substr($ks..^$ke)} = #lines.join;
}
However, if the parts of the standard library used has seen some performance work, this might actually make things worse. In that case, the next step to take would be adding low-level type annotations such as str and int and replacing calls to routines such as .index with NQP builtins such as nqp::index.
If that's still too slow, you're out of luck and will need to switch languages, eg calling into Perl5 by using Inline::Perl5 or C using NativeCall.
Note that #timotimo has done some performance measurements and wrote an article about it.
If my short version is the baseline, the imperative version improves performance by 2.4x.
He actually managed to squeeze a 3x improvement out of the short version by rewriting it to
my %seqs = slurp('genome.fa', :enc<latin-1>).split('>').skip(1).map: {
.head => .skip(1).join given .split("\n").cache;
}
Finally, rewriting the imperative version using NQP builtins sped things up by a factor of 17x, but given potential portability issues, writing such code is generally discouraged, but may be necessary for now if you really need that level of performance:
use nqp;
my Mu $seqs := nqp::hash();
my str $data = slurp('genome.fa', :enc<latin1>);
my int $pos = 0;
my str #lines;
loop {
$pos = nqp::index($data, '>', $pos);
last if $pos < 0;
my int $ks = $pos + 1;
my int $ke = nqp::index($data, "\n", $ks);
my int $ss = $ke + 1;
my int $se = nqp::index($data ,'>', $ss);
if $se < 0 {
$se = nqp::chars($data);
}
$pos = $ss;
my int $end;
while $pos < $se {
$end = nqp::index($data, "\n", $pos);
nqp::push_s(#lines, nqp::substr($data, $pos, $end - $pos));
$pos = $end + 1
}
nqp::bindkey($seqs, nqp::substr($data, $ks, $ke - $ks), nqp::join("", #lines));
nqp::setelems(#lines, 0);
}

Check if paths from file are physically located in hard drive

Hello I have a file named files.txt in that file there are paths to files for example:
/home/ojom/123.jpg
/home/ojom/oaksdokwijeqijwqe.jpg
There is million of those paths in this file I need to see if the files in that file are physically located (exists) on my hard drive (if they don't write those paths to another file) how do I do that? What can I use?
You could parse that file using PHP and then go through the results and check them with file_exists.
The example below works if every file path is on a new line.
<?php
$files = array();
$handle = fopen("files.txt", "r");
if ($handle) {
while (($line = fgets($handle)) !== false) {
if(!file_exits($line)) {
continue; // file does not exist, skip
} else {
$files[] = $line;
}
}
} else {
die('Error opening the file');
}
fclose($handle);
echo "These files exist:";
echo "<pre>" . print_r($files, true) . "</pre>"; // prints them as an array
You could also use the array for further processing.
Here is the Python solution:
import os.path
files = 'c:\\test\\files.txt'
output = 'c:\\test\\filesNotExist.txt'
with open(files) as f:
for file in f:
if not os.path.isfile(file):
f = open(output, 'w')
f.write(file)
f.close()
f.close()
This script scans your text file and writes list of non-existed files to the output text file.

How to handle error thrown by module in perl

I am using the module DBD::Oracle in perl to insert xml contents into oracle 11 g instance. While inserting some of the documents in my sample set the script fails as the module returns Unsupported named object type for bind parameter. I would like to handle this error and make the loop iteration to go on.
following is my code,
use strict;
use warnings;
use DBI;
use DBD::Oracle qw(:ora_session_modes);
use DBD::Oracle qw(:ora_types);
die("USAGE: $0 <input_directory>") unless ($#ARGV == 0);
my $directory=$ARGV[0];
my $dbh = DBI->connect('dbi:Oraclle:dbname',"username", "pass");
my $SQL;
opendir(IMD, $directory) || die ("Cannot open directory");
my #listOfFiles= readdir(IMD);
closedir(IMD);
my $xmltype_string;
my $xml;
my $i = 1;
foreach my $file(#listOfFiles)
{
unless($file eq '.' or $file eq '..')
{
print "inserting File no. $i \t $file .... \n";
{
local $/=undef;
open (FILE , "<" , "$directory/$file" );
$xml=<FILE>;
close (FILE);
}
$SQL="insert into sampleTable values ( :ind, :xml)";
my $sth =$dbh-> prepare($SQL);
$sth->bind_param(":xml" , $xml , { ora_type => ORA_XMLTYPE});
$sth->bind_param(":ind" , $i);
$sth-> execute();
$i++;
}
}
Am getting the error in bind param.
Error handling is usually done via the Try::Tiny module:
use Try::Tiny;
try {
something_that_could_die();
}
catch {
handle_error($_);
}
finally {
do_something_either_way();
}; # ← trailing semicolon not optional.
Both catch and finally are optional.

How to read content of doc file using php or in codeigniter

private function read_doc($filename) {
$fileHandle = fopen($filename, "r");
var_dump($filename);
$line = fread($fileHandle, filesize($filename));
$lines = explode(chr(0x0D), $line);
$outtext = "";
foreach ($lines as $thisline) {
$pos = strpos($thisline, chr(0x00));
if (($pos !== FALSE) || (strlen($thisline) == 0)) {
} else {
$outtext .= $thisline . " ";
}
}
$outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t#\/\_\(\)]/", "", $outtext);
return $outtext;
}
I am trying to read content of .doc using the above code.. but when i run the above code it give's me
filesize(): stat failed for http://localhost/jobportal/public/uploads/document/1.doc and
fread(): Length parameter must be greater than 0
As you can see here http://php.net/manual/en/function.filesize.php :
As of PHP 5.0.0, this function can also be used with some URL wrappers. Refer to Supported Protocols and Wrappers to determine which wrappers support stat() family of functionality.
and if you will go to this link http://www.php.net/manual/en/wrappers.http.php , you will see that Supports stat() is "NO". So you can't use http links with filesize function. If this is local file just use absolute or relevant path of the file like '/path/to/my/file', if this is remote file (not sure but) I think it should be downloaded first with curl and curl_getinfo function to read Content-Length http property
That filesize(): stat failed for http://localhost/jobportal/public/uploads/document/1.doc means: you are trying to get the filesize of an URL which isn't applicable.
You need to read from the stream in chunks until end and check how many data actually has been submitted.
if($ext == 'doc'){
if(file_exists($filename) ) {
if(($fh = fopen($filename, 'r')) !== false ) {
$headers = fread($fh, 0xA00);
$n1 = ( ord($headers[0x21C]) - 1 );
$n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 );
$n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 );
$n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 );
$textLength = ($n1 + $n2 + $n3 + $n4);
$content_of_file1 = fread($fh, $textLength);
$content_of_file = strtolower($content_of_file1);
}
}
}

Perl select returning undef on sysread when using Windows, IPC::Open3, and IO::Socket->socketpair()

I found this example (posted by #ikegami) of a way to use IPC::Open3 on windows using sockets. The problem is that, when I run it, I get an error An existing connection was forcibly closed by the remote host on the sysread. The command runs, the select works correctly, but the sysread is getting an undef instead of the expected 0 for end of file. This behavior is not the same for all commands. If I change the command to echo Hello World! it does not cause the error. Any idea what is going on here?
Here is the code from the example:
use strict;
use warnings;
use IO::Select qw( );
use IPC::Open3 qw( open3 );
use Socket qw( AF_UNIX SOCK_STREAM PF_UNSPEC );
print( "REMOVE ME: getting started\n" );
sub _pipe {
socketpair($_[0], $_[1], AF_UNIX, SOCK_STREAM, PF_UNSPEC)
or return undef;
shutdown($_[0], 1); # No more writing for reader
shutdown($_[1], 0); # No more reading for writer
return 1;
}
sub _open3 {
local (*TO_CHLD_R, *TO_CHLD_W);
local (*FR_CHLD_R, *FR_CHLD_W);
local (*FR_CHLD_ERR_R, *FR_CHLD_ERR_W);
if ($^O =~ /Win32/) {
_pipe(*TO_CHLD_R, *TO_CHLD_W ) or die $^E;
_pipe(*FR_CHLD_R, *FR_CHLD_W ) or die $^E;
_pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $^E;
} else {
pipe(*TO_CHLD_R, *TO_CHLD_W ) or die $!;
pipe(*FR_CHLD_R, *FR_CHLD_W ) or die $!;
pipe(*FR_CHLD_ERR_R, *FR_CHLD_ERR_W) or die $!;
}
my $pid = open3('>&TO_CHLD_R', '<&FR_CHLD_W', '<&FR_CHLD_ERR_W', #_);
return ( $pid, *TO_CHLD_W, *FR_CHLD_R, *FR_CHLD_ERR_R );
}
# when i change the command to 'echo Hello World' it works...
my ($pid, $to_chld, $fr_chld, $fr_chld_err) =
_open3('cmd /c "dir /s/b"');
my %objs;
my $in_sel = IO::Select->new();
my $out_sel = IO::Select->new();
for my $fh ($fr_chld, $fr_chld_err) {
my $obj = {
buf => '',
};
$objs{ fileno($fh) } = $obj;
$in_sel->add($fh);
}
close($to_chld);
while ($in_sel->count() + $out_sel->count()) {
my ($ins, $outs) = IO::Select::select($in_sel, $out_sel, undef);
for my $fh (#$ins) {
my $obj = $objs{ fileno($fh) };
our $buf; local *buf = \( $obj->{buf} );
my $bytes_read = sysread($fh, $buf, 64*1024, length($buf));
if (!$bytes_read) {
warn("Error reading from child: $!\n")
if !defined($bytes_read);
$in_sel->remove($fh);
}
}
for my $fh (#$outs) {
}
}
waitpid($pid, 0);
print("STDOUT:\n$objs{ fileno( $fr_chld ) }{buf}");
print("\n" );
print("STDERR:\n$objs{ fileno( $fr_chld_err ) }{buf}");
I think it's because something like shutdown was used instead of something like close. Sounds safe to ignore.
grep $!{$_}, keys %! shows ECONNRESET, so just change
warn("Error reading from child: $!\n")
if !defined($bytes_read);
to
warn("Error reading from child: $!\n")
if !defined($bytes_read) && !$!{ECONNRESET};

Resources