#!/usr/bin/perl -w use strict; my $numArgs = $#ARGV + 1; # calculate number of arguments passed to program from command line our $inputfile; our $outputfile; # initialise filename variables if ($numArgs >= 2) { $inputfile = "$ARGV[0]"; print "Input filename received from command line. Opening '$inputfile'...\n"; open INPUTFILE, "<$inputfile" || die "An error occured whilst opening your input file '$inputfile': $!"; $outputfile = "$ARGV[1]"; print "Output filename received from command line. Opening '$outputfile'...\n"; open OUTPUTFILE, ">$outputfile" || die "An error occured whilst creating your output file '$outputfile': $!"; } elsif ($numArgs == 1) { $inputfile = "$ARGV[0]"; print "Input filename received from command line. Opening '$inputfile'...\n"; open INPUTFILE, "<$inputfile" || die "An error occured whilst opening your input file '$inputfile': $!"; print "Please enter the path to your output file: "; chomp($outputfile = ); open OUTPUTFILE, ">$outputfile" || die "An error occured whilst creating your output file '$outputfile': $!"; } else { print "Please enter the path to your input file: "; chomp($inputfile = ); open INPUTFILE, "<$inputfile" || die "An error occured whilst opening your input file '$inputfile': $!"; print "Please enter the path to your output file: "; chomp($outputfile = ); open OUTPUTFILE, ">$outputfile" || die "An error occured whilst creating your output file '$outputfile': $!"; } print "INPUT: $inputfile\n"; print "OUTPUT: $outputfile\n"; #display input and output file names while ((&chooseprogram != 1) && (&chooseprogram != 2) && (&chooseprogram != 3)) {} sub chooseprogram { print "Please enter 1 to parse a csv file and extract a certain field from each line...\n"; print "Enter 2 to parse a csv file and bin the data according to a specific column...\n"; print "Enter 3 to parse a csv file and remove a series of invalid data rows...\n"; print "Enter 4 to parse a csv file and replace invalid characters with blanks...\n"; print "Or enter 5 to select a string (using regex) from a repeating sequence of lines in a text file: "; chomp(my $programchoice = ); eval { if ($programchoice == 1) { &csvextract; } elsif ($programchoice == 2) { &csvbin; } elsif ($programchoice == 3) { &csvstrip; } elsif ($programchoice == 4) { &csvclean; } elsif ($programchoice == 5) { &compoundselect; } }; return $programchoice; } sub csvextract { print "CSVextract\n"; my @requiredcolumn; my $requiredcolumn; my $firstline = 0; my @fields; my $fields; my $lineindex = 0; my $finished = "y"; my $selections = 0; # initialise variables while ($finished =~ /^[y]$/) { # while the user still wants to enter more columns for selection print "Please enter the number of the column that you wish to select (first column is zero): "; chomp($requiredcolumn[$selections] = ); # read in column number for selection from keyboard print "Would you like to select another column? Please enter 'y' if yes: "; chomp($finished = ); # read in choice of user $selections++; # increment number of column selections counter } print "Please enter the line number of the first line to be parsed (line number of the first line in the file is zero): "; chomp($firstline = ); # read in line number of the first line from keyboard my $requiredcolumns = @requiredcolumn; # count the number of required columns while(defined(my $line = )) { # read through each line in turn while they still exist if (($lineindex) >= $firstline) { # if the line is required to be read $line =~ s/["]+//g; # remove quotation marks from SMILES strings $line =~ s/[\n]+//g; # remove new line characters @fields = split(/,\t*/, $line); # split line into fields according to comma delimiter for my $i (0 .. $requiredcolumns - 1) { # for each required column print OUTPUTFILE "$fields[$requiredcolumn[$i]] "; # print the correct field with a space delimiter } print OUTPUTFILE "\n"; # add new line character as all the fields in this row have been printed } $lineindex++; # increment current line } close INPUTFILE; close OUTPUTFILE; # close input and output files return; } sub compoundselect { print "StringSelect\n"; print "Please enter the number of compounds to select: "; chomp(my $selectcmpds = ); # read in number of compounds to select from keyboard print "Please enter the line number of the first compound line (first line is zero): "; chomp(my $firstline = ); # read in first compound line number from keyboard print "Please enter the number of lines between each compound line: "; chomp(my $increment = ); # read in increment from keyboard print "Please enter the regex for selection (compound will be taken from $1): "; # e.g. <(.+)> chomp(my $regex = ); # read in compound selection regex from keyboard my @compounds; my $compounds; # initialise compounds array my $lineindex = 0; # initialise current line variable $increment++; # add 1 to the increment variable to account for the compound line itself my $offset = $increment - $firstline; # calculate offset while(defined(my $line = )) { # read through each line in turn while they still exist if (($lineindex+$offset)%$increment == 0) { # if the line is a compound line $line =~ /$regex/; # match with the regex $compounds[($lineindex+$offset)/$increment] = "$1"; # Read each compound into the compounds array } $lineindex++; # increment current line } for my $i (0 .. $selectcmpds) { # for each compound to be selected my $totalcmpds = @compounds; # total number of compounds is the length of the compounds array my $rnd = int(rand($totalcmpds)); # create a random number corresponding to a compound in the array print OUTPUTFILE "$compounds[$rnd] $i\n"; # write the random compound into the output file splice(@compounds,$rnd,1); #remove the written compound from the array so that it can't be selected again } close INPUTFILE; close OUTPUTFILE; # close input and output files return; } sub csvstrip { print "CSVstrip\n"; print "Please enter the line number of the first compound line (first line is zero): "; chomp(my $firstline = ); # read in first compound line number from keyboard print "Please enter the column to use for validation (first column is zero): "; chomp(my $valcolumn = ); # read in number of compounds to select from keyboard print "Please enter the regex for validation: "; # e.g. ^"[\.]"$ chomp(my $regex = ); # read in compound selection regex from keyboard my @fields; my $fields; my $lineindex = 0; while(defined(my $line = )) { # read through each line in turn while they still exist if ($lineindex >= $firstline) { $line =~ s/[\n]+//g; # remove new line characters @fields = split(/,\t*/, $line); # split line into fields according to comma delimiter if ($fields[$valcolumn] !~ /$regex/) { print "field: $fields[$valcolumn]...printing \$lineindex: $lineindex\n"; print OUTPUTFILE "$line\n"; } } $lineindex++; } close INPUTFILE; close OUTPUTFILE; # close input and output files return; } sub csvclean { print "CSVclean\n"; print "Please enter the line number of the first compound line (first line is zero): "; chomp(my $firstline = ); # read in first compound line number from keyboard print "Please enter the regex matching the characters to be removed: "; # e.g. ^"[\.]"$ chomp(my $regex = ); # read in compound selection regex from keyboard my @fields; my $fields; my $lineindex = 0; while(defined(my $line = )) { # read through each line in turn while they still exist if ($lineindex >= $firstline) { $line =~ s/[\n]+//g; # remove new line characters @fields = split(/,\t*/, $line); # split line into fields according to comma delimiter my $numfields = @fields; for my $i (0 .. $numfields) { #print "field: $fields[$valcolumn]...printing \$lineindex: $lineindex\n"; $fields[$i] =~ s/^"[\.]"$//g; $fields[$i] =~ s/^"[?]"$//g; if ($i == $numfields) { print OUTPUTFILE "$fields[$i]\n"; } else { print OUTPUTFILE "$fields[$i],"; } } } $lineindex++; } close INPUTFILE; close OUTPUTFILE; # close input and output files return; } sub csvbin { print "CSVBin\n"; print "Please enter the line number of the first compound line (first line is zero): "; chomp(my $firstline = ); # read in first compound line number from keyboard print "Please enter the number of bins required: "; chomp(my $bins = ); # read in number of compounds to select from keyboard print "Please enter the maximum bin value: "; chomp(my $binmax = ); # read in number of compounds to select from keyboard print "Please enter the column to use for binning (first column is zero): "; chomp(my $bincolumn = ); # read in number of compounds to select from keyboard my @fields; my $fields; my @bin; my $bin; my $lineindex = 0; my $binsize = $binmax / $bins; for my $i (0 .. $bins) { # for each bin $lineindex = 0; $bin[$i] = 0; open OUTPUTFILE, ">bin" . $i . "_output.txt" || die "An error occured whilst creating your output file 'bin . $i . _output': $!"; open INPUTFILE, "<$inputfile" || die "An error occured whilst opening your input file '$inputfile': $!"; while(defined(my $line = )) { # read through each line in turn while they still exist if ($lineindex >= $firstline) { @fields = split(/,\t*/, $line); # split line into fields according to comma delimiter if ($fields[$bincolumn] !~ /^[\.]$/) { if ($i == $bins) { if ($fields[$bincolumn] >= ($i * $binsize)) { # if the current value fits into this particular bin $bin[$i]++; $line =~ s/[\n]+//g; # remove new line characters print OUTPUTFILE "$line\n"; } } elsif (($fields[$bincolumn] >= ($i * $binsize)) && ($fields[$bincolumn] < (($i + 1) * $binsize))) { # if the current value fits into this particular bin $bin[$i]++; $line =~ s/[\n]+//g; # remove new line characters print OUTPUTFILE "$line\n"; } } } $lineindex++; # increment current line } close INPUTFILE; close OUTPUTFILE; } open OUTPUTFILE, ">$outputfile" || die "An error occured whilst creating your output file '$outputfile': $!"; for my $i (0 .. $bins) { # for each bin my $binminval = $i * $binsize; my $binmaxval = ($i + 1) * $binsize; if ($i == $bins) { print OUTPUTFILE "$binminval" . "+," . "$bin[$i]\n"; } else { print OUTPUTFILE "$binminval" . "-" . "$binmaxval" . "," . "$bin[$i]\n"; } } close INPUTFILE; close OUTPUTFILE; return; }