#!/usr/bin/perl

# Extract sheets from xlsx files as tab seperated text files.

# Currently some utf-8 characters are translated as well as a few html entities,

# Based on a dump example that came with Spreadsheet::ParseExcel.

use utf8;
use Encode;
use locale ':not_characters';
use open ':locale';
use POSIX qw(locale_h);
setlocale(LC_ALL, "en_US.utf8");

use Spreadsheet::XLSX;
foreach $file (@ARGV) {
  print STDERR "Reading $file.\n";
  $book = new Spreadsheet::XLSX($file);
  if (!$book) {
    print STDERR "Unable to parse $file.\n";
    next;
  }
  
  foreach $sheet (@{$book->{Worksheet}}) {
    $name = $sheet->{Name};
    $name = lc($name);
    $name =~ s/[^A-Za-z0-9]//g;
    $bname = $name;
    $count = 0;
    while (-e "new/$name.tsv") {
      $count++;
      $name = "UNIQUE$count-$bname";
    }
    open(FILE, ">new/$name.tsv") || die "Unable to open $name.\n";
    print STDERR "Writing $bname to new/$name.tsv.\n";

    print FILE "\n" x $sheet->{MinRow};
    for($row = $sheet->{MinRow} ; 
      defined $sheet->{MaxRow} && $row <= $sheet->{MaxRow} ; $row++) {
      $new = 1;
      print FILE "\t" x $sheet->{MinCol};
      for($col = $sheet->{MinCol} ;
        defined $sheet->{MaxCol} && $col <= $sheet->{MaxCol} ; $col++) {
        $cell = $sheet->{Cells}[$row][$col];
        if ($new) {
          $new = 0;
        }
        else {
          print FILE "\t";
        }
        if ($cell) {
          $val = decode('utf8',$cell->value);
          $val =~ s/’/'/g;
          $val =~ s/‘/'/g;
          $val =~ s/&lt;/</g;
          $val =~ s/&gt;/>/g;
          $val =~ s/&quot;/"/g;
          $val =~ s/&apos;/'/g;
          $val =~ s/&amp;/\&/g;
          print FILE $val;
        }
      }
      print FILE "\n";
    }
    close(FILE);
  }
}
