#!/usr/bin/perl

# Extract sheets from xls files as tab seperated text files.

# Currently some utf-8 characters are translated as well as a few html entities.

# Based on a dump example that came with Spreadsheet::ParseExcel.

use utf8;
use locale ':not_characters';
use open ':locale';
use POSIX qw(locale_h);
setlocale(LC_ALL, "en_US.utf8");

use Spreadsheet::ParseExcel;
foreach $file (@ARGV) {
  print STDERR "Reading $file.\n";
  $ex = new Spreadsheet::ParseExcel;
  $book = $ex->Parse($file);
  if (!$book) {
    print STDERR "Unable to parse $file.\n";
    next;
  }
  
  foreach $sheet (@{$book->{Worksheet}}) {
    $name = $sheet->{Name};
    $name = lc($name);
    $name =~ s/[^abcdefghijklmnopqrstuvwxyz[:digit:]]//g;
    $bname = $name;
    $count = 0;
    while (-e "new/$name.tsv") {
      $count++;
      $name = "UNIQUE$count-$bname";
    }
    open(FILE, ">new/$name.tsv") || die "Unable to open $name.\n";
    print STDERR "Writing $bname to new/$name.tsv.\n";

    print FILE "\n" x $sheet->{MinRow};
    for($row = $sheet->{MinRow} ; 
      defined $sheet->{MaxRow} && $row <= $sheet->{MaxRow} ; $row++) {
      $new = 1;
      print FILE "\t" x $sheet->{MinCol};
      for($col = $sheet->{MinCol} ;
        defined $sheet->{MaxCol} && $col <= $sheet->{MaxCol} ; $col++) {
        $cell = $sheet->{Cells}[$row][$col];
        if ($new) {
          $new = 0;
        }
        else {
          print FILE "\t";
        }
        if ($cell) {
          $val = $cell->value;
          $val =~ s/’|‘/'/g;
          $val =~ s/&lt;/</g;
          $val =~ s/&gt;/>/g;
          $val =~ s/&amp;/\&/g;
          utf8::decode($val);
          print FILE $val;
        }
      }
      print FILE "\n";
    }
    close(FILE);
  }
}
