#!/usr/local/bin/perl # glossing.pl by Matt Rosin (see bottom of this file) # Loads a glossary and replaces Japanese words in the document with English words from list. # glossary format: Japanesetext TAB Englishtext\n # Japanese document is EUC text. use Smart::Comments; use Jcode; use Getopt::Long; use Pod::Usage; our $VERSION = '0.2'; my ($glosf,$docf,$help,$man,$quotes,$singlequotes) = ("","",undef,undef,0,0); my $outf = "replaced"; GetOptions( 'help|?' => \$help, 'man' => \$man, "glosf=s" => \$glosf, # string required "docf=s" => \$docf, # string required "outf:s" => \$outf, # string, optional "quotes|quotationmarks" => \$quotes, # optional flag "singlequotes|singlequotationmarks" => \$singlequotes # optional flag ) or pod2usage(2); pod2usage(1) if $help; if ($man) { print $0 . " " . $VERSION . "\n"; pod2usage(-exitstatus => 0, -verbose => 3); exit 0; } pod2usage(1) unless $#ARGV > -1; die pod2usage(-exitstatus => 0, -verbose => 2) unless ((length($glosf)>0) && (length($docf)>0)); die print "You may not select both -s)inglequotes and -q)uotes at the same time.\n\n" if (($quotes==1) && ($singlequotes==1)); #DEBUG print "Got: GLOSF $glosf DOCF $docf OUTF $outf QUOTES $quotes SINGLEQUOTES $singlequotes\n"; my @keylist = loadtextconfig(\%words,$glosf); # foreach (@keylist) { print $_ . " => " . $words{$_} . "\n"; } open(IN,$docf); my $rawdoc; my $cnt = 0; while () { $rawdoc .= $_ while (); $cnt++; }; close(IN); $doc = toeuc($rawdoc); $j = Jcode->new($doc); # replace all words in glossary with progress bar foreach my $k (@keylist) { ### Processing [===| ] % done $j->s($k,$words{$k},'g'); # s/pattern/replace/opt } #use Data::Dumper; print Dumper($j); exit 0; #print $j->euc . "\n"; exit 0; # write euc and sjis versions print "Writing sjis version... "; open(SJIS,">" . $outf . ".sjis.txt"); print SJIS $j->sjis; close(SJIS); print "done.\n"; print "Writing euc version... "; open(EUC, ">" . $outf . ".euc.txt"); print EUC $j->euc; close(EUC); print "done.\n"; exit 0; ## SUBROUTINES ## sub loadtextconfig { # Read a text-based config file into a single-level hash, die on disk error # Usage: my @keylist = loadtextconfig(\%myhash, $myfilename); # Note: SJIS CAN BREAK/bakemoji e.g. a comma separated list of prefectures # Copyright 2001-2006 (c) Matt Rosin # Note: modified from 2003 version on 2006-1127 to add Jcode conversion my ($href,$f) = @_; my ($i,$j) = (0,0); my @idx = (); # array containing the keys in order my $slurp = ""; open(W,$f) || die "Content-type: text/html\n\n ERROR: Could not open $f $!"; while () { $slurp .= $_; } my $slurpobj = Jcode->new($slurp); my $eucbuf = $slurpobj->euc; foreach (split(/\n/,$eucbuf)) { next if /^#/; # ignore commented lines $_ =~ s/\r|\n//g; # remove ending carriage return and/or newline s/^\s+//; # remove leading whitespace s/\s+$//g; # remove trailing whitespace next unless length; # skip blank lines ($i,$j) = split(/\t/,$_,2); # $j holds rest of line $j =~ s/^\s+//; # remove leading whitespace if ($singlequotes) { # remove single quotation marks around terms $i =~ s/^\'//; $i =~ s/\'$//; $j =~ s/^\'//; $j =~ s/\'$//; } elsif ($quotes) { # remove double quotation marks around terms $i =~ s/^\"//; $i =~ s/\"$//; $j =~ s/^\"//; $j =~ s/\"$//; } $href->{$i} = $j; push(@idx,$i); # ADDED 2006-1127 } # print "loadtextconfig $f:
\n" . Dumper($href) . "
\n"; return @idx; } sub toeuc { my $tmp = shift; # &jcode'convert(*tmp,'euc'); # convert to anything to jis (less efficient) my $tj = Jcode->new($tmp); return $tj->euc; } __END__ =head1 NAME Glossing =head1 VERSION 0.2 =head1 SYNOPSIS glossing -g glossary -d document -o output_base [-q|-s] The output base filename defaults to "replaced" if not provided. Note this requires perl 5.8.1 or better for the replace function to work. Options: -h,-help,-? prints brief help -man prints long manpage -q,-quot,-quotes remove double quotation marks -s,-single,singlequotes remove single quotation marks =head1 DESCRIPTION Glossing was made to assist in study and translation of Japanese for foreign language speakers. Given a dictionary file and a text document, glossing will use the dictionary file as a glossary and replace all occurrences of all words in the glossary that it finds in your text document. =head1 FILE FORMATS The dictionary file and document file are both converted to EUC first internally, and output is saved in both SJIS and EUC formats in separate files. You specify these files on the command line, along with an output filename, to which is added ".sjis.txt" or ".euc.txt". The dictionary file must be a text file organized in two columns, separated by tabs. For example you could save a spreadsheet from OpenOffice Calc by selecting EUC encoding and TAB delimited records. It wants to add quotation marks around each field though, so to remove these add the -q or -s switches. =head1 AUTHOR Copyright (c) 2006-2007 by Matt Rosin