package CleanTeX;
# This package prepares a LaTeX file for translation to LyX (via reLyX.pl)
# - Translates some local commands (e.g., {\em blah} to {\emph{blah}})
# - Prepares math mode stuff for LyX. LyX reads LaTeX math mode directly,
#      so reLyX.pl can basically copy all math mode exactly, but LyX is a
#      bit stricter than LaTeX.

use strict;

######
# Global variables
my $TeXStackLevel = 0;  #Do we need this? Can't we use $# or $TeXStack[-1]?
my @TeXStack = (".");
my $printstr = ""; # current string to print
my $last_printstr = ""; # last string we printed


#########################   PARSER INVOCATION   ################################
sub call_parser {
# This subroutine opens the TeX parser and processes the file.
# Arg0 is the name of the input TeX file
# Arg1 is the name of the output "clean" file

    # This is the list of tokens for the parser
    # Parts of the token list were swiped from TeX.pm
    # NOTE that we don't have to worry below about fancy token types
    #     like TT::BegArgsToken or TT::LookAhead, because we only
    #     define simple tokens in the tokens hash
    my %MyTokens = ( '{' => $Text::TeX::Tokens{'{'},
		     '}' => $Text::TeX::Tokens{'}'},
		     '%' => {'class' => 'Text::TeX::Token'},
		     "\$" => $Text::TeX::Tokens{"\$"},
		     '$$' => $Text::TeX::Tokens{'$$'},
		     '\em' => $Text::TeX::Tokens{'\em'},
		     '\rm' => $Text::TeX::Tokens{'\em'},
		     '\bf' => $Text::TeX::Tokens{'\em'},
		     '\tt' => $Text::TeX::Tokens{'\em'},
		     '\sf' => $Text::TeX::Tokens{'\em'},
		     '\sc' => $Text::TeX::Tokens{'\em'},
		     '\sl' => $Text::TeX::Tokens{'\em'},
		     '\it' => $Text::TeX::Tokens{'\em'},
		     '\rmfamily' => $Text::TeX::Tokens{'\em'},
		     '\ttfamily' => $Text::TeX::Tokens{'\em'},
		     '\sffamily' => $Text::TeX::Tokens{'\em'},
		     '\mdseries' => $Text::TeX::Tokens{'\em'},
		     '\bfseries' => $Text::TeX::Tokens{'\em'},
		     '\upshape' => $Text::TeX::Tokens{'\em'},
		     '\itshape' => $Text::TeX::Tokens{'\em'},
		     '\slshape' => $Text::TeX::Tokens{'\em'},
		     '\scshape' => $Text::TeX::Tokens{'\em'}
		    ); 

    my ($InFileName, $OutFileName) = (shift,shift);

    warn "Cleaning TeX file... \n";
    open (OUTFILE, ">$OutFileName");

    my $file = new Text::TeX::OpenFile 
	   $InFileName,
	   'defaultact' => \&clean_tex,
	   'tokens' => \%MyTokens;

    # Now actually process the file
    $file->process;
    close OUTFILE;
    warn "Done cleaning TeX file\n";
} # end sub call_parser


#######################   MAIN TRANSLATING SUBROUTINE   ########################
# Routine called by the TeX-parser to perform token-processing.
sub clean_tex {
    my($eaten,$txt) = (shift,shift);
    my ($outstr, $type);

    # a faux "switch" statement.  sets $_ for later use in pattern
    # matching.
    $type = ref($eaten);
    $type =~ s/^Text::TeX::// or die "Non-Text::TeX token";
    $printstr = ""; # default for undefined printstrs etc.
    for ($type) {
	   # Handle blank lines.
	   if (m/Paragraph/o) {
		  $printstr = "\n";
		  last;
	   }
	   
	   # Handle comment at end of paragraph
	   if (! defined $eaten->[0]) {
	       warn "Weird undefined token!" unless $eaten->[1];
	       last;
	    }

	   # do it here because $eaten->[0] is undefined for Paragraph
	   $outstr = $eaten->[0];
	   
	   # Handle LaTeX tokens
	   if (m/Token/o) {
		  &do_token($outstr);
		  last;
	   }
	   
	   # Handle opening groups, like '{' and '$'.
	   if (m/Begin::Group/o) {
		  &do_begin_group($outstr);
		  last;
	   }
	   
	   # Handle closing groups, like '}' and '$'.
	   if (m/End::Group/o) {
		  &do_end_group($outstr);
		  last;
	   }
	   
	   # Handle the end of a local font command - insert a '}'
	   if (m/EndLocal/o) {
		  &do_end_group('}');
		  last;
	   }
	   
	   if (m/Text/) {
	       # Note 1: Text::TeX eats spaces after a token beginning with '\'.
	       #    Usually this would be fine, but CleanTeX ends up printing,
	       #    e.g., '\itemblah' instead of '\item blah'. This will confuse
	       #    reLyX.pl into thinking the token is \itemblah. So if a
	       #    text follows a multiletter token, put a space before it.
	       if ($last_printstr =~ /\\[a-zA-Z]+$/ && $outstr =~ /^[a-zA-Z]/) {
	           $outstr = " " . $outstr;

	       # Note 2: LyX requires _ and ^ to have braces around their
	       #     arguments, while LaTeX accepts one-letter arguments
	       #     without braces. Add braces if necessary.
	       } elsif ($last_printstr =~ /^[_^]$/) {
	           $outstr =~ s/^(.)/{$1}/;
	       }
	       
	       $printstr = $outstr;
	       last;
	   }

	   # The default action - print the string.
	   $printstr = $outstr;
    } # end for (ref($eaten))
    
    # Actually print the string
    if (defined $printstr) { 
	print OUTFILE $printstr;
    } else {warn "Undefined printstr";}
    $last_printstr = $printstr; #save for next time

#  print "Nested Level #",$TeXingroup,"...\n";
#  print ref($eaten), ": ", $eaten->[0], "\n";
}


####################   TRANSLATOR SUBROUTINES    ###############################
# Processes begin-group tokens
sub do_begin_group {
    my $tokstr = shift;
    my %begtranstbl = (
				"\$" => '\(', # LyX math mode doesn't
				'$$' => '\[', # understand \$ or $$
				);

    if ($tokstr eq '{') {
	   push @TeXStack, $tokstr;
	   $TeXStackLevel += 1;
    }

    if ( exists $begtranstbl{$tokstr} ) {
	   $printstr = $begtranstbl{$tokstr};
    }
    else {
	   $printstr = $tokstr;
    }
}


######
# Processes end-group tokens
sub do_end_group {
    my $tokstr = shift;
    my %endtranstbl = (
				   "\$" => '\)',
				   '$$' => '\]',
				   );

    if ($tokstr eq '}') {
	   pop @TeXStack;
	   $TeXStackLevel -= 1;
    }

    if ( exists $endtranstbl{$tokstr} ) {
	   $printstr = $endtranstbl{$tokstr};
    }
    else {
	   $printstr = $tokstr;
    }
}


######
# Processes tokens.
sub do_token {
    my $tokstr = shift; # the text to be printed
    my %transtbl = (
				'\em' => '\emph{',
				'\rm' => '\textrm{',
				'\bf' => '\textbf{',
				'\tt' => '\texttt{',
				'\sf' => '\textsf{',
				'\sc' => '\textsc{',
				'\sl' => '\textsl{',
				'\it' => '\textit{',
				'\rmfamily' => '\textrm{',
				'\ttfamily' => '\texttt{',
				'\sffamily' => '\textsf{',
				'\mdseries' => '\textmd{',
				'\bfseries' => '\textbf{',
				'\upshape' => '\textup{',
				'\itshape' => '\textit{',
				'\slshape' => '\textsl{',
				'\scshape' => '\textsc{',
				);

    # The \em command requires special handling, since it can work in
    # pairs.  To handle this, we use a stack-mechanism.  If we find a
    # second '\em', we print a group-ender and pop the stack.
    if ($tokstr eq '\em') {
	   if ($TeXStack[$TeXStackLevel] eq '\em'){
		  # The last thing on the stack was an '\em' - end this
		  # group 
		  do_end_group '}';
	   }
	   else {
		  # This is a new '\em' - handle like the others.
		  push @TeXStack, $tokstr;
		  $TeXStackLevel += 1;
		  $printstr = $transtbl{$tokstr};
	   }
    }
    elsif ( exists $transtbl{$tokstr} ) {
	   # Normal translatable token
	   push @TeXStack, $tokstr;
	   $TeXStackLevel += 1;
	   $printstr = $transtbl{$tokstr};
    }
    else {
	   # Unknown token - just print it.
	   $printstr = $tokstr;
    }
}


1; # return true value to calling program
