#!/usr/bin/perl

use strict;
use warnings;

use Chemistry::OpenSMILES qw(
    is_aromatic_bond
    is_cis_trans_bond
    is_double_bond
    is_single_bond
    valence
);
use Chemistry::OpenSMILES::Aromaticity qw( kekulise );
use Chemistry::OpenSMILES::Parser;
use Chemistry::OpenSMILES::Stereo qw(
    chirality_to_pseudograph
    cis_trans_to_pseudoedges
    mark_all_double_bonds
);
use Chemistry::OpenSMILES::Writer qw( write_SMILES );
use File::Basename qw( basename );
use Getopt::Long::Descriptive;
use Graph::Nauty qw( canonical_order );
use List::Util qw( all any first shuffle );
use SmilesScripts::Aromaticity;
use SmilesScripts::MolecularTransformations qw( canonicalise_nitro_groups );

$Graph::Nauty::worksize = 25600;

my $basename = basename $0;
my( $opt, $usage ) = describe_options( <<"END" . 'OPTIONS',
USAGE
    $basename [<args>] [<files>]

DESCRIPTION
    $basename reads in files with SMILES descriptors and outputs them
    according to stable atom ordering established by Graph::Nauty.
    Moieties, if more than one, are ordered in lexicographic order.

END
    [ raw => hidden => {
        one_of => [
            [ 'infer-hydrogens' =>
                'infer hydrogen atom counts according to valency rules [default]' ],
            [ 'no-infer-hydrogens' =>
                'do not infer hydrogen atom counts' ]
        ],
        default => 'infer_hydrogens'
      }
    ],
    [],
    [ aroma => hidden => {
        one_of => [
            [ 'aromatise' => 'aromatise Kekule structures ' .
                             '(experimental)' ],
            [ 'no-aromatise' => 'do not attempt to aromatise [default]' ]
        ],
        default => 'no-aromatise'
      }
    ],
    [ 'aromatisation-method=s',
      'Mark flat rings as aromatic. Two methods are supported: ' .
      '\'electron-cycles\', which follows Richard L. Apodaca\'s electron ' .
      'cycle-based approach (default), \'Huckel\', ' .
      'which aims to follow Hückel\'s rule as implemented in ' .
      'Chemistry::Ring. \'external-list\': uses aromaticity ' .
      'defined using --aromatic-atoms or --aromatic-bonds options.',
      { default => 'electron-cycles' } ],
    [ 'aromatic-atoms=s',
      'comma-separated list of 0-based aromatic atom numbers to be ' .
      'consulted during aromaticity detection. Implies ' .
      '--aromatisation-method \'external-list\'.',
      { implies => { aromatisation_method => 'external-list' } } ],
    [ 'aromatic-bonds=s',
      'comma-separated list of bonds, expressed as dash-separated pairs ' .
      'of 0-based aromatic atom numbers. The list is consulted during ' .
      'aromaticity detection. Implies --aromatisation-method ' .
      '\'external-list\'.',
      { implies => { aromatisation_method => 'external-list' } } ],
    [],
    [ kekul => hidden => {
        one_of => [
            [ 'kekulise' => 'kekulise simple aromatic structures ' .
                            '(experimental)' ],
            [ 'no-kekulise' => 'do not attempt to kekulise [default]' ]
        ],
        default => 'no-kekulise'
      }
    ],
    [],
    [ 'no-acyclic-aromatic-bonds', 'convert acyclic aromatic bonds to single' ],
    [],
    [ haloanions => hidden => {
        one_of => [
            [ 'canonicalise-haloanions'    => 'canonicalise the representation of haloanions [default]' ],
            [ 'no-canonicalise-haloanions' => 'retain input representation of haloanions' ],
        ],
        default => 'canonicalise_haloanions'
      }
    ],
    [ nitro_groups => hidden => {
        one_of => [
            [ 'canonicalise-nitro-groups', 'canonicalise the representation of nitro groups by converting them from *-[N+]([O-])=O to *-N(=O)=O' ],
            [ 'no-canonicalise-nitro-groups', 'retain input representation of nitro groups [default]' ],
        ],
        default => 'no_canonicalise_nitro_groups'
      }
    ],
    [],
    [ 'ignore-class',
      'ignore SMILES atom class in canonicalisation (useful for testing)' ],
    [ 'preserve-order', 'preserve input order of molecular entities and atoms' ],
    [ 'random-order',
      'instead of canonical, output SMILES in random order (useful for testing)' ],
    [],
    [ 'help', 'print usage message and exit', { shortcircuit => 1 } ],
);

if( $opt->help ) {
    print $usage->text;
    exit;
}

if( defined $opt->aromatisation_method &&
    $opt->aromatisation_method eq 'external-list' ) {
    my @options_used = grep { $opt->_specified( $_ ) }
                            ( 'aromatic-atoms', 'aromatic-bonds' );
    if( @options_used == 2 ) {
        print STDERR "${basename}:: ERROR, aromatisation method 'external-list' " .
                     'expects a value for either --aromatic-atoms or --aromatic-bonds, ' .
                     'but not both.' . "\n";
        exit 1;
    }
    if( !@options_used ) {
        print STDERR "${basename}:: ERROR, aromatisation method 'external-list' " .
                     'expects a value for either --aromatic-atoms or --aromatic-bonds ' .
                     "(use \'\' for an empty list).\n";
        exit 1;
    }
}

my $errors = 0;
while (<>) {
    chomp;
    my $additional_position = '';
    if( s/\t([^\t]*)$// ) {
        $additional_position = ' ' . $1;
    }

    local $SIG{__WARN__} = sub {
        print STDERR "$basename: $ARGV($.)$additional_position: $_[0]";
    };

    my $parser = Chemistry::OpenSMILES::Parser->new;
    my @moieties;
    eval {
        @moieties = $parser->parse( $_, { raw => $opt->raw eq 'no_infer_hydrogens' } );
    };
    if( $@ ) {
        $@ =~ s/^[^:]+:\s*// if !index( $@, $0 );
        print STDERR "$basename: $ARGV($.)$additional_position: $@";
        $errors++;
    }

    my @smiles_parts;
    for my $moiety (@moieties) {
        if( $opt->aroma eq 'aromatise' ) {
            if(         $opt->aromatisation_method eq 'electron-cycles' ) {
                Chemistry::OpenSMILES::Aromaticity::aromatise( $moiety );
            } elsif( lc $opt->aromatisation_method eq 'huckel' ) {
                SmilesScripts::Aromaticity::aromatise( $moiety );
            } elsif(    $opt->aromatisation_method eq 'external-list' ) {
                my %atoms_by_number = map { $_->{number} => $_ } $moiety->vertices;
                my @aromatic_atoms;
                if( $opt->aromatic_atoms ) {
                    @aromatic_atoms = map   { $atoms_by_number{$_} }
                                      grep  { exists $atoms_by_number{$_} }
                                      map   { int }
                                      split ',', $opt->aromatic_atoms;
                }
                if( $opt->aromatic_bonds ) {
                    for my $bond (map { [ map { int } split '-', $_ ] } split ',', $opt->aromatic_bonds) {
                        next unless all { exists $atoms_by_number{$_} } @$bond;
                        push @aromatic_atoms, map { $atoms_by_number{$_} } @$bond;
                        $moiety->set_edge_attribute( $atoms_by_number{$bond->[0]},
                                                     $atoms_by_number{$bond->[1]},
                                                     'bond',
                                                     ':' );
                    }
                }
                for my $atom (@aromatic_atoms) {
                    next unless $atom->{symbol} =~ /^([BCNOPS]|Se|As)$/;
                    $atom->{symbol} = lcfirst $atom->{symbol};
                }
            } else {
                die sprintf "unknown aromatisation method '%s'",
                            $opt->aromatisation_method;
            }
        }
        acyclic_aromatic_bonds_to_single( $moiety ) if $opt->no_acyclic_aromatic_bonds;
        canonicalise_haloanions( $moiety )
                    if $opt->haloanions eq 'canonicalise_haloanions';
        canonicalise_nitro_groups( $moiety )
                    if $opt->nitro_groups eq 'canonicalise_nitro_groups';

        my @order;
        if( !$opt->preserve_order && !$opt->random_order ) {
            # copy() makes a shallow copy without edge attributes, thus they
            # have to be added later:
            my $copy = $moiety->copy;
            for my $bond ($moiety->edges) {
                next unless $moiety->has_edge_attribute( @$bond, 'bond' );
                $copy->set_edge_attribute( @$bond,
                                           'bond',
                                           $moiety->get_edge_attribute( @$bond, 'bond' ) );
            }
            cis_trans_to_pseudoedges( $copy );
            chirality_to_pseudograph( $copy );

            @order = canonical_order( $copy, \&represent_vertex );
            my %order;
            for (0..$#order) {
                $order{$order[$_]} = $_;
            }

            # Collect cis/trans bonds for marking them up
            my @cis_trans_bonds;
            for my $bond ($moiety->edges) {
                next unless is_double_bond( $moiety, @$bond );

                my $subgraph = $copy->subgraph( [ $moiety->neighbours( $bond->[0] ),
                                                  $moiety->neighbours( $bond->[1] ) ] );
                my $cis_trans_bond = first { $copy->has_edge_attribute( @$_, 'pseudo' ) }
                                           $subgraph->edges;
                next unless $cis_trans_bond;

                @$cis_trans_bond = reverse @$cis_trans_bond unless $subgraph->has_edge( $bond->[0], $cis_trans_bond->[0] );
                push @cis_trans_bonds, [ $cis_trans_bond->[0],
                                         $bond->[0],
                                         $bond->[1],
                                         $cis_trans_bond->[1],
                                         $copy->get_edge_attribute( @$cis_trans_bond, 'pseudo' ) ];
            }

            # Drop cis/trans markers from the input graph and mark them
            # anew.
            for my $bond ($moiety->edges) {
                next unless is_cis_trans_bond( $moiety, @$bond );
                $moiety->delete_edge_attribute( @$bond, 'bond' );
            }
            mark_all_double_bonds( $moiety,
                                   \@cis_trans_bonds,
                                   sub { $order{$_[0]} } );
        } elsif( $opt->preserve_order ) {
            @order = sort { $a->{number} <=> $b->{number} } $moiety->vertices;
        } else {
            @order = shuffle $moiety->vertices;
        }
        my %order;
        for (0..$#order) {
            $order{$order[$_]} = $_;
        }

        kekulise( $moiety, sub { $order{$_[0]} } ) if $opt->kekul eq 'kekulise';

        eval {
            my $part =
                 write_SMILES( $moiety,
                               {
                                    order_sub =>
                                        sub {
                                            my @sorted = sort { $order{$a} <=> $order{$b} }
                                                              keys %{$_[0]};
                                            return $_[0]->{shift @sorted};
                                        },
                                    raw => $opt->raw eq 'no_infer_hydrogens',
                               } );

            # In a SMILES descriptor, one can substitute all '/' with '\'
            # and vice versa, retaining correct cis/trans settings.
            # Similar rule is explained in O'Boyle (2012), Rule H.
            if( $part =~ /([\/\\])/ && $1 eq '\\' ) {
                $part =~ tr/\/\\/\\\//;
            }
            push @smiles_parts, $part;
        };
        if( $@ ) {
            print STDERR "$basename: $ARGV($.)$additional_position: $@";
            $errors++;
        }
    }

    @smiles_parts = sort @smiles_parts unless $opt->preserve_order;

    $additional_position =~ s/^ /\t/;
    print join( '.', @smiles_parts ), $additional_position, "\n";
}

exit( $errors > 0 );

sub represent_vertex
{
    my( $vertex ) = @_;

    return '' unless %$vertex;

    my %atom = %$vertex;
    delete $atom{chirality};
    delete $atom{class} if $opt->ignore_class;
    return write_SMILES( \%atom );
}

sub acyclic_aromatic_bonds_to_single
{
    my( $moiety ) = @_;

    my $aromatic = $moiety->copy_graph;
    $aromatic->delete_edges( map  { @$_ }
                             grep { !is_aromatic_bond( $moiety, @$_ ) }
                                  $moiety->edges );

    # Due to the issue in Graph, bridges() returns strings instead of real objects.
    # Graph issue: https://github.com/graphviz-perl/Graph/issues/29
    # The code below works on buggy (< 0.9727) as well as fixed (>= 0.9727) versions.
    my %vertices_by_name = map { $_ => $_ } $aromatic->vertices;
    my @bridges = map { [ map { $vertices_by_name{$_} } @$_ ] } $aromatic->bridges;
    for (@bridges) {
        $moiety->delete_edge_attribute( @$_, 'bond' );
    }

    # Make all aromatic atoms with less than two aromatic bonds nonaromatic.
    for my $atom (grep { $_->{symbol} ne ucfirst $_->{symbol} }
                       $moiety->vertices) {
        my $aromatic_bonds = grep { is_aromatic_bond( $moiety, $atom, $_ ) }
                                  $moiety->neighbours( $atom );
        next if $aromatic_bonds >= 2;
        $atom->{symbol} = ucfirst $atom->{symbol};
    }
}

# See https://projects.ibt.lt/repositories/issues/1622 for rationale and algorithm
sub canonicalise_haloanions
{
    my( $moiety ) = @_;

    return if ( $moiety->vertices < 5 || $moiety->vertices > 7 );

    my @anions = grep  { $moiety->degree($_) == 1 } $moiety->vertices;
    my $center = first { $moiety->degree($_)  > 3 } $moiety->vertices;

    return unless $center;
    return unless @anions == $moiety->vertices - 1;
    return unless $center->{symbol} =~ /^(As|Se|Si|[BPS])$/;
    return if any { $_->{symbol} !~ /^(At|Br|Cl|[FI])$/ } @anions;

    return unless any { $_->{charge} } @anions;
    return if any { $_->{charge} && $_->{charge} > 0 } @anions;

    for (@anions) {
        next unless exists $_->{charge};
        $center->{charge} += $_->{charge};
        delete $_->{charge};
    }
}
