/*
 Does a join of two rdbtables.

 Author: Carlo Strozzi <carlos@linux.it>
*/

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define MAX_JOIN_ARGS           32
#define MAX_ARG_LENGTH          1024
#define EMPTY                   ""

void show_help( char *my_name)
{
    printf("
        NoSQL operator: %s

Usage: %s [options] < rdbtable_1 'column_1 [column_2]' header_2 body_2

Options:
    -help  Print this help info.
    -a     Do a \"Master/Detail\" join. The table from STDIN is the master.
    -n     Strip header from output.
    -x     Debug option.

Does a join of two rdbtables on the (Key) column(s) specified.  The default is
a \"natural\" join, with optional \"Master/Detail\" or cartesian (cross-product)
type joins.

Column names are in the form 'column_1 [column_2]'.
For example, to join the input rdbtables on columns 'NAME' and 'JOB' the
statement is:

                        'NAME  JOB' 

Note how the list of columns must be quoted, i.e. it must be one
single token. Characters that are special to the Unix shell must be quoted
on the command line.

Each item in the list of column name(s) specifys a key column, which may be
different in the two rdbtables, i.e. 'column_2', if given, refers to a
name in body_2 that corresponds to 'column' in rdbtable_1.  If 'column_2'
is not given it means that the corresponding column name in both rdbtables
is the same. There can be at most two column names in the list.

If the key column names are different in the two rdbtables, the name of the
key columns in the output rdbtable will be from rdbtable_1.

Note that the two rdbtables must be sorted on the key columns in order for a
join operation to function correctly.

The order of columns in the output rdbtable will be: first the key columns,
then the other columns from rdbtable_1, then the other columns from body_2.

If both rdbtables contain table documentation (comment) lines then those from
rdbtable_1 will be first in the output rdbtable.

This operator reads the primary (Master) rdbtable via STDIN and the secondary
rdbtable from two files; the two files must contain the header (header_2) and
the body (body_2) of the secondary rdbtable, respectively. See join(1) for
more details on how the Unix \"join\" works.

The resulting rdbtable is written to STDOUT. If an invalid column is
specified, then nothing is printed to STDOUT.


'$Id: nsq-fjoin.c,v 1.1 1998/05/29 20:43:01 carlos Exp $'

            ----------------------
NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.
This program comes with ABSOLUTELY NO WARRANTY; for details
refer to the GNU General Public License.

You should have received a copy of the GNU General Public License
along with this program;  if not, write to the Free Software
Foundation, Inc., 59 Temple Place Suite 330, Boston, MA 02111-1307
USA.
            ----------------------\n", my_name, my_name);
}

int main( int  argc, char *argv[] ) {

  /* For getopt() */
  extern char* optarg;
  extern int optind;

  /* for the rest of the program. */
  register int
    a_loop;
  char
    *my_name=argv[0],
    cmd_buf[MAX_COMMAND_LENGTH],
    join_args[MAX_JOIN_ARGS] = EMPTY,
    c_names[MAX_ARG_LENGTH] = EMPTY,
    hdr_2[MAX_ARG_LENGTH] = EMPTY,
    body_2[MAX_ARG_LENGTH] = EMPTY;

  int no_hdr=0, debug=0, outerj=0;

  while ((a_loop = getopt(argc, argv, "axnh")) != EOF) {
    switch (a_loop) {
      case 'h':
        show_help(my_name);
        exit(0);
        break;
      case 'a': 
        outerj=1;   /* An outer (Master/Detail) join was requested */
        break;
      case 'n': 
        no_hdr=1;
        break;
      case 'x': 
        debug=1;
        break;
      default:
        show_help(my_name);
        exit(1);
    }
  }

  if( optind < argc )
    snprintf(c_names, MAX_ARG_LENGTH,"%s", argv[optind++]);

  if( optind < argc )
    snprintf(hdr_2, MAX_ARG_LENGTH,"%s", argv[optind++]);

  if( optind < argc )
    snprintf(body_2, MAX_ARG_LENGTH,"%s", argv[optind++]);

  /* Check for mandatory arguments. */
  if( ! strcmp( c_names, EMPTY ) ||
      ! strcmp( hdr_2, EMPTY )   ||
      ! strcmp( body_2, EMPTY ) ) {
    show_help(my_name);
    exit(1);
  }

  snprintf(cmd_buf,MAX_COMMAND_LENGTH,"#
#
BEGIN {
  NULL = \"\"
  FS = \"\\t\"; OFS = FS;

  # Get join field names and set defaults.
  if( split( \"%s\", c_names, \" \" ) < 2 )
    c_names[2] = c_names[1]

  # Read secondary table header.
  while( getline < \"%s\" > 0 ) {
    # Table comments.
    if( r == 0 && $0 ~ /^ *#/ ) cmt_2[++b] = $0
    
    # Column names and positions.
    else if( r == 0 ) {
      while( ++p <= NF ) {
        # Make sure we pick the first occurrence of duplicated column
        # names (it may happen after a join).
        if( P2[$p] == NULL ) {
          P2[$p]=p
          N2[p]=$p
        }
        else N2[p]=\".\"
      }
      j2 = P2[c_names[2]]
      # Exit on invalid column name.
      if( j2 == NULL )  exit
      r++
    }
    # Column definitions.
    else if( r == 1 ) {
      p=0
      while( ++p <= NF ) {
        if( D2[p] == NULL ) D2[p] = $p
		else D2[p] = \".\"
	  }
      r++
    }
  }
  # We are going to re-use this counter.
  r=0

  # Honour the 'debug' switch.
  if( %d ) {
    arg_vec = \"# ARGC = \" ARGC

    for( arg in ARGV )
      arg_vec = arg_vec \"\\n# ARGV[\" arg \"] = \" ARGV[arg]

    print arg_vec > \"/dev/stderr\"
  }
}
# Primary table read from STDIN.
# Table comments.
r == 0 && $0 ~ /^ *#/ {
  out_hdr[++h] = $0
  next
}
# Column names and positions.
r == 0 {
  # Add comments from secondary table to header.
  b=0
  while( cmt_2[++b] != NULL )  out_hdr[++h] = cmt_2[b] 
  p=0
  while( ++p <= NF ) {
    # Make sure we pick the first occurrence of duplicated column
    # names (it may happen after a join).
    if( P1[$p] == NULL ) {
      P1[$p]=p
      N1[p]=$p
    }
    else N1[p]=\".\"
  }
  j1 = P1[c_names[1]]
  # Exit on invalid column name.
  if( j1 == NULL )  exit
  # Build join(1) arg list.
  join_args = \"-j1 \" j1 \" -j2 \" j2
  if( %d ) join_args = join_args \" -a 1\"
  join_args = \"-t \\\"\\011\\\" \" join_args \" - %s\"
  out_rec = N1[ j1 ]
  field_list = \" 1.\" P1[out_rec]
  while( N1[++c] != NULL )
	if( N1[c] != \".\" && c != j1 ) {
	  out_rec = out_rec OFS N1[c]
	  field_list = field_list \",1.\" P1[ N1[c] ]
	}
  c=0
  while( N2[++c] != NULL )
	if( N2[c] != \".\" && c != j2 ) {
	  out_rec = out_rec OFS N2[c]
	  field_list = field_list \",2.\" P2[ N2[c] ]
	}
  if( ! %d && out_rec != NULL ) out_hdr[++h] = out_rec
  join_cmd = \"join -o \" field_list \" \" join_args
  if( %d )  print \"# join_cmd = \" join_cmd > \"/dev/stderr\"
  r++
  next
}
# Column definitions.
r == 1 {
  if( ! %d ) {
    p=0
    while( ++p <= NF ) {
      if( D1[p] == NULL ) D1[p] = $p
	  else D1[p] = \".\"
	}
    out_rec = D1[ j1 ]
    c=0
    while( D1[++c] != NULL )
	  if( D1[c] != \".\" && c != j1 ) out_rec = out_rec OFS D1[c]
    c=0
    while( D2[++c] != NULL )
	  if( D2[c] != \".\" && c != j2 ) out_rec = out_rec OFS D2[c]
    out_hdr[++h] = out_rec
    # Now print the final table header.
    c=0
    while( out_hdr[++c] != NULL ) print out_hdr[c]
  }
  r++
  next
}
# Table body.
{
  print | join_cmd
}", c_names, hdr_2, debug, outerj, body_2, no_hdr, debug, no_hdr);

  if( debug )
    fprintf (stderr, "Generated AWK program:
      ----------\n%s\n----------\n",cmd_buf);

  execlp(AWK,"awk",cmd_buf,NULL);
  exit(0);
}
