/*--------------------------------------------------------------------*/
/*--- Callgrind                                                    ---*/
/*---                                                    ct_dump.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Callgrind, a Valgrind tool for call tracing.

   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#include "global.h"
#include "events.h"


/*------------------------------------------------------------*/
/*--- Support for signal handlers and multi-threading      ---*/
/*------------------------------------------------------------*/

/* Dump Part Counter */
static Int out_counter = 0;

static Char* dump_file_base = 0;
static Char* base_directory = 0;

/* Command */
static Char cmdbuf[BUF_LEN];

/* Total reads/writes/misses sum over all dumps and threads.
 * Updated during CC traversal at dump time.
 */
FullCost SK_(total_cost) = 0;
static FullCost dump_total_cost = 0;

EventMapping* SK_(dumpmap) = 0;

/* Temporary output buffer for
 *  print_fn_pos, fprint_apos, fprint_fcost, fprint_jcc,
 *  fprint_fcc_ln, dump_run_info, dump_state_info
 */
static Char outbuf[FILENAME_LEN + FN_NAME_LEN + OBJ_NAME_LEN];

Int SK_(get_dump_counter)(void)
{
  return out_counter;
}

Char* SK_(get_dump_file_base)()
{
  return dump_file_base;
}

/*------------------------------------------------------------*/
/*--- Output file related stuff                            ---*/
/*------------------------------------------------------------*/

/* Boolean dumping array */
static Bool* dump_array = 0;
static Int   dump_array_size = 0;
static Bool* obj_dumped = 0;
static Bool* file_dumped = 0;
static Bool* fn_dumped = 0;
static Bool* cxt_dumped = 0;

static void reset_dump_array()
{
    int i;

    CT_ASSERT(dump_array != 0);

    for(i=0;i<dump_array_size;i++)
	dump_array[i] = False;
}

static void init_dump_array()
{
    dump_array_size = SK_(stat).distinct_objs +
      SK_(stat).distinct_files +
      SK_(stat).distinct_fns +
      SK_(stat).context_counter;
    CT_ASSERT(dump_array == 0);
    dump_array = VG_(malloc)(dump_array_size * sizeof(Bool));
    obj_dumped  = dump_array;
    file_dumped = obj_dumped + SK_(stat).distinct_objs;
    fn_dumped   = file_dumped + SK_(stat).distinct_files;
    cxt_dumped  = fn_dumped + SK_(stat).distinct_fns;

    reset_dump_array();

    CT_DEBUG(1, "  init_dump_array: size %d\n", dump_array_size);
}

static __inline__
void free_dump_array()
{
    CT_ASSERT(dump_array != 0);
    VG_(free)(dump_array);

    dump_array = 0;
    obj_dumped = 0;
    file_dumped = 0;
    fn_dumped = 0;
    cxt_dumped = 0;
}


/* Initialize to an invalid position */
static __inline__
void init_fpos(FnPos* p)
 {
    p->file = 0;
    p->fn = 0;
    p->obj = 0;
    p->cxt = 0;
    p->rec_index = 0;
}


#if 0
static __inline__
static void fwrite(Int fd, Char* buf, Int len)
{
	VG_(write)(fd, (void*)buf, len);
}
#else

#define FWRITE_BUFSIZE 32000
#define FWRITE_THROUGH 10000
static Char fwrite_buf[FWRITE_BUFSIZE];
static Int fwrite_pos;
static Int fwrite_fd = -1;

static __inline__
void fwrite_flush()
{
    if ((fwrite_fd>0) && (fwrite_pos>0))
	VG_(write)(fwrite_fd, (void*)fwrite_buf, fwrite_pos);
    fwrite_pos = 0;
}

static void fwrite(Int fd, Char* buf, Int len)
{
    if (fwrite_fd != fd) {
	fwrite_flush();
	fwrite_fd = fd;
    }
    if (len > FWRITE_THROUGH) {
	fwrite_flush();
	VG_(write)(fd, (void*)buf, len);
	return;
    }
    if (FWRITE_BUFSIZE - fwrite_pos <= len) fwrite_flush();
    VG_(strncpy)(fwrite_buf + fwrite_pos, buf, len);
    fwrite_pos += len;
}
#endif


static void print_obj(Char* buf, obj_node* obj)
{
    int n;

    if (SK_(clo).compress_strings) {
	CT_ASSERT(obj_dumped != 0);
	if (obj_dumped[obj->number])
	    n = VG_(sprintf)(buf, "(%d)\n", obj->number);
	else {
	    n = VG_(sprintf)(buf, "(%d) %s\n",
			     obj->number, obj->name);
	}
    }
    else
	n = VG_(sprintf)(buf, "%s\n", obj->name);

#if 0
    /* add mapping parameters the first time a object is dumped
     * format: mp=0xSTART SIZE 0xOFFSET */
    if (!obj_dumped[obj->number]) {
	obj_dumped[obj->number];
	VG_(sprintf)(buf+n, "mp=0x%x %d 0x%x\n",
		     pos->obj->start, pos->obj->size, pos->obj->offset);
    }
#else
    obj_dumped[obj->number] = True;
#endif
}

static void print_file(Char* buf, file_node* file)
{
    if (SK_(clo).compress_strings) {
	CT_ASSERT(file_dumped != 0);
	if (file_dumped[file->number])
	    VG_(sprintf)(buf, "(%d)\n", file->number);
	else {
	    VG_(sprintf)(buf, "(%d) %s\n",
			 file->number, file->name);
	    file_dumped[file->number] = True;
	}
    }
    else
	VG_(sprintf)(buf, "%s\n", file->name);
}

/*
 * tag can be "fn", "cfn", "jfn"
 */
static void print_fn(Int fd, Char* buf, Char* tag, fn_node* fn)
{
    int p;
    p = VG_(sprintf)(buf, "%s=",tag);
    if (SK_(clo).compress_strings) {
	CT_ASSERT(fn_dumped != 0);
	if (fn_dumped[fn->number])
	    p += VG_(sprintf)(buf+p, "(%d)\n", fn->number);
	else {
	    p += VG_(sprintf)(buf+p, "(%d) %s\n",
			      fn->number, fn->name);
	    fn_dumped[fn->number] = True;
	}
    }
    else
	p += VG_(sprintf)(buf+p, "%s\n", fn->name);

    fwrite(fd, buf, p);
}

static void print_mangled_fn(Int fd, Char* buf, Char* tag, 
			     Context* cxt, int rec_index)
{
    int p, i;

    if (SK_(clo).compress_strings && SK_(clo).compress_mangled) {

	int n;
	Context* last;

	CT_ASSERT(cxt_dumped != 0);
	if (cxt_dumped[cxt->base_number+rec_index]) {
	    p = VG_(sprintf)(buf, "%s=(%d)\n",
			     tag, cxt->base_number + rec_index);
	    fwrite(fd, buf, p);
	    return;
	}

	last = 0;
	/* make sure that for all context parts compressed data is written */
	for(i=cxt->size;i>0;i--) {
	    CT_ASSERT(cxt->fn[i-1]->pure_cxt != 0);
	    n = cxt->fn[i-1]->pure_cxt->base_number;
	    if (cxt_dumped[n]) continue;
	    p = VG_(sprintf)(buf, "%s=(%d) %s\n",
			     tag, n, cxt->fn[i-1]->name);
	    fwrite(fd, buf, p);

	    cxt_dumped[n] = True;
	    last = cxt->fn[i-1]->pure_cxt;
	}
	/* If the last context was the context to print, we are finished */
	if ((last == cxt) && (rec_index == 0)) return;

	p = VG_(sprintf)(buf, "%s=(%d) (%d)", tag,
			 cxt->base_number + rec_index,
			 cxt->fn[0]->pure_cxt->base_number);
	if (rec_index >0)
	    p += VG_(sprintf)(buf+p, "'%d", rec_index +1);
	for(i=1;i<cxt->size;i++)
	    p += VG_(sprintf)(buf+p, "'(%d)", 
			      cxt->fn[i]->pure_cxt->base_number);
	p += VG_(sprintf)(buf+p, "\n");
	fwrite(fd, buf, p);

	cxt_dumped[cxt->base_number+rec_index] = True;
	return;
    }


    p = VG_(sprintf)(buf, "%s=", tag);
    if (SK_(clo).compress_strings) {
	CT_ASSERT(cxt_dumped != 0);
	if (cxt_dumped[cxt->base_number+rec_index]) {
	    p += VG_(sprintf)(buf+p, "(%d)\n", cxt->base_number + rec_index);
	    fwrite(fd, buf, p);
	    return;
	}
	else {
	    p += VG_(sprintf)(buf+p, "(%d) ", cxt->base_number + rec_index);
	    cxt_dumped[cxt->base_number+rec_index] = True;
	}
    }

    p += VG_(sprintf)(buf+p, "%s", cxt->fn[0]->name);
    if (rec_index >0)
	p += VG_(sprintf)(buf+p, "'%d", rec_index +1);
    for(i=1;i<cxt->size;i++)
	p += VG_(sprintf)(buf+p, "'%s", cxt->fn[i]->name);

    p += VG_(sprintf)(buf+p, "\n");
    fwrite(fd, buf, p);
}



/**
 * Print function position of the BBCC, but only print info differing to
 * the <last> position, update <last>
 * Return True if something changes.
 */
static Bool print_fn_pos(int fd, FnPos* last, BBCC* bbcc)
{
    Bool res = False;

    CT_DEBUGIF(3) {
	CT_DEBUG(2, "+ print_fn_pos: ");
	SK_(print_cxt)(16, bbcc->cxt, bbcc->rec_index);
    }

    if (!SK_(clo).mangle_names) {
	if (last->rec_index != bbcc->rec_index) {
	    VG_(sprintf)(outbuf, "rec=%d\n\n", bbcc->rec_index);
	    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	    last->rec_index = bbcc->rec_index;
	    last->cxt = 0; /* reprint context */
	    res = True;
	}
	
	if (last->cxt != bbcc->cxt) {
	    fn_node* last_from = (last->cxt && last->cxt->size>1) ?
				 last->cxt->fn[1] : 0;
	    fn_node* curr_from = (bbcc->cxt && bbcc->cxt->size>1) ?
				 bbcc->cxt->fn[1] : 0;
	    if (curr_from == 0) {
		if (last_from != 0) {
		    /* switch back to no context */
		    VG_(sprintf)(outbuf, "frfn=(spontaneous)\n");
		    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
		    res = True;
		}
	    }
	    else if (last_from != curr_from) {
		print_fn(fd,outbuf,"frfn", curr_from);
		res = True;
	    }
	    last->cxt = bbcc->cxt;
	}
    }

    if (last->obj != bbcc->cxt->fn[0]->file->obj) {
	VG_(sprintf)(outbuf, "ob=");
	print_obj(outbuf+3, bbcc->cxt->fn[0]->file->obj);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	last->obj = bbcc->cxt->fn[0]->file->obj;
	res = True;
    }

    if (last->file != bbcc->cxt->fn[0]->file) {
	VG_(sprintf)(outbuf, "fl=");
	print_file(outbuf+3, bbcc->cxt->fn[0]->file);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	last->file = bbcc->cxt->fn[0]->file;
	res = True;
    }

    if (!SK_(clo).mangle_names) {
	if (last->fn != bbcc->cxt->fn[0]) {
	    print_fn(fd,outbuf, "fn", bbcc->cxt->fn[0]);
	    last->fn = bbcc->cxt->fn[0];
	    res = True;
	}
    }
    else {
	/* Print mangled name if context or rec_index changes */
	if ((last->rec_index != bbcc->rec_index) ||
	    (last->cxt != bbcc->cxt)) {

	    print_mangled_fn(fd, outbuf, "fn", bbcc->cxt, bbcc->rec_index);
	    last->fn = bbcc->cxt->fn[0];
	    last->rec_index = bbcc->rec_index;
	    res = True;
	}
    }

    last->cxt = bbcc->cxt;

    CT_DEBUG(2, "- print_fn_pos: %s\n", res ? "changed" : "");
    
    return res;
}

/* the debug lookup cache is useful if BBCC for same BB are
 * dumped directly in a row. This is a direct mapped cache.
 */
#define DEBUG_CACHE_SIZE 1777

static Addr       debug_cache_addr[DEBUG_CACHE_SIZE];
static file_node* debug_cache_file[DEBUG_CACHE_SIZE];
static int        debug_cache_line[DEBUG_CACHE_SIZE];
static Bool       debug_cache_info[DEBUG_CACHE_SIZE];

static __inline__
void init_debug_cache()
{
    int i;
    for(i=0;i<DEBUG_CACHE_SIZE;i++) {
	debug_cache_addr[i] = 0;
	debug_cache_file[i] = 0;
	debug_cache_line[i] = 0;
	debug_cache_info[i] = 0;
    }
}

static __inline__
Bool get_debug_pos(BBCC* bbcc, Addr addr, AddrPos* p)
{
    Char file[FILENAME_LEN];
    Bool res;

    int cachepos = addr % DEBUG_CACHE_SIZE;
    
    if (debug_cache_addr[cachepos] == addr) {
	p->line = debug_cache_line[cachepos];
	p->file = debug_cache_file[cachepos];
	res     = debug_cache_info[cachepos];
    }
    else {
	res = VG_(get_filename_linenum)(addr, file,
					FILENAME_LEN, &(p->line));
	if (!res) {
	    VG_(strcpy)(file, "???");
	    p->line = 0;
	}
	p->file    = SK_(get_file_node)(bbcc->bb->obj, file);

	debug_cache_info[cachepos] = res;
	debug_cache_addr[cachepos] = addr;
	debug_cache_line[cachepos] = p->line;
	debug_cache_file[cachepos] = p->file;
    }

    /* Address offset from bbcc start address */
    p->addr = addr - bbcc->bb->obj->offset;
    p->bb_addr = bbcc->bb->offset;

    CT_DEBUG(3, "  get_debug_pos(0x%x): BB 0x%x, fn '%s', file '%s', line %d\n",
	     addr, bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name,
	     p->file->name, p->line);

    return res;
}


/* copy file position and init cost */
static void init_apos(AddrPos* p, Addr addr, Addr bbaddr, file_node* file)
{
    p->addr    = addr;
    p->bb_addr = bbaddr;
    p->file    = file;
    p->line    = 0;
}

static void copy_apos(AddrPos* dst, AddrPos* src)
{
    dst->addr    = src->addr;
    dst->bb_addr = src->bb_addr;
    dst->file    = src->file;
    dst->line    = src->line;
}   

/* copy file position and init cost */
static void init_fcost(AddrCost* c, Addr addr, Addr bbaddr, file_node* file)
{
    init_apos( &(c->p), addr, bbaddr, file);
    /* FIXME: This is a memory leak as a AddrCost is inited multiple times */
    c->cost = SK_(get_eventset_cost)( SK_(sets).full );
    SK_(init_cost)( SK_(sets).full, c->cost );
}


/**
 * print position change inside of a BB (last -> curr)
 * this doesn't update last to curr!
 */
static void fprint_apos(Int fd, AddrPos* curr, AddrPos* last, file_node* func_file)
{
    CT_ASSERT(curr->file != 0);
    CT_DEBUG(2, "    print_apos(file '%s', line %d, bb %p, addr %p) fnFile '%s'\n",
	     curr->file->name, curr->line, curr->bb_addr, curr->addr,
	     func_file->name);

    if (curr->file != last->file) {

	/* if we switch back to orig file, use fe=... */
	if (curr->file == func_file)
	    VG_(sprintf)(outbuf, "fe=");
	else
	    VG_(sprintf)(outbuf, "fi=");
	print_file(outbuf+3, curr->file);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
    }

    if (SK_(clo).dump_bbs) {
	if (curr->line != last->line) {
	    VG_(sprintf)(outbuf, "ln=%d\n", curr->line);
	    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	}
    }
}



/**
 * Print a position.
 * This prints out differences if allowed
 *
 * This doesn't set last to curr afterwards!
 */
static
void fprint_pos(Int fd, AddrPos* curr, AddrPos* last)
{
    if (SK_(clo).dump_bbs)
	VG_(sprintf)(outbuf, "%u ", curr->addr - curr->bb_addr);
    else {
	int p = 0;
	if (SK_(clo).dump_instr) {
	    int diff = curr->addr - last->addr;
	    if ( SK_(clo).compress_pos && (last->addr >0) && 
		 (diff > -100) && (diff < 100)) {
		if (diff >0)
		    p = VG_(sprintf)(outbuf, "+%d ", diff);
		else if (diff==0)
		    p = VG_(sprintf)(outbuf, "* ");
	        else
		    p = VG_(sprintf)(outbuf, "%d ", diff);
	    }
	    else
		p = VG_(sprintf)(outbuf, "%p ", curr->addr);
	}

	if (SK_(clo).dump_bb) {
	    int diff = curr->bb_addr - last->bb_addr;
	    if ( SK_(clo).compress_pos && (last->bb_addr >0) && 
		 (diff > -100) && (diff < 100)) {
		if (diff >0)
		    p += VG_(sprintf)(outbuf+p, "+%d ", diff);
		else if (diff==0)
		    p += VG_(sprintf)(outbuf+p, "* ");
	        else
		    p += VG_(sprintf)(outbuf+p, "%d ", diff);
	    }
	    else
		p += VG_(sprintf)(outbuf+p, "%p ", curr->bb_addr);
	}

	if (SK_(clo).dump_line) {
	    int diff = curr->line - last->line;
	    if ( SK_(clo).compress_pos && (last->line >0) && 
		 (diff > -100) && (diff < 100)) {

		if (diff >0)
		    VG_(sprintf)(outbuf+p, "+%d ", diff);
		else if (diff==0)
		    VG_(sprintf)(outbuf+p, "* ");
	        else
		    VG_(sprintf)(outbuf+p, "%d ", diff);
	    }
	    else
		VG_(sprintf)(outbuf+p, "%u ", curr->line);
	}
    }
    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
}


/**
 * Print events.
 */

static
void fprint_cost(int fd, EventMapping* es, ULong* cost)
{
  int p = SK_(sprint_mappingcost)(outbuf, es, cost);
  VG_(sprintf)(outbuf+p, "\n");
  fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
  return;
}



/* Write the cost of a source line; only that parts of the source
 * position are written that changed relative to last written position.
 * funcPos is the source position of the first line of actual function.
 * Something is written only if cost != 0; returns True in this case.
 */
static void fprint_fcost(Int fd, AddrCost* c, AddrPos* last)
{
  CT_DEBUGIF(3) {
    CT_DEBUG(2, "   print_fcost(file '%s', line %d, bb %p, addr %p):\n",
	     c->p.file->name, c->p.line, c->p.bb_addr, c->p.addr);
    SK_(print_cost)(-5, SK_(sets).full, c->cost);
  }
    
  fprint_pos(fd, &(c->p), last);
  copy_apos( last, &(c->p) ); /* update last to current position */

  fprint_cost(fd, SK_(dumpmap), c->cost);

  /* add cost to total */
  SK_(add_and_zero_cost)( SK_(sets).full, dump_total_cost, c->cost );
}


/* Write out the calls from jcc (at pos)
 */
static void fprint_jcc(Int fd, jCC* jcc, AddrPos* curr, AddrPos* last)
{
    static AddrPos target;
    file_node* file;
    obj_node*  obj;

    CT_DEBUGIF(2) {
      CT_DEBUG(2, "   fprint_jcc (jkind %d)\n");
      SK_(print_jcc)(-10, jcc);
    }

    if (!get_debug_pos(jcc->to, bb_addr(jcc->to->bb), &target)) {
	/* if we don't have debug info, don't switch to file "???" */
	target.file = last->file;
    }

    if (jcc->from &&
	(jcc->jmpkind == JmpCond || jcc->jmpkind == JmpBoring)) {
	    
      /* this is a JCC for a followed conditional or boring jump. */
      CT_ASSERT(SK_(is_zero_cost)( SK_(sets).full, jcc->cost));
	
      /* objects among jumps should be the same.
       * Otherwise this jump would have been changed to a call
       *  (see setup_bbcc)
       */
      CT_ASSERT(jcc->from->bb->obj == jcc->to->bb->obj);

	/* only print if target position info is usefull */
	if (!SK_(clo).dump_instr && !SK_(clo).dump_bb && target.line==0) {
	  jcc->call_counter = 0;
	  return;
	}

	/* Different files/functions are possible e.g. with longjmp's
	 * which change the stack, and thus context
	 */
	if (last->file != target.file) {
	    VG_(sprintf)(outbuf, "jfi=");
	    print_file(outbuf+4, target.file);
	    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	}
	
	if (jcc->from->cxt != jcc->to->cxt) {
	    if (SK_(clo).mangle_names)
		print_mangled_fn(fd, outbuf, "jfn",
				 jcc->to->cxt, jcc->to->rec_index);
	    else
		print_fn(fd, outbuf, "jfn", jcc->to->cxt->fn[0]);
	}
	    
	if (jcc->jmpkind == JmpCond) {
	    /* format: jcnd=<followed>/<executions> <target> */
	    VG_(sprintf)(outbuf, "jcnd=%llu/%llu ",
			 jcc->call_counter,
			 jcc->from->exe_counter);
	}
	else {
	    /* format: jump=<jump count> <target> */
	    VG_(sprintf)(outbuf, "jump=%llu ",
			 jcc->call_counter);
	}
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
		
	fprint_pos(fd, &target, last);
	fwrite(fd, "\n", 1);
	fprint_pos(fd, curr, last);
	fwrite(fd, "\n", 1);

	jcc->call_counter = 0;
	return;
    }

    CT_ASSERT(jcc->to !=0);
    
    file = jcc->to->cxt->fn[0]->file;
    obj  = jcc->to->bb->obj;
    
    /* object of called position different to object of this function?*/
    if (jcc->from->cxt->fn[0]->file->obj != obj) {
	VG_(sprintf)(outbuf, "cob=");
	print_obj(outbuf+4, obj);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
    }

    /* file of called position different to current file? */
    if (last->file != file) {
	VG_(sprintf)(outbuf, "cfi=");
	print_file(outbuf+4, file);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
    }

    if (SK_(clo).mangle_names)
	print_mangled_fn(fd, outbuf, "cfn", jcc->to->cxt, jcc->to->rec_index);
    else
	print_fn(fd, outbuf, "cfn", jcc->to->cxt->fn[0]);

    if (!SK_(is_zero_cost)( SK_(sets).full, jcc->cost)) {
      VG_(sprintf)(outbuf, "calls=%llu ", 
		   jcc->call_counter);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));

	fprint_pos(fd, &target, last);
	fwrite(fd, "\n", 1);	

	fprint_pos(fd, curr, last);
	fprint_cost(fd, SK_(dumpmap), jcc->cost);

	SK_(init_cost)( SK_(sets).full, jcc->cost );

	jcc->call_counter = 0;
    }
}



/* Cost summation of functions.We use alternately ccSum[0/1], thus
 * ssSum[currSum] for recently read lines with same line number.
 */
static AddrCost ccSum[2];
static int currSum;

/*
 * Print all costs of a BBCC:
 * - FCCs of instructions
 * - JCCs of the unique jump of this BB
 * returns True if something was written 
 */
static Bool fprint_bbcc(Int fd, BBCC* bbcc, AddrPos* last)
{
  InstrInfo* instr_info;
  Bool something_written = False;
  jCC* jcc;
  AddrCost *currCost, *newCost;
  Int jcc_count = 0, instr;
  BB* bb = bbcc->bb;

  CT_ASSERT(bbcc->cxt != 0);
  CT_DEBUGIF(1) {
    VG_(printf)("+ fprint_bbcc (Instr %d): ", bb->instr_count);
    SK_(print_bbcc)(15, bbcc, False);
  }

  CT_ASSERT(currSum == 0 || currSum == 1);
  currCost = &(ccSum[currSum]);
  newCost  = &(ccSum[1-currSum]);

  instr_info = &(bb->instr[0]);
  for(instr=0; instr<bb->instr_count; instr++, instr_info++) {

    /* get debug info of current instruction address and dump cost
     * if SK_(clo).dump_bbs or file/line has changed
     */
    if (!get_debug_pos(bbcc, bb_addr(bb) + instr_info->instr_offset, 
		       &(newCost->p))) {
      /* if we don't have debug info, don't switch to file "???" */
      newCost->p.file = bbcc->cxt->fn[0]->file;
    }

    if (SK_(clo).dump_bbs || SK_(clo).dump_instr ||
	(newCost->p.line != currCost->p.line) ||
	(newCost->p.file != currCost->p.file)) {
      
      if (!SK_(is_zero_cost)( SK_(sets).full, currCost->cost )) {
	something_written = True;
	
	fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
	fprint_fcost(fd, currCost, last);
      }
	   
      /* switch buffers */
      currSum = 1 - currSum;
      currCost = &(ccSum[currSum]);
      newCost  = &(ccSum[1-currSum]);
    }
       
    /* add line cost to current cost sum */
    (*SK_(cachesim).add_icost)(currCost->cost, bbcc, instr_info);
  }

  /* Some JCC output? If yes, dump cumulated line info first */
  for(jcc=bbcc->jcc_list; jcc; jcc=jcc->next_from) {
    /* yes, if JCC only counts jmp arcs or cost >0 */
    if ( ((jcc->jmpkind != JmpCall) && (jcc->call_counter >0)) ||
	 (!SK_(is_zero_cost)( SK_(sets).full, jcc->cost )))
      jcc_count++;
  }
  
  if ( (bbcc->skipped &&
	!SK_(is_zero_cost)(SK_(sets).full, bbcc->skipped)) || 
       (jcc_count>0) ) {
    
    if (!SK_(is_zero_cost)( SK_(sets).full, currCost->cost )) {
      /* no need to switch buffers, as position is the same */
      fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
      fprint_fcost(fd, currCost, last);
    }
    
    get_debug_pos(bbcc, bb_jmpaddr(bbcc->bb), &(currCost->p));
    fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
    something_written = True;
    
    /* first, print skipped costs for calls */
    if (bbcc->skipped && !SK_(is_zero_cost)( SK_(sets).full,
					     bbcc->skipped )) {
      SK_(add_and_zero_cost)( SK_(sets).full,
			      currCost->cost, bbcc->skipped );
#if 0
      VG_(sprintf)(outbuf, "# Skipped\n");
      fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
#endif
      fprint_fcost(fd, currCost, last);
    }
    
    if (jcc_count > 0)
      for(jcc=bbcc->jcc_list; jcc; jcc=jcc->next_from)
	if ( ((jcc->jmpkind != JmpCall) && (jcc->call_counter >0)) ||
	     (!SK_(is_zero_cost)( SK_(sets).full, jcc->cost )))
	  
	  fprint_jcc(fd, jcc, &(currCost->p), last);
  }
  
  if (SK_(clo).dump_bbs || SK_(clo).dump_bb) {
    if (!SK_(is_zero_cost)( SK_(sets).full, currCost->cost )) {
      something_written = True;
      
      fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
      fprint_fcost(fd, currCost, last);
    }
    if (SK_(clo).dump_bbs) fwrite(fd, (void*)"\n", 1);
    
    /* when every cost was immediatly written, we must have done so,
     * as this function is only called when there's cost in a BBCC
     */
    CT_ASSERT(something_written);
  }
  
  bbcc->exe_counter = 0;
  bbcc->ret_counter = 0;
  
  CT_DEBUG(1, "- fprint_bbcc: JCCs %d\n", jcc_count);
  
  return something_written;
}

/* order by
 *  recursion,
 *  from->bb->obj, from->bb->fn
 *  obj, fn[0]->file, fn
 *  address
 */
static int my_cmp(BBCC** pbbcc1, BBCC** pbbcc2)
{
#if 0
    return (*pbbcc1)->bb->offset - (*pbbcc2)->bb->offset;
#else
    BBCC *bbcc1 = *pbbcc1;
    BBCC *bbcc2 = *pbbcc2;
    Context* cxt1 = bbcc1->cxt;
    Context* cxt2 = bbcc2->cxt;
    int off = 1;

    if (cxt1->fn[0]->file->obj != cxt2->fn[0]->file->obj)
	return cxt1->fn[0]->file->obj - cxt2->fn[0]->file->obj;

    if (cxt1->fn[0]->file != cxt2->fn[0]->file)
	return cxt1->fn[0]->file - cxt2->fn[0]->file;

    if (cxt1->fn[0] != cxt2->fn[0])
	return cxt1->fn[0] - cxt2->fn[0];

    if (bbcc1->rec_index != bbcc2->rec_index)
	return bbcc1->rec_index - bbcc2->rec_index;

    while((off < cxt1->size) && (off < cxt2->size)) {
	fn_node* ffn1 = cxt1->fn[off];
	fn_node* ffn2 = cxt2->fn[off];
	if (ffn1->file->obj != ffn2->file->obj)
	    return ffn1->file->obj - ffn2->file->obj;
	if (ffn1 != ffn2)
	    return ffn1 - ffn2;
	off++;
    }
    if      (cxt1->size > cxt2->size) return 1;
    else if (cxt1->size < cxt2->size) return -1;

    return bbcc1->bb->offset - bbcc2->bb->offset;
#endif
}





/* modified version of:
 *
 * qsort -- qsort interface implemented by faster quicksort.
 * J. L. Bentley and M. D. McIlroy, SPE 23 (1993) 1249-1265.
 * Copyright 1993, John Wiley.
*/

static __inline__
void swapfunc(BBCC** a, BBCC** b, int n)
{
    while(n>0) {
	BBCC* t = *a; *a = *b; *b = t;
	a++, b++;
	n--;
    }
}

static __inline__
void swap(BBCC** a, BBCC** b)
{
    BBCC* t;
    t = *a; *a = *b; *b = t;
}

#define min(x, y) ((x)<=(y) ? (x) : (y))

static BBCC** med3(BBCC **a, BBCC **b, BBCC **c, int (*cmp)())
{	return cmp(a, b) < 0 ?
		  (cmp(b, c) < 0 ? b : cmp(a, c) < 0 ? c : a)
		: (cmp(b, c) > 0 ? b : cmp(a, c) > 0 ? c : a);
}

static BBCC** qsort_start = 0;

static void qsort(BBCC **a, int n, int (*cmp)(BBCC**,BBCC**))
{
	BBCC **pa, **pb, **pc, **pd, **pl, **pm, **pn, **pv;
	int s, r;
	BBCC* v;

	CT_DEBUG(8, "  qsort(%d,%d)\n", a-qsort_start, n);

	if (n < 7) {	 /* Insertion sort on smallest arrays */
		for (pm = a+1; pm < a+n; pm++)
			for (pl = pm; pl > a && cmp(pl-1, pl) > 0; pl --)
				swap(pl, pl-1);

		CT_DEBUGIF(8) {
		    for (pm = a; pm < a+n; pm++) {
			VG_(printf)("   %3d BB 0x%x, ", pm - qsort_start,
				    bb_addr((*pm)->bb));      
			SK_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
		    }
		}
		return;
	}
	pm = a + n/2;    /* Small arrays, middle element */
	if (n > 7) {
		pl = a;
		pn = a + (n-1);
		if (n > 40) {    /* Big arrays, pseudomedian of 9 */
			s = n/8;
			pl = med3(pl, pl+s, pl+2*s, cmp);
			pm = med3(pm-s, pm, pm+s, cmp);
			pn = med3(pn-2*s, pn-s, pn, cmp);
		}
		pm = med3(pl, pm, pn, cmp); /* Mid-size, med of 3 */
	}


	v = *pm;
	pv = &v;
	pa = pb = a;
	pc = pd = a + (n-1);
	for (;;) {
		while ((pb <= pc) && ((r=cmp(pb, pv)) <= 0)) {
		    if (r==0) {
			/* same as pivot, to start */
			swap(pa,pb); pa++; 
		    }
		    pb ++;
		}
		while ((pb <= pc) && ((r=cmp(pc, pv)) >= 0)) {
		    if (r==0) {
			/* same as pivot, to end */
			swap(pc,pd); pd--; 
		    }
		    pc --;
		}
		if (pb > pc) { break; }
		swap(pb, pc);
		pb ++;
		pc --;
	}
	pb--;
	pc++;

	/* put pivot from start into middle */
	if ((s = pa-a)>0) { for(r=0;r<s;r++) swap(a+r, pb+1-s+r); }
	/* put pivot from end into middle */
	if ((s = a+n-1-pd)>0) { for(r=0;r<s;r++) swap(pc+r, a+n-s+r); }	    

	CT_DEBUGIF(8) {
	  VG_(printf)("   PV BB 0x%x, ", bb_addr((*pv)->bb));
	    SK_(print_cxt)(9, (*pv)->cxt, (*pv)->rec_index);

	    s = pb-pa+1;
	    VG_(printf)("    Lower %d - %d:\n", a-qsort_start, a+s-1-qsort_start);
	    for (r=0;r<s;r++) {
		pm = a+r;
		VG_(printf)("     %3d BB 0x%x, ", 
			    pm-qsort_start,bb_addr((*pm)->bb));
		SK_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
	    }

	    s = pd-pc+1;
	    VG_(printf)("    Upper %d - %d:\n", 
			a+n-s-qsort_start, a+n-1-qsort_start);
	    for (r=0;r<s;r++) {
		pm = a+n-s+r;
		VG_(printf)("     %3d BB 0x%x, ", 
			    pm-qsort_start,bb_addr((*pm)->bb));
		SK_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
	    }
	}

	if ((s = pb+1-pa) > 1) qsort(a,     s, cmp);
	if ((s = pd+1-pc) > 1) qsort(a+n-s, s, cmp);
}


/* Helpers for prepare_dump */

static Int    prepare_count;
static BBCC** prepare_ptr;


static void hash_addCount(BBCC* bbcc)
{
  if (bbcc->exe_counter>0 || bbcc->ret_counter>0)
    prepare_count++;
}

static void hash_addPtr(BBCC* bbcc)
{
  if ((bbcc->exe_counter == 0) &&
      (bbcc->ret_counter == 0)) return;

  *prepare_ptr = bbcc;
  prepare_ptr++;
}


static void cs_addCount(thread_info* ti)
{
  Int i;
  BBCC* bbcc;

  /* add BBCCs with active call in call stack of current thread.
   * update cost sums for active calls
   */
      
  for(i = 0; i < SK_(current_call_stack).sp; i++) {
    call_entry* e = &(SK_(current_call_stack).entry[i]);
    if (e->jcc == 0) continue;
    
    SK_(add_diff_cost_lz)( SK_(sets).full, &(e->jcc->cost),
			   e->enter_cost, SK_(current_state).cost);
    bbcc = e->jcc->from;

    CT_DEBUG(1, " [%2d] (tid %d), added active: %s\n",
	     i,SK_(current_tid),bbcc->cxt->fn[0]->name);
    
    if (bbcc->exe_counter>0 || bbcc->ret_counter>0) {
      /* already counted */
      continue;
    }
    prepare_count++;
  }
}

static void cs_addPtr(thread_info* ti)
{
  Int i;
  BBCC* bbcc;

  /* add BBCCs with active call in call stack of current thread.
   * update cost sums for active calls
   */
      
  for(i = 0; i < SK_(current_call_stack).sp; i++) {
    call_entry* e = &(SK_(current_call_stack).entry[i]);
    if (e->jcc == 0) continue;

    bbcc = e->jcc->from;
    
    if (bbcc->exe_counter>0 || bbcc->ret_counter>0) {
      /* already counted */
      continue;
    }

    *prepare_ptr = bbcc;
    prepare_ptr++;
  }
}


/**
 * Put all BBCCs with costs into a sorted array.
 * The returned arrays ends with a null pointer. 
 * Must be freed after dumping.
 */
static BBCC** prepare_dump()
{
    BBCC **array;

    prepare_count = 0;
    
    /* if we do not separate among threads, this gives all */
    /* count number of BBCCs with >0 executions */
    SK_(forall_bbccs)(hash_addCount);
    
    /* even if we do not separate among threads,
     * call stacks are separated */
    if (SK_(clo).separate_threads)
      cs_addCount(0);
    else
      SK_(forall_threads)(cs_addCount);

    CT_DEBUG(0, "prepare_dump: %d BBCCs\n", prepare_count);

    /* allocate bbcc array, insert BBCCs and sort */
    prepare_ptr = array =
      (BBCC**) VG_(malloc)((prepare_count+1) * sizeof(BBCC*));    

    SK_(forall_bbccs)(hash_addPtr);

    if (SK_(clo).separate_threads)
      cs_addPtr(0);
    else
      SK_(forall_threads)(cs_addPtr);

    CT_ASSERT(array + prepare_count == prepare_ptr);

    /* end mark */
    *prepare_ptr = 0;

    CT_DEBUG(0,"             BBCCs inserted\n");

    qsort_start = array;
    qsort(array, prepare_count, my_cmp);

    CT_DEBUG(0,"             BBCCs sorted\n");

    return array;
}




static void fprint_cost_ln(int fd, Char* prefix,
			   EventMapping* em, ULong* cost)
{
    int p;

    p = VG_(sprintf)(outbuf, "%s", prefix);
    p += SK_(sprint_mappingcost)(outbuf + p, em, cost);
    VG_(sprintf)(outbuf + p, "\n");
    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
}

static ULong last_bbs_done = 0;
static Char* filename = 0;

static void file_err()
{
   VG_(message)(Vg_UserMsg,
                "Error: can not open cache simulation output file `%s'",
                filename );
   VG_(exit)(1);
}

/**
 * Create a new dump file and write header.
 *
 * Naming: <SK_(clo).filename_base>.<pid>[.<part>][-<tid>]
 *         <part> is skipped for final dump (trigger==0)
 *         <tid>  is skipped for thread 1 with SK_(clo).separate_threads=no
 *
 * Returns the file descriptor, and -1 on error (no write permission)
 */
static int new_dumpfile(Char buf[BUF_LEN], int tid, Char* trigger)
{
    Bool appending = False;
    int i, fd;
    FullCost sum = 0;

    CT_ASSERT(filename != 0);

    if (!SK_(clo).combine_dumps) {
	i = VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
    
	if (trigger)
	    i += VG_(sprintf)(filename+i, ".%d", out_counter);

	if (SK_(clo).separate_threads)
	    i += VG_(sprintf)(filename+i, "-%02d", tid);

	fd = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
    }
    else {
	VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
	fd = VG_(open)(filename, VKI_O_WRONLY|VKI_O_APPEND, 0);
	if ((fd >= 0) && out_counter>1)
	    appending = True;
    }

    if (fd <0) {
	fd = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
		       VKI_S_IRUSR|VKI_S_IWUSR);
	if (fd <0) {
	    /* If the file can not be opened for whatever reason (conflict
	       between multiple supervised processes?), give up now. */
	    file_err();
	    VGP_POPCC(VgpCacheDump);
	    return -1;
	}
    }

    CT_DEBUG(2, "  new_dumpfile '%s'\n", filename);

    if (!appending)
	reset_dump_array();


    if (!appending) {
	/* version */
	VG_(sprintf)(buf, "version: 1\n");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));

	/* creator */
	VG_(sprintf)(buf, "creator: callgrind-" VERSION "\n");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));

	/* "pid:" line */
	VG_(sprintf)(buf, "pid: %d\n", VG_(getpid)());
	fwrite(fd, (void*)buf, VG_(strlen)(buf));

	/* "cmd:" line */
	VG_(strcpy)(buf, "cmd: ");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	fwrite(fd, (void*)cmdbuf, VG_(strlen)(cmdbuf));
    }

    VG_(sprintf)(buf, "\npart: %d\n", out_counter);
    fwrite(fd, (void*)buf, VG_(strlen)(buf));
    if (SK_(clo).separate_threads) {
	VG_(sprintf)(buf, "thread: %d\n", tid);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
    }

    /* "desc:" lines */
    if (!appending) {
	fwrite(fd, "\n", 1);

#if 0
	/* Global options changing the tracing behaviour */
	VG_(sprintf)(buf, "\ndesc: Option: --skip-plt=%s\n",
		     SK_(clo).skip_plt ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --collect-jumps=%s\n",
		     SK_(clo).collect_jumps ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --separate-recs=%d\n",
		     SK_(clo).separate_recursions);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --separate-callers=%d\n",
		     SK_(clo).separate_callers);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));

	VG_(sprintf)(buf, "desc: Option: --dump-bbs=%s\n",
		     SK_(clo).dump_bbs ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --separate-threads=%s\n",
		     SK_(clo).separate_threads ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
#endif

	(*SK_(cachesim).getdesc)(buf);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
    }

    VG_(sprintf)(buf, "\ndesc: Timerange: Basic block %llu - %llu\n",
		 last_bbs_done, VG_(bbs_done));
    fwrite(fd, (void*)buf, VG_(strlen)(buf));
    VG_(sprintf)(buf, "desc: Trigger: %s\n",
		 trigger ? trigger : (Char*)"Program termination");
    fwrite(fd, (void*)buf, VG_(strlen)(buf));

#if 0
   /* Output function specific config
    * FIXME */
   for (i = 0; i < N_FNCONFIG_ENTRIES; i++) {
       fnc = fnc_table[i];
       while (fnc) {
	   if (fnc->skip) {
	       VG_(sprintf)(buf, "desc: Option: --fn-skip=%s\n", fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }
	   if (fnc->dump_at_enter) {
	       VG_(sprintf)(buf, "desc: Option: --fn-dump-at-enter=%s\n",
			    fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }   
	   if (fnc->dump_at_leave) {
	       VG_(sprintf)(buf, "desc: Option: --fn-dump-at-leave=%s\n",
			    fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }
	   if (fnc->separate_callers != SK_(clo).separate_callers) {
	       VG_(sprintf)(buf, "desc: Option: --separate-callers%d=%s\n",
			    fnc->separate_callers, fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }   
	   if (fnc->separate_recursions != SK_(clo).separate_recursions) {
	       VG_(sprintf)(buf, "desc: Option: --separate-recs%d=%s\n",
			    fnc->separate_recursions, fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }   
	   fnc = fnc->next;
       }
   }
#endif

   /* "positions:" line */
   VG_(sprintf)(buf, "\npositions:%s%s%s\n",
		SK_(clo).dump_instr ? " instr" : "",
		SK_(clo).dump_bb    ? " bb" : "",
		SK_(clo).dump_line  ? " line" : "");
   fwrite(fd, (void*)buf, VG_(strlen)(buf));

   /* "events:" line */
   i = VG_(sprintf)(buf, "events: ");
   SK_(sprint_eventmapping)(buf+i, SK_(dumpmap));
   fwrite(fd, (void*)buf, VG_(strlen)(buf));
   fwrite(fd, "\n", 1);

   /* summary lines */
   sum = SK_(get_eventset_cost)( SK_(sets).full );
   SK_(zero_cost)(SK_(sets).full, sum);
   if (SK_(clo).separate_threads) {
     thread_info* ti = SK_(get_current_thread)();
     SK_(add_diff_cost)(SK_(sets).full, sum, ti->lastdump_cost,
			   ti->states.entry[0]->cost);
   }
   else {
     /* This function is called once for thread 1, where
      * all costs are summed up when not dumping separate per thread.
      * But this is not true for summary: we need to add all threads.
      */
     int t;
     thread_info** thr = SK_(get_threads)();
     for(t=1;t<VG_N_THREADS;t++) {
       if (!thr[t]) continue;
       SK_(add_diff_cost)(SK_(sets).full, sum,
			  thr[t]->lastdump_cost,
			  thr[t]->states.entry[0]->cost);
     }
   }
   fprint_cost_ln(fd, "summary: ", SK_(dumpmap), sum);

   /* all dumped cost will be added to total_fcc */
   SK_(init_cost_lz)( SK_(sets).full, &dump_total_cost );

   fwrite(fd, "\n\n",2);

   if (VG_(clo_verbosity) > 1)
       VG_(message)(Vg_DebugMsg, "Dump to %s", filename);

   return fd;
}


static void close_dumpfile(Char buf[BUF_LEN], int fd, int tid)
{
    if (fd <0) return;

    fprint_cost_ln(fd, "totals: ", SK_(dumpmap),
		   dump_total_cost);
    //fprint_fcc_ln(fd, "summary: ", &dump_total_fcc);
    SK_(add_cost_lz)(SK_(sets).full, 
		     &SK_(total_cost), dump_total_cost);

    fwrite_flush();    
    VG_(close)(fd);

    if (filename[0] == '.') {
	if (-1 == VG_(rename) (filename, filename+1)) {
	    /* Can not rename to correct file name: give out warning */
	    VG_(message)(Vg_DebugMsg, "Warning: Can not rename .%s to %s",
			 filename, filename);
       }
   }
}


/* Helper for print_bbccs */

static Int   print_fd;
static Char* print_trigger;
static Char  print_buf[BUF_LEN];

static void print_bbccs_of_thread(thread_info* ti)
{
  BBCC **p, **array;
  FnPos lastFnPos;
  AddrPos lastAPos;

  CT_DEBUG(1, "+ print_bbccs(tid %d)\n", SK_(current_tid));

  print_fd = new_dumpfile(print_buf, SK_(current_tid), print_trigger);
  if (print_fd <0) {
    CT_DEBUG(1, "- print_bbccs(tid %d): No output...\n", SK_(current_tid));
    return;
  }

  p = array = prepare_dump();
  init_fpos(&lastFnPos);
  init_apos(&lastAPos, 0, 0, 0);

  while(1) {

    /* on context/function change, print old cost buffer before */
    if (lastFnPos.cxt && ((*p==0) ||				 
			 (lastFnPos.cxt != (*p)->cxt) ||
			 (lastFnPos.rec_index != (*p)->rec_index))) {
      if (!SK_(is_zero_cost)( SK_(sets).full, ccSum[currSum].cost )) {
	/* no need to switch buffers, as position is the same */
	fprint_apos(print_fd, &(ccSum[currSum].p), &lastAPos,
		    lastFnPos.cxt->fn[0]->file);
	fprint_fcost(print_fd, &ccSum[currSum], &lastAPos);
      }
      
      if (ccSum[currSum].p.file != lastFnPos.cxt->fn[0]->file) {
	/* switch back to file of function */
	VG_(sprintf)(print_buf, "fe=");
	print_file(print_buf+3, lastFnPos.cxt->fn[0]->file);
	fwrite(print_fd, (void*)print_buf, VG_(strlen)(print_buf));
      }
      fwrite(print_fd, "\n", 1);
    }
    
    if (*p == 0) break;
    
    if (print_fn_pos(print_fd, &lastFnPos, *p)) {
      
      /* new function */
      init_apos(&lastAPos, 0, 0, (*p)->cxt->fn[0]->file);
      init_fcost(&ccSum[0], 0, 0, 0);
      init_fcost(&ccSum[1], 0, 0, 0);
      currSum = 0;
    }
    
    if (SK_(clo).dump_bbs) {
      /* FIXME: Specify Object of BB if different to object of fn */
      VG_(sprintf)(print_buf, "bb=0x%x %d %llu\n", 
		   (*p)->bb->offset,
		   (*p)->bb->instr_count,
		   (*p)->exe_counter);
      fwrite(print_fd, (void*)print_buf, VG_(strlen)(print_buf));
    }
    
    fprint_bbcc(print_fd, *p, &lastAPos);
    
    p++;
  }
  
  close_dumpfile(print_buf, print_fd, SK_(current_tid));
  VG_(free)(array);
  
  /* set counters of last dump */
  SK_(copy_cost)( SK_(sets).full, ti->lastdump_cost,
		  SK_(current_state).cost );

  CT_DEBUG(1, "- print_bbccs(tid %d)\n", SK_(current_tid));
}


static void print_bbccs(Char* trigger, Bool only_current_thread)
{
  init_dump_array();
  init_debug_cache();

  print_fd = -1;
  print_trigger = trigger;

  if (!SK_(clo).separate_threads) {
    /* All BBCC/JCC costs is stored for thread 1 */
    Int orig_tid = SK_(current_tid);

    SK_(switch_thread)(1);
    print_bbccs_of_thread( SK_(get_current_thread)() );
    SK_(switch_thread)(orig_tid);
  }
  else if (only_current_thread)
    print_bbccs_of_thread( SK_(get_current_thread)() );
  else
    SK_(forall_threads)(print_bbccs_of_thread);

  free_dump_array();
}


void SK_(dump_profile)(Char* trigger, Bool only_current_thread)
{
   VGP_PUSHCC(VgpCacheDump);

   CT_DEBUG(2, "+ dump_profile(Trigger '%s')\n",
	    trigger ? trigger : (Char*)"Prg.Term.");

   if (VG_(clo_verbosity) > 1)
       VG_(message)(Vg_DebugMsg, "Start dumping at BB %llu (%s)...",
		    SK_(stat).bb_executions,
		    trigger ? trigger : (Char*)"Prg.Term.");

   out_counter++;

   print_bbccs(trigger, only_current_thread);

   last_bbs_done = VG_(bbs_done);

   if (VG_(clo_verbosity) > 1)
     VG_(message)(Vg_DebugMsg, "Dumping done.");

   VGP_POPCC(VgpCacheDump);
}

/*` copy command to cmd buffer (could change) */
static void init_cmdbuf()
{
  Int i,j,size = 0;
  Char* argv;

  for(i = 0; i < VG_(client_argc); i++) {
    argv = VG_(client_argv[i]);
    if (!argv) continue;
    if ((size>0) && (size < BUF_LEN)) cmdbuf[size++] = ' ';
    for(j=0;argv[j]!=0;j++)
      if (size < BUF_LEN) cmdbuf[size++] = argv[j];
  }
  if (size == BUF_LEN) size--;
  cmdbuf[size] = 0;
}

void SK_(init_files)(Char** dir, Char** file)
{
  Int fd, size;

   if (!SK_(clo).filename_base)
     SK_(clo).filename_base = DEFAULT_DUMPNAME;

   /* get base directory for dump/command/result files */
   if (SK_(clo).filename_base[0] == '/') {
       int lastSlash = 0, i =1;
       while(SK_(clo).filename_base[i]) {
	 for(; SK_(clo).filename_base[i] &&
	       SK_(clo).filename_base[i] != '/'; i++);
	   if (SK_(clo).filename_base[i] != '/') break;
	   lastSlash = i;
	   i++;
       }
       base_directory = VG_(malloc)(i+1);
       VG_(strncpy)(base_directory, SK_(clo).filename_base, i);
       base_directory[i] = 0;

       dump_file_base = SK_(clo).filename_base;
   }
   else {
       size = 100;
       base_directory = 0;

       /* getcwd() fails if the buffer isn't big enough -- keep doubling size
	  until it succeeds. */
       while (NULL == base_directory) {
	   base_directory = VG_(malloc)(size);
	   if (NULL == VG_(getcwd)(base_directory, size)) {
	       VG_(free)(base_directory);
	       base_directory = 0;
	       size *= 2;
	   }
       }

       size = VG_(strlen)(base_directory) + VG_(strlen)(SK_(clo).filename_base) +2;
       dump_file_base = VG_(malloc)(size);
       CT_ASSERT(dump_file_base != 0);
       VG_(sprintf)(dump_file_base, "%s/%s",
		    base_directory, SK_(clo).filename_base);
   }

   /* allocate space big enough for final filenames */
   filename = VG_(malloc)(VG_(strlen)(dump_file_base)+32);
   CT_ASSERT(filename != 0);
       
   /* Make sure the output base file can be written.
    * This is used for the dump at program termination.
    * We stop with an error here if we can not create the
    * file: This is probably because of missing rights,
    * and trace parts wouldn't be allowed to be written, too.
    */ 
    VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
    fd = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
    if (fd <0) { 
	fd = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
		       VKI_S_IRUSR|VKI_S_IWUSR);
	if (fd <0) {
	    file_err(); 
	}
    }
    if (fd>=0) VG_(close)(fd);

    *dir  = base_directory;
    *file = filename;

    init_cmdbuf();
}
