/* Copyright (C) 2007 One Laptop Per Child
 * Author: Marc Maurer <uwog@uwog.net>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */

#include <sys/types.h>
#include <map>

#include "pd_Document.h"
#include "fl_BlockLayout.h"
#include "fp_Run.h"
#include "fp_TextRun.h"
#include "px_ChangeRecord.h"
#include "px_CR_Strux.h"
#include "px_CR_Span.h"
#include "pf_Frag_Strux.h"
#include "pf_Frag_Strux_Block.h"

#include "LanguagePattern.h"
#include "Highlighter.h"

using std::map;

Highlighter::Highlighter(FL_DocLayout* pDocLayout, const LanguageDefinition* pLangDef)
	: m_pDocLayout(pDocLayout),
	m_pLangDef(pLangDef),
	m_iListenerId(0)
{
	// FIXME: we have to be _sure_ that we are notified AFTER the FL_DocLayout class we belong to,
	// so the layout classes have been updated already when we get notified
	if (m_pDocLayout && m_pDocLayout->getDocument())
	{
		m_pDocLayout->getDocument()->addListener(this, &m_iListenerId);
	}
}

Highlighter::~Highlighter()
{
	if (m_pDocLayout && m_pDocLayout->getDocument())
	{
		m_pDocLayout->getDocument()->removeListener(m_iListenerId);
	}
}

bool Highlighter::populate(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr)
{
	UT_return_val_if_fail(sfh, false);
	UT_return_val_if_fail(pcr, false);
	UT_DEBUGMSG(("Highlighter::populate() - sfh: 0x%x, pcr type: %d\n", sfh, pcr->getType()));

	if (pcr->getType() == PX_ChangeRecord::PXT_InsertSpan ||
		pcr->getType() == PX_ChangeRecord::PXT_DeleteSpan)
	{
		return _highlight(sfh, pcr, true);
	}

	return true;
}

bool Highlighter::populateStrux(PL_StruxDocHandle sdh, const PX_ChangeRecord *pcr, PL_StruxFmtHandle *psfh)
{
	UT_DEBUGMSG(("Highlighter::populateStrux(sdh: 0x%x)\n", sdh));
	UT_return_val_if_fail(sdh, false);
//	UT_return_val_if_fail(pcr, false);
	UT_return_val_if_fail(psfh, false);

	switch (pcr->getType())
	{
		case PX_ChangeRecord::PXT_InsertStrux:
		{
			UT_DEBUGMSG(("Highlighter::populateStrux() -  insert\n"));

			// FIXME: only do this for blocks
			const pf_Frag_Strux* pFS = reinterpret_cast<const pf_Frag_Strux*>(sdh);

			RegionMap* pNewRegionMap = new RegionMap();
			if (_setupBlockMatches(pcr, *pNewRegionMap))
			{
				*psfh = reinterpret_cast<PL_StruxFmtHandle>(pNewRegionMap);
			}
			else
			{
				DELETEP(pNewRegionMap);
				*psfh = (void*)NULL;
			}

			break;
		}
		case PX_ChangeRecord::PXT_DeleteStrux:
			UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); // a PXT_DeleteStrux while populating is, erm, well, weird
			break;
		default:
			// ignore the rest, we don't need it for highlighting
			break;
	}

	return true;
}

bool Highlighter::change(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr)
{
	UT_DEBUGMSG(("Highlighter::change()\n"));
	UT_return_val_if_fail(sfh, false);
	UT_return_val_if_fail(pcr, false);
	
	// we are only interested in changes in actual spans
	switch (pcr->getType())
	{
		case PX_ChangeRecord::PXT_InsertSpan:
		case PX_ChangeRecord::PXT_DeleteSpan:
			return _highlight(sfh, pcr);
		case PX_ChangeRecord::PXT_DeleteStrux:
		{
			UT_DEBUGMSG(("Highlighter::change() - PXT_DeleteStrux, sfh: 0x%x\n", sfh));
			UT_return_val_if_fail(sfh, false);
			RegionMap* pMatches = const_cast<RegionMap*>(reinterpret_cast<const RegionMap*>(sfh));
			DELETEP(pMatches);
			return true;
		}
		default:
			// ignore the rest, we don't need it for highlighting
			break;
	}
	return true;
}

bool Highlighter::insertStrux(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr, 
			PL_StruxDocHandle sdhNew, PL_ListenerId lid, 
			void(*pfnBindHandles)(PL_StruxDocHandle sdhNew, PL_ListenerId lid, PL_StruxFmtHandle sfhNew))
{
	UT_DEBUGMSG(("Highlighter::insertStrux()\n"));
//	UT_return_val_if_fail(sfh, false);
	UT_return_val_if_fail(pcr, false);
	UT_return_val_if_fail(sdhNew, false);

	RegionMap* pNewRegionMap = new RegionMap();
	if (_setupBlockMatches(pcr, *pNewRegionMap))
	{
		pfnBindHandles(sdhNew, m_iListenerId, pNewRegionMap);
	}
	else
	{
		DELETEP(pNewRegionMap);
		pfnBindHandles(sdhNew, m_iListenerId, (void*)NULL);
	}

	return true;
}

bool Highlighter::signal(UT_uint32 iSignal)
{
	UT_DEBUGMSG(("Highlighter::signal()\n"));
	return true;
}

PLListenerType Highlighter::getType() const
{
	return PTL_DocLayout;
}

bool Highlighter::_setupBlockMatches(const PX_ChangeRecord *pcr, RegionMap& matches)
{
	UT_return_val_if_fail(pcr, false);

	// FIXME: can't this be faster? ie, use the sfh from the normal layout listener
	UT_return_val_if_fail(pcr->getType() == PX_ChangeRecord::PXT_InsertStrux, false);
	const PX_ChangeRecord_Strux* pcrs = reinterpret_cast<const PX_ChangeRecord_Strux*>(pcr);
	if (pcrs->getStruxType() != PTX_Block)
	{
		UT_DEBUGMSG(("Ignoring insertion of non-block type strux\n"));
		return false;
	}

	// TODO: we assume that this will return the block before the new block; is this
	// always correct??	
	UT_DEBUGMSG(("Looking for previous block before cr pos: %d\n", pcr->getPosition()));
	fl_BlockLayout* pPrevBL = m_pDocLayout->findBlockAtPosition(pcr->getPosition()-1);
	UT_return_val_if_fail(pPrevBL, false);

	if (pPrevBL->getPosition() >= pcr->getPosition())
	{
		UT_DEBUGMSG(("Apparently we are the first block in the document, no need to set up a continuation marker\n"));
		return true; // apparently there is no block before our current block
	}

	UT_DEBUGMSG(("Prev block in document (pos: %d): 0x%x\n", pPrevBL->getPosition()-1, pPrevBL));

	UT_DEBUGMSG(("Looking at previous block to see if we should be a continuation\n"));
	RegionMap* pPrevMap = _getRegionMap(pPrevBL);
	if (pPrevMap)
	{
		if (_isOpen(pPrevBL->getLength()-1, *pPrevMap)) // -1 for the block itself, which is included in the block length
		{
			UT_DEBUGMSG(("The new block should be inserted as continuation section\n"));
			// insert a continuation section 'marker'
			// TODO: should we make this closed or not?
			RegExMatch cont_match;
			cont_match.byte_start = 0;
			cont_match.byte_end = 0;
			cont_match.char_start = 0;
			cont_match.char_end = 0;
			cont_match.continuation = true;
			cont_match.closed = false;
			cont_match.pattern = (*(--pPrevMap->end())).second.pattern;
			matches.insert(map<int, RegExMatch>::value_type(cont_match.byte_start, cont_match));
			
		}
		else
			UT_DEBUGMSG(("The new block is no continuation block\n"));
	}

	return true;
}

UT_RGBColor Highlighter::_getColor(const LanguagePattern& pattern)
{
	if (strcmp(pattern.style.c_str(), "String") == 0)
		return UT_RGBColor(235,64,255);
	else if (strcmp(pattern.style.c_str(), "Decimal") == 0)
		return UT_RGBColor(148,64,255);
	else if (strcmp(pattern.style.c_str(), "Keyword") == 0)
		return UT_RGBColor(150,0,0);
	else if (strcmp(pattern.style.c_str(), "Types") == 0)
		return UT_RGBColor(0,150,81);
	else if (strcmp(pattern.style.c_str(), "Comment") == 0)
		return UT_RGBColor(0,85,213);
	else if (strcmp(pattern.style.c_str(), "Preprocessor") == 0)
		return UT_RGBColor(0,0,213);
	else if (strcmp(pattern.style.c_str(), "Others") == 0)
		return UT_RGBColor(0,200,0);
	else if (strcmp(pattern.style.c_str(), "Data Type") == 0)
		return UT_RGBColor(64,200,0);
	else if (strcmp(pattern.style.c_str(), "Base-N Integer") == 0)
		return UT_RGBColor(64,0,0);
	else if (strcmp(pattern.style.c_str(), "Character") == 0)
		return UT_RGBColor(64,0,64);
	else
	{
		UT_DEBUGMSG(("unknown pattern style: %s\n", pattern.style.c_str()));
		UT_ASSERT(UT_NOT_IMPLEMENTED);
		return UT_RGBColor(255,0,0);
	}
}

bool Highlighter::_highlight(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr, bool isPopulating)
{
	UT_DEBUGMSG(("Highlighter::_highlight() - pcr: 0x%x, pcr type: %d\n", pcr, pcr ? pcr->getType() : -1));
	UT_return_val_if_fail(pcr->getType() == PX_ChangeRecord::PXT_InsertSpan || pcr->getType() == PX_ChangeRecord::PXT_DeleteSpan, false);

	UT_DEBUGMSG(("Highlighter::_highlight() - cr is of type %s\n", pcr->getType() == PX_ChangeRecord::PXT_InsertSpan ? "insert" : "delete"));
	const PX_ChangeRecord_Span* pcrs = static_cast<const PX_ChangeRecord_Span*>(pcr);

	// calculate the corrention needed to determine positions before the changerecord was applied
	UT_sint32 crCorrection = (pcrs->getType() == PX_ChangeRecord::PXT_InsertSpan 
								? -pcrs->getLength() 
								: ( pcrs->getType() == PX_ChangeRecord::PXT_DeleteSpan 
										? pcrs->getLength() 
										: 0 ));
	
	UT_DEBUGMSG(("Highlighter::_highlight() insert/delete change: blockoffset: %d, bufindex: %u, length %d\n", pcrs->getBlockOffset(), pcrs->getBufIndex(), pcrs->getLength()));

	// get our current block
	// FIXME: can't this be faster? ie, use the sfh from the normal layout listener
	fl_BlockLayout* pBL = m_pDocLayout->findBlockAtPosition(pcr->getPosition());
	UT_return_val_if_fail(pBL, true);

	// FIXME: we should stop highlighting when a match is equal to the match
	// we already (might have) had

	//
	// Re-match all (potentially) damaged blocks
	//

	RegionMap* pMatches = 0;
	PT_BlockOffset damageOffset = -1;
	bool finished = false;
	fl_BlockLayout* pPrevBL = 0;
	while (pBL != pPrevBL)
	{
		pPrevBL = pBL;
		if (pBL == 0)
			break;

		bool openEndAtStart = false;
		bool openEndAtEnd = false;
		UT_GrowBuf textBuffer;

		// get a text buffer for this block
		// TODO: optimize this
		PL_StruxDocHandle sdh = pBL->getStruxDocHandle();
		UT_return_val_if_fail(sdh, true);
		const pf_Frag_Strux_Block* fsb = reinterpret_cast<const pf_Frag_Strux_Block*>(sdh);
		m_pDocLayout->getDocument()->getBlockBuf(sdh, &textBuffer);
		
		pMatches = _getRegionMap(pBL);
		if (!pMatches)
		{
			UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);
			break;
		}
		RegionMap& matches = *pMatches;

		// check if we need to recheck the next block
		openEndAtStart = _isOpen(textBuffer.getLength() + crCorrection, matches);
		crCorrection = 0; // we only need to correct for the first block

		// find the offset in the block the damage has occurred
		if (damageOffset == -1)
			damageOffset = _findDamageOffset(fsb, pcr, matches);

		// syntac highlight this block
		UT_DEBUGMSG(("Syntaxhighlighting from offset: %d\n", damageOffset));

		// first, check if we damaged a continuation match (from a match in the previous block)
		bool matchingFinished = false;
		if (damageOffset == 0 &&
			matches.begin() != matches.end() && 
			(*matches.begin()).second.continuation)
		{
			RegExMatch old_cont_match = (*matches.begin()).second;
			UT_DEBUGMSG(("Damage occurred to a continuation section, for pattern: %s!\n", old_cont_match.pattern->name.c_str()));

			PT_BlockOffset end_regex_offset = 0;
			// FIXME: remove ugly const cast
			bool foundEndRegEx = _matchEndRegex(textBuffer, *const_cast<LanguagePattern*>(old_cont_match.pattern), &end_regex_offset);
			if (foundEndRegEx)
			{
				UT_DEBUGMSG(("Found end regex for continuation match at pos: %d\n", end_regex_offset));
				_pruneRegionsUpTo(end_regex_offset, matches);

				RegExMatch cont_closed_match;
				cont_closed_match.byte_start = 0;
				cont_closed_match.byte_end = end_regex_offset; // FIXME: THIS IS PLAIN WRONG
				cont_closed_match.char_start = 0;
				cont_closed_match.char_end = end_regex_offset;
				cont_closed_match.continuation = true;
				cont_closed_match.closed = true;
				cont_closed_match.pattern = old_cont_match.pattern;
				matches.insert(map<int, RegExMatch>::value_type(cont_closed_match.byte_start, cont_closed_match));

				UT_DEBUGMSG(("Moving damageOffset forward to %d\n", end_regex_offset));
				damageOffset = end_regex_offset;
			}
			else
			{
				UT_DEBUGMSG(("Found no end regex for continuation match, marking whole block as a continuation and open\n"));

				// we apparently damaged the end regex match, and no other one exists in this
				// block. This means we'll make 1 match that spans this whole block, and is as
				// well a continuation match as an open match
				matches.clear();
				RegExMatch cont_open_match;
				cont_open_match.byte_start = 0;
				cont_open_match.byte_end = textBuffer.getLength(); // FIXME: THIS IS PLAIN WRONG
				cont_open_match.char_start = 0;
				cont_open_match.char_end = textBuffer.getLength();
				cont_open_match.continuation = true;
				cont_open_match.closed = false;
				cont_open_match.pattern =  old_cont_match.pattern;
				matches.insert(map<int, RegExMatch>::value_type(cont_open_match.byte_start, cont_open_match));
				matchingFinished = true;
			}
		}					

		if (!matchingFinished)
		{
			// delete all regions after the damage spot, we can't be sure they won't
			// be touched in any shape or form
			_pruneRegions(damageOffset, matches);
			_matchRegions(textBuffer, damageOffset, matches);
			_pruneAndRematchRegions(textBuffer, damageOffset, matches);
		}

		// Update the run coloring information for this block
		_decorateRuns(pBL, damageOffset, matches);

		if (!isPopulating)
		{
			// the last match in this block could have become open during this rematching 
			// session, which means we have to continue matching on the next line
			// NOTE: we need to check what the situation was BEFORE the changerecord was 
			// applied, hence the changerecord correction factor
			openEndAtEnd = _isOpen(textBuffer.getLength(), matches); 

			// now check if we need to rematch the next block

			if (!(openEndAtStart || openEndAtEnd))
			{
				UT_DEBUGMSG(("openEndAtStart = false, openEndAtEnd = false\n"));

				// do nothing
			}
			else if (!openEndAtStart && openEndAtEnd)
			{
				UT_DEBUGMSG(("openEndAtStart = false, openEndAtEnd = true\n"));

				// search for the ending regex of our open match in the next block(s)
				RegionMap::iterator ilast = --matches.end();
				RegExMatch& open_match = (*ilast).second;
				if (open_match.pattern)
				{
					pBL = pBL->getNextBlockInDocument();
					while (pBL)
					{
						UT_GrowBuf nextTextBuffer;
						PT_BlockOffset end_regex_offset = 0;

						PL_StruxDocHandle next_sdh = pBL->getStruxDocHandle();
						UT_return_val_if_fail(next_sdh, false);
						m_pDocLayout->getDocument()->getBlockBuf(next_sdh, &nextTextBuffer);
						bool foundEndRegEx = _matchEndRegex(nextTextBuffer, *const_cast<LanguagePattern*>(open_match.pattern), &end_regex_offset);
						if (foundEndRegEx)
						{
							damageOffset = end_regex_offset;
							RegionMap* pNextBlockMatches = _getRegionMap(pBL);
							if (pNextBlockMatches)
							{
								// first, delete all matches in the next block up to the end of the ending regex
								_pruneRegionsUpTo(end_regex_offset, *pNextBlockMatches);

								// insert a new match, which closes the opened match
								RegExMatch closing_match;
								closing_match.byte_start = 0; // TODO: this isn't true when there are for example images in this block
								closing_match.byte_end = damageOffset; // FIXME, not utf8 safe
								closing_match.char_start = 0;
								closing_match.char_end = damageOffset;
								closing_match.continuation = true;
								closing_match.closed = true;
								closing_match.pattern = open_match.pattern;		
								pNextBlockMatches->insert(map<int, RegExMatch>::value_type(closing_match.byte_start, closing_match));

								// recolor the runs
								_decorateRuns(pBL, 0, *pNextBlockMatches);
							}
							else
								UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);

							break;
						}
						else
						{
							// nope, this block does not have the ending regex we are looking for
							// this means we will erase all it's current matches, and replace it
							// with one open match
							UT_DEBUGMSG(("No closing regex found for block at pos: %d, erasing all and inserting a signle continued, open match\n", pBL->getPosition()));

							RegionMap* pNextBlockMatches = _getRegionMap(pBL);
							if (pNextBlockMatches)
							{
								// clear and insert one continued, open match
								pNextBlockMatches->clear();
								RegExMatch cont_match;
								cont_match.byte_start = 0; // TODO: this isn't true when there are for example images in this block
								cont_match.byte_end = pBL->getLength(); // FIXME, not utf8 safe
								cont_match.char_start = 0;
								cont_match.char_end = pBL->getLength();
								cont_match.continuation = true;
								cont_match.closed = false;
								cont_match.pattern = open_match.pattern;
								pNextBlockMatches->insert(map<int, RegExMatch>::value_type(cont_match.byte_start, cont_match));

								// recolor the runs
								_decorateRuns(pBL, 0, *pNextBlockMatches);
							}
							else
								UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);

							// now search again for a matching end regex in the next block; maybe
							// we are lucky this time
							pBL = pBL->getNextBlockInDocument();
						}
					}
				}
				else
					UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);

				continue;
			}
			else if (openEndAtStart && openEndAtEnd)
			{
				UT_DEBUGMSG(("openEndAtStart = true, openEndAtEnd = true\n"));

				// FIXME: we're ONLY done here when the current open match and the 
				// end as this block didn't change its type during this rematching
				// iteration. If its open of a different type now, then we should
				// recheck the next block for a closing match
			}

			// TODO: ALSO CHECK IF WHEN WE _ARE_ OPEN, THAT THE NEXT CONTINUATIO MATCH HAS THE SAME PATTERN

			// one final last case: the next block is a continuation, but we don't have 
			// any open match on the end of the current block, meaning we have found
			// some stale highlighting, force a recheck.

			if (!openEndAtEnd && _nextBlockIsContinuation(pBL)) // FIXME: suboptimal, as it requires 2 times fetching the next block in this document
			{
				UT_DEBUGMSG(("Detected stale continuation highlighting in the next block, continuing with next block\n"));

				pBL = pBL->getNextBlockInDocument();

				// NOTE: make sure to remove all existing matches, as a full recheck will never remove continuations
				RegionMap* pNextMatches = _getRegionMap(pBL);
				if (pNextMatches)
					pNextMatches->clear();
				else
					UT_ASSERT(UT_SHOULD_NOT_HAPPEN);

				damageOffset = 0;
			}
		} /* if (!populating) */

	} /* while (pBL != pPrevBL) */

	return true;
}

PT_BlockOffset Highlighter::_findDamageOffset(const pf_Frag_Strux_Block* fsb, const PX_ChangeRecord *pcr, RegionMap& matches)
{
	// get the position of the damage relative to the start of the block
	// NOTE: we want to find the position for the situation that existed
	// BEFORE the changerecord was applied, as all our matches are indexed
	// using that metric
	PT_BlockOffset damageOffset = pcr->getPosition() - fsb->getPos() - 1;

	// TODO: rewrite this!!!

	return 0;
}

bool Highlighter::_isContinuation(RegionMap& matches)
{
	UT_DEBUGMSG(("Highlighter::_isContinuation()\n"));
	return matches.begin() != matches.end() && (*matches.begin()).second.continuation;
}

bool Highlighter::_isOpen(PT_BlockOffset endOffset, RegionMap& matches)
{
	UT_DEBUGMSG(("Highlighter::_isOpen() - endOffset: %d\n", endOffset));

	if (matches.size() == 0)
		return false;

	// TODO: only do this if the current match was actually modified/removed/whatever (can't check that here)
	RegionMap::iterator ilast = --matches.end();
	RegExMatch& match = (*ilast).second;
	UT_DEBUGMSG(("Found the last match, char_end: %d, type: %s\n", match.char_end, match.pattern->name.c_str()));
	if (match.char_end == endOffset && !match.closed)
		return true;
	return false;
}

bool Highlighter::_nextBlockIsContinuation(fl_BlockLayout* pBL)
{
	UT_return_val_if_fail(pBL, false);

	fl_BlockLayout* pNextBL = pBL->getNextBlockInDocument();
	if (pNextBL)
	{
		RegionMap* pMatches = _getRegionMap(pNextBL);
		if (pMatches)
		{
			if (pMatches->begin() != pMatches->end() && (*pMatches->begin()).second.continuation)
				return true;
		}
		else
			UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);
	}
	return false;
}

RegionMap* Highlighter::_getRegionMap(const fl_BlockLayout* pBL)
{
	UT_return_val_if_fail(pBL, NULL);

	// get pt element (frag strux) for this block
	PL_StruxDocHandle sdh = pBL->getStruxDocHandle();
	UT_return_val_if_fail(sdh, 0);
	const pf_Frag_Strux_Block* fsb = reinterpret_cast<const pf_Frag_Strux_Block*>(sdh);

	return const_cast<RegionMap*>(reinterpret_cast<const RegionMap*>(fsb->getFmtHandle(m_iListenerId))); // we really don't want/require the RegionMap to be const
}

void Highlighter::_pruneRegionsUpTo(PT_BlockOffset startOffset, RegionMap& matches)
{
	for (RegionMap::iterator nbmit = matches.begin(); nbmit != matches.end();)
	{
		RegionMap::iterator cur_nbmit = nbmit++;
		if ((*cur_nbmit).first < startOffset)
			matches.erase(cur_nbmit);
		else
			break;
	}
}

void Highlighter::_pruneRegions(PT_BlockOffset startOffset, RegionMap& matches)
{
	for (RegionMap::iterator mit = matches.begin(); mit != matches.end(); mit++)
	{
		RegExMatch& match = (*mit).second;
		if (match.char_start >= startOffset)
		{
			matches.erase(mit, matches.end());
			break;
		}
	}
}

void Highlighter::_matchRegions(const UT_GrowBuf& textBuffer, PT_BlockOffset startOffset, RegionMap& matches)
{
	UT_return_if_fail(m_pLangDef);
	
	gchar* utf8str = g_ucs4_to_utf8((const gunichar*)textBuffer.getPointer(0), textBuffer.getLength(), 0, 0, 0);
	int startByteOffset = (startOffset > 0 ? g_utf8_offset_to_pointer(utf8str, startOffset) - utf8str : 0);

	UT_DEBUGMSG(("Highlighter::_nextRegion() - utf8str: %s, startByteOffset: %d\n", utf8str, startByteOffset));
	
	RegExMatch match;
	for (vector<LanguagePattern*>::const_iterator pos = m_pLangDef->getPatterns().begin();
			pos != m_pLangDef->getPatterns().end(); pos++)
	{
		LanguagePattern* pPat = *pos;
		if (pPat)
		{
			match.pattern = pPat;
			
			switch (pPat->type)
			{
				case LanguagePattern::ESCAPE_CHAR:
					// do nothing
					break;
				default:
					{
						// try to match this pattern; every pattern may match multiple times (where applicable)
						int start = startByteOffset;
						int strlength = g_utf8_strlen(utf8str, -1);
						while (start >= 0 && start < strlength)
						{
							if (pPat->start_regex.size() > 0)
							{
								if (_search(pPat->comp_start_regex, pPat->start_regex, utf8str, strlength, start, &match) >= 0)
								{
									UT_DEBUGMSG(("start-regex match: %s at pos %d\n", pPat->start_regex.c_str(), match.byte_start));
									start = match.byte_end;
									
									if (pPat->end_regex.size() > 0)
									{
										// now try to find up the ending regex
										RegExMatch end_match = match;
										if (start < strlength &&
											_search(pPat->comp_end_regex, pPat->end_regex, utf8str, strlength, start, &end_match) >= 0)
										{
											UT_DEBUGMSG(("end-regex match: %s at pos: %d\n", pPat->end_regex.c_str(), end_match.byte_start));
											start = end_match.byte_end;
											
											// now resize the opening match to include the end match
											match.byte_end = end_match.byte_end;
											match.char_end = end_match.char_end;

											matches.insert(map<int, RegExMatch>::value_type(match.byte_start, match));
										}
										else
										{
											// we found no matching end regex for the opening regex
											if (pPat->endAtLineEnd)
											{
												UT_DEBUGMSG(("no end-regex match found, inflating to end of line and inserting at pos %d, type: %s\n", match.byte_start, pPat->name.c_str()));
												
												// just scale up this match to the end of the line, and be done with it
												match.byte_end = (int)g_utf8_strlen(utf8str, -1);
												match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string?
												matches.insert(map<int, RegExMatch>::value_type(match.byte_start, match));
												UT_DEBUGMSG(("inflated, byte_end: %d, char_end: %d\n", match.byte_end, match.char_end));
												break;
											}
											else
											{
												UT_DEBUGMSG(("no end-regex match found, inflating to end of line, marking unclosed and inserting\n"));

												// scale up this match to the end of the line; // furtermore mark this match as 
												// non-closed, so we can continue searching for the matching end regex on the next line;
												match.byte_end = (int)g_utf8_strlen(utf8str, -1);
												match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string?
												match.closed = false;
												matches.insert(map<int, RegExMatch>::value_type(match.byte_start, match));
												break;
											}
										}
									}
									else
									{
										// there is no end regex belonging to this start regex

										if (pPat->endAtLineEnd)
										{
											// scale up this match to the end of the line
											match.byte_end = strlength;
											match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string?
											
											matches.insert(map<int, RegExMatch>::value_type(match.byte_start, match));
											break;
										}
										else
										{
											// huh?! should we just scale this match up to the end of this document?
											UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);
											
											// for now, to handle this we'll just scale this match up to the end of the line
											match.byte_end = strlength;
											match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string?
											
											matches.insert(map<int, RegExMatch>::value_type(match.byte_start, match));
											break;
										}
									}
								}
								else
									break;
							}
							else if (pPat->regex.size() > 0)
							{
								if (_search(pPat->comp_regex, pPat->regex, utf8str, strlength, start, &match) >= 0)
								{
									// TODO: is end-at-end-of-line important here?
									
									UT_DEBUGMSG(("regex match: %s\n", pPat->regex.c_str()));
									start = match.byte_end;
									matches.insert(map<int, RegExMatch>::value_type(match.byte_start, match));;
								}
								else
									break;
							}
							else
							{
								UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);
								break;
							}
						}
					}
					break;
			} /* switch */
		} /* if */
	} /* for */
}

void Highlighter::_pruneAndRematchRegions(const UT_GrowBuf& textBuffer, PT_BlockOffset startOffset, RegionMap& matches)
{
	UT_DEBUGMSG(("pruneAndRematchRegions() - startOffset: %d\n", startOffset));

	// now prune all overlapping matches
	for (map<int, RegExMatch>::iterator cur = matches.begin(); cur != matches.end(); cur++)
	{
		RegExMatch cur_match = (*cur).second;
		UT_continue_if_fail(cur_match.pattern);
		
		// first, skip every match that is before our starting position
		if (cur_match.char_end <= startOffset)
		{
			UT_DEBUGMSG(("Skipping check on pos %d for pattern: %s\n", cur_match.byte_start, cur_match.pattern->name.c_str()));
			continue;
		}
		
		UT_DEBUGMSG(("\tKeep match at pos: %d-%d for pattern: %s\n", cur_match.byte_start, cur_match.byte_end, cur_match.pattern->name.c_str()));
		
		map<int, RegExMatch>::iterator next = cur;
		next++;
		if (next == matches.end())
		{
			UT_DEBUGMSG(("\tWe are the last match, auto-approving\n"));
			break;
		}

		RegExMatch next_match = (*next).second;
		bool checknext = true;
		while (checknext)
		{
			if ((next_match.byte_start <= cur_match.byte_end-1) && 
				(next_match.byte_end-1 <= cur_match.byte_end-1))
			{
				// the current match completely overlaps the next match, so we
				// can remove the next match from our list of valid matches
				UT_DEBUGMSG(("\tDelete match at pos: %d for pattern: %s\n", next_match.byte_start, next_match.pattern->name.c_str()));
	
				map<int, RegExMatch>::iterator del = next;
				next++;
				next_match = (*next).second;
				matches.erase(del);

				if (next == matches.end())
					checknext = false;
			}
			else if ((next_match.byte_start <= cur_match.byte_end-1) &&
					 next_match.byte_end-1 > cur_match.byte_end-1)
			{
				// the next match has its starting point within the current match,
				// and its end point after the end of the current match.
				// for example (c code): 
				//
				//   if (true) { pr/*intf("*/a"); printf("b"); }
				//
				// the closing " of string in this case should be interpreted as
				// the start of a string.
				// to reach that behavior, we invalidate every match after the 
				// current match, and start over from the end point of the current match
				
				UT_DEBUGMSG(("\tPartial overlap found at pos %d, INVALIDATING FROM %d, and auto-approving current\n", next_match.byte_start, cur_match.byte_end));
				matches.erase(next, matches.end());
				_matchRegions(textBuffer, cur_match.char_end, matches);
				UT_DEBUGMSG(("\tDone redoing partial overlap from pos %d\n", cur_match.byte_end));

				// note that we can continue validating from this point onwards, as none of the 
				// already validated matches will be touched by the previous _matchRegions() calls

				// we can auto-approve the current match now, as no other match can overlap this one
				// anymore, given the matches.erase() we just did
				checknext = false;
			}
			else
			{
				UT_DEBUGMSG(("\tNo overlap against current match anymore: %d-%d for pattern: %s\n", cur_match.byte_start, cur_match.byte_end, cur_match.pattern->name.c_str()));
				checknext = false;
			}
		} /* while */
	}
}

bool Highlighter::_matchEndRegex(const UT_GrowBuf& textBuffer, LanguagePattern& pattern, PT_BlockOffset* iOffset)
{
	UT_DEBUGMSG(("Highlighter::_matchEndRegex()\n"));
	if (pattern.end_regex.size() == 0)
		return false;
	
	gchar* utf8str = g_ucs4_to_utf8((const gunichar*)textBuffer.getPointer(0), textBuffer.getLength(), 0, 0, 0);
	UT_DEBUGMSG(("Highlighter::_matchEndRegex() - utf8str: %s\n", utf8str));

	RegExMatch match;
	if (_search(pattern.comp_end_regex, pattern.end_regex, utf8str, (int)g_utf8_strlen(utf8str, -1), 0, &match) >= 0)
	{
		UT_DEBUGMSG(("Found end regex ending at char offset: %d\n", match.char_end));
		*iOffset = match.char_end;
		return true;
	}

	return false;
}

void Highlighter::_decorateRuns(fl_BlockLayout* pBL, UT_uint32 runOffset, RegionMap& matches)
{
	UT_return_if_fail(pBL);

	// TODO: use getRun() ourselves, as we iterate way to much now using findRunAtOffset
	RegionMap::const_iterator cmit = matches.begin();
	fp_Run* pRun = pBL->findRunAtOffset(runOffset);
	while (pRun)
	{
		runOffset = pRun->getBlockOffset(); // find the actual run offset

		if (pRun->getType() == FPRUN_TEXT)
		{
			fp_TextRun* pTextRun = static_cast<fp_TextRun*>(pRun);
			vector<fp_TextRunDecoration>& vDecoration = pTextRun->getTextDecorations();
			vDecoration.clear();
			pTextRun->markAsDirty();

			UT_DEBUGMSG(("Got textrun: 0x%x, run block offset: %d\n", pRun, pTextRun->getBlockOffset()));

			while (cmit != matches.end())
			{
				RegExMatch match = (*cmit).second;
				if (match.char_end <= pRun->getBlockOffset())
				{
					// the match is before this run, we're done with the current match
					cmit++; 
				}
				else if (match.char_start >= pRun->getBlockOffset() + pRun->getLength())
				{
					// the match is after this run, we're done witht the current run
					break;
				}
				else
				{
					// this match touches this run
					UT_uint32 hit_start = match.char_start > pRun->getBlockOffset() ? match.char_start : pRun->getBlockOffset();
					UT_uint32 hit_end = match.char_end > pRun->getBlockOffset() + pRun->getLength() ? pRun->getBlockOffset() + pRun->getLength() : match.char_end;

					// color this part of the 
					fp_TextRunDecoration decor;
					decor.setStartOffset(hit_start - pRun->getBlockOffset()); // decorator offsets are relative to the run block offset
					decor.setEndOffset(hit_end - pRun->getBlockOffset());
					decor.setFgColor(_getColor(*match.pattern));
					vDecoration.push_back(decor);

					UT_DEBUGMSG(("Colored textrun: 0x%x, run block offset: %d, length: %d, decor.start: %d, decor.end: %d, match.char_end: %d\n", pRun, pTextRun->getBlockOffset(), pRun->getLength(), decor.getStartOffset(), decor.getEndOffset(), match.char_end));

					if (match.char_end <= pRun->getBlockOffset() + pRun->getLength())
						cmit++; // we're done with the current match

					if (hit_end == pRun->getBlockOffset() + pRun->getLength())
						break; // we're done with the current run
				}
			}
		}

		runOffset = pRun->getBlockOffset() + pRun->getLength();
		UT_DEBUGMSG(("Finding run at offset: %d\n", runOffset));
		pRun = pBL->findRunAtOffset(runOffset);
	}
	UT_DEBUGMSG(("Done coloring runs\n"));
}

int Highlighter::_search(regex_t*& comp_regex, const string& regex, gchar* str, int length, int start, RegExMatch* pMatch)
{
	//UT_DEBUGMSG(("Highlighter::_search() - regex: >%s<, str: >%s<, length: %d, start: %d\n", regex.c_str(), str, length, start));

	//UT_return_val_if_fail(comp_regex, -2);
	UT_return_val_if_fail(regex.size() > 0, -2);
	UT_return_val_if_fail(str != NULL && *str != '\0', -2);
	UT_return_val_if_fail(length > 0, -2);
	UT_return_val_if_fail(start >= 0 && start < length, -2);
	UT_return_val_if_fail(pMatch, -2);

	int startpos = -1; /* no match */
	
	if (!comp_regex)
	{
		comp_regex = new regex_t();
		//UT_DEBUGMSG(("Compiling regex in 0x%x\n", comp_regex));

		re_syntax_options = RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
		comp_regex->translate = NULL;
		comp_regex->fastmap = reinterpret_cast<char*>(g_malloc(256));
		comp_regex->allocated = 0;
        comp_regex->buffer = NULL;

		// precompile our regular expression, and cache the results	
		const char *res = re_compile_pattern(regex.c_str(), regex.size(), comp_regex);
		if (res != NULL)
		{
			UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);
			FREEP(comp_regex->fastmap);
			DELETEP(comp_regex);
			return -2; /* internal error */
		}
	
		if (re_compile_fastmap(comp_regex) != 0)
		{
			UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN);
			regfree(comp_regex);
			DELETEP(comp_regex);
			return -2; /* internal error */
		}

	}
	//UT_DEBUGMSG(("Using compiled regex: 0x%x\n", comp_regex));

	re_registers regs;
	int p = re_search(comp_regex, str, length, start, length, &regs);
	if (p >= 0)
	{
		// UT_DEBUGMSG(("match at pos %d for regex: %s\n", p, regex.c_str()));
		startpos = p;
		
		// byte index
		pMatch->byte_start = p;
		pMatch->byte_end = regs.end[0];
			
		// character index
		pMatch->char_start = g_utf8_pointer_to_offset(str, str + p); 
		pMatch->char_end = g_utf8_pointer_to_offset(str, str + regs.end[0]);
			
		//UT_DEBUGMSG(("  start: %d, end: %d   growbuf: start: %d, end: %d\n", pMatch->byte_start, pMatch->byte_end, pMatch->char_start, pMatch->char_end));

		// HACK HACK HACK: when we have a match, and the same pattern matches next time, it will crash
		regfree(comp_regex);
		DELETEP(comp_regex);
	}

	return startpos;
}
