#!/usr/bin/env python
# -*- Mode: python -*-
#
# Copyright (C) 2002-2003 Mark Ferrell <xrxgrok@yahoo.com>
# Copyright (C) 2000-2001 The ViewCVS Group. All Rights Reserved.
#
# By using this file, you agree to the terms and conditions set forth in
# the LICENSE.html file which can be found at the top level of the ViewCVS
# distribution or at http://viewcvs.sourceforge.net/license-1.html.
#
# Contact information:
#   Greg Stein, PO Box 760, Palo Alto, CA, 94302
#   gstein@lyra.org, http://viewcvs.sourceforge.net/
#
# -----------------------------------------------------------------------
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   1. Redistributions of source code must retain the above copyright notice,
#      this list of conditions and the following disclaimer.
#
#   2. Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
# -----------------------------------------------------------------------
#
# This file heavily borrows from revision 1.23 of the rlog.py script originally
# distributed with ViewCVS.  As such, this file is released under the original
# license it was distributed with.
#

import sys
import string
import re
import calendar
import time
import os

import StorageLayer
from StorageLayer import WIPRevision
import SCM

error = 'Parser error'
class EndOfInputError(StandardError): pass

class Parser:
	# change our verbosity

	OUTPUT_NONE = 0
	OUTPUT_NORM = 1
	OUTPUT_VERB = 2
	OUTPUT_DEBG = 3

	BRANCH_MAIN = 'MAIN'

	# constants used in the output parser


	_rlog_commit_sep = '----------------------------\n'
	_rlog_end = '=============================================================================\n'

	# regular expression used in the output parser
	_re_unknown_file = re.compile("^\? .*$")
	_re_rcs_file = re.compile("^(?:RCS file: )?(.*?)(Attic/)?([^/]*),v$")
	_re_working_file = re.compile("^Working file: (.*)$")
	_re_symbolic_name = re.compile("\s+([^:]+):\s+(.+)$")
	_re_total_revs = re.compile("^total revisions:\s+(\d+);\s+selected\s+revisions:\s+(\d+)$")

	_re_rev_start = re.compile("^revision\s+([0-9.]+).*")

	_re_rev_data_add = re.compile(
		"^date:\s+(\d+)/(\d+)/(\d+)\s+(\d+):(\d+):(\d+);\s+"\
		"author:\s+([^;]+);\s+"\
		"state:\s+([^;]+);\s+"\
		"lines:\s+\+(\d+)\s+\-(\d+)$")

	_re_rev_data = re.compile(
		"^date:\s+(\d+)/(\d+)/(\d+)\s+(\d+):(\d+):(\d+);\s+"\
		"author:\s+([^;]+);\s+"\
		"state:\s+([^;]+);$")
	_re_rev_branches = re.compile("^branches:\s+([0-9.]+);")


	def __init__(self, config, pipe=None, verbose=OUTPUT_NONE):
		# revision data stores
		self.revisions_list = []
		self.revs_by_file = {}
		self.revs_by_author = {}
		self.revs_on_branch = {}
		self._verbose = verbose

		# We try to only store a text segment once in memory, so we
		# check these for proper matches before going off and using the
		# ones we read from a file.  A bit slower, but worth the memory
		# savings in large archives.
		self._files = {}
		self._authors = {}
		self._branches = {}
		self._tags = {}
		self._logs = {}

		self._pipe = pipe
		self._queued_lines = []

		if verbose >= Parser.OUTPUT_VERB:
			start_time = time.time()
			sys.stderr.write("Parser started @ %s\n" % time.ctime(start_time))
		try:
			# parse out the entire repository history
			while 1:
				filename = self._parse_filename(config)
				(branches, tags) = self._parse_sym_names()
				total_revs = self._parse_total_revs()
				if total_revs == 0:
					self._debug("Skip to end of file")
					line = ''
					while not line == Parser._rlog_end:
						line = self.readline()
					self._debug("Done skipping")
					continue
				self._purge_description()
				while total_revs:
					self._debug("Reading next revision...")
					total_revs = self._parse_revs(filename, tags, branches, total_revs)
				self._debug("Ready for next file")
		except EndOfInputError:
			self._debug("End of input stream reached.")
			# Clean up.
			self._pipe.close()

		# we return a "sorted" array of the revisions
		self.revisions_list.sort()

		if verbose >= Parser.OUTPUT_VERB:
			end_time = time.time()
			sys.stderr.write("Parser finished @ %s\n" % time.ctime(end_time))

	def readline(self):
		if self._queued_lines:
			line = self._queued_lines.pop(0)
		else:
			line = self._pipe.readline()
		# Have we reached the end of our input?
		if not line:
			raise EndOfInputError
		self._debug("INPUT> %s" % string.rstrip(line))
		return line

	def putline(self, line):
		self._queued_lines.append(line)

	# Various Query interfaces
	def revisions(self): return self.revisions_list
	def authors(self): return (self.revs_by_author.keys())
	def author(self, author): return self.revs_by_author[author]

	def _debug(self, string):
		if self._verbose >= Parser.OUTPUT_DEBG: sys.stderr.write("%s\n" % string)

	def _parse_sym_names(self): # FIXME we should be handling branches as well

		self._debug("_parse_sym_names")

		# Find the start of the symbolic name table
		line = self.readline()
		while line:
			if line == "symbolic names:\n":
				self._debug("found symbolic names table")
				break
			line = self.readline()

		tag_hash = {}
		branch_hash = {}

		# parse all the tags int the tag_hash
		line = self.readline()
		while line:
			match = Parser._re_symbolic_name.match(line)

			if not match: break

			(tag, revision) = match.groups()
			self._debug("found symbolic name %s for %s" % (tag, revision))


			# "Technically" in CVS all branches have an odd
			# number of numeric positions, i.e. X.X.X has 3, so all
			# branches have an even number of '.'s.  In reality,
			# CVS only really holds this true if someone has gone
			# out of their way to use the CVS admin command to
			# associate a true branch number to a CVS magic branch,
			# most annoying.  CVS magic branches contain a 0 as the
			# second to last position.  i.e. X.X.X.0.1.  CVS claims
			# they do this for performance reasons, though one
			# would assume that counting the number of '.'s would
			# work just as well, if not better considerin that the
			# X.X.X.0.X format doesn't comply with RCS's handling
			# of branches.  Soooo, if there is a .0, then we have
			# to remove it so we can check for even/odd number of
			# '.' in the version in order to decide if it is a
			# branch or a revision.
			index = string.rfind(revision, '.')
			if revision[index-2:index] == '.0':
				revision = revision[:index-2] + revision[index:]
				self._debug("magic branch mangled to %s" % revision)

			# check to see if we have an even or odd number of '.'s
			# Even means we have an odd number of X.X.X, so it's a
			# branch.  Odd is a revision on a branch (or on the MAIN)
			if (string.count(revision, '.') % 2) == 0:
				branch_hash[tag] = revision
				self._debug("Added symbolic name %s to branch hash" % tag)
			else:
				tag_hash[tag] = revision
				self._debug("Added symbolic name %s to tag hash" % tag)

			line = self.readline()
		return (branch_hash, tag_hash)


	def _parse_total_revs(self):
		self._debug("_parse_total_revs")
		line = self.readline()
		while line:
			match = Parser._re_total_revs.match(line)

			if match:
				(total_revs, requested_revs) = match.groups()
				self._debug("total revs %s, requested revs %s" % (total_revs, requested_revs))
				return string.atoi(requested_revs)
			line = self.readline()

		
	def _parse_revs(self, filename, tags, branches, count):
		self._debug("_parse_revs")
		count -= 1
		line = self.readline()

		# Since FreeBSD's rlog outputs extra "---...---\n" before
		# "===...===\n", _rlog_end may be occured here.
		if not line or line == Parser._rlog_end:
			self._debug("EOF")
			return None

		# parse through revision entries
		match = Parser._re_rev_start.match(line)
		if not match:
			raise error, "bad rlog parser, no revision"

		(revision,) = match.groups()
		self._debug("found revision %s" % revision)

		# data line
		line = self.readline()
		match = Parser._re_rev_data.match(line)
		if not match:
			match = Parser._re_rev_data_add.match(line)

		if not match:
			raise error, "bad rlog parser, no cookie!"

		groups = match.groups()
		year = string.atoi(groups[0])
		month = string.atoi(groups[1])
		day = string.atoi(groups[2])
		hour = string.atoi(groups[3])
		minute = string.atoi(groups[4])
		second = string.atoi(groups[5])
		author = groups[6]
		state = groups[7]

		self._debug("groups %d %d %d %d:%d:%d %s %s" % (year, month, day, hour, minute, second, author, state))

		# very strange; here's the deal: if this is a newly added file,
		# then there is no plus/minus count of lines; if there is, then
		# this could be a "CHANGE" or "REMOVE", you can tell if the
		# file has been removed by looking if state == 'dead'
		try:
			pluscount = int(groups[8])
			minuscount = int(groups[9])
		except IndexError:
			pluscount = 0
			minuscount = 0
			if state == 'dead':
				cmit_type = SCM.Revision.Revision.PLACEHOLDER
			else:	cmit_type = SCM.Revision.Revision.ADD
		else:
			if state == 'dead':
				cmit_type = SCM.Revision.Revision.REMOVE
			else:	cmit_type = SCM.Revision.Revision.CHANGE

		# branch line: pretty much ignored if it's there FIXME we must
		# check the value of the "branches: " key here, if it exists to
		# see if this is a branch point in the repository.  damn CVS
		# sucks.
		log_list = []
		line = self.readline()
		branches_line = 0
		match = Parser._re_rev_branches.match(line)
		if not match:
			log_list.append(string.rstrip(line))
		else:
			branches_line = match.groups()[0]
			self._debug("found branch line \"%s\"" % branches_line)
			# ignore the 1.1.1 branch; AFAICT, it is the
			# initial revision's (the initial vendor
			# branch). Not ignoring it causes the initial
			# import to be split up into multiple
			# changesets - ugh.
			if branches_line == "1.1.1":
				branches_line = 0
				self._debug("	Ignoring Initial revision's branch line")

		# suck up the log
		while 1:
			line = self.readline()

			# the last line printed out by rlog is '===='...  or
			# '------'... between entries
			if line == Parser._rlog_commit_sep:
				# Peek at the line that follows the commit separator.
				peek_line = self.readline()
				self.putline(peek_line)
				match = Parser._re_rev_start.match(peek_line)
				if not match:
					# Apparently the author has included the CVS
					# revision separator line as part of the commit
					# message.  Continue slurping the commit message.
					pass
				elif log_list and log_list[-1] == '':
					# We have a revision separator line followed by a
					# new revision indicator, but the separator was
					# preceded by an empty line.
					pass
				else:
					break
			elif line == Parser._rlog_end:
				if count:
					raise ValueError, "reached end of revs with %d revs remaining" % count
				break

			# append line to the log list
			log_list.append(string.strip(line))

		log = string.join(log_list, '\n')
		self._debug("log:\n %s\n----\n" % log)

		# compute time using time routines in seconds from epoc GMT
		# timegm needs a four digit date. Be paranoid about it.
		EPOCH = 1970
		if year < EPOCH:
			if year < 70:
				year = year + 2000
			else:
				year = year + 1900
			if year < EPOCH:
				raise ValueError, 'invalid year'
		gmt_time = calendar.timegm((year, month, day, hour, minute, second, 0, 0, 0))

		self._debug("building new revision")

		# Build the revision making certain to reuse existing data wherever possible
		rev = WIPRevision()
		if self._files.has_key(filename):
			filename = self._files[filename]
		else: self._files[filename] = filename
		rev.filename = filename
		rev.type = cmit_type
		rev.revision = revision
		if self._authors.has_key(author):
			author = self._authors[author]
		else: self._authors[author] = author
		rev.author = author

		if self._logs.has_key(log):
			log = self._logs[log]
		else: self._logs[log] = log
		rev.log = log
		rev.time = gmt_time
		rev.addcount = pluscount
		rev.delcount = minuscount

		self._debug("finding tags")
		# Add any tags to this revision
		for tag in tags.keys():
			if tags[tag] == revision:
				if self._tags.has_key(tag):
					tag = self._tags[tag]
				else: self._tags[tag] = tag
				rev.tags.append(tag)
				del(tags[tag])

		# If this is the last revision for this file we need to check
		# our tags to validate that we have consumed them all.  Since a
		# tag can only reference a single revision, then by the time we
		# are at the last revision all the tags should be gone.  If not
		# we have an issue.
		#if not count and len(tags.keys()) > 0:
		#	sys.stderr.write('Unused tags %s on file %s\n' % (tags.keys(), rev.filename()))

		# Branches never show up as a real revisions.  They only occure
		# as a sub-reference to an existing revision and in the
		# symnames table.  Similary, we treat branches stricly as a
		# symbolic name reference.  A branches to us are just a hash
		# key, and off each key will be a list of revision on that
		# branch.  All revisions must exist on a branch, if they are
		# not on any branch, we place them on the .MAIN branch.  All we
		# do here is validate that we indeed have a key entry for this
		# branch in the branch hash and then tell revision_add to add
		# it to a particular branch.
		self._debug("finding branches")
		index = string.rfind(revision, '.')
		for branch in branches.keys():
			branchRev    = branches[branch] ## ie. 1.3.2, not 1.3.0.2
			branchParent = branchRev[:branchRev.rfind('.')]
			# this branch was created from this revision
			if branchParent == revision:
				self._debug('Revision %s is parent of branch %s; creating fake revision' % (revision, branch))
				metaRev = WIPRevision()
				if rev.type == SCM.Revision.Revision.CHANGE:
					metaRev.type = SCM.Revision.Revision.ADD
				else:	metaRev.type = rev.type
				metaRev.tags[:]	 = rev.tags[:]
				metaRev.revision = rev.revision
				metaRev.filename = rev.filename
				metaRev.time	 = rev.time
				metaRev.author	 = 'none'
				metaRev.log	 = 'Creation of branch %s' % branch
				metaRev.branch	 = branch
				metaRev.addcount, metaRev.delcount = 0,0
				self.propagateRevision(metaRev)
			# this revision is directly on a branch
			if rev.branch == self.BRANCH_MAIN and branchRev == revision[:index]:
				self._debug("found revision %s on branch %s [%s]" % (revision, branch, branchRev))
				if self._branches.has_key(branch):
					branch = self._branches[branch]
				else: self._branches[branch] = branch
				rev.branch = branch

		self.propagateRevision(rev)

		return count

	def propagateRevision(self, rev):
		self._debug("adding new revision to revision list")
		self.revisions_list.append(rev)
		self._debug("adding new revision to file hash")
		if self.revs_by_file.has_key(file):
			self.revs_by_file[file].append(rev)
		else:	self.revs_by_file[file] = [rev]
		if self.revs_by_author.has_key(rev.author):
			self.revs_by_author[rev.author].append(rev)
		else:	self.revs_by_author[rev.author] = [rev]

	def _parse_filename(self, config):
		self._debug("_parse_filename")
		line = self.readline()

		# validate we still have a file to read from
		if not line:
			self._debug("empty line")
			return None

		# Some checked out archives will have unknown files that the
		# user may have added to the directory.  These will be reported
		# via CVS in as something like: ? foo.tmp
		match = Parser._re_unknown_file.match(line)
		while match:
			line = self.readline()
			match = Parser._re_unknown_file.match(line)

		# file history starts with a newline often enough
		if line == '\n':
			line = self.readline()

		if config.use_rlog:
			# Find the "RCS File" which starts an entry
			self._debug("finding rcsfile name")
			match = Parser._re_rcs_file.match(line)
			if match:
				filename = os.path.normpath(os.path.join(match.group(1), match.group(3)))
				filename = filename[len(os.path.commonprefix([filename, config.reposloc])):]
				if filename[0] == '/': filename = filename[1:]
				return filename
		else:
			rcsFile = None
			# Ignore the "RCS File" and go for the "Working File" entry
			self._debug("finding working file name")
			match = Parser._re_rcs_file.match(line)
			if match:
				## TODO: actually look at the RCS filename and
				##       determine the delta between it and the working
				##       filename (for later rlog usage)
				rcsFile = os.path.normpath(os.path.join(match.group(1), match.group(3)))
				line = self.readline()
			match = Parser._re_working_file.match(line)
			if match:
				filename = match.group(1)
				return filename
		raise error, 'Unable to find filename -- parser in bad state?'

	def _purge_description(self):
		self._debug("_purge_description")
		# Find and purge through the description.  Directly after the
		# description should be the commit seperator
		line = self.readline()
		while line:
			if line[:12] == 'description:':
				self._debug("found start of description")
				break
			line = self.readline()

		# purge out the description
		self._debug("description purged")
		line = self.readline()
		while line:
			if line == Parser._rlog_commit_sep:
				break
			line = self.readline()


if __name__ == '__main__':
	import getopt
	import CVS

	config = CVS.Config()

	deltas = 0
	reverse = 0
	verbose = Parser.OUTPUT_NORM
	num = 0
	path = None
	
	class BadUsage: pass
	try:
		opts, args = getopt.getopt(sys.argv[1:], 'D:drvqh')

		for opt, val in opts:
			if opt == '-r':
				reverse = 1
			elif opt == '-v':
				verbose = Parser.OUTPUT_VERB
			elif opt == '-d':
				verbose = Parser.OUTPUT_DEBG
			elif opt == '-q':
				verbose = Parser.OUTPUT_NONE
			elif opt == '-D':
				class MyConfig(CVS.Config):
					reposloc = val
				config = MyConfig()
			elif opt == '-h':
				raise BadUsage

		if args:
			if len(args) > 1: raise BadUsage
			path = args[0]

		config.use_rlog = 1

		parser = Parser(config, sys.stdin, verbose)
		revisions = parser.revisions()

		if verbose >= Parser.OUTPUT_VERB:

			if reverse: revisions.reverse()

			for revision in revisions:
				print "==================="
				sys.stdout.write("%s: %s" % (revision.filename, revision.revision))

				if revision.type == SCM.Revision.Revision.REMOVE:
					print " [dead]"
				elif revision.type == SCM.Revision.Revision.PLACEHOLDER:
					print " [placeholder]"
				elif revision.type != SCM.Revision.Revision.ADD:
					print " +%d-%d" % (revision.addcount, revision.delcount)
				else:	sys.stdout.write("\n")

				if revision.tags:
					print  "Tags: %s" % revision.tags

				print "Author: %s" % revision.author
				print "Date: %s" % (time.asctime(time.gmtime(revision.time)))
				print "\nLog:\n%s\n" % revision.log
			print "-------------------------------------------------------------------"

		if verbose >= Parser.OUTPUT_NORM:
			print "total revisions %d" % len(revisions)
			for author in parser.authors():
				print " %d by %s " % (len(parser.byAuthor(author)), author)
			print "elapsed time: %d seconds" % time.clock()


	except (getopt.error, BadUsage):
		print """usage %s: [-drvh]

	-D root		module root in repository
	-r		reverse sort order
	-v		verbose output [default]
	-d		display debugging information
	-q		supress normal output
	-h		display this help
"""

# tag: Mark Ferrell Fri May 23 11:03:48 CDT 2003 (modules/CVS/Parser.py)
