;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/flexi-streams/input.lisp,v 1.15 2006/06/12 23:26:12 edi Exp $

;;; Copyright (c) 2005-2006, Dr. Edmund Weitz.  All rights reserved.

;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:

;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.

;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

(in-package :flexi-streams)

(defun read-byte* (flexi-input-stream &optional eof-value)
  "Reads one byte \(octet) from the underlying stream of
FLEXI-OUTPUT-STREAM \(or from the internal stack if it's not
empty)."
  (incf (flexi-stream-position flexi-input-stream))
  (or (pop (flexi-stream-octet-stack flexi-input-stream))
      ;; we use READ-SEQUENCE because READ-BYTE doesn't work with all
      ;; bivalent streams in LispWorks (4.4.6)
      (let* ((buffer (make-array 1 :element-type 'octet))
             (position (read-sequence buffer (flexi-stream-stream flexi-input-stream))))
        (cond ((zerop position)
               (decf (flexi-stream-position flexi-input-stream))
               eof-value)
              (t (aref buffer 0))))))

(defun read-char-8-bit (flexi-input-stream encoding-table)
  "Reads one octet from the stream underlying FLEXI-INPUT-STREAM
and converts it into a character code using the array
ENCODING-TABLE."
  (let* ((octet (or (read-byte* flexi-input-stream)
                    (return-from read-char-8-bit :eof)))
         (char-code (aref encoding-table octet)))
    (when (or (and (= char-code 65533)
                   (not *use-replacement-char*))
              (null char-code))
      (signal-encoding-error flexi-input-stream
                              "No character which correspondes to octet #x~X."
                              octet))
    char-code))

(defun read-char-utf-32 (flexi-input-stream)
  "Reads four octets from the stream underlying
FLEXI-INPUT-STREAM and converts them into the character code
determined by the UTF-32 encoding and the endianess of the
current external format of FLEXI-INPUT-STREAM."
  (let (first-octet-seen)
    (flet ((read-next-byte ()
             (prog1
                 (or (read-byte* flexi-input-stream)
                     (cond (first-octet-seen
                            (signal-encoding-error flexi-input-stream
                                                    "End of file while in UTF-32 sequence."))
                           (t (return-from read-char-utf-32 :eof))))
               (setq first-octet-seen t))))
      (loop for count in (if (external-format-little-endian
                              (flexi-stream-external-format flexi-input-stream))
                           '(0 8 16 24)
                           '(24 16 8 0))
            for octet = (read-next-byte)
            sum (ash octet count)))))

(defun read-char-utf-16 (flexi-input-stream)
  "Reads two or four octets from the stream underlying
FLEXI-INPUT-STREAM and converts them into the character code
determined by the UTF-16 encoding and the endianess of the
current external format of FLEXI-INPUT-STREAM."
  (let (first-octet-seen)
    (labels ((read-next-byte ()
               (prog1
                   (or (read-byte* flexi-input-stream)
                       (cond (first-octet-seen
                              (signal-encoding-error flexi-input-stream
                                                      "End of file while in UTF-16 sequence."))
                             (t (return-from read-char-utf-16 :eof))))
                 (setq first-octet-seen t)))
             (read-next-word ()
               (cond ((external-format-little-endian
                       (flexi-stream-external-format flexi-input-stream))
                      (+ (read-next-byte)
                         (ash (read-next-byte) 8)))
                     (t (+ (ash (read-next-byte) 8)
                           (read-next-byte))))))
      (let ((word (read-next-word)))
        (cond ((<= #xd800 word #xdfff)
               (let ((next-word (read-next-word)))
                 (unless (<= #xdc00 next-word #xdfff)
                   (signal-encoding-error flexi-input-stream
                                          "Unexpected UTF-16 word #x~X following #x~S."
                                          next-word word))
                 (+ (ash (logand #b1111111111 word) 10)
                    (logand #b1111111111 next-word)
                    #x10000)))
              (t word))))))

(defun read-char-utf-8 (flexi-input-stream)
  "Reads up to six octets from the stream underlying
FLEXI-INPUT-STREAM and converts them into the character code
determined by the UTF-8 encoding."
  (let (first-octet-seen)
    (flet ((read-next-byte ()
             (prog1
                 (or (read-byte* flexi-input-stream)
                     (cond (first-octet-seen
                            (signal-encoding-error flexi-input-stream
                                                   "End of file while in UTF-8 sequence."))
                           (t (return-from read-char-utf-8 :eof))))
               (setq first-octet-seen t))))
      (let ((octet (read-next-byte)))
        (multiple-value-bind (start count)
            (cond ((zerop (logand octet #b10000000))
                   (values octet 0))
                  ((= #b11000000 (logand octet #b11100000))
                   (values (logand octet #b00011111) 1))
                  ((= #b11100000 (logand octet #b11110000))
                   (values (logand octet #b00001111) 2))
                  ((= #b11110000 (logand octet #b11111000))
                   (values (logand octet #b00000111) 3))
                  ((= #b11111000 (logand octet #b11111100))
                   (values (logand octet #b00000011) 4))
                  ((= #b11111100 (logand octet #b11111110))
                   (values (logand octet #b00000001) 5))
                  (t (signal-encoding-error flexi-input-stream
                                            "Unexpected value #x~X at start of UTF-8 sequence."
                                             octet)))
          ;; note that we currently don't check for "overlong"
          ;; sequences or other illegal values
          (loop for result = start then (+ (ash result 6)
                                           (logand octet #b111111))
                repeat count
                for octet = (read-next-byte)
                unless (= #b10000000 (logand octet #b11000000))
                do (signal-encoding-error flexi-input-stream
                                          "Unexpected value #x~X in UTF-8 sequence."
                                           octet)
                finally (return result)))))))

(defmethod stream-check-eof-no-hang ((stream flexi-input-stream))
  "Simply calls the corresponding method for the underlying input
stream."
  (stream-check-eof-no-hang (flexi-stream-stream stream)))

(defmethod stream-clear-input ((stream flexi-input-stream))
  "Calls the corresponding method for the underlying input stream
and also clears the value of the OCTET-STACK slot."
  ;; note that we don't reset the POSITION slot
  (setf (flexi-stream-octet-stack stream) nil)
  (stream-clear-input (flexi-stream-stream stream)))

(defmethod stream-listen ((stream flexi-input-stream))
  "Calls the corresponding method for the underlying input stream
but first check if \(old) input is available in the OCTET-STACK
slot."
  (or (flexi-stream-octet-stack stream)
      (stream-listen (flexi-stream-stream stream))))

(defmethod stream-read-byte ((stream flexi-input-stream))
  "Reads one byte \(octet) from the underlying stream."
  ;; set LAST-CHAR slot to NIL because we can't UNREAD-CHAR after this
  ;; operation
  (setf (flexi-stream-last-char stream) nil)
  (let ((octet (read-byte* stream :eof)))
    (setf (flexi-stream-last-octet stream)
          (case octet
            (:eof nil)
            (otherwise octet)))
    octet))

(defun unread-char% (char-code flexi-input-stream)
  "Used internally to put a character denoted by the character code
CHAR-CODE which was already read back on the stream.  Uses the
OCTET-STACK slot and decrements the POSITION slot accordingly."
  (let ((octets-read (translate-char char-code flexi-input-stream)))
    (decf (flexi-stream-position flexi-input-stream)
          (length octets-read))
    (setf (flexi-stream-octet-stack flexi-input-stream)
          (append octets-read
                  (flexi-stream-octet-stack flexi-input-stream)))))

(defmethod stream-read-char ((stream flexi-input-stream))
  "Reads enough octets from the underlying stream to return one
character as encoded corresponding to the current external format
of STREAM.  Handles line endings correctly according to the
current EOL style."
  ;; set LAST-OCTET slot to NIL because we can't UNREAD-BYTE after
  ;; this operation
  (setf (flexi-stream-last-octet stream) nil)
  (let* ((external-format (flexi-stream-external-format stream))
         (external-format-name (external-format-name external-format))
         (external-format-eol-style (external-format-eol-style external-format)))
    (flet ((get-char-code ()
             "This internal function does all the work of reading
from the stream and converting to character codes but it doesn't
handle line endings."
             (cond ((ascii-name-p external-format-name)
                    (read-char-8-bit stream +ascii-table+))
                   ((iso-8859-name-p external-format-name)
                    (read-char-8-bit stream
                                     (cdr (assoc external-format-name +iso-8859-tables+
                                                 :test #'eq))))
                   ((code-page-name-p external-format-name)
                    (read-char-8-bit stream
                                     (cdr (assoc (external-format-id external-format)
                                                 +code-page-tables+))))
                   (t (case external-format-name
                        (:utf-8 (read-char-utf-8 stream))
                        (:utf-16 (read-char-utf-16 stream))
                        (:utf-32 (read-char-utf-32 stream)))))))
      (let ((char-code (get-char-code)))
        (when (eq char-code :eof)
          (return-from stream-read-char :eof))
        (let ((char (code-char char-code)))
          (case external-format-eol-style
            (:cr (when (eql char #\Return)
                   (setq char #\Newline)))
            (:lf (when (eql char #\Linefeed)
                   (setq char #\Newline)))
            ;; in the case :CRLF we have to look ahead one character
            (:crlf (when (eql char #\Return)
                     (let ((next-char-code (get-char-code)))
                       (cond ((eql (code-char next-char-code) #\Linefeed)
                              (setq char #\Newline))
                             ((eq next-char-code :eof))
                             ;; if the character we peeked at wasn't a
                             ;; linefeed character we push its
                             ;; constituents back onto our internal
                             ;; octet stack
                             (t (unread-char% next-char-code stream)))))))
          ;; remember this character and the current external format
          ;; for UNREAD-CHAR
          (setf (flexi-stream-last-char stream)
                (cons (if char (char-code char) char-code)
                      external-format))
          (or char char-code))))))

(defmethod stream-read-char-no-hang ((stream flexi-input-stream))
  "Reads one character if the underlying stream has at least one
octet available."
  ;; note that this may block for non-8-bit encodings - there's no
  ;; easy way to handle this correctly
  (and (stream-listen stream)
       (stream-read-char stream)))

(defmethod stream-read-sequence ((stream flexi-input-stream) sequence start end &key)
  "Reads enough input from STREAM to fill SEQUENCE from START to
END.  If SEQUENCE is an array which can store octets we use
STREAM-READ-BYTE to fill it, otherwise we use STREAM-READ-CHAR."
  (let ((reader (cond ((and (arrayp sequence)
                            (subtypep 'octet (array-element-type sequence)))
                       #'stream-read-byte)
                      (t #'stream-read-char))))
    (loop for index from start below end
          for element = (funcall reader stream)
          until (eq element :eof)
          do (setf (elt sequence index) element)
          finally (return index))))

(defmethod stream-unread-char ((stream flexi-input-stream) char)
  "Implements UNREAD-CHAR for streams of type FLEXI-INPUT-STREAM.
Makes sure CHAR will only be unread if it was the last character
read and if it was read with the same encoding that's currently
being used by the stream."
  (let* ((last-char (flexi-stream-last-char stream))
         (last-char-code (car last-char)))
    (unless last-char
      (error 'flexi-stream-simple-error
             :format-control "No character to unread from this stream \(or last reading operation was binary)."))
    (unless (= (char-code char) last-char-code)
      (error 'flexi-stream-simple-error
             :format-control "Last character read was different from ~S."
             :format-arguments (list char)))
    (unless (external-format-equal (cdr last-char)
                                   (flexi-stream-external-format stream))
      (error 'flexi-stream-simple-error
             :format-control "Can't unread because external format of stream has changed since last READ-CHAR."))
    (setf (flexi-stream-last-char stream) nil)
    (unread-char% last-char-code stream)
    nil))

(defun unread-byte (byte flexi-input-stream)
  "Similar to UNREAD-CHAR in that it `unreads' the last octet from
STREAM.  Note that you can only call UNREAD-BYTE after a corresponding
READ-BYTE."
  (let ((last-octet (flexi-stream-last-octet flexi-input-stream)))
    (unless last-octet
      (error 'flexi-stream-simple-error
             :format-control "No byte to unread from this stream \(or last reading operation read a character)."))
    (unless (= byte last-octet)
      (error 'flexi-stream-simple-error
             :format-control "Last byte read was different from #x~X."
             :format-arguments (list byte)))
    (setf (flexi-stream-last-octet flexi-input-stream) nil)
    (decf (flexi-stream-position flexi-input-stream))
    (push byte (flexi-stream-octet-stack flexi-input-stream))
    nil))
    
(defun peek-byte (flexi-input-stream &optional peek-type (eof-error-p t) eof-value)
  "PEEK-BYTE is like PEEK-CHAR, i.e. it returns an octet from
FLEXI-INPUT-STREAM without actually removing it.  If PEEK-TYPE is NIL
the next octet is returned, if PEEK-TYPE is T, the next octet which is
not 0 is returned, if PEEK-TYPE is an octet, the next octet which
equals PEEK-TYPE is returned.  EOF-ERROR-P and EOF-VALUE are
interpreted as usual."
  (loop for octet = (read-byte flexi-input-stream eof-error-p eof-value)
        until (cond ((null peek-type))
                    ((eql octet eof-value))
                    ((eq peek-type t)
                     (plusp octet))
                    (t (= octet peek-type)))
        finally (unless (eql octet eof-value)
                  (unread-byte octet flexi-input-stream))
                (return octet)))
