(define-ccl-program ccl-decode-mule-utf-8
`(2
((r5 = ,(charset-id 'eight-bit-control))
(r6 = ,(charset-id 'eight-bit-graphic))
(loop
(read r0)
(if (r0 < #x80)
(write r0)
(if (r0 < #xe0)
((read r1)
(if ((r1 & #b11000000) != #b10000000)
((if (r0 < #xa0)
(write-multibyte-character r5 r0)
(write-multibyte-character r6 r0))
(if (r1 < #x80)
(write r1)
(if (r1 < #xa0)
(write-multibyte-character r5 r1)
(write-multibyte-character r6 r1))))
((r0 &= #x1f)
(r0 <<= 6)
(r1 &= #x3f)
(r1 += r0)
(if (r1 < 160)
((write-multibyte-character r5 r1))
(if (r1 < 256)
((r0 = ,(charset-id 'latin-iso8859-1))
(r1 -= 128)
(write-multibyte-character r0 r1))
((r0 = ,(charset-id 'mule-unicode-0100-24ff))
(r1 -= #x0100)
(r2 = (((r1 / 96) + 32) << 7))
(r1 %= 96)
(r1 += (r2 + 32))
(write-multibyte-character r0 r1)))))))
(if (r0 < #xf0)
((read r1 r2)
(r4 = 0)
(r3 = (r1 & #b11000000))
(r3 |= ((r2 >> 2) & #b00110000))
(if (r3 != #b10100000)
(r4 = 1)
((r3 = ((r0 & #x0f) << 12))
(r3 += ((r1 & #x3f) << 6))
(r3 += (r2 & #x3f))
(if (r3 < #x0800)
(r4 = 1))))
(if (r4 != 0)
((if (r0 < #xa0)
(write-multibyte-character r5 r0)
(write-multibyte-character r6 r0))
(if (r1 < #x80)
(write r1)
(if (r1 < #xa0)
(write-multibyte-character r5 r1)
(write-multibyte-character r6 r1)))
(if (r2 < #x80)
(write r2)
(if (r2 < #xa0)
(write-multibyte-character r5 r2)
(write-multibyte-character r6 r2))))
((if (r3 < #x2500)
((r0 = ,(charset-id 'mule-unicode-0100-24ff))
(r3 -= #x0100)
(r3 //= 96)
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(write-multibyte-character r0 r1))
(if (r3 < #x3400)
((r0 = ,(charset-id 'mule-unicode-2500-33ff))
(r3 -= #x2500)
(r3 //= 96)
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(write-multibyte-character r0 r1))
(if (r3 < #xe000)
( (r3 = r6)
(write-multibyte-character r3 r0)
(if (r1 < #xa0)
(r3 = r5))
(write-multibyte-character r3 r1)
(if (r2 < #xa0)
(r3 = r5)
(r3 = r6))
(write-multibyte-character r3 r2))
((r0 = ,(charset-id 'mule-unicode-e000-ffff))
(r3 -= #xe000)
(r3 //= 96)
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(write-multibyte-character r0 r1))))))))
((read r1 r2 r3)
(write-multibyte-character r6 r0)
(if (r1 < #xa0)
(write-multibyte-character r5 r1)
(write-multibyte-character r6 r1))
(if (r2 < #xa0)
(write-multibyte-character r5 r2)
(write-multibyte-character r6 r2))
(if (r3 < #xa0)
(write-multibyte-character r5 r3)
(write-multibyte-character r6 r3))))))
(repeat))))
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
mule-unicode-*. Encodings of un-representable Unicode characters are
decoded asis into eight-bit-control and eight-bit-graphic
characters.")
(define-ccl-program ccl-encode-mule-utf-8
`(1
((r5 = -1)
(loop
(if (r5 < 0)
((r1 = -1)
(read-multibyte-character r0 r1))
( (r0 = r5)
(r1 = r6)
(r5 = -1)))
(if (r0 == ,(charset-id 'ascii))
(write r1)
(if (r0 == ,(charset-id 'latin-iso8859-1))
((r0 = (((r1 & #x40) >> 6) | #xc2))
(r1 &= #x3f)
(r1 |= #x80)
(write r0 r1))
(if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
(r1 &= #x7f)
(r1 += (r0 + 224)) (if (r1 < #x0800)
((r0 = (((r1 & #x07c0) >> 6) | #xc0))
(r1 &= #x3f)
(r1 |= #x80)
(write r0 r1))
((r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
(r1 >>= 6)
(r1 |= #x80)
(write r0 r1 r2))))
(if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
(r1 &= #x7f)
(r1 += (r0 + 9440)) (r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
(r1 >>= 6)
(r1 |= #x80)
(write r0 r1 r2))
(if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
(r1 &= #x7f)
(r1 += (r0 + 57312)) (r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
(r1 >>= 6)
(r1 |= #x80)
(write r0 r1 r2))
(if (r0 == ,(charset-id 'eight-bit-control))
((write #xc2)
(write r1))
(if (r0 == ,(charset-id 'eight-bit-graphic))
((write r1)
(r1 = -1)
(read-multibyte-character r0 r1)
(if (r0 != ,(charset-id 'eight-bit-graphic))
(if (r0 != ,(charset-id 'eight-bit-control))
((r5 = r0)
(r6 = r1))))
(if (r5 < 0)
((read-multibyte-character r0 r2)
(if (r0 != ,(charset-id 'eight-bit-graphic))
(if (r0 != ,(charset-id 'eight-bit-control))
((r5 = r0)
(r6 = r2))))
(if (r5 < 0)
(write r1 r2)
(if (r1 < #xa0)
(write r1)
((write #xc2)
(write r1)))))))
((write #xef)
(write #xbf)
(write #xbd)))))))))
(repeat)))
(if (r1 >= #xa0)
(write r1)
(if (r1 >= #x80)
((write #xc2)
(write r1)))))
"CCL program to encode into UTF-8.
Only characters from the charsets ascii, eight-bit-control,
eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
Others are encoded as U+FFFD.")
(make-coding-system
'mule-utf-8 4 ?u
"UTF-8 encoding for Emacs-supported Unicode characters.
The supported Emacs character sets are:
ascii
eight-bit-control
eight-bit-graphic
latin-iso8859-1
mule-unicode-0100-24ff
mule-unicode-2500-33ff
mule-unicode-e000-ffff
Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
are decoded into sequences of eight-bit-control and eight-bit-graphic
characters to preserve their byte sequences. Emacs characters out of
these ranges are encoded into U+FFFD.
Note that, currently, characters in the mule-unicode charsets have no
syntax and case information. Thus, for instance, upper- and
lower-casing commands won't work with them."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
'((safe-charsets
ascii
eight-bit-control
eight-bit-graphic
latin-iso8859-1
mule-unicode-0100-24ff
mule-unicode-2500-33ff
mule-unicode-e000-ffff)
(mime-charset . utf-8)
(coding-category . coding-category-utf-8)
(valid-codes (0 . 255))))
(define-coding-system-alias 'utf-8 'mule-utf-8)