note
	description: "UTF-16 encoding routines"
	library: "Gobo Eiffel Kernel Library"
	copyright: "Copyright (c) 2002-2018, Eric Bezault and others"
	license: "MIT License"
	date: "$Date: 2019-02-07 22:54:15 +0000 (Thu, 07 Feb 2019) $"
	revision: "$Revision: 102807 $"

class 
	UC_UTF16_ROUTINES

inherit
	UC_UNICODE_CONSTANTS

	KL_IMPORTED_ANY_ROUTINES

	KL_IMPORTED_INTEGER_ROUTINES

create 
	default_create

feature -- Status report

	valid_utf16 (a_string: STRING_8): BOOLEAN
			-- Are the bytes in a_string a valid UTF-16 encoding?
			-- 'a_string' has one byte per character.
			-- Default to big endian when no BOM.
		require
			a_string_not_void: a_string /= Void
			a_string_is_string: Any_.same_types (a_string, "")
		local
			a_most: INTEGER_32
			i, cnt: INTEGER_32
		do
			Result := (a_string.count \\ 2) = 0
			if Result and a_string.count > 0 then
				from
					if is_endian_detection_character_least_first (a_string.item_code (1), a_string.item_code (2)) then
						i := 2
					else
						i := 1
					end
					cnt := a_string.count
				until
					(i > cnt) or (not Result)
				loop
					a_most := a_string.item (i).code
					if is_surrogate (a_most) then
						i := i + 2
						Result := is_high_surrogate (a_most) and ((i <= cnt) and then is_low_surrogate (a_string.item (i).code))
					end
					i := i + 2
				end
			end
		ensure
			instance_free: class
			empty_is_true: a_string.count = 0 implies Result
			utf16_even_count: Result implies ((a_string.count \\ 2) = 0)
		end

	valid_utf16be (a_string: STRING_8): BOOLEAN
			-- Are the bytes in a_string valid UTF-16BE?
			-- 'a_string' has one byte per character.
		require
			a_string_not_void: a_string /= Void
			a_string_is_string: Any_.same_types (a_string, "")
		local
			a_most: INTEGER_32
			i, cnt: INTEGER_32
		do
			Result := (a_string.count \\ 2) = 0
			if Result and a_string.count > 0 then
				from
					i := 1
					cnt := a_string.count
				until
					(i > cnt) or (not Result)
				loop
					a_most := a_string.item (i).code
					if is_surrogate (a_most) then
						i := i + 2
						Result := is_high_surrogate (a_most) and ((i <= cnt) and then is_low_surrogate (a_string.item (i).code))
					end
					i := i + 2
				end
			end
		ensure
			instance_free: class
			empty_is_true: a_string.count = 0 implies Result
			utf16_even_count: Result implies ((a_string.count \\ 2) = 0)
		end

	valid_utf16le (a_string: STRING_8): BOOLEAN
			-- Are the bytes in a_string valid UTF-16LE?
			-- 'a_string' has one byte per character.
		require
			a_string_not_void: a_string /= Void
			a_string_is_string: Any_.same_types (a_string, "")
		local
			a_most: INTEGER_32
			i, cnt: INTEGER_32
		do
			Result := (a_string.count \\ 2) = 0
			if Result and a_string.count > 0 then
				from
					i := 2
				until
					(i > cnt) or (not Result)
				loop
					a_most := a_string.item (i).code
					if is_surrogate (a_most) then
						i := i + 2
						Result := is_high_surrogate (a_most) and ((i <= cnt) and then is_low_surrogate (a_string.item (i).code))
					end
					i := i + 2
				end
			end
		ensure
			instance_free: class
			empty_is_true: a_string.count = 0 implies Result
			utf16_even_count: Result implies ((a_string.count \\ 2) = 0)
		end
	
feature -- Endian-ness detection

	Bom_be: STRING_8
			-- BOM in big-endian format
		once
			Result := "þÿ"
		ensure
			instance_free: class
			bom_be_not_void: Result /= Void
			two_bytes: Result.count = 2
			first_byte: Result.item_code (1) = Hex_fe
			second_byte: Result.item_code (2) = Hex_ff
		end

	Bom_le: STRING_8
			-- BOM in little-endian format
		once
			Result := "ÿþ"
		ensure
			instance_free: class
			bom_le_not_void: Result /= Void
			two_bytes: Result.count = 2
			first_byte: Result.item_code (1) = Hex_ff
			second_byte: Result.item_code (2) = Hex_fe
		end

	is_endian_detection_character_most_first (first, second: INTEGER_32): BOOLEAN
			-- Do the two bytes first and second represent the character
			-- 0xFEFF with first being the most significant byte?
		require
			a_byte_is_byte: is_byte (first)
			other_byte_is_byte: is_byte (second)
		do
			Result := first = Hex_fe and second = Hex_ff
		ensure
			instance_free: class
			definition: Result = (is_endian_detection_character (first, second) and (first = Hex_fe))
		end

	is_endian_detection_character_least_first (first, second: INTEGER_32): BOOLEAN
			-- Do the two bytes first and second represent the character
			-- 0xFEFF with first being the least significant byte?
		require
			a_byte_is_byte: is_byte (first)
			other_byte_is_byte: is_byte (second)
		do
			Result := first = Hex_ff and second = Hex_fe
		ensure
			instance_free: class
			definition: Result = (is_endian_detection_character (first, second) and (first = Hex_ff))
		end

	is_endian_detection_character (a_byte, other_byte: INTEGER_32): BOOLEAN
			-- Can these two bytes represent ZERO WIDTH NON-BREAKING SPACE?
			-- (It has to be unicode character 0xFEFF, because 0xFFFE is not a valid character.)
		require
			a_byte_is_byte: is_byte (a_byte)
			other_byte_is_byte: is_byte (other_byte)
		do
			Result := (a_byte = Hex_fe and other_byte = Hex_ff) or (a_byte = Hex_ff and other_byte = Hex_fe)
		ensure
			instance_free: class
			definition: Result = (a_byte.min (other_byte) = Hex_fe and a_byte.max (other_byte) = Hex_ff)
		end
	
feature -- Surrogate

	is_surrogate (a_most: INTEGER_32): BOOLEAN
			-- Is this a high surrogate byte?
		require
			byte: is_byte (a_most)
		do
			Result := a_most >= Hex_d8 and a_most < Hex_e0
		ensure
			instance_free: class
		end

	is_high_surrogate (a_most: INTEGER_32): BOOLEAN
			-- Is this a high surrogate byte?
		require
			byte: is_byte (a_most)
		do
			Result := a_most >= Hex_d8 and a_most < Hex_dc
		ensure
			instance_free: class
		end

	is_low_surrogate (a_most: INTEGER_32): BOOLEAN
			-- Is this a low surrogate byte?
		require
			byte: is_byte (a_most)
		do
			Result := a_most >= Hex_dc and a_most < Hex_e0
		ensure
			instance_free: class
		end

	least_10_bits (msb, lsb: INTEGER_32): INTEGER_32
			-- UTF16 least 10 bytes of a byte pair
		require
			msb_byte: is_byte (msb)
			lsb_byte: is_byte (lsb)
			surrogate: is_surrogate (msb)
		do
			Result := ((msb \\ 4) * Hex_100) + lsb
		ensure
			instance_free: class
			ten_bits: Result >= 0 and Result < Hex_400
		end

	surrogate (a_high_10: INTEGER_32; a_low_10: INTEGER_32): INTEGER_32
			-- Supplementary code point from high and low values
		require
			high_10: a_high_10 >= 0 and a_high_10 < 1024
			low_10: a_low_10 >= 0 and a_low_10 < 1024
		do
			Result := Hex_10000 + ((a_high_10 * Hex_400) + a_low_10)
		ensure
			instance_free: class
			more_than_16bits: Result >= Hex_10000
		end

	surrogate_from_bytes (a_high_most, a_high_least, a_low_most, a_low_least: INTEGER_32): INTEGER_32
			-- Supplementary code point from bytes
		require
			surrogate_high: is_high_surrogate (a_high_most)
			high_least_byte: is_byte (a_high_least)
			surrogate_low: is_low_surrogate (a_low_most)
			low_least_byte: is_byte (a_low_least)
		do
			Result := surrogate (least_10_bits (a_high_most, a_high_least), least_10_bits (a_low_most, a_low_least))
		ensure
			instance_free: class
			more_than_16bits: Result >= Hex_10000
		end

	is_byte (a: INTEGER_32): BOOLEAN
			-- Is a a byte?
		do
			Result := a >= 0 and a < Hex_100
		ensure
			instance_free: class
			definition: Result = (a >= 0 and a < Hex_100)
		end

	supplementary_to_high_surrogate (a_code: INTEGER_32): INTEGER_32
			-- High surrogate for a_code
		require
			code_high_enough: a_code > Maximum_bmp_character_code
			code_low_enough: a_code <= Maximum_unicode_character_code
		do
			Result := Integer_.bit_shift_right (a_code, 10) + Hex_d7c0
		ensure
			instance_free: class
			high_surrogate: Result >= 256 * Hex_d8
			not_too_big: Result < 256 * Hex_dc
		end

	supplementary_to_low_surrogate (a_code: INTEGER_32): INTEGER_32
			-- Low surrogate for a_code
		require
			code_high_enough: a_code > Maximum_bmp_character_code
			code_low_enough: a_code <= Maximum_unicode_character_code
		do
			Result := Integer_.bit_or (Integer_.bit_and (a_code, Hex_3ff), Hex_dc00)
		ensure
			instance_free: class
			low_surrogate: Result >= 256 * Hex_dc
			not_too_big: Result < 256 * Hex_e0
		end
	
feature {NONE} -- Constants

	Hex_400: INTEGER_32 = 1024
			-- 2 ^ 10

	Hex_100: INTEGER_32 = 256
			-- 2 ^ 8

	Hex_fe: INTEGER_32 = 254
			-- Endian detection character

	Hex_ff: INTEGER_32 = 255
			-- Endian detection character

	Hex_d8: INTEGER_32 = 216
			-- Hex_D800: start of so-called high-half zone or high surrogate area

	Hex_dc: INTEGER_32 = 220
			-- Hex_DC00: start of so-called low-half zone or low surrogate area

	Hex_e0: INTEGER_32 = 224
			-- Hex_E000: end (exclusive) of surrogate area

	Hex_10000: INTEGER_32 = 65536
			-- Base of surrogates

	Hex_d7c0: INTEGER_32 = 55232
			-- Hex D7C0

	Hex_3ff: INTEGER_32 = 1023
			-- Hex 3FF

	Hex_dc00: INTEGER_32 = 56320
			-- Hex_DC00: start of so-called low-half zone or low surrogate area
	
end -- class UC_UTF16_ROUTINES

Generated by ISE EiffelStudio