uc_utf8_routines Text

note
	description: "UTF-8 encoding routines"
	library: "Gobo Eiffel Kernel Library"
	copyright: "Copyright (c) 2001-2018, Eric Bezault and others"
	license: "MIT License"
	date: "$Date: 2019-02-07 22:54:15 +0000 (Thu, 07 Feb 2019) $"
	revision: "$Revision: 102807 $"

class 
	UC_UTF8_ROUTINES

inherit
	UC_IMPORTED_UNICODE_ROUTINES

	KL_IMPORTED_STRING_ROUTINES

	KL_IMPORTED_INTEGER_ROUTINES

	KL_IMPORTED_ANY_ROUTINES

	UC_STRING_HANDLER

create 
	default_create

feature -- Status report

	valid_utf8 (a_string: STRING_8): BOOLEAN
			-- Are the bytes in a_string a valid UTF-8 encoding?
		require
			a_string_not_void: a_string /= Void
			a_string_is_string: Any_.same_types (a_string, "")
		local
			i, nb, nb2: INTEGER_32
			bc, a_code: INTEGER_32
			a_byte, a_first_byte: CHARACTER_8
		do
			Result := True
			nb := a_string.count
			from
				i := 1
			until
				i > nb
			loop
				a_first_byte := a_string.item (i)
				if is_encoded_first_byte (a_first_byte) then
					bc := encoded_byte_count (a_first_byte)
					if bc = 1 then
						i := i + 1
					else
						nb2 := i + bc - 1
						if nb2 > nb then
							Result := False
							i := nb + 1
						else
							a_code := encoded_first_value (a_first_byte)
							i := i + 1
							a_byte := a_string.item (i)
							if not is_encoded_second_byte (a_byte, a_first_byte) then
								Result := False
								i := nb + 1
							else
								a_code := a_code * 64 + encoded_next_value (a_byte)
								inspect bc
								when 2 then
									if a_code <= Code_127 then
										Result := False
									end
								when 3 then
									if a_code <= Code_31 then
										Result := False
									end
								when 4 then
									if a_code <= Code_15 then
										Result := False
									end
								end
								if Result then
									from
										i := i + 1
									until
										i > nb2
									loop
										if is_encoded_next_byte (a_string.item (i)) then
											i := i + 1
										else
											Result := False
											i := nb + 1
										end
									end
								end
							end
						end
					end
				else
					Result := False
					i := nb + 1
				end
			end
		ensure
			instance_free: class
		end

	is_encoded_first_byte (a_byte: CHARACTER_8): BOOLEAN
			-- Is a_byte the first byte in UTF-8 encoding?
		do
			Result := (a_byte <= Byte_127 or (Byte_194 <= a_byte and a_byte <= Byte_244))
		ensure
			instance_free: class
		end

	is_encoded_next_byte (a_byte: CHARACTER_8): BOOLEAN
			-- Is a_byte one of the next bytes in UTF-8 encoding?
		do
			Result := (Byte_127 < a_byte and a_byte <= Byte_191)
		ensure
			instance_free: class
		end

	is_encoded_second_byte (a_byte, a_first_byte: CHARACTER_8): BOOLEAN
			-- Is a_byte a valid second byte in UTF-8 encoding?
		require
			valid_first_byte: is_encoded_first_byte (a_first_byte)
		do
			if a_first_byte = Byte_224 then
				Result := (Byte_159 < a_byte and a_byte <= Byte_191)
			elseif a_first_byte = Byte_237 then
				Result := (Byte_127 < a_byte and a_byte <= Byte_159)
			elseif a_first_byte = Byte_240 then
				Result := (Byte_143 < a_byte and a_byte <= Byte_191)
			elseif a_first_byte = Byte_244 then
				Result := (Byte_127 < a_byte and a_byte <= Byte_143)
			else
				Result := (Byte_127 < a_byte and a_byte <= Byte_191)
			end
		ensure
			instance_free: class
		end

	is_endian_detection_character (a_first, a_second, a_third: CHARACTER_8): BOOLEAN
			-- Is this sequence a UTF-8 Byte Order Marker (BOM)?
		do
			Result := is_endian_detection_character_start (a_first, a_second) and a_third = Byte_bf
		ensure
			instance_free: class
			result_start: Result implies is_endian_detection_character_start (a_first, a_second)
		end

	is_endian_detection_character_start (a_first, a_second: CHARACTER_8): BOOLEAN
			-- Are these characters the start of a UTF-8 encoded Byte Order Marker (BOM)?
		do
			Result := a_first = Byte_ef and a_second = Byte_bb
		ensure
			instance_free: class
		end
	
feature -- Access

	encoded_first_value (a_byte: CHARACTER_8): INTEGER_32
			-- Value encoded in first byte
		require
			is_encoded_first_byte: is_encoded_first_byte (a_byte)
		do
			Result := a_byte.code
			if a_byte <= Byte_127 then
			elseif a_byte <= Byte_223 then
				Result := Result \\ 32
			elseif a_byte <= Byte_239 then
				Result := Result \\ 16
			elseif a_byte <= Byte_244 then
				Result := Result \\ 8
			end
		ensure
			instance_free: class
			value_positive: Result >= 0
			value_small_enough: Result < 128
		end

	encoded_next_value (a_byte: CHARACTER_8): INTEGER_32
			-- Value encoded in one of the next bytes
		require
			is_encoded_next_byte: is_encoded_next_byte (a_byte)
		do
			Result := a_byte.code \\ 64
		ensure
			instance_free: class
			value_positive: Result >= 0
			value_small_enough: Result < 64
		end
	
feature -- Measurement

	encoded_byte_count (a_byte: CHARACTER_8): INTEGER_32
			-- Number of bytes which were necessary to encode
			-- the unicode character whose first byte is a_byte
		require
			is_encoded_first_byte: is_encoded_first_byte (a_byte)
		do
			if a_byte <= Byte_127 then
				Result := 1
			elseif a_byte <= Byte_223 then
				Result := 2
			elseif a_byte <= Byte_239 then
				Result := 3
			else
				Result := 4
			end
		ensure
			instance_free: class
			encoded_byte_code_large_enough: Result >= 1
			encoded_byte_code_small_enough: Result <= 4
		end

	substring_byte_count (a_string: READABLE_STRING_GENERAL; start_index, end_index: INTEGER_32): INTEGER_32
			-- Number of bytes needed to encode characters  of
			-- a_string between start_index and end_index
			-- inclusive with the UTF-8 encoding
		require
			a_string_not_void: a_string /= Void
			valid_start_index: 1 <= start_index
			valid_end_index: end_index <= a_string.count
			meaningful_interval: start_index <= end_index + 1
		local
			s, e: INTEGER_32
			i: INTEGER_32
			even_end_index: INTEGER_32
			c: CHARACTER_8
		do
			if start_index <= end_index then
				if Any_.same_types (a_string, Dummy_string) and then attached {STRING_8} a_string as l_string_8 then
					if end_index \\ 2 = 0 then
						even_end_index := end_index
					else
						even_end_index := end_index - 1
					end
					from
						i := start_index
					until
						i > even_end_index
					loop
						c := l_string_8.item (i)
						if c <= Byte_127 then
							Result := Result + 1
						else
							Result := Result + character_byte_count (c)
						end
						c := l_string_8.item (i + 1)
						if c <= Byte_127 then
							Result := Result + 1
						else
							Result := Result + character_byte_count (c)
						end
						i := i + 2
					end
					if even_end_index < end_index then
						Result := Result + character_byte_count (l_string_8.item (end_index))
					end
				elseif Any_.same_types (a_string, Dummy_uc_string) and then attached {UC_STRING} a_string as a_uc_string then
					if start_index = 1 and end_index = a_uc_string.count then
						Result := a_uc_string.byte_count
					else
						s := a_uc_string.byte_index (start_index)
						if end_index = a_uc_string.count then
							Result := a_uc_string.byte_count - s + 1
						else
							e := a_uc_string.shifted_byte_index (s, end_index - start_index + 1)
							Result := e - s
						end
					end
				elseif attached {UC_UTF8_STRING} a_string as a_utf8 then
					if start_index = 1 and end_index = a_utf8.count then
						Result := a_utf8.byte_count
					else
						s := a_utf8.byte_index (start_index)
						if end_index = a_utf8.count then
							Result := a_utf8.byte_count - s + 1
						else
							e := a_utf8.shifted_byte_index (s, end_index - start_index + 1)
							Result := e - s
						end
					end
				else
					from
						i := start_index
					until
						i > end_index
					loop
						Result := Result + code_byte_count (a_string.code (i).to_integer_32)
						i := i + 1
					end
				end
			end
		ensure
			instance_free: class
			substring_byte_count_positive: Result >= 0
		end

	code_byte_count (a_code: INTEGER_32): INTEGER_32
			-- Number of bytes needed to encode unicode character
			-- of code a_code with the UTF-8 encoding
		require
			valid_code: Unicode.valid_non_surrogate_code (a_code)
		do
			if a_code < 128 then
				Result := 1
			elseif a_code < 2048 then
				Result := 2
			elseif a_code < 65536 then
				Result := 3
			else
				Result := 4
			end
		ensure
			instance_free: class
			code_byte_count_large_enough: Result >= 1
			code_byte_count_small_enough: Result <= 4
		end

	character_byte_count (c: CHARACTER_8): INTEGER_32
			-- Number of bytes needed to encode character
			-- c with the UTF-8 encoding
		local
			a_code: INTEGER_32
		do
			if c <= Byte_127 then
				Result := 1
			elseif c <= Byte_255 then
				Result := 2
			else
				a_code := c.code
				if a_code < 2048 then
					Result := 2
				elseif a_code < 65536 then
					Result := 3
				else
					Result := 4
				end
			end
		ensure
			instance_free: class
			character_byte_count_large_enough: Result >= 1
			character_byte_count_small_enough: Result <= 4
		end
	
feature -- Conversion

	to_utf8 (a_string: STRING_8): STRING_8
			-- New STRING made up of bytes corresponding to
			-- the UTF-8 representation of a_string
		require
			a_string_not_void: a_string /= Void
		local
			i, nb: INTEGER_32
		do
			if attached {UC_STRING} a_string as uc_string then
				Result := uc_string.to_utf8
			else
				nb := a_string.count
				create Result.make (nb)
				from
					i := 1
				until
					i > nb
				loop
					append_code_to_utf8 (Result, a_string.item_code (i))
					i := i + 1
				end
			end
		ensure
			instance_free: class
			to_utf8_not_void: Result /= Void
			string_type: Any_.same_types (Result, "")
			valid_utf8: valid_utf8 (Result)
		end
	
feature -- Element change

	append_code_to_utf8 (a_utf8: STRING_8; a_code: INTEGER_32)
			-- Add UTF-8 encoded character of code a_code
			-- at the end of a_utf8.
		require
			a_utf8_not_void: a_utf8 /= Void
			a_utf8_is_string: Any_.same_types (a_utf8, "")
			a_utf8_valid: valid_utf8 (a_utf8)
			valid_code: Unicode.valid_non_surrogate_code (a_code)
		local
			b2, b3, b4: CHARACTER_8
			c: INTEGER_32
		do
			inspect code_byte_count (a_code)
			when 1 then
				a_utf8.append_character (Integer_.to_character (a_code))
			when 2 then
				c := a_code
				b2 := Integer_.to_character ((c \\ 64) + 128)
				c := c // 64
				a_utf8.append_character (Integer_.to_character (c + 192))
				a_utf8.append_character (b2)
			when 3 then
				c := a_code
				b3 := Integer_.to_character ((c \\ 64) + 128)
				c := c // 64
				b2 := Integer_.to_character ((c \\ 64) + 128)
				c := c // 64
				a_utf8.append_character (Integer_.to_character (c + 224))
				a_utf8.append_character (b2)
				a_utf8.append_character (b3)
			when 4 then
				c := a_code
				b4 := Integer_.to_character ((c \\ 64) + 128)
				c := c // 64
				b3 := Integer_.to_character ((c \\ 64) + 128)
				c := c // 64
				b2 := Integer_.to_character ((c \\ 64) + 128)
				c := c // 64
				a_utf8.append_character (Integer_.to_character (c + 240))
				a_utf8.append_character (b2)
				a_utf8.append_character (b3)
				a_utf8.append_character (b4)
			end
		ensure
			instance_free: class
			a_utf8_valid: valid_utf8 (a_utf8)
		end
	
feature {NONE} -- Constants

	Code_3: INTEGER_32 = 3
			-- 111110xx (2^2 - 1 = 3)

	Code_7: INTEGER_32 = 7
			-- 11110xxx (2^3 - 1 = 7)

	Code_15: INTEGER_32 = 15

	Code_31: INTEGER_32 = 31

	Byte_127: CHARACTER_8 = '%/127/'
			-- Highest ASCII character/1st UTF-8 byte

	Code_127: INTEGER_32 = 127
			-- 01111111

	Byte_143: CHARACTER_8 = '%/143/'

	Byte_159: CHARACTER_8 = '%/159/'

	Byte_191: CHARACTER_8 = '¿'
			-- 10111111

	Code_191: INTEGER_32 = 191
			-- 10111111

	Byte_194: CHARACTER_8 = 'Â'

	Byte_223: CHARACTER_8 = 'ß'
			-- 11011111

	Code_223: INTEGER_32 = 223
			-- 11011111

	Byte_224: CHARACTER_8 = 'à'

	Byte_237: CHARACTER_8 = 'í'

	Byte_239: CHARACTER_8 = 'ï'
			-- 11101111

	Code_239: INTEGER_32 = 239
			-- 11101111

	Byte_240: CHARACTER_8 = 'ð'

	Byte_244: CHARACTER_8 = 'ô'

	Byte_247: CHARACTER_8 = '÷'
			-- 11110111

	Code_247: INTEGER_32 = 247
			-- 11110111

	Byte_251: CHARACTER_8 = 'û'
			-- 11111011

	Code_251: INTEGER_32 = 251
			-- 11111011

	Byte_253: CHARACTER_8 = 'ý'
			-- 11111101

	Code_253: INTEGER_32 = 253
			-- 11111101

	Byte_255: CHARACTER_8 = 'ÿ'

	Byte_ef: CHARACTER_8 = 'ï'
			-- UTF-8 BOM first: EF

	Byte_bb: CHARACTER_8 = '»'
			-- UTF-8 BOM second: BB

	Byte_bf: CHARACTER_8 = '¿'
			-- UTF-8 BOM third: BF
	
feature {NONE} -- Implementation

	Dummy_string: STRING_8 = ""
			-- Dummy string

	Dummy_uc_string: UC_STRING
			-- Dummy UC_STRING
		once
			create Result.make_empty
		ensure
			instance_free: class
			dummy_uc_string_not_void: Result /= Void
		end
	
end -- class UC_UTF8_ROUTINES
Generated by ISE EiffelStudio