note description: "UTF-8 encoding routines" library: "Gobo Eiffel Kernel Library" copyright: "Copyright (c) 2001-2018, Eric Bezault and others" license: "MIT License" date: "$Date: 2019-02-07 22:54:15 +0000 (Thu, 07 Feb 2019) $" revision: "$Revision: 102807 $" class UC_UTF8_ROUTINES inherit UC_IMPORTED_UNICODE_ROUTINES KL_IMPORTED_STRING_ROUTINES KL_IMPORTED_INTEGER_ROUTINES KL_IMPORTED_ANY_ROUTINES UC_STRING_HANDLER create default_create feature -- Status report valid_utf8 (a_string: STRING_8): BOOLEAN -- Are the bytes in a_string a valid UTF-8 encoding? require a_string_not_void: a_string /= Void a_string_is_string: Any_.same_types (a_string, "") local i, nb, nb2: INTEGER_32 bc, a_code: INTEGER_32 a_byte, a_first_byte: CHARACTER_8 do Result := True nb := a_string.count from i := 1 until i > nb loop a_first_byte := a_string.item (i) if is_encoded_first_byte (a_first_byte) then bc := encoded_byte_count (a_first_byte) if bc = 1 then i := i + 1 else nb2 := i + bc - 1 if nb2 > nb then Result := False i := nb + 1 else a_code := encoded_first_value (a_first_byte) i := i + 1 a_byte := a_string.item (i) if not is_encoded_second_byte (a_byte, a_first_byte) then Result := False i := nb + 1 else a_code := a_code * 64 + encoded_next_value (a_byte) inspect bc when 2 then if a_code <= Code_127 then Result := False end when 3 then if a_code <= Code_31 then Result := False end when 4 then if a_code <= Code_15 then Result := False end end if Result then from i := i + 1 until i > nb2 loop if is_encoded_next_byte (a_string.item (i)) then i := i + 1 else Result := False i := nb + 1 end end end end end end else Result := False i := nb + 1 end end ensure instance_free: class end is_encoded_first_byte (a_byte: CHARACTER_8): BOOLEAN -- Is a_byte the first byte in UTF-8 encoding? do Result := (a_byte <= Byte_127 or (Byte_194 <= a_byte and a_byte <= Byte_244)) ensure instance_free: class end is_encoded_next_byte (a_byte: CHARACTER_8): BOOLEAN -- Is a_byte one of the next bytes in UTF-8 encoding? do Result := (Byte_127 < a_byte and a_byte <= Byte_191) ensure instance_free: class end is_encoded_second_byte (a_byte, a_first_byte: CHARACTER_8): BOOLEAN -- Is a_byte a valid second byte in UTF-8 encoding? require valid_first_byte: is_encoded_first_byte (a_first_byte) do if a_first_byte = Byte_224 then Result := (Byte_159 < a_byte and a_byte <= Byte_191) elseif a_first_byte = Byte_237 then Result := (Byte_127 < a_byte and a_byte <= Byte_159) elseif a_first_byte = Byte_240 then Result := (Byte_143 < a_byte and a_byte <= Byte_191) elseif a_first_byte = Byte_244 then Result := (Byte_127 < a_byte and a_byte <= Byte_143) else Result := (Byte_127 < a_byte and a_byte <= Byte_191) end ensure instance_free: class end is_endian_detection_character (a_first, a_second, a_third: CHARACTER_8): BOOLEAN -- Is this sequence a UTF-8 Byte Order Marker (BOM)? do Result := is_endian_detection_character_start (a_first, a_second) and a_third = Byte_bf ensure instance_free: class result_start: Result implies is_endian_detection_character_start (a_first, a_second) end is_endian_detection_character_start (a_first, a_second: CHARACTER_8): BOOLEAN -- Are these characters the start of a UTF-8 encoded Byte Order Marker (BOM)? do Result := a_first = Byte_ef and a_second = Byte_bb ensure instance_free: class end feature -- Access encoded_first_value (a_byte: CHARACTER_8): INTEGER_32 -- Value encoded in first byte require is_encoded_first_byte: is_encoded_first_byte (a_byte) do Result := a_byte.code if a_byte <= Byte_127 then elseif a_byte <= Byte_223 then Result := Result \\ 32 elseif a_byte <= Byte_239 then Result := Result \\ 16 elseif a_byte <= Byte_244 then Result := Result \\ 8 end ensure instance_free: class value_positive: Result >= 0 value_small_enough: Result < 128 end encoded_next_value (a_byte: CHARACTER_8): INTEGER_32 -- Value encoded in one of the next bytes require is_encoded_next_byte: is_encoded_next_byte (a_byte) do Result := a_byte.code \\ 64 ensure instance_free: class value_positive: Result >= 0 value_small_enough: Result < 64 end feature -- Measurement encoded_byte_count (a_byte: CHARACTER_8): INTEGER_32 -- Number of bytes which were necessary to encode -- the unicode character whose first byte is a_byte require is_encoded_first_byte: is_encoded_first_byte (a_byte) do if a_byte <= Byte_127 then Result := 1 elseif a_byte <= Byte_223 then Result := 2 elseif a_byte <= Byte_239 then Result := 3 else Result := 4 end ensure instance_free: class encoded_byte_code_large_enough: Result >= 1 encoded_byte_code_small_enough: Result <= 4 end substring_byte_count (a_string: READABLE_STRING_GENERAL; start_index, end_index: INTEGER_32): INTEGER_32 -- Number of bytes needed to encode characters of -- a_string between start_index and end_index -- inclusive with the UTF-8 encoding require a_string_not_void: a_string /= Void valid_start_index: 1 <= start_index valid_end_index: end_index <= a_string.count meaningful_interval: start_index <= end_index + 1 local s, e: INTEGER_32 i: INTEGER_32 even_end_index: INTEGER_32 c: CHARACTER_8 do if start_index <= end_index then if Any_.same_types (a_string, Dummy_string) and then attached {STRING_8} a_string as l_string_8 then if end_index \\ 2 = 0 then even_end_index := end_index else even_end_index := end_index - 1 end from i := start_index until i > even_end_index loop c := l_string_8.item (i) if c <= Byte_127 then Result := Result + 1 else Result := Result + character_byte_count (c) end c := l_string_8.item (i + 1) if c <= Byte_127 then Result := Result + 1 else Result := Result + character_byte_count (c) end i := i + 2 end if even_end_index < end_index then Result := Result + character_byte_count (l_string_8.item (end_index)) end elseif Any_.same_types (a_string, Dummy_uc_string) and then attached {UC_STRING} a_string as a_uc_string then if start_index = 1 and end_index = a_uc_string.count then Result := a_uc_string.byte_count else s := a_uc_string.byte_index (start_index) if end_index = a_uc_string.count then Result := a_uc_string.byte_count - s + 1 else e := a_uc_string.shifted_byte_index (s, end_index - start_index + 1) Result := e - s end end elseif attached {UC_UTF8_STRING} a_string as a_utf8 then if start_index = 1 and end_index = a_utf8.count then Result := a_utf8.byte_count else s := a_utf8.byte_index (start_index) if end_index = a_utf8.count then Result := a_utf8.byte_count - s + 1 else e := a_utf8.shifted_byte_index (s, end_index - start_index + 1) Result := e - s end end else from i := start_index until i > end_index loop Result := Result + code_byte_count (a_string.code (i).to_integer_32) i := i + 1 end end end ensure instance_free: class substring_byte_count_positive: Result >= 0 end code_byte_count (a_code: INTEGER_32): INTEGER_32 -- Number of bytes needed to encode unicode character -- of code a_code with the UTF-8 encoding require valid_code: Unicode.valid_non_surrogate_code (a_code) do if a_code < 128 then Result := 1 elseif a_code < 2048 then Result := 2 elseif a_code < 65536 then Result := 3 else Result := 4 end ensure instance_free: class code_byte_count_large_enough: Result >= 1 code_byte_count_small_enough: Result <= 4 end character_byte_count (c: CHARACTER_8): INTEGER_32 -- Number of bytes needed to encode character -- c with the UTF-8 encoding local a_code: INTEGER_32 do if c <= Byte_127 then Result := 1 elseif c <= Byte_255 then Result := 2 else a_code := c.code if a_code < 2048 then Result := 2 elseif a_code < 65536 then Result := 3 else Result := 4 end end ensure instance_free: class character_byte_count_large_enough: Result >= 1 character_byte_count_small_enough: Result <= 4 end feature -- Conversion to_utf8 (a_string: STRING_8): STRING_8 -- New STRING made up of bytes corresponding to -- the UTF-8 representation of a_string require a_string_not_void: a_string /= Void local i, nb: INTEGER_32 do if attached {UC_STRING} a_string as uc_string then Result := uc_string.to_utf8 else nb := a_string.count create Result.make (nb) from i := 1 until i > nb loop append_code_to_utf8 (Result, a_string.item_code (i)) i := i + 1 end end ensure instance_free: class to_utf8_not_void: Result /= Void string_type: Any_.same_types (Result, "") valid_utf8: valid_utf8 (Result) end feature -- Element change append_code_to_utf8 (a_utf8: STRING_8; a_code: INTEGER_32) -- Add UTF-8 encoded character of code a_code -- at the end of a_utf8. require a_utf8_not_void: a_utf8 /= Void a_utf8_is_string: Any_.same_types (a_utf8, "") a_utf8_valid: valid_utf8 (a_utf8) valid_code: Unicode.valid_non_surrogate_code (a_code) local b2, b3, b4: CHARACTER_8 c: INTEGER_32 do inspect code_byte_count (a_code) when 1 then a_utf8.append_character (Integer_.to_character (a_code)) when 2 then c := a_code b2 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 a_utf8.append_character (Integer_.to_character (c + 192)) a_utf8.append_character (b2) when 3 then c := a_code b3 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 b2 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 a_utf8.append_character (Integer_.to_character (c + 224)) a_utf8.append_character (b2) a_utf8.append_character (b3) when 4 then c := a_code b4 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 b3 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 b2 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 a_utf8.append_character (Integer_.to_character (c + 240)) a_utf8.append_character (b2) a_utf8.append_character (b3) a_utf8.append_character (b4) end ensure instance_free: class a_utf8_valid: valid_utf8 (a_utf8) end feature {NONE} -- Constants Code_3: INTEGER_32 = 3 -- 111110xx (2^2 - 1 = 3) Code_7: INTEGER_32 = 7 -- 11110xxx (2^3 - 1 = 7) Code_15: INTEGER_32 = 15 Code_31: INTEGER_32 = 31 Byte_127: CHARACTER_8 = '%/127/' -- Highest ASCII character/1st UTF-8 byte Code_127: INTEGER_32 = 127 -- 01111111 Byte_143: CHARACTER_8 = '%/143/' Byte_159: CHARACTER_8 = '%/159/' Byte_191: CHARACTER_8 = '¿' -- 10111111 Code_191: INTEGER_32 = 191 -- 10111111 Byte_194: CHARACTER_8 = 'Â' Byte_223: CHARACTER_8 = 'ß' -- 11011111 Code_223: INTEGER_32 = 223 -- 11011111 Byte_224: CHARACTER_8 = 'à' Byte_237: CHARACTER_8 = 'í' Byte_239: CHARACTER_8 = 'ï' -- 11101111 Code_239: INTEGER_32 = 239 -- 11101111 Byte_240: CHARACTER_8 = 'ð' Byte_244: CHARACTER_8 = 'ô' Byte_247: CHARACTER_8 = '÷' -- 11110111 Code_247: INTEGER_32 = 247 -- 11110111 Byte_251: CHARACTER_8 = 'û' -- 11111011 Code_251: INTEGER_32 = 251 -- 11111011 Byte_253: CHARACTER_8 = 'ý' -- 11111101 Code_253: INTEGER_32 = 253 -- 11111101 Byte_255: CHARACTER_8 = 'ÿ' Byte_ef: CHARACTER_8 = 'ï' -- UTF-8 BOM first: EF Byte_bb: CHARACTER_8 = '»' -- UTF-8 BOM second: BB Byte_bf: CHARACTER_8 = '¿' -- UTF-8 BOM third: BF feature {NONE} -- Implementation Dummy_string: STRING_8 = "" -- Dummy string Dummy_uc_string: UC_STRING -- Dummy UC_STRING once create Result.make_empty ensure instance_free: class dummy_uc_string_not_void: Result /= Void end end -- class UC_UTF8_ROUTINES
Generated by ISE EiffelStudio