note description: "UTF-32 encoding routines" library: "Gobo Eiffel Kernel Library" copyright: "Copyright (c) 2005-2018, Colin Adams and others" license: "MIT License" date: "$Date: 2019-02-07 22:54:15 +0000 (Thu, 07 Feb 2019) $" revision: "$Revision: 102807 $" class UC_UTF32_ROUTINES inherit UC_UNICODE_CONSTANTS UC_IMPORTED_UNICODE_ROUTINES KL_IMPORTED_ANY_ROUTINES KL_IMPORTED_INTEGER_ROUTINES create default_create feature -- Status report valid_utf32 (a_string: STRING_8): BOOLEAN -- Are the bytes in a_string a valid UTF-32 encoding? -- 'a_string' has one byte per character. -- Default to big endian when no BOM. require a_string_not_void: a_string /= Void a_string_is_string: Any_.same_types (a_string, "") local i, nb: INTEGER_32 least_endian: BOOLEAN do Result := (a_string.count \\ 4) = 0 if Result and a_string.count > 0 then if is_endian_detection_character_least_first (a_string.item_code (1), a_string.item_code (2), a_string.item_code (3), a_string.item_code (4)) then least_endian := True end nb := a_string.count from i := 1 until i > nb loop Result := Unicode.valid_non_surrogate_code (code (a_string.item_code (i), a_string.item_code (i + 1), a_string.item_code (i + 2), a_string.item_code (i + 3), least_endian)) if not Result then i := nb + 1 else i := i + 4 end end end ensure instance_free: class empty_is_true: a_string.count = 0 implies Result utf32_count_multiple_of_four: Result implies ((a_string.count \\ 4) = 0) end feature -- Endian-ness detection Bom_be: STRING_8 -- BOM in big-endian format once Result := "%U%Uþÿ" ensure instance_free: class bom_be_not_void: Result /= Void four_bytes: Result.count = 4 first_byte: Result.item_code (1) = 0 second_byte: Result.item_code (2) = 0 third_byte: Result.item_code (3) = Hex_fe fourth_byte: Result.item_code (4) = Hex_ff end Bom_le: STRING_8 -- BOM in little-endian format once Result := "ÿþ%U%U" ensure instance_free: class bom_le_not_void: Result /= Void four_bytes: Result.count = 4 first_byte: Result.item_code (1) = Hex_ff second_byte: Result.item_code (2) = Hex_fe third_byte: Result.item_code (3) = 0 fourth_byte: Result.item_code (4) = 0 end is_endian_detection_character_most_first (first, second, third, fourth: INTEGER_32): BOOLEAN -- Do the four bytes represent the character -- 0xFEFF with first being the most significant byte? require first_is_byte: is_byte (first) second_is_byte: is_byte (second) third_is_byte: is_byte (third) fourth_is_byte: is_byte (fourth) do Result := first = 0 and second = 0 and third = Hex_fe and fourth = Hex_ff ensure instance_free: class definition: Result = (first = 0 and second = 0 and third = Hex_fe and fourth = Hex_ff) end is_endian_detection_character_least_first (first, second, third, fourth: INTEGER_32): BOOLEAN -- Do the four bytes represent the character -- 0xFEFF with first being the least significant byte? require first_is_byte: is_byte (first) second_is_byte: is_byte (second) third_is_byte: is_byte (third) fourth_is_byte: is_byte (fourth) do Result := first = Hex_ff and second = Hex_fe and third = 0 and fourth = 0 ensure instance_free: class definition: Result = (first = Hex_ff and second = Hex_fe and third = 0 and fourth = 0) end is_byte (a: INTEGER_32): BOOLEAN -- Is a a byte? do Result := a >= 0 and a < Hex_100 ensure instance_free: class definition: Result = (a >= 0 and a < Hex_100) end feature -- Access code (first, second, third, fourth: INTEGER_32; least_endian: BOOLEAN): INTEGER_32 -- Code point represented by four bytes require first_is_byte: is_byte (first) second_is_byte: is_byte (second) third_is_byte: is_byte (third) fourth_is_byte: is_byte (fourth) do if least_endian then Result := first + second * 256 + third * Two_byte_offset + fourth * Three_byte_offset else Result := fourth + third * 256 + second * Two_byte_offset + first * Three_byte_offset end ensure instance_free: class code_not_negative: Result >= 0 end feature {NONE} -- Constants Hex_100: INTEGER_32 = 256 -- 2 ^ 8 Hex_fe: INTEGER_32 = 254 -- Endian detection character Hex_ff: INTEGER_32 = 255 -- Endian detection character Two_byte_offset: INTEGER_32 = 65536 -- 256 * 256 Three_byte_offset: INTEGER_32 = 16777216 -- 256 * 256 * 256 end -- class UC_UTF32_ROUTINES
Generated by ISE EiffelStudio