note description: "UTF-32 encoding routines" library: "Gobo Eiffel Kernel Library" copyright: "Copyright (c) 2005-2018, Colin Adams and others" license: "MIT License" date: "$Date: 2019-02-07 22:54:15 +0000 (Thu, 07 Feb 2019) $" revision: "$Revision: 102807 $" class UC_UTF32_ROUTINES create default_create feature {NONE} -- Initialization default_create -- Process instances of classes with no creation clause. -- (Default: do nothing.) -- (from ANY) do end feature -- Access Any_: KL_ANY_ROUTINES -- Routines that ought to be in class ANY -- (from KL_IMPORTED_ANY_ROUTINES) once create Result ensure -- from KL_IMPORTED_ANY_ROUTINES instance_free: class any_routines_not_void: Result /= Void end Canonical_decomposition_mapping: INTEGER_32 = 0 -- Decomposition mapping is canonical -- (from UC_UNICODE_CONSTANTS) Close_punctuation_category: INTEGER_32 = 15 -- Close punctuation -- (from UC_UNICODE_CONSTANTS) code (first, second, third, fourth: INTEGER_32; least_endian: BOOLEAN): INTEGER_32 -- Code point represented by four bytes require first_is_byte: is_byte (first) second_is_byte: is_byte (second) third_is_byte: is_byte (third) fourth_is_byte: is_byte (fourth) do if least_endian then Result := first + second * 256 + third * Two_byte_offset + fourth * Three_byte_offset else Result := fourth + third * 256 + second * Two_byte_offset + first * Three_byte_offset end ensure instance_free: class code_not_negative: Result >= 0 end Compatibility_decomposition_mapping: INTEGER_32 = 16 -- Decomposition mapping for unspecified compatibility character -- (from UC_UNICODE_CONSTANTS) Connector_punctuation_category: INTEGER_32 = 12 -- Connector punctuation -- (from UC_UNICODE_CONSTANTS) Control_other_category: INTEGER_32 = 26 -- Control character -- (from UC_UNICODE_CONSTANTS) Currency_symbol_category: INTEGER_32 = 20 -- Currency symbol -- (from UC_UNICODE_CONSTANTS) Dash_punctuation_category: INTEGER_32 = 13 -- Dash punctuation -- (from UC_UNICODE_CONSTANTS) Decimal_digit_number_category: INTEGER_32 = 9 -- Decimal digit number -- (from UC_UNICODE_CONSTANTS) Encircled_decomposition_mapping: INTEGER_32 = 7 -- Decomposition mapping for encircled form -- (from UC_UNICODE_CONSTANTS) Enclosing_mark_category: INTEGER_32 = 8 -- Enclosing mark -- (from UC_UNICODE_CONSTANTS) Final_decomposition_mapping: INTEGER_32 = 5 -- Decomposition mapping for Arabic final presentation form -- (from UC_UNICODE_CONSTANTS) Final_quote_punctuation_category: INTEGER_32 = 17 -- Final_quote punctuation -- (from UC_UNICODE_CONSTANTS) Font_decomposition_mapping: INTEGER_32 = 1 -- Decomposition mapping for font variant -- (from UC_UNICODE_CONSTANTS) Format_other_category: INTEGER_32 = 27 -- Format character -- (from UC_UNICODE_CONSTANTS) Fraction_decomposition_mapping: INTEGER_32 = 15 -- Decomposition mapping for vulgar fraction form -- (from UC_UNICODE_CONSTANTS) generating_type: TYPE [detachable UC_UTF32_ROUTINES] -- Type of current object -- (type of which it is a direct instance) -- (from ANY) external "built_in" ensure -- from ANY generating_type_not_void: Result /= Void end generator: STRING_8 -- Name of current object's generating class -- (base class of the type of which it is a direct instance) -- (from ANY) external "built_in" ensure -- from ANY generator_not_void: Result /= Void generator_not_empty: not Result.is_empty end Initial_decomposition_mapping: INTEGER_32 = 3 -- Decomposition mapping for Arabic initial presentation form -- (from UC_UNICODE_CONSTANTS) Initial_quote_punctuation_category: INTEGER_32 = 16 -- Initial punctuation -- (from UC_UNICODE_CONSTANTS) Integer_: KL_INTEGER_ROUTINES -- Routines that ought to be in class INTEGER -- (from KL_IMPORTED_INTEGER_ROUTINES) once create Result ensure -- from KL_IMPORTED_INTEGER_ROUTINES instance_free: class integer_routines_not_void: Result /= Void end Isolated_decomposition_mapping: INTEGER_32 = 6 -- Decomposition mapping for Arabic isolated presentation form -- (from UC_UNICODE_CONSTANTS) Letter_number_category: INTEGER_32 = 10 -- Letter number -- (from UC_UNICODE_CONSTANTS) Line_separator_category: INTEGER_32 = 24 -- Line separator -- (from UC_UNICODE_CONSTANTS) Lowercase_letter_category: INTEGER_32 = 2 -- Lower case letter -- (from UC_UNICODE_CONSTANTS) Math_symbol_category: INTEGER_32 = 19 -- Mathematics symbol -- (from UC_UNICODE_CONSTANTS) Maximum_ascii_character: CHARACTER_8 = '%/127/' -- Largest ASCII character -- ensure -- definition: Result.code = maximum_ascii_code -- end -- (from UC_UNICODE_CONSTANTS) Maximum_ascii_character_code: INTEGER_32 = 127 -- Largest code for ASCII characters -- (2^7 - 1) -- ensure -- definition: Result = 127 -- small_enough: Result <= Platform.Maximum_byte_code -- end -- (from UC_UNICODE_CONSTANTS) Maximum_bmp_character_code: INTEGER_32 = 65535 -- Largest code for unicode characters in Basic Multi-lingual Plane (FFFF); -- ensure -- definition: Result = 65535 -- end -- (from UC_UNICODE_CONSTANTS) Maximum_unicode_character_code: INTEGER_32 = 1114111 -- Largest code for unicode characters (10FFFF); -- Includes final two non-characters. -- ensure -- definition: Result = 1114111 -- end -- (from UC_UNICODE_CONSTANTS) Maximum_unicode_surrogate_code: INTEGER_32 = 57343 -- Highest unicode surrogate code-point (0xDFFF) -- ensure -- definition: Result = 57343 -- end -- (from UC_UNICODE_CONSTANTS) Medial_decomposition_mapping: INTEGER_32 = 4 -- Decomposition mapping for Arabic medial presentation form -- (from UC_UNICODE_CONSTANTS) Minimum_ascii_character: CHARACTER_8 = '%U' -- Smallest ASCII character -- ensure -- definition: Result.code = minimum_ascii_code -- end -- (from UC_UNICODE_CONSTANTS) Minimum_ascii_character_code: INTEGER_32 = 0 -- Smallest code for ASCII characters -- ensure -- definition: Result = 0 -- end -- (from UC_UNICODE_CONSTANTS) Minimum_unicode_character_code: INTEGER_32 = 0 -- Smallest code for unicode characters -- ensure -- definition: Result = 0 -- end -- (from UC_UNICODE_CONSTANTS) Minimum_unicode_surrogate_code: INTEGER_32 = 55296 -- Lowest unicode surrogate code-point (0xD800) -- ensure -- definition: Result = 55296 -- end -- (from UC_UNICODE_CONSTANTS) Modifier_letter_category: INTEGER_32 = 4 -- Modifier letter -- (from UC_UNICODE_CONSTANTS) Modifier_symbol_category: INTEGER_32 = 21 -- Modifier symbol -- (from UC_UNICODE_CONSTANTS) Narrow_decomposition_mapping: INTEGER_32 = 12 -- Decomposition mapping for narrow (hankaku) compatibility character -- (from UC_UNICODE_CONSTANTS) No_break_decomposition_mapping: INTEGER_32 = 2 -- Decomposition mapping for no-break variant -- (from UC_UNICODE_CONSTANTS) Non_spacing_mark_category: INTEGER_32 = 6 -- Non-spacing mark -- (from UC_UNICODE_CONSTANTS) Open_punctuation_category: INTEGER_32 = 14 -- Open punctuation -- (from UC_UNICODE_CONSTANTS) Other_letter_category: INTEGER_32 = 5 -- Other letter -- (from UC_UNICODE_CONSTANTS) Other_number_category: INTEGER_32 = 11 -- Other number -- (from UC_UNICODE_CONSTANTS) Other_punctuation_category: INTEGER_32 = 18 -- Other punctuation -- (from UC_UNICODE_CONSTANTS) Other_symbol_category: INTEGER_32 = 22 -- Other symbol -- (from UC_UNICODE_CONSTANTS) Paragraph_separator_category: INTEGER_32 = 25 -- Paragraph separator -- (from UC_UNICODE_CONSTANTS) Platform: KL_PLATFORM -- Platform-dependent properties -- (from KL_SHARED_PLATFORM) once create Result ensure -- from KL_SHARED_PLATFORM instance_free: class platform_not_void: Result /= Void end Private_other_category: INTEGER_32 = 29 -- Private-use character -- (from UC_UNICODE_CONSTANTS) Small_decomposition_mapping: INTEGER_32 = 13 -- Decomposition mapping for small variant form (CNS compatibility) -- (from UC_UNICODE_CONSTANTS) Space_separator_category: INTEGER_32 = 23 -- Space separator -- (from UC_UNICODE_CONSTANTS) Spacing_combining_mark_category: INTEGER_32 = 7 -- Spacing combining mark -- (from UC_UNICODE_CONSTANTS) Square_decomposition_mapping: INTEGER_32 = 14 -- Decomposition mapping for CJK squared font variant -- (from UC_UNICODE_CONSTANTS) Subscript_decomposition_mapping: INTEGER_32 = 9 -- Decomposition mapping for subscript form -- (from UC_UNICODE_CONSTANTS) Superscript_decomposition_mapping: INTEGER_32 = 8 -- Decomposition mapping for superscript form -- (from UC_UNICODE_CONSTANTS) Surrogate_other_category: INTEGER_32 = 28 -- Surrogate character -- (from UC_UNICODE_CONSTANTS) Titlecase_letter_category: INTEGER_32 = 3 -- Title case letter -- (from UC_UNICODE_CONSTANTS) Unassigned_other_category: INTEGER_32 = 0 -- Unassigned chacaracter -- (from UC_UNICODE_CONSTANTS) Unicode: UC_UNICODE_ROUTINES -- Unicode routines -- (from UC_IMPORTED_UNICODE_ROUTINES) once create Result ensure -- from UC_IMPORTED_UNICODE_ROUTINES instance_free: class unicode_not_void: Result /= Void end Uppercase_letter_category: INTEGER_32 = 1 -- Upper case letter -- (from UC_UNICODE_CONSTANTS) Vertical_decomposition_mapping: INTEGER_32 = 10 -- Decomposition mapping for vertical layout presentation form -- (from UC_UNICODE_CONSTANTS) Wide_decomposition_mapping: INTEGER_32 = 11 -- Decomposition mapping for wide (zenkaku) compatibility character -- (from UC_UNICODE_CONSTANTS) feature -- Comparison frozen deep_equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void -- or attached to isomorphic object structures? -- (from ANY) do if a = Void then Result := b = Void else Result := b /= Void and then a.is_deep_equal (b) end ensure -- from ANY instance_free: class shallow_implies_deep: standard_equal (a, b) implies Result both_or_none_void: (a = Void) implies (Result = (b = Void)) same_type: (Result and (a /= Void)) implies (b /= Void and then a.same_type (b)) symmetric: Result implies deep_equal (b, a) end frozen equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void or attached -- to objects considered equal? -- (from ANY) do if a = Void then Result := b = Void else Result := b /= Void and then a.is_equal (b) end ensure -- from ANY instance_free: class definition: Result = (a = Void and b = Void) or else ((a /= Void and b /= Void) and then a.is_equal (b)) end frozen is_deep_equal (other: UC_UTF32_ROUTINES): BOOLEAN -- Are Current and other attached to isomorphic object structures? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY shallow_implies_deep: standard_is_equal (other) implies Result same_type: Result implies same_type (other) symmetric: Result implies other.is_deep_equal (Current) end is_equal (other: UC_UTF32_ROUTINES): BOOLEAN -- Is other attached to an object considered -- equal to current object? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY symmetric: Result implies other ~ Current consistent: standard_is_equal (other) implies Result end frozen standard_equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void or attached to -- field-by-field identical objects of the same type? -- Always uses default object comparison criterion. -- (from ANY) do if a = Void then Result := b = Void else Result := b /= Void and then a.standard_is_equal (b) end ensure -- from ANY instance_free: class definition: Result = (a = Void and b = Void) or else ((a /= Void and b /= Void) and then a.standard_is_equal (b)) end frozen standard_is_equal (other: UC_UTF32_ROUTINES): BOOLEAN -- Is other attached to an object of the same type -- as current object, and field-by-field identical to it? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY same_type: Result implies same_type (other) symmetric: Result implies other.standard_is_equal (Current) end feature -- Status report conforms_to (other: ANY): BOOLEAN -- Does type of current object conform to type -- of other (as per Eiffel: The Language, chapter 13)? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" end same_type (other: ANY): BOOLEAN -- Is type of current object identical to type of other? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY definition: Result = (conforms_to (other) and other.conforms_to (Current)) end valid_utf32 (a_string: STRING_8): BOOLEAN -- Are the bytes in a_string a valid UTF-32 encoding? -- 'a_string' has one byte per character. -- Default to big endian when no BOM. require a_string_not_void: a_string /= Void a_string_is_string: Any_.same_types (a_string, "") local i, nb: INTEGER_32 least_endian: BOOLEAN do Result := (a_string.count \\ 4) = 0 if Result and a_string.count > 0 then if is_endian_detection_character_least_first (a_string.item_code (1), a_string.item_code (2), a_string.item_code (3), a_string.item_code (4)) then least_endian := True end nb := a_string.count from i := 1 until i > nb loop Result := Unicode.valid_non_surrogate_code (code (a_string.item_code (i), a_string.item_code (i + 1), a_string.item_code (i + 2), a_string.item_code (i + 3), least_endian)) if not Result then i := nb + 1 else i := i + 4 end end end ensure instance_free: class empty_is_true: a_string.count = 0 implies Result utf32_count_multiple_of_four: Result implies ((a_string.count \\ 4) = 0) end feature -- Duplication frozen clone (other: detachable ANY): like other obsolete "Use `twin' instead. [2017-05-31]" -- Void if other is void; otherwise new object -- equal to other -- -- For non-void other, clone calls copy; -- to change copying/cloning semantics, redefine copy. -- (from ANY) do if other /= Void then Result := other.twin end ensure -- from ANY instance_free: class equal: Result ~ other end copy (other: UC_UTF32_ROUTINES) -- Update current object using fields of object attached -- to other, so as to yield equal objects. -- (from ANY) require -- from ANY other_not_void: other /= Void type_identity: same_type (other) external "built_in" ensure -- from ANY is_equal: Current ~ other end frozen deep_clone (other: detachable ANY): like other obsolete "Use `deep_twin' instead. [2017-05-31]" -- Void if other is void: otherwise, new object structure -- recursively duplicated from the one attached to other -- (from ANY) do if other /= Void then Result := other.deep_twin end ensure -- from ANY instance_free: class deep_equal: deep_equal (other, Result) end frozen deep_copy (other: UC_UTF32_ROUTINES) -- Effect equivalent to that of: -- copy (other . deep_twin) -- (from ANY) require -- from ANY other_not_void: other /= Void do copy (other.deep_twin) ensure -- from ANY deep_equal: deep_equal (Current, other) end frozen deep_twin: UC_UTF32_ROUTINES -- New object structure recursively duplicated from Current. -- (from ANY) external "built_in" ensure -- from ANY deep_twin_not_void: Result /= Void deep_equal: deep_equal (Current, Result) end frozen standard_clone (other: detachable ANY): like other obsolete "Use `standard_twin' instead. [2017-05-31]" -- Void if other is void; otherwise new object -- field-by-field identical to other. -- Always uses default copying semantics. -- (from ANY) do if other /= Void then Result := other.standard_twin end ensure -- from ANY instance_free: class equal: standard_equal (Result, other) end frozen standard_copy (other: UC_UTF32_ROUTINES) -- Copy every field of other onto corresponding field -- of current object. -- (from ANY) require -- from ANY other_not_void: other /= Void type_identity: same_type (other) external "built_in" ensure -- from ANY is_standard_equal: standard_is_equal (other) end frozen standard_twin: UC_UTF32_ROUTINES -- New object field-by-field identical to other. -- Always uses default copying semantics. -- (from ANY) external "built_in" ensure -- from ANY standard_twin_not_void: Result /= Void equal: standard_equal (Result, Current) end frozen twin: UC_UTF32_ROUTINES -- New object equal to Current -- twin calls copy; to change copying/twinning semantics, redefine copy. -- (from ANY) external "built_in" ensure -- from ANY twin_not_void: Result /= Void is_equal: Result ~ Current end feature -- Basic operations frozen as_attached: attached UC_UTF32_ROUTINES obsolete "Remove calls to this feature. [2017-05-31]" -- Attached version of Current. -- (Can be used during transitional period to convert -- non-void-safe classes to void-safe ones.) -- (from ANY) do Result := Current end frozen default: detachable UC_UTF32_ROUTINES -- Default value of object's type -- (from ANY) do end frozen default_pointer: POINTER -- Default value of type POINTER -- (Avoid the need to write p.default for -- some p of type POINTER.) -- (from ANY) do ensure -- from ANY instance_free: class end default_rescue -- Process exception for routines with no Rescue clause. -- (Default: do nothing.) -- (from ANY) do end frozen do_nothing -- Execute a null action. -- (from ANY) do ensure -- from ANY instance_free: class end feature {NONE} -- Constants Hex_100: INTEGER_32 = 256 -- 2 ^ 8 Hex_fe: INTEGER_32 = 254 -- Endian detection character Hex_ff: INTEGER_32 = 255 -- Endian detection character Three_byte_offset: INTEGER_32 = 16777216 -- 256 * 256 * 256 Two_byte_offset: INTEGER_32 = 65536 -- 256 * 256 feature -- Endian-ness detection Bom_be: STRING_8 -- BOM in big-endian format once Result := "%U%Uþÿ" ensure instance_free: class bom_be_not_void: Result /= Void four_bytes: Result.count = 4 first_byte: Result.item_code (1) = 0 second_byte: Result.item_code (2) = 0 third_byte: Result.item_code (3) = Hex_fe fourth_byte: Result.item_code (4) = Hex_ff end Bom_le: STRING_8 -- BOM in little-endian format once Result := "ÿþ%U%U" ensure instance_free: class bom_le_not_void: Result /= Void four_bytes: Result.count = 4 first_byte: Result.item_code (1) = Hex_ff second_byte: Result.item_code (2) = Hex_fe third_byte: Result.item_code (3) = 0 fourth_byte: Result.item_code (4) = 0 end is_byte (a: INTEGER_32): BOOLEAN -- Is a a byte? do Result := a >= 0 and a < Hex_100 ensure instance_free: class definition: Result = (a >= 0 and a < Hex_100) end is_endian_detection_character_least_first (first, second, third, fourth: INTEGER_32): BOOLEAN -- Do the four bytes represent the character -- 0xFEFF with first being the least significant byte? require first_is_byte: is_byte (first) second_is_byte: is_byte (second) third_is_byte: is_byte (third) fourth_is_byte: is_byte (fourth) do Result := first = Hex_ff and second = Hex_fe and third = 0 and fourth = 0 ensure instance_free: class definition: Result = (first = Hex_ff and second = Hex_fe and third = 0 and fourth = 0) end is_endian_detection_character_most_first (first, second, third, fourth: INTEGER_32): BOOLEAN -- Do the four bytes represent the character -- 0xFEFF with first being the most significant byte? require first_is_byte: is_byte (first) second_is_byte: is_byte (second) third_is_byte: is_byte (third) fourth_is_byte: is_byte (fourth) do Result := first = 0 and second = 0 and third = Hex_fe and fourth = Hex_ff ensure instance_free: class definition: Result = (first = 0 and second = 0 and third = Hex_fe and fourth = Hex_ff) end feature -- Output Io: STD_FILES -- Handle to standard file setup -- (from ANY) once create Result Result.set_output_default ensure -- from ANY instance_free: class io_not_void: Result /= Void end out: STRING_8 -- New string containing terse printable representation -- of current object -- (from ANY) do Result := tagged_out ensure -- from ANY out_not_void: Result /= Void end print (o: detachable ANY) -- Write terse external representation of o -- on standard output. -- (from ANY) do if o /= Void then Io.put_string (o.out) end ensure -- from ANY instance_free: class end frozen tagged_out: STRING_8 -- New string containing terse printable representation -- of current object -- (from ANY) external "built_in" ensure -- from ANY tagged_out_not_void: Result /= Void end feature -- Platform Operating_environment: OPERATING_ENVIRONMENT -- Objects available from the operating system -- (from ANY) once create Result ensure -- from ANY instance_free: class operating_environment_not_void: Result /= Void end feature {NONE} -- Retrieval frozen internal_correct_mismatch -- Called from runtime to perform a proper dynamic dispatch on correct_mismatch -- from MISMATCH_CORRECTOR. -- (from ANY) local l_msg: STRING_8 l_exc: EXCEPTIONS do if attached {MISMATCH_CORRECTOR} Current as l_corrector then l_corrector.correct_mismatch else create l_msg.make_from_string ("Mismatch: ") create l_exc l_msg.append (generating_type.name) l_exc.raise_retrieval_exception (l_msg) end end invariant -- from ANY reflexive_equality: standard_is_equal (Current) reflexive_conformance: conforms_to (Current) end -- class UC_UTF32_ROUTINES
Generated by ISE EiffelStudio